uipc_socket.c revision 194672
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 194672 2009-06-22 23:08:05Z andre $");
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h>			/* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <net/route.h>
126#include <sys/signalvar.h>
127#include <sys/stat.h>
128#include <sys/sx.h>
129#include <sys/sysctl.h>
130#include <sys/uio.h>
131#include <sys/jail.h>
132#include <sys/vimage.h>
133
134#include <security/mac/mac_framework.h>
135
136#include <vm/uma.h>
137
138#ifdef COMPAT_IA32
139#include <sys/mount.h>
140#include <sys/sysent.h>
141#include <compat/freebsd32/freebsd32.h>
142#endif
143
144static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
145		    int flags);
146
147static void	filt_sordetach(struct knote *kn);
148static int	filt_soread(struct knote *kn, long hint);
149static void	filt_sowdetach(struct knote *kn);
150static int	filt_sowrite(struct knote *kn, long hint);
151static int	filt_solisten(struct knote *kn, long hint);
152
153static struct filterops solisten_filtops =
154	{ 1, NULL, filt_sordetach, filt_solisten };
155static struct filterops soread_filtops =
156	{ 1, NULL, filt_sordetach, filt_soread };
157static struct filterops sowrite_filtops =
158	{ 1, NULL, filt_sowdetach, filt_sowrite };
159
160uma_zone_t socket_zone;
161so_gen_t	so_gencnt;	/* generation count for sockets */
162
163int	maxsockets;
164
165MALLOC_DEFINE(M_SONAME, "soname", "socket name");
166MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
167
168static int somaxconn = SOMAXCONN;
169static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
170/* XXX: we dont have SYSCTL_USHORT */
171SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
172    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
173    "queue size");
174static int numopensockets;
175SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
176    &numopensockets, 0, "Number of open sockets");
177#ifdef ZERO_COPY_SOCKETS
178/* These aren't static because they're used in other files. */
179int so_zero_copy_send = 1;
180int so_zero_copy_receive = 1;
181SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
182    "Zero copy controls");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
184    &so_zero_copy_receive, 0, "Enable zero copy receive");
185SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
186    &so_zero_copy_send, 0, "Enable zero copy send");
187#endif /* ZERO_COPY_SOCKETS */
188
189/*
190 * accept_mtx locks down per-socket fields relating to accept queues.  See
191 * socketvar.h for an annotation of the protected fields of struct socket.
192 */
193struct mtx accept_mtx;
194MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
195
196/*
197 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
198 * so_gencnt field.
199 */
200static struct mtx so_global_mtx;
201MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
202
203/*
204 * General IPC sysctl name space, used by sockets and a variety of other IPC
205 * types.
206 */
207SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
208
209/*
210 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
211 * of the change so that they can update their dependent limits as required.
212 */
213static int
214sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
215{
216	int error, newmaxsockets;
217
218	newmaxsockets = maxsockets;
219	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
220	if (error == 0 && req->newptr) {
221		if (newmaxsockets > maxsockets) {
222			maxsockets = newmaxsockets;
223			if (maxsockets > ((maxfiles / 4) * 3)) {
224				maxfiles = (maxsockets * 5) / 4;
225				maxfilesperproc = (maxfiles * 9) / 10;
226			}
227			EVENTHANDLER_INVOKE(maxsockets_change);
228		} else
229			error = EINVAL;
230	}
231	return (error);
232}
233
234SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
235    &maxsockets, 0, sysctl_maxsockets, "IU",
236    "Maximum number of sockets avaliable");
237
238/*
239 * Initialise maxsockets.  This SYSINIT must be run after
240 * tunable_mbinit().
241 */
242static void
243init_maxsockets(void *ignored)
244{
245
246	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
247	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
248}
249SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
250
251/*
252 * Socket operation routines.  These routines are called by the routines in
253 * sys_socket.c or from a system process, and implement the semantics of
254 * socket operations by switching out to the protocol specific routines.
255 */
256
257/*
258 * Get a socket structure from our zone, and initialize it.  Note that it
259 * would probably be better to allocate socket and PCB at the same time, but
260 * I'm not convinced that all the protocols can be easily modified to do
261 * this.
262 *
263 * soalloc() returns a socket with a ref count of 0.
264 */
265static struct socket *
266soalloc(struct vnet *vnet)
267{
268	struct socket *so;
269
270	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
271	if (so == NULL)
272		return (NULL);
273#ifdef MAC
274	if (mac_socket_init(so, M_NOWAIT) != 0) {
275		uma_zfree(socket_zone, so);
276		return (NULL);
277	}
278#endif
279	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
280	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
281	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
282	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
283	TAILQ_INIT(&so->so_aiojobq);
284	mtx_lock(&so_global_mtx);
285	so->so_gencnt = ++so_gencnt;
286	++numopensockets;
287#ifdef VIMAGE
288	++vnet->sockcnt;	/* Locked with so_global_mtx. */
289	so->so_vnet = vnet;
290#endif
291	mtx_unlock(&so_global_mtx);
292	return (so);
293}
294
295/*
296 * Free the storage associated with a socket at the socket layer, tear down
297 * locks, labels, etc.  All protocol state is assumed already to have been
298 * torn down (and possibly never set up) by the caller.
299 */
300static void
301sodealloc(struct socket *so)
302{
303
304	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
305	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
306
307	mtx_lock(&so_global_mtx);
308	so->so_gencnt = ++so_gencnt;
309	--numopensockets;	/* Could be below, but faster here. */
310#ifdef VIMAGE
311	--so->so_vnet->sockcnt;
312#endif
313	mtx_unlock(&so_global_mtx);
314	if (so->so_rcv.sb_hiwat)
315		(void)chgsbsize(so->so_cred->cr_uidinfo,
316		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
317	if (so->so_snd.sb_hiwat)
318		(void)chgsbsize(so->so_cred->cr_uidinfo,
319		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
320#ifdef INET
321	/* remove acccept filter if one is present. */
322	if (so->so_accf != NULL)
323		do_setopt_accept_filter(so, NULL);
324#endif
325#ifdef MAC
326	mac_socket_destroy(so);
327#endif
328	crfree(so->so_cred);
329	sx_destroy(&so->so_snd.sb_sx);
330	sx_destroy(&so->so_rcv.sb_sx);
331	SOCKBUF_LOCK_DESTROY(&so->so_snd);
332	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
333	uma_zfree(socket_zone, so);
334}
335
336/*
337 * socreate returns a socket with a ref count of 1.  The socket should be
338 * closed with soclose().
339 */
340int
341socreate(int dom, struct socket **aso, int type, int proto,
342    struct ucred *cred, struct thread *td)
343{
344	struct protosw *prp;
345	struct socket *so;
346	int error;
347
348	if (proto)
349		prp = pffindproto(dom, proto, type);
350	else
351		prp = pffindtype(dom, type);
352
353	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
354	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
355		return (EPROTONOSUPPORT);
356
357	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
358		return (EPROTONOSUPPORT);
359
360	if (prp->pr_type != type)
361		return (EPROTOTYPE);
362	so = soalloc(CRED_TO_VNET(cred));
363	if (so == NULL)
364		return (ENOBUFS);
365
366	TAILQ_INIT(&so->so_incomp);
367	TAILQ_INIT(&so->so_comp);
368	so->so_type = type;
369	so->so_cred = crhold(cred);
370	if ((prp->pr_domain->dom_family == PF_INET) ||
371	    (prp->pr_domain->dom_family == PF_ROUTE))
372		so->so_fibnum = td->td_proc->p_fibnum;
373	else
374		so->so_fibnum = 0;
375	so->so_proto = prp;
376#ifdef MAC
377	mac_socket_create(cred, so);
378#endif
379	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
380	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
381	so->so_count = 1;
382	/*
383	 * Auto-sizing of socket buffers is managed by the protocols and
384	 * the appropriate flags must be set in the pru_attach function.
385	 */
386	CURVNET_SET(so->so_vnet);
387	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
388	CURVNET_RESTORE();
389	if (error) {
390		KASSERT(so->so_count == 1, ("socreate: so_count %d",
391		    so->so_count));
392		so->so_count = 0;
393		sodealloc(so);
394		return (error);
395	}
396	*aso = so;
397	return (0);
398}
399
400#ifdef REGRESSION
401static int regression_sonewconn_earlytest = 1;
402SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
403    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
404#endif
405
406/*
407 * When an attempt at a new connection is noted on a socket which accepts
408 * connections, sonewconn is called.  If the connection is possible (subject
409 * to space constraints, etc.) then we allocate a new structure, propoerly
410 * linked into the data structure of the original socket, and return this.
411 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
412 *
413 * Note: the ref count on the socket is 0 on return.
414 */
415struct socket *
416sonewconn(struct socket *head, int connstatus)
417{
418	struct socket *so;
419	int over;
420
421	ACCEPT_LOCK();
422	over = (head->so_qlen > 3 * head->so_qlimit / 2);
423	ACCEPT_UNLOCK();
424#ifdef REGRESSION
425	if (regression_sonewconn_earlytest && over)
426#else
427	if (over)
428#endif
429		return (NULL);
430	VNET_ASSERT(head->so_vnet);
431	so = soalloc(head->so_vnet);
432	if (so == NULL)
433		return (NULL);
434	if ((head->so_options & SO_ACCEPTFILTER) != 0)
435		connstatus = 0;
436	so->so_head = head;
437	so->so_type = head->so_type;
438	so->so_options = head->so_options &~ SO_ACCEPTCONN;
439	so->so_linger = head->so_linger;
440	so->so_state = head->so_state | SS_NOFDREF;
441	so->so_proto = head->so_proto;
442	so->so_cred = crhold(head->so_cred);
443#ifdef MAC
444	mac_socket_newconn(head, so);
445#endif
446	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
447	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
448	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
449	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
450		sodealloc(so);
451		return (NULL);
452	}
453	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
454	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
455	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
456	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
457	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
458	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
459	so->so_state |= connstatus;
460	ACCEPT_LOCK();
461	if (connstatus) {
462		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
463		so->so_qstate |= SQ_COMP;
464		head->so_qlen++;
465	} else {
466		/*
467		 * Keep removing sockets from the head until there's room for
468		 * us to insert on the tail.  In pre-locking revisions, this
469		 * was a simple if(), but as we could be racing with other
470		 * threads and soabort() requires dropping locks, we must
471		 * loop waiting for the condition to be true.
472		 */
473		while (head->so_incqlen > head->so_qlimit) {
474			struct socket *sp;
475			sp = TAILQ_FIRST(&head->so_incomp);
476			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
477			head->so_incqlen--;
478			sp->so_qstate &= ~SQ_INCOMP;
479			sp->so_head = NULL;
480			ACCEPT_UNLOCK();
481			soabort(sp);
482			ACCEPT_LOCK();
483		}
484		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
485		so->so_qstate |= SQ_INCOMP;
486		head->so_incqlen++;
487	}
488	ACCEPT_UNLOCK();
489	if (connstatus) {
490		sorwakeup(head);
491		wakeup_one(&head->so_timeo);
492	}
493	return (so);
494}
495
496int
497sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
498{
499	int error;
500
501	CURVNET_SET(so->so_vnet);
502	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
503	CURVNET_RESTORE();
504	return error;
505}
506
507/*
508 * solisten() transitions a socket from a non-listening state to a listening
509 * state, but can also be used to update the listen queue depth on an
510 * existing listen socket.  The protocol will call back into the sockets
511 * layer using solisten_proto_check() and solisten_proto() to check and set
512 * socket-layer listen state.  Call backs are used so that the protocol can
513 * acquire both protocol and socket layer locks in whatever order is required
514 * by the protocol.
515 *
516 * Protocol implementors are advised to hold the socket lock across the
517 * socket-layer test and set to avoid races at the socket layer.
518 */
519int
520solisten(struct socket *so, int backlog, struct thread *td)
521{
522
523	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
524}
525
526int
527solisten_proto_check(struct socket *so)
528{
529
530	SOCK_LOCK_ASSERT(so);
531
532	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
533	    SS_ISDISCONNECTING))
534		return (EINVAL);
535	return (0);
536}
537
538void
539solisten_proto(struct socket *so, int backlog)
540{
541
542	SOCK_LOCK_ASSERT(so);
543
544	if (backlog < 0 || backlog > somaxconn)
545		backlog = somaxconn;
546	so->so_qlimit = backlog;
547	so->so_options |= SO_ACCEPTCONN;
548}
549
550/*
551 * Attempt to free a socket.  This should really be sotryfree().
552 *
553 * sofree() will succeed if:
554 *
555 * - There are no outstanding file descriptor references or related consumers
556 *   (so_count == 0).
557 *
558 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
559 *
560 * - The protocol does not have an outstanding strong reference on the socket
561 *   (SS_PROTOREF).
562 *
563 * - The socket is not in a completed connection queue, so a process has been
564 *   notified that it is present.  If it is removed, the user process may
565 *   block in accept() despite select() saying the socket was ready.
566 *
567 * Otherwise, it will quietly abort so that a future call to sofree(), when
568 * conditions are right, can succeed.
569 */
570void
571sofree(struct socket *so)
572{
573	struct protosw *pr = so->so_proto;
574	struct socket *head;
575
576	ACCEPT_LOCK_ASSERT();
577	SOCK_LOCK_ASSERT(so);
578
579	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
580	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
581		SOCK_UNLOCK(so);
582		ACCEPT_UNLOCK();
583		return;
584	}
585
586	head = so->so_head;
587	if (head != NULL) {
588		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
589		    (so->so_qstate & SQ_INCOMP) != 0,
590		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
591		    "SQ_INCOMP"));
592		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
593		    (so->so_qstate & SQ_INCOMP) == 0,
594		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
595		TAILQ_REMOVE(&head->so_incomp, so, so_list);
596		head->so_incqlen--;
597		so->so_qstate &= ~SQ_INCOMP;
598		so->so_head = NULL;
599	}
600	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
601	    (so->so_qstate & SQ_INCOMP) == 0,
602	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
603	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
604	if (so->so_options & SO_ACCEPTCONN) {
605		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
606		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
607	}
608	SOCK_UNLOCK(so);
609	ACCEPT_UNLOCK();
610
611	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
612		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
613	if (pr->pr_usrreqs->pru_detach != NULL)
614		(*pr->pr_usrreqs->pru_detach)(so);
615
616	/*
617	 * From this point on, we assume that no other references to this
618	 * socket exist anywhere else in the stack.  Therefore, no locks need
619	 * to be acquired or held.
620	 *
621	 * We used to do a lot of socket buffer and socket locking here, as
622	 * well as invoke sorflush() and perform wakeups.  The direct call to
623	 * dom_dispose() and sbrelease_internal() are an inlining of what was
624	 * necessary from sorflush().
625	 *
626	 * Notice that the socket buffer and kqueue state are torn down
627	 * before calling pru_detach.  This means that protocols shold not
628	 * assume they can perform socket wakeups, etc, in their detach code.
629	 */
630	sbdestroy(&so->so_snd, so);
631	sbdestroy(&so->so_rcv, so);
632	knlist_destroy(&so->so_rcv.sb_sel.si_note);
633	knlist_destroy(&so->so_snd.sb_sel.si_note);
634	sodealloc(so);
635}
636
637/*
638 * Close a socket on last file table reference removal.  Initiate disconnect
639 * if connected.  Free socket when disconnect complete.
640 *
641 * This function will sorele() the socket.  Note that soclose() may be called
642 * prior to the ref count reaching zero.  The actual socket structure will
643 * not be freed until the ref count reaches zero.
644 */
645int
646soclose(struct socket *so)
647{
648	int error = 0;
649
650	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
651
652	CURVNET_SET(so->so_vnet);
653	funsetown(&so->so_sigio);
654	if (so->so_state & SS_ISCONNECTED) {
655		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
656			error = sodisconnect(so);
657			if (error)
658				goto drop;
659		}
660		if (so->so_options & SO_LINGER) {
661			if ((so->so_state & SS_ISDISCONNECTING) &&
662			    (so->so_state & SS_NBIO))
663				goto drop;
664			while (so->so_state & SS_ISCONNECTED) {
665				error = tsleep(&so->so_timeo,
666				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
667				if (error)
668					break;
669			}
670		}
671	}
672
673drop:
674	if (so->so_proto->pr_usrreqs->pru_close != NULL)
675		(*so->so_proto->pr_usrreqs->pru_close)(so);
676	if (so->so_options & SO_ACCEPTCONN) {
677		struct socket *sp;
678		ACCEPT_LOCK();
679		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
680			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
681			so->so_incqlen--;
682			sp->so_qstate &= ~SQ_INCOMP;
683			sp->so_head = NULL;
684			ACCEPT_UNLOCK();
685			soabort(sp);
686			ACCEPT_LOCK();
687		}
688		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
689			TAILQ_REMOVE(&so->so_comp, sp, so_list);
690			so->so_qlen--;
691			sp->so_qstate &= ~SQ_COMP;
692			sp->so_head = NULL;
693			ACCEPT_UNLOCK();
694			soabort(sp);
695			ACCEPT_LOCK();
696		}
697		ACCEPT_UNLOCK();
698	}
699	ACCEPT_LOCK();
700	SOCK_LOCK(so);
701	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
702	so->so_state |= SS_NOFDREF;
703	sorele(so);
704	CURVNET_RESTORE();
705	return (error);
706}
707
708/*
709 * soabort() is used to abruptly tear down a connection, such as when a
710 * resource limit is reached (listen queue depth exceeded), or if a listen
711 * socket is closed while there are sockets waiting to be accepted.
712 *
713 * This interface is tricky, because it is called on an unreferenced socket,
714 * and must be called only by a thread that has actually removed the socket
715 * from the listen queue it was on, or races with other threads are risked.
716 *
717 * This interface will call into the protocol code, so must not be called
718 * with any socket locks held.  Protocols do call it while holding their own
719 * recursible protocol mutexes, but this is something that should be subject
720 * to review in the future.
721 */
722void
723soabort(struct socket *so)
724{
725
726	/*
727	 * In as much as is possible, assert that no references to this
728	 * socket are held.  This is not quite the same as asserting that the
729	 * current thread is responsible for arranging for no references, but
730	 * is as close as we can get for now.
731	 */
732	KASSERT(so->so_count == 0, ("soabort: so_count"));
733	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
734	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
735	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
736	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
737
738	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
739		(*so->so_proto->pr_usrreqs->pru_abort)(so);
740	ACCEPT_LOCK();
741	SOCK_LOCK(so);
742	sofree(so);
743}
744
745int
746soaccept(struct socket *so, struct sockaddr **nam)
747{
748	int error;
749
750	SOCK_LOCK(so);
751	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
752	so->so_state &= ~SS_NOFDREF;
753	SOCK_UNLOCK(so);
754	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
755	return (error);
756}
757
758int
759soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
760{
761	int error;
762
763	if (so->so_options & SO_ACCEPTCONN)
764		return (EOPNOTSUPP);
765	/*
766	 * If protocol is connection-based, can only connect once.
767	 * Otherwise, if connected, try to disconnect first.  This allows
768	 * user to disconnect by connecting to, e.g., a null address.
769	 */
770	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
771	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
772	    (error = sodisconnect(so)))) {
773		error = EISCONN;
774	} else {
775		/*
776		 * Prevent accumulated error from previous connection from
777		 * biting us.
778		 */
779		so->so_error = 0;
780		CURVNET_SET(so->so_vnet);
781		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
782		CURVNET_RESTORE();
783	}
784
785	return (error);
786}
787
788int
789soconnect2(struct socket *so1, struct socket *so2)
790{
791
792	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
793}
794
795int
796sodisconnect(struct socket *so)
797{
798	int error;
799
800	if ((so->so_state & SS_ISCONNECTED) == 0)
801		return (ENOTCONN);
802	if (so->so_state & SS_ISDISCONNECTING)
803		return (EALREADY);
804	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
805	return (error);
806}
807
808#ifdef ZERO_COPY_SOCKETS
809struct so_zerocopy_stats{
810	int size_ok;
811	int align_ok;
812	int found_ifp;
813};
814struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
815#include <netinet/in.h>
816#include <net/route.h>
817#include <netinet/in_pcb.h>
818#include <vm/vm.h>
819#include <vm/vm_page.h>
820#include <vm/vm_object.h>
821
822/*
823 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
824 * sosend_dgram() and sosend_generic() use m_uiotombuf().
825 *
826 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
827 * all of the data referenced by the uio.  If desired, it uses zero-copy.
828 * *space will be updated to reflect data copied in.
829 *
830 * NB: If atomic I/O is requested, the caller must already have checked that
831 * space can hold resid bytes.
832 *
833 * NB: In the event of an error, the caller may need to free the partial
834 * chain pointed to by *mpp.  The contents of both *uio and *space may be
835 * modified even in the case of an error.
836 */
837static int
838sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
839    int flags)
840{
841	struct mbuf *m, **mp, *top;
842	long len, resid;
843	int error;
844#ifdef ZERO_COPY_SOCKETS
845	int cow_send;
846#endif
847
848	*retmp = top = NULL;
849	mp = &top;
850	len = 0;
851	resid = uio->uio_resid;
852	error = 0;
853	do {
854#ifdef ZERO_COPY_SOCKETS
855		cow_send = 0;
856#endif /* ZERO_COPY_SOCKETS */
857		if (resid >= MINCLSIZE) {
858#ifdef ZERO_COPY_SOCKETS
859			if (top == NULL) {
860				m = m_gethdr(M_WAITOK, MT_DATA);
861				m->m_pkthdr.len = 0;
862				m->m_pkthdr.rcvif = NULL;
863			} else
864				m = m_get(M_WAITOK, MT_DATA);
865			if (so_zero_copy_send &&
866			    resid>=PAGE_SIZE &&
867			    *space>=PAGE_SIZE &&
868			    uio->uio_iov->iov_len>=PAGE_SIZE) {
869				so_zerocp_stats.size_ok++;
870				so_zerocp_stats.align_ok++;
871				cow_send = socow_setup(m, uio);
872				len = cow_send;
873			}
874			if (!cow_send) {
875				m_clget(m, M_WAITOK);
876				len = min(min(MCLBYTES, resid), *space);
877			}
878#else /* ZERO_COPY_SOCKETS */
879			if (top == NULL) {
880				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
881				m->m_pkthdr.len = 0;
882				m->m_pkthdr.rcvif = NULL;
883			} else
884				m = m_getcl(M_WAIT, MT_DATA, 0);
885			len = min(min(MCLBYTES, resid), *space);
886#endif /* ZERO_COPY_SOCKETS */
887		} else {
888			if (top == NULL) {
889				m = m_gethdr(M_WAIT, MT_DATA);
890				m->m_pkthdr.len = 0;
891				m->m_pkthdr.rcvif = NULL;
892
893				len = min(min(MHLEN, resid), *space);
894				/*
895				 * For datagram protocols, leave room
896				 * for protocol headers in first mbuf.
897				 */
898				if (atomic && m && len < MHLEN)
899					MH_ALIGN(m, len);
900			} else {
901				m = m_get(M_WAIT, MT_DATA);
902				len = min(min(MLEN, resid), *space);
903			}
904		}
905		if (m == NULL) {
906			error = ENOBUFS;
907			goto out;
908		}
909
910		*space -= len;
911#ifdef ZERO_COPY_SOCKETS
912		if (cow_send)
913			error = 0;
914		else
915#endif /* ZERO_COPY_SOCKETS */
916		error = uiomove(mtod(m, void *), (int)len, uio);
917		resid = uio->uio_resid;
918		m->m_len = len;
919		*mp = m;
920		top->m_pkthdr.len += len;
921		if (error)
922			goto out;
923		mp = &m->m_next;
924		if (resid <= 0) {
925			if (flags & MSG_EOR)
926				top->m_flags |= M_EOR;
927			break;
928		}
929	} while (*space > 0 && atomic);
930out:
931	*retmp = top;
932	return (error);
933}
934#endif /*ZERO_COPY_SOCKETS*/
935
936#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
937
938int
939sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
940    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
941{
942	long space, resid;
943	int clen = 0, error, dontroute;
944#ifdef ZERO_COPY_SOCKETS
945	int atomic = sosendallatonce(so) || top;
946#endif
947
948	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
949	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
950	    ("sodgram_send: !PR_ATOMIC"));
951
952	if (uio != NULL)
953		resid = uio->uio_resid;
954	else
955		resid = top->m_pkthdr.len;
956	/*
957	 * In theory resid should be unsigned.  However, space must be
958	 * signed, as it might be less than 0 if we over-committed, and we
959	 * must use a signed comparison of space and resid.  On the other
960	 * hand, a negative resid causes us to loop sending 0-length
961	 * segments to the protocol.
962	 *
963	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
964	 * type sockets since that's an error.
965	 */
966	if (resid < 0) {
967		error = EINVAL;
968		goto out;
969	}
970
971	dontroute =
972	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
973	if (td != NULL)
974		td->td_ru.ru_msgsnd++;
975	if (control != NULL)
976		clen = control->m_len;
977
978	SOCKBUF_LOCK(&so->so_snd);
979	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
980		SOCKBUF_UNLOCK(&so->so_snd);
981		error = EPIPE;
982		goto out;
983	}
984	if (so->so_error) {
985		error = so->so_error;
986		so->so_error = 0;
987		SOCKBUF_UNLOCK(&so->so_snd);
988		goto out;
989	}
990	if ((so->so_state & SS_ISCONNECTED) == 0) {
991		/*
992		 * `sendto' and `sendmsg' is allowed on a connection-based
993		 * socket if it supports implied connect.  Return ENOTCONN if
994		 * not connected and no address is supplied.
995		 */
996		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
997		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
998			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
999			    !(resid == 0 && clen != 0)) {
1000				SOCKBUF_UNLOCK(&so->so_snd);
1001				error = ENOTCONN;
1002				goto out;
1003			}
1004		} else if (addr == NULL) {
1005			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1006				error = ENOTCONN;
1007			else
1008				error = EDESTADDRREQ;
1009			SOCKBUF_UNLOCK(&so->so_snd);
1010			goto out;
1011		}
1012	}
1013
1014	/*
1015	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1016	 * problem and need fixing.
1017	 */
1018	space = sbspace(&so->so_snd);
1019	if (flags & MSG_OOB)
1020		space += 1024;
1021	space -= clen;
1022	SOCKBUF_UNLOCK(&so->so_snd);
1023	if (resid > space) {
1024		error = EMSGSIZE;
1025		goto out;
1026	}
1027	if (uio == NULL) {
1028		resid = 0;
1029		if (flags & MSG_EOR)
1030			top->m_flags |= M_EOR;
1031	} else {
1032#ifdef ZERO_COPY_SOCKETS
1033		error = sosend_copyin(uio, &top, atomic, &space, flags);
1034		if (error)
1035			goto out;
1036#else
1037		/*
1038		 * Copy the data from userland into a mbuf chain.
1039		 * If no data is to be copied in, a single empty mbuf
1040		 * is returned.
1041		 */
1042		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1043		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1044		if (top == NULL) {
1045			error = EFAULT;	/* only possible error */
1046			goto out;
1047		}
1048		space -= resid - uio->uio_resid;
1049#endif
1050		resid = uio->uio_resid;
1051	}
1052	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1053	/*
1054	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1055	 * than with.
1056	 */
1057	if (dontroute) {
1058		SOCK_LOCK(so);
1059		so->so_options |= SO_DONTROUTE;
1060		SOCK_UNLOCK(so);
1061	}
1062	/*
1063	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1064	 * of date.  We could have recieved a reset packet in an interrupt or
1065	 * maybe we slept while doing page faults in uiomove() etc.  We could
1066	 * probably recheck again inside the locking protection here, but
1067	 * there are probably other places that this also happens.  We must
1068	 * rethink this.
1069	 */
1070	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1071	    (flags & MSG_OOB) ? PRUS_OOB :
1072	/*
1073	 * If the user set MSG_EOF, the protocol understands this flag and
1074	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1075	 */
1076	    ((flags & MSG_EOF) &&
1077	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1078	     (resid <= 0)) ?
1079		PRUS_EOF :
1080		/* If there is more to send set PRUS_MORETOCOME */
1081		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1082		top, addr, control, td);
1083	if (dontroute) {
1084		SOCK_LOCK(so);
1085		so->so_options &= ~SO_DONTROUTE;
1086		SOCK_UNLOCK(so);
1087	}
1088	clen = 0;
1089	control = NULL;
1090	top = NULL;
1091out:
1092	if (top != NULL)
1093		m_freem(top);
1094	if (control != NULL)
1095		m_freem(control);
1096	return (error);
1097}
1098
1099/*
1100 * Send on a socket.  If send must go all at once and message is larger than
1101 * send buffering, then hard error.  Lock against other senders.  If must go
1102 * all at once and not enough room now, then inform user that this would
1103 * block and do nothing.  Otherwise, if nonblocking, send as much as
1104 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1105 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1106 * in mbuf chain must be small enough to send all at once.
1107 *
1108 * Returns nonzero on error, timeout or signal; callers must check for short
1109 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1110 * on return.
1111 */
1112int
1113sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1114    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1115{
1116	long space, resid;
1117	int clen = 0, error, dontroute;
1118	int atomic = sosendallatonce(so) || top;
1119
1120	if (uio != NULL)
1121		resid = uio->uio_resid;
1122	else
1123		resid = top->m_pkthdr.len;
1124	/*
1125	 * In theory resid should be unsigned.  However, space must be
1126	 * signed, as it might be less than 0 if we over-committed, and we
1127	 * must use a signed comparison of space and resid.  On the other
1128	 * hand, a negative resid causes us to loop sending 0-length
1129	 * segments to the protocol.
1130	 *
1131	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1132	 * type sockets since that's an error.
1133	 */
1134	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1135		error = EINVAL;
1136		goto out;
1137	}
1138
1139	dontroute =
1140	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1141	    (so->so_proto->pr_flags & PR_ATOMIC);
1142	if (td != NULL)
1143		td->td_ru.ru_msgsnd++;
1144	if (control != NULL)
1145		clen = control->m_len;
1146
1147	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1148	if (error)
1149		goto out;
1150
1151restart:
1152	do {
1153		SOCKBUF_LOCK(&so->so_snd);
1154		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1155			SOCKBUF_UNLOCK(&so->so_snd);
1156			error = EPIPE;
1157			goto release;
1158		}
1159		if (so->so_error) {
1160			error = so->so_error;
1161			so->so_error = 0;
1162			SOCKBUF_UNLOCK(&so->so_snd);
1163			goto release;
1164		}
1165		if ((so->so_state & SS_ISCONNECTED) == 0) {
1166			/*
1167			 * `sendto' and `sendmsg' is allowed on a connection-
1168			 * based socket if it supports implied connect.
1169			 * Return ENOTCONN if not connected and no address is
1170			 * supplied.
1171			 */
1172			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1173			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1174				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1175				    !(resid == 0 && clen != 0)) {
1176					SOCKBUF_UNLOCK(&so->so_snd);
1177					error = ENOTCONN;
1178					goto release;
1179				}
1180			} else if (addr == NULL) {
1181				SOCKBUF_UNLOCK(&so->so_snd);
1182				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1183					error = ENOTCONN;
1184				else
1185					error = EDESTADDRREQ;
1186				goto release;
1187			}
1188		}
1189		space = sbspace(&so->so_snd);
1190		if (flags & MSG_OOB)
1191			space += 1024;
1192		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1193		    clen > so->so_snd.sb_hiwat) {
1194			SOCKBUF_UNLOCK(&so->so_snd);
1195			error = EMSGSIZE;
1196			goto release;
1197		}
1198		if (space < resid + clen &&
1199		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1200			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1201				SOCKBUF_UNLOCK(&so->so_snd);
1202				error = EWOULDBLOCK;
1203				goto release;
1204			}
1205			error = sbwait(&so->so_snd);
1206			SOCKBUF_UNLOCK(&so->so_snd);
1207			if (error)
1208				goto release;
1209			goto restart;
1210		}
1211		SOCKBUF_UNLOCK(&so->so_snd);
1212		space -= clen;
1213		do {
1214			if (uio == NULL) {
1215				resid = 0;
1216				if (flags & MSG_EOR)
1217					top->m_flags |= M_EOR;
1218			} else {
1219#ifdef ZERO_COPY_SOCKETS
1220				error = sosend_copyin(uio, &top, atomic,
1221				    &space, flags);
1222				if (error != 0)
1223					goto release;
1224#else
1225				/*
1226				 * Copy the data from userland into a mbuf
1227				 * chain.  If no data is to be copied in,
1228				 * a single empty mbuf is returned.
1229				 */
1230				top = m_uiotombuf(uio, M_WAITOK, space,
1231				    (atomic ? max_hdr : 0),
1232				    (atomic ? M_PKTHDR : 0) |
1233				    ((flags & MSG_EOR) ? M_EOR : 0));
1234				if (top == NULL) {
1235					error = EFAULT; /* only possible error */
1236					goto release;
1237				}
1238				space -= resid - uio->uio_resid;
1239#endif
1240				resid = uio->uio_resid;
1241			}
1242			if (dontroute) {
1243				SOCK_LOCK(so);
1244				so->so_options |= SO_DONTROUTE;
1245				SOCK_UNLOCK(so);
1246			}
1247			/*
1248			 * XXX all the SBS_CANTSENDMORE checks previously
1249			 * done could be out of date.  We could have recieved
1250			 * a reset packet in an interrupt or maybe we slept
1251			 * while doing page faults in uiomove() etc.  We
1252			 * could probably recheck again inside the locking
1253			 * protection here, but there are probably other
1254			 * places that this also happens.  We must rethink
1255			 * this.
1256			 */
1257			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1258			    (flags & MSG_OOB) ? PRUS_OOB :
1259			/*
1260			 * If the user set MSG_EOF, the protocol understands
1261			 * this flag and nothing left to send then use
1262			 * PRU_SEND_EOF instead of PRU_SEND.
1263			 */
1264			    ((flags & MSG_EOF) &&
1265			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1266			     (resid <= 0)) ?
1267				PRUS_EOF :
1268			/* If there is more to send set PRUS_MORETOCOME. */
1269			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1270			    top, addr, control, td);
1271			if (dontroute) {
1272				SOCK_LOCK(so);
1273				so->so_options &= ~SO_DONTROUTE;
1274				SOCK_UNLOCK(so);
1275			}
1276			clen = 0;
1277			control = NULL;
1278			top = NULL;
1279			if (error)
1280				goto release;
1281		} while (resid && space > 0);
1282	} while (resid);
1283
1284release:
1285	sbunlock(&so->so_snd);
1286out:
1287	if (top != NULL)
1288		m_freem(top);
1289	if (control != NULL)
1290		m_freem(control);
1291	return (error);
1292}
1293
1294int
1295sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1296    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1297{
1298	int error;
1299
1300	CURVNET_SET(so->so_vnet);
1301	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1302	    control, flags, td);
1303	CURVNET_RESTORE();
1304	return (error);
1305}
1306
1307/*
1308 * The part of soreceive() that implements reading non-inline out-of-band
1309 * data from a socket.  For more complete comments, see soreceive(), from
1310 * which this code originated.
1311 *
1312 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1313 * unable to return an mbuf chain to the caller.
1314 */
1315static int
1316soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1317{
1318	struct protosw *pr = so->so_proto;
1319	struct mbuf *m;
1320	int error;
1321
1322	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1323
1324	m = m_get(M_WAIT, MT_DATA);
1325	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1326	if (error)
1327		goto bad;
1328	do {
1329#ifdef ZERO_COPY_SOCKETS
1330		if (so_zero_copy_receive) {
1331			int disposable;
1332
1333			if ((m->m_flags & M_EXT)
1334			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1335				disposable = 1;
1336			else
1337				disposable = 0;
1338
1339			error = uiomoveco(mtod(m, void *),
1340					  min(uio->uio_resid, m->m_len),
1341					  uio, disposable);
1342		} else
1343#endif /* ZERO_COPY_SOCKETS */
1344		error = uiomove(mtod(m, void *),
1345		    (int) min(uio->uio_resid, m->m_len), uio);
1346		m = m_free(m);
1347	} while (uio->uio_resid && error == 0 && m);
1348bad:
1349	if (m != NULL)
1350		m_freem(m);
1351	return (error);
1352}
1353
1354/*
1355 * Following replacement or removal of the first mbuf on the first mbuf chain
1356 * of a socket buffer, push necessary state changes back into the socket
1357 * buffer so that other consumers see the values consistently.  'nextrecord'
1358 * is the callers locally stored value of the original value of
1359 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1360 * NOTE: 'nextrecord' may be NULL.
1361 */
1362static __inline void
1363sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1364{
1365
1366	SOCKBUF_LOCK_ASSERT(sb);
1367	/*
1368	 * First, update for the new value of nextrecord.  If necessary, make
1369	 * it the first record.
1370	 */
1371	if (sb->sb_mb != NULL)
1372		sb->sb_mb->m_nextpkt = nextrecord;
1373	else
1374		sb->sb_mb = nextrecord;
1375
1376        /*
1377         * Now update any dependent socket buffer fields to reflect the new
1378         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1379	 * addition of a second clause that takes care of the case where
1380	 * sb_mb has been updated, but remains the last record.
1381         */
1382        if (sb->sb_mb == NULL) {
1383                sb->sb_mbtail = NULL;
1384                sb->sb_lastrecord = NULL;
1385        } else if (sb->sb_mb->m_nextpkt == NULL)
1386                sb->sb_lastrecord = sb->sb_mb;
1387}
1388
1389
1390/*
1391 * Implement receive operations on a socket.  We depend on the way that
1392 * records are added to the sockbuf by sbappend.  In particular, each record
1393 * (mbufs linked through m_next) must begin with an address if the protocol
1394 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1395 * data, and then zero or more mbufs of data.  In order to allow parallelism
1396 * between network receive and copying to user space, as well as avoid
1397 * sleeping with a mutex held, we release the socket buffer mutex during the
1398 * user space copy.  Although the sockbuf is locked, new data may still be
1399 * appended, and thus we must maintain consistency of the sockbuf during that
1400 * time.
1401 *
1402 * The caller may receive the data as a single mbuf chain by supplying an
1403 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1404 * the count in uio_resid.
1405 */
1406int
1407soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1408    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1409{
1410	struct mbuf *m, **mp;
1411	int flags, len, error, offset;
1412	struct protosw *pr = so->so_proto;
1413	struct mbuf *nextrecord;
1414	int moff, type = 0;
1415	int orig_resid = uio->uio_resid;
1416
1417	mp = mp0;
1418	if (psa != NULL)
1419		*psa = NULL;
1420	if (controlp != NULL)
1421		*controlp = NULL;
1422	if (flagsp != NULL)
1423		flags = *flagsp &~ MSG_EOR;
1424	else
1425		flags = 0;
1426	if (flags & MSG_OOB)
1427		return (soreceive_rcvoob(so, uio, flags));
1428	if (mp != NULL)
1429		*mp = NULL;
1430	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1431	    && uio->uio_resid)
1432		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1433
1434	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1435	if (error)
1436		return (error);
1437
1438restart:
1439	SOCKBUF_LOCK(&so->so_rcv);
1440	m = so->so_rcv.sb_mb;
1441	/*
1442	 * If we have less data than requested, block awaiting more (subject
1443	 * to any timeout) if:
1444	 *   1. the current count is less than the low water mark, or
1445	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1446	 *	receive operation at once if we block (resid <= hiwat).
1447	 *   3. MSG_DONTWAIT is not set
1448	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1449	 * we have to do the receive in sections, and thus risk returning a
1450	 * short count if a timeout or signal occurs after we start.
1451	 */
1452	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1453	    so->so_rcv.sb_cc < uio->uio_resid) &&
1454	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1455	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1456	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1457		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1458		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1459		    m, so->so_rcv.sb_cc));
1460		if (so->so_error) {
1461			if (m != NULL)
1462				goto dontblock;
1463			error = so->so_error;
1464			if ((flags & MSG_PEEK) == 0)
1465				so->so_error = 0;
1466			SOCKBUF_UNLOCK(&so->so_rcv);
1467			goto release;
1468		}
1469		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1470		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1471			if (m == NULL) {
1472				SOCKBUF_UNLOCK(&so->so_rcv);
1473				goto release;
1474			} else
1475				goto dontblock;
1476		}
1477		for (; m != NULL; m = m->m_next)
1478			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1479				m = so->so_rcv.sb_mb;
1480				goto dontblock;
1481			}
1482		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1483		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1484			SOCKBUF_UNLOCK(&so->so_rcv);
1485			error = ENOTCONN;
1486			goto release;
1487		}
1488		if (uio->uio_resid == 0) {
1489			SOCKBUF_UNLOCK(&so->so_rcv);
1490			goto release;
1491		}
1492		if ((so->so_state & SS_NBIO) ||
1493		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1494			SOCKBUF_UNLOCK(&so->so_rcv);
1495			error = EWOULDBLOCK;
1496			goto release;
1497		}
1498		SBLASTRECORDCHK(&so->so_rcv);
1499		SBLASTMBUFCHK(&so->so_rcv);
1500		error = sbwait(&so->so_rcv);
1501		SOCKBUF_UNLOCK(&so->so_rcv);
1502		if (error)
1503			goto release;
1504		goto restart;
1505	}
1506dontblock:
1507	/*
1508	 * From this point onward, we maintain 'nextrecord' as a cache of the
1509	 * pointer to the next record in the socket buffer.  We must keep the
1510	 * various socket buffer pointers and local stack versions of the
1511	 * pointers in sync, pushing out modifications before dropping the
1512	 * socket buffer mutex, and re-reading them when picking it up.
1513	 *
1514	 * Otherwise, we will race with the network stack appending new data
1515	 * or records onto the socket buffer by using inconsistent/stale
1516	 * versions of the field, possibly resulting in socket buffer
1517	 * corruption.
1518	 *
1519	 * By holding the high-level sblock(), we prevent simultaneous
1520	 * readers from pulling off the front of the socket buffer.
1521	 */
1522	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1523	if (uio->uio_td)
1524		uio->uio_td->td_ru.ru_msgrcv++;
1525	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1526	SBLASTRECORDCHK(&so->so_rcv);
1527	SBLASTMBUFCHK(&so->so_rcv);
1528	nextrecord = m->m_nextpkt;
1529	if (pr->pr_flags & PR_ADDR) {
1530		KASSERT(m->m_type == MT_SONAME,
1531		    ("m->m_type == %d", m->m_type));
1532		orig_resid = 0;
1533		if (psa != NULL)
1534			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1535			    M_NOWAIT);
1536		if (flags & MSG_PEEK) {
1537			m = m->m_next;
1538		} else {
1539			sbfree(&so->so_rcv, m);
1540			so->so_rcv.sb_mb = m_free(m);
1541			m = so->so_rcv.sb_mb;
1542			sockbuf_pushsync(&so->so_rcv, nextrecord);
1543		}
1544	}
1545
1546	/*
1547	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1548	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1549	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1550	 * perform externalization (or freeing if controlp == NULL).
1551	 */
1552	if (m != NULL && m->m_type == MT_CONTROL) {
1553		struct mbuf *cm = NULL, *cmn;
1554		struct mbuf **cme = &cm;
1555
1556		do {
1557			if (flags & MSG_PEEK) {
1558				if (controlp != NULL) {
1559					*controlp = m_copy(m, 0, m->m_len);
1560					controlp = &(*controlp)->m_next;
1561				}
1562				m = m->m_next;
1563			} else {
1564				sbfree(&so->so_rcv, m);
1565				so->so_rcv.sb_mb = m->m_next;
1566				m->m_next = NULL;
1567				*cme = m;
1568				cme = &(*cme)->m_next;
1569				m = so->so_rcv.sb_mb;
1570			}
1571		} while (m != NULL && m->m_type == MT_CONTROL);
1572		if ((flags & MSG_PEEK) == 0)
1573			sockbuf_pushsync(&so->so_rcv, nextrecord);
1574		while (cm != NULL) {
1575			cmn = cm->m_next;
1576			cm->m_next = NULL;
1577			if (pr->pr_domain->dom_externalize != NULL) {
1578				SOCKBUF_UNLOCK(&so->so_rcv);
1579				error = (*pr->pr_domain->dom_externalize)
1580				    (cm, controlp);
1581				SOCKBUF_LOCK(&so->so_rcv);
1582			} else if (controlp != NULL)
1583				*controlp = cm;
1584			else
1585				m_freem(cm);
1586			if (controlp != NULL) {
1587				orig_resid = 0;
1588				while (*controlp != NULL)
1589					controlp = &(*controlp)->m_next;
1590			}
1591			cm = cmn;
1592		}
1593		if (m != NULL)
1594			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1595		else
1596			nextrecord = so->so_rcv.sb_mb;
1597		orig_resid = 0;
1598	}
1599	if (m != NULL) {
1600		if ((flags & MSG_PEEK) == 0) {
1601			KASSERT(m->m_nextpkt == nextrecord,
1602			    ("soreceive: post-control, nextrecord !sync"));
1603			if (nextrecord == NULL) {
1604				KASSERT(so->so_rcv.sb_mb == m,
1605				    ("soreceive: post-control, sb_mb!=m"));
1606				KASSERT(so->so_rcv.sb_lastrecord == m,
1607				    ("soreceive: post-control, lastrecord!=m"));
1608			}
1609		}
1610		type = m->m_type;
1611		if (type == MT_OOBDATA)
1612			flags |= MSG_OOB;
1613	} else {
1614		if ((flags & MSG_PEEK) == 0) {
1615			KASSERT(so->so_rcv.sb_mb == nextrecord,
1616			    ("soreceive: sb_mb != nextrecord"));
1617			if (so->so_rcv.sb_mb == NULL) {
1618				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1619				    ("soreceive: sb_lastercord != NULL"));
1620			}
1621		}
1622	}
1623	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1624	SBLASTRECORDCHK(&so->so_rcv);
1625	SBLASTMBUFCHK(&so->so_rcv);
1626
1627	/*
1628	 * Now continue to read any data mbufs off of the head of the socket
1629	 * buffer until the read request is satisfied.  Note that 'type' is
1630	 * used to store the type of any mbuf reads that have happened so far
1631	 * such that soreceive() can stop reading if the type changes, which
1632	 * causes soreceive() to return only one of regular data and inline
1633	 * out-of-band data in a single socket receive operation.
1634	 */
1635	moff = 0;
1636	offset = 0;
1637	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1638		/*
1639		 * If the type of mbuf has changed since the last mbuf
1640		 * examined ('type'), end the receive operation.
1641	 	 */
1642		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1643		if (m->m_type == MT_OOBDATA) {
1644			if (type != MT_OOBDATA)
1645				break;
1646		} else if (type == MT_OOBDATA)
1647			break;
1648		else
1649		    KASSERT(m->m_type == MT_DATA,
1650			("m->m_type == %d", m->m_type));
1651		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1652		len = uio->uio_resid;
1653		if (so->so_oobmark && len > so->so_oobmark - offset)
1654			len = so->so_oobmark - offset;
1655		if (len > m->m_len - moff)
1656			len = m->m_len - moff;
1657		/*
1658		 * If mp is set, just pass back the mbufs.  Otherwise copy
1659		 * them out via the uio, then free.  Sockbuf must be
1660		 * consistent here (points to current mbuf, it points to next
1661		 * record) when we drop priority; we must note any additions
1662		 * to the sockbuf when we block interrupts again.
1663		 */
1664		if (mp == NULL) {
1665			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1666			SBLASTRECORDCHK(&so->so_rcv);
1667			SBLASTMBUFCHK(&so->so_rcv);
1668			SOCKBUF_UNLOCK(&so->so_rcv);
1669#ifdef ZERO_COPY_SOCKETS
1670			if (so_zero_copy_receive) {
1671				int disposable;
1672
1673				if ((m->m_flags & M_EXT)
1674				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1675					disposable = 1;
1676				else
1677					disposable = 0;
1678
1679				error = uiomoveco(mtod(m, char *) + moff,
1680						  (int)len, uio,
1681						  disposable);
1682			} else
1683#endif /* ZERO_COPY_SOCKETS */
1684			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1685			SOCKBUF_LOCK(&so->so_rcv);
1686			if (error) {
1687				/*
1688				 * The MT_SONAME mbuf has already been removed
1689				 * from the record, so it is necessary to
1690				 * remove the data mbufs, if any, to preserve
1691				 * the invariant in the case of PR_ADDR that
1692				 * requires MT_SONAME mbufs at the head of
1693				 * each record.
1694				 */
1695				if (m && pr->pr_flags & PR_ATOMIC &&
1696				    ((flags & MSG_PEEK) == 0))
1697					(void)sbdroprecord_locked(&so->so_rcv);
1698				SOCKBUF_UNLOCK(&so->so_rcv);
1699				goto release;
1700			}
1701		} else
1702			uio->uio_resid -= len;
1703		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1704		if (len == m->m_len - moff) {
1705			if (m->m_flags & M_EOR)
1706				flags |= MSG_EOR;
1707			if (flags & MSG_PEEK) {
1708				m = m->m_next;
1709				moff = 0;
1710			} else {
1711				nextrecord = m->m_nextpkt;
1712				sbfree(&so->so_rcv, m);
1713				if (mp != NULL) {
1714					*mp = m;
1715					mp = &m->m_next;
1716					so->so_rcv.sb_mb = m = m->m_next;
1717					*mp = NULL;
1718				} else {
1719					so->so_rcv.sb_mb = m_free(m);
1720					m = so->so_rcv.sb_mb;
1721				}
1722				sockbuf_pushsync(&so->so_rcv, nextrecord);
1723				SBLASTRECORDCHK(&so->so_rcv);
1724				SBLASTMBUFCHK(&so->so_rcv);
1725			}
1726		} else {
1727			if (flags & MSG_PEEK)
1728				moff += len;
1729			else {
1730				if (mp != NULL) {
1731					int copy_flag;
1732
1733					if (flags & MSG_DONTWAIT)
1734						copy_flag = M_DONTWAIT;
1735					else
1736						copy_flag = M_WAIT;
1737					if (copy_flag == M_WAIT)
1738						SOCKBUF_UNLOCK(&so->so_rcv);
1739					*mp = m_copym(m, 0, len, copy_flag);
1740					if (copy_flag == M_WAIT)
1741						SOCKBUF_LOCK(&so->so_rcv);
1742 					if (*mp == NULL) {
1743 						/*
1744 						 * m_copym() couldn't
1745						 * allocate an mbuf.  Adjust
1746						 * uio_resid back (it was
1747						 * adjusted down by len
1748						 * bytes, which we didn't end
1749						 * up "copying" over).
1750 						 */
1751 						uio->uio_resid += len;
1752 						break;
1753 					}
1754				}
1755				m->m_data += len;
1756				m->m_len -= len;
1757				so->so_rcv.sb_cc -= len;
1758			}
1759		}
1760		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1761		if (so->so_oobmark) {
1762			if ((flags & MSG_PEEK) == 0) {
1763				so->so_oobmark -= len;
1764				if (so->so_oobmark == 0) {
1765					so->so_rcv.sb_state |= SBS_RCVATMARK;
1766					break;
1767				}
1768			} else {
1769				offset += len;
1770				if (offset == so->so_oobmark)
1771					break;
1772			}
1773		}
1774		if (flags & MSG_EOR)
1775			break;
1776		/*
1777		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1778		 * must not quit until "uio->uio_resid == 0" or an error
1779		 * termination.  If a signal/timeout occurs, return with a
1780		 * short count but without error.  Keep sockbuf locked
1781		 * against other readers.
1782		 */
1783		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1784		    !sosendallatonce(so) && nextrecord == NULL) {
1785			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1787				break;
1788			/*
1789			 * Notify the protocol that some data has been
1790			 * drained before blocking.
1791			 */
1792			if (pr->pr_flags & PR_WANTRCVD) {
1793				SOCKBUF_UNLOCK(&so->so_rcv);
1794				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1795				SOCKBUF_LOCK(&so->so_rcv);
1796			}
1797			SBLASTRECORDCHK(&so->so_rcv);
1798			SBLASTMBUFCHK(&so->so_rcv);
1799			error = sbwait(&so->so_rcv);
1800			if (error) {
1801				SOCKBUF_UNLOCK(&so->so_rcv);
1802				goto release;
1803			}
1804			m = so->so_rcv.sb_mb;
1805			if (m != NULL)
1806				nextrecord = m->m_nextpkt;
1807		}
1808	}
1809
1810	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1812		flags |= MSG_TRUNC;
1813		if ((flags & MSG_PEEK) == 0)
1814			(void) sbdroprecord_locked(&so->so_rcv);
1815	}
1816	if ((flags & MSG_PEEK) == 0) {
1817		if (m == NULL) {
1818			/*
1819			 * First part is an inline SB_EMPTY_FIXUP().  Second
1820			 * part makes sure sb_lastrecord is up-to-date if
1821			 * there is still data in the socket buffer.
1822			 */
1823			so->so_rcv.sb_mb = nextrecord;
1824			if (so->so_rcv.sb_mb == NULL) {
1825				so->so_rcv.sb_mbtail = NULL;
1826				so->so_rcv.sb_lastrecord = NULL;
1827			} else if (nextrecord->m_nextpkt == NULL)
1828				so->so_rcv.sb_lastrecord = nextrecord;
1829		}
1830		SBLASTRECORDCHK(&so->so_rcv);
1831		SBLASTMBUFCHK(&so->so_rcv);
1832		/*
1833		 * If soreceive() is being done from the socket callback,
1834		 * then don't need to generate ACK to peer to update window,
1835		 * since ACK will be generated on return to TCP.
1836		 */
1837		if (!(flags & MSG_SOCALLBCK) &&
1838		    (pr->pr_flags & PR_WANTRCVD)) {
1839			SOCKBUF_UNLOCK(&so->so_rcv);
1840			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1841			SOCKBUF_LOCK(&so->so_rcv);
1842		}
1843	}
1844	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1845	if (orig_resid == uio->uio_resid && orig_resid &&
1846	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1847		SOCKBUF_UNLOCK(&so->so_rcv);
1848		goto restart;
1849	}
1850	SOCKBUF_UNLOCK(&so->so_rcv);
1851
1852	if (flagsp != NULL)
1853		*flagsp |= flags;
1854release:
1855	sbunlock(&so->so_rcv);
1856	return (error);
1857}
1858
1859/*
1860 * Optimized version of soreceive() for stream (TCP) sockets.
1861 */
1862int
1863soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1864    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1865{
1866	int len = 0, error = 0, flags, oresid;
1867	struct sockbuf *sb;
1868	struct mbuf *m, *n = NULL;
1869
1870	/* We only do stream sockets. */
1871	if (so->so_type != SOCK_STREAM)
1872		return (EINVAL);
1873	if (psa != NULL)
1874		*psa = NULL;
1875	if (controlp != NULL)
1876		return (EINVAL);
1877	if (flagsp != NULL)
1878		flags = *flagsp &~ MSG_EOR;
1879	else
1880		flags = 0;
1881	if (flags & MSG_OOB)
1882		return (soreceive_rcvoob(so, uio, flags));
1883	if (mp0 != NULL)
1884		*mp0 = NULL;
1885
1886	sb = &so->so_rcv;
1887
1888	/* Prevent other readers from entering the socket. */
1889	error = sblock(sb, SBLOCKWAIT(flags));
1890	if (error)
1891		goto out;
1892	SOCKBUF_LOCK(sb);
1893
1894	/* Easy one, no space to copyout anything. */
1895	if (uio->uio_resid == 0) {
1896		error = EINVAL;
1897		goto out;
1898	}
1899	oresid = uio->uio_resid;
1900
1901	/* We will never ever get anything unless we are connected. */
1902	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1903		/* When disconnecting there may be still some data left. */
1904		if (sb->sb_cc > 0)
1905			goto deliver;
1906		if (!(so->so_state & SS_ISDISCONNECTED))
1907			error = ENOTCONN;
1908		goto out;
1909	}
1910
1911	/* Socket buffer is empty and we shall not block. */
1912	if (sb->sb_cc == 0 &&
1913	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1914		error = EAGAIN;
1915		goto out;
1916	}
1917
1918restart:
1919	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1920
1921	/* Abort if socket has reported problems. */
1922	if (so->so_error) {
1923		if (sb->sb_cc > 0)
1924			goto deliver;
1925		if (oresid > uio->uio_resid)
1926			goto out;
1927		error = so->so_error;
1928		if (!(flags & MSG_PEEK))
1929			so->so_error = 0;
1930		goto out;
1931	}
1932
1933	/* Door is closed.  Deliver what is left, if any. */
1934	if (sb->sb_state & SBS_CANTRCVMORE) {
1935		if (sb->sb_cc > 0)
1936			goto deliver;
1937		else
1938			goto out;
1939	}
1940
1941	/* Socket buffer got some data that we shall deliver now. */
1942	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1943	    ((sb->sb_flags & SS_NBIO) ||
1944	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1945	     sb->sb_cc >= sb->sb_lowat ||
1946	     sb->sb_cc >= uio->uio_resid ||
1947	     sb->sb_cc >= sb->sb_hiwat) ) {
1948		goto deliver;
1949	}
1950
1951	/* On MSG_WAITALL we must wait until all data or error arrives. */
1952	if ((flags & MSG_WAITALL) &&
1953	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1954		goto deliver;
1955
1956	/*
1957	 * Wait and block until (more) data comes in.
1958	 * NB: Drops the sockbuf lock during wait.
1959	 */
1960	error = sbwait(sb);
1961	if (error)
1962		goto out;
1963	goto restart;
1964
1965deliver:
1966	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1967	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1968	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1969
1970	/* Statistics. */
1971	if (uio->uio_td)
1972		uio->uio_td->td_ru.ru_msgrcv++;
1973
1974	/* Fill uio until full or current end of socket buffer is reached. */
1975	len = min(uio->uio_resid, sb->sb_cc);
1976	if (mp0 != NULL) {
1977		/* Dequeue as many mbufs as possible. */
1978		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1979			for (*mp0 = m = sb->sb_mb;
1980			     m != NULL && m->m_len <= len;
1981			     m = m->m_next) {
1982				len -= m->m_len;
1983				uio->uio_resid -= m->m_len;
1984				sbfree(sb, m);
1985				n = m;
1986			}
1987			sb->sb_mb = m;
1988			if (sb->sb_mb == NULL)
1989				SB_EMPTY_FIXUP(sb);
1990			n->m_next = NULL;
1991		}
1992		/* Copy the remainder. */
1993		if (len > 0) {
1994			KASSERT(sb->sb_mb != NULL,
1995			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1996
1997			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
1998			if (m == NULL)
1999				len = 0;	/* Don't flush data from sockbuf. */
2000			else
2001				uio->uio_resid -= m->m_len;
2002			if (*mp0 != NULL)
2003				n->m_next = m;
2004			else
2005				*mp0 = m;
2006			if (*mp0 == NULL) {
2007				error = ENOBUFS;
2008				goto out;
2009			}
2010		}
2011	} else {
2012		/* NB: Must unlock socket buffer as uiomove may sleep. */
2013		SOCKBUF_UNLOCK(sb);
2014		error = m_mbuftouio(uio, sb->sb_mb, len);
2015		SOCKBUF_LOCK(sb);
2016		if (error)
2017			goto out;
2018	}
2019	SBLASTRECORDCHK(sb);
2020	SBLASTMBUFCHK(sb);
2021
2022	/*
2023	 * Remove the delivered data from the socket buffer unless we
2024	 * were only peeking.
2025	 */
2026	if (!(flags & MSG_PEEK)) {
2027		if (len > 0)
2028			sbdrop_locked(sb, len);
2029
2030		/* Notify protocol that we drained some data. */
2031		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2032		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2033		     !(flags & MSG_SOCALLBCK))) {
2034			SOCKBUF_UNLOCK(sb);
2035			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2036			SOCKBUF_LOCK(sb);
2037		}
2038	}
2039
2040	/*
2041	 * For MSG_WAITALL we may have to loop again and wait for
2042	 * more data to come in.
2043	 */
2044	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2045		goto restart;
2046out:
2047	SOCKBUF_LOCK_ASSERT(sb);
2048	SBLASTRECORDCHK(sb);
2049	SBLASTMBUFCHK(sb);
2050	SOCKBUF_UNLOCK(sb);
2051	sbunlock(sb);
2052	return (error);
2053}
2054
2055/*
2056 * Optimized version of soreceive() for simple datagram cases from userspace.
2057 * Unlike in the stream case, we're able to drop a datagram if copyout()
2058 * fails, and because we handle datagrams atomically, we don't need to use a
2059 * sleep lock to prevent I/O interlacing.
2060 */
2061int
2062soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2063    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2064{
2065	struct mbuf *m, *m2;
2066	int flags, len, error;
2067	struct protosw *pr = so->so_proto;
2068	struct mbuf *nextrecord;
2069
2070	if (psa != NULL)
2071		*psa = NULL;
2072	if (controlp != NULL)
2073		*controlp = NULL;
2074	if (flagsp != NULL)
2075		flags = *flagsp &~ MSG_EOR;
2076	else
2077		flags = 0;
2078
2079	/*
2080	 * For any complicated cases, fall back to the full
2081	 * soreceive_generic().
2082	 */
2083	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2084		return (soreceive_generic(so, psa, uio, mp0, controlp,
2085		    flagsp));
2086
2087	/*
2088	 * Enforce restrictions on use.
2089	 */
2090	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2091	    ("soreceive_dgram: wantrcvd"));
2092	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2093	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2094	    ("soreceive_dgram: SBS_RCVATMARK"));
2095	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2096	    ("soreceive_dgram: P_CONNREQUIRED"));
2097
2098	/*
2099	 * Loop blocking while waiting for a datagram.
2100	 */
2101	SOCKBUF_LOCK(&so->so_rcv);
2102	while ((m = so->so_rcv.sb_mb) == NULL) {
2103		KASSERT(so->so_rcv.sb_cc == 0,
2104		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2105		    so->so_rcv.sb_cc));
2106		if (so->so_error) {
2107			error = so->so_error;
2108			so->so_error = 0;
2109			SOCKBUF_UNLOCK(&so->so_rcv);
2110			return (error);
2111		}
2112		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2113		    uio->uio_resid == 0) {
2114			SOCKBUF_UNLOCK(&so->so_rcv);
2115			return (0);
2116		}
2117		if ((so->so_state & SS_NBIO) ||
2118		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2119			SOCKBUF_UNLOCK(&so->so_rcv);
2120			return (EWOULDBLOCK);
2121		}
2122		SBLASTRECORDCHK(&so->so_rcv);
2123		SBLASTMBUFCHK(&so->so_rcv);
2124		error = sbwait(&so->so_rcv);
2125		if (error) {
2126			SOCKBUF_UNLOCK(&so->so_rcv);
2127			return (error);
2128		}
2129	}
2130	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2131
2132	if (uio->uio_td)
2133		uio->uio_td->td_ru.ru_msgrcv++;
2134	SBLASTRECORDCHK(&so->so_rcv);
2135	SBLASTMBUFCHK(&so->so_rcv);
2136	nextrecord = m->m_nextpkt;
2137	if (nextrecord == NULL) {
2138		KASSERT(so->so_rcv.sb_lastrecord == m,
2139		    ("soreceive_dgram: lastrecord != m"));
2140	}
2141
2142	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2143	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2144
2145	/*
2146	 * Pull 'm' and its chain off the front of the packet queue.
2147	 */
2148	so->so_rcv.sb_mb = NULL;
2149	sockbuf_pushsync(&so->so_rcv, nextrecord);
2150
2151	/*
2152	 * Walk 'm's chain and free that many bytes from the socket buffer.
2153	 */
2154	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2155		sbfree(&so->so_rcv, m2);
2156
2157	/*
2158	 * Do a few last checks before we let go of the lock.
2159	 */
2160	SBLASTRECORDCHK(&so->so_rcv);
2161	SBLASTMBUFCHK(&so->so_rcv);
2162	SOCKBUF_UNLOCK(&so->so_rcv);
2163
2164	if (pr->pr_flags & PR_ADDR) {
2165		KASSERT(m->m_type == MT_SONAME,
2166		    ("m->m_type == %d", m->m_type));
2167		if (psa != NULL)
2168			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2169			    M_NOWAIT);
2170		m = m_free(m);
2171	}
2172	if (m == NULL) {
2173		/* XXXRW: Can this happen? */
2174		return (0);
2175	}
2176
2177	/*
2178	 * Packet to copyout() is now in 'm' and it is disconnected from the
2179	 * queue.
2180	 *
2181	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2182	 * in the first mbuf chain on the socket buffer.  We call into the
2183	 * protocol to perform externalization (or freeing if controlp ==
2184	 * NULL).
2185	 */
2186	if (m->m_type == MT_CONTROL) {
2187		struct mbuf *cm = NULL, *cmn;
2188		struct mbuf **cme = &cm;
2189
2190		do {
2191			m2 = m->m_next;
2192			m->m_next = NULL;
2193			*cme = m;
2194			cme = &(*cme)->m_next;
2195			m = m2;
2196		} while (m != NULL && m->m_type == MT_CONTROL);
2197		while (cm != NULL) {
2198			cmn = cm->m_next;
2199			cm->m_next = NULL;
2200			if (pr->pr_domain->dom_externalize != NULL) {
2201				error = (*pr->pr_domain->dom_externalize)
2202				    (cm, controlp);
2203			} else if (controlp != NULL)
2204				*controlp = cm;
2205			else
2206				m_freem(cm);
2207			if (controlp != NULL) {
2208				while (*controlp != NULL)
2209					controlp = &(*controlp)->m_next;
2210			}
2211			cm = cmn;
2212		}
2213	}
2214	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2215
2216	while (m != NULL && uio->uio_resid > 0) {
2217		len = uio->uio_resid;
2218		if (len > m->m_len)
2219			len = m->m_len;
2220		error = uiomove(mtod(m, char *), (int)len, uio);
2221		if (error) {
2222			m_freem(m);
2223			return (error);
2224		}
2225		m = m_free(m);
2226	}
2227	if (m != NULL)
2228		flags |= MSG_TRUNC;
2229	m_freem(m);
2230	if (flagsp != NULL)
2231		*flagsp |= flags;
2232	return (0);
2233}
2234
2235int
2236soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2237    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2238{
2239
2240	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2241	    controlp, flagsp));
2242}
2243
2244int
2245soshutdown(struct socket *so, int how)
2246{
2247	struct protosw *pr = so->so_proto;
2248	int error;
2249
2250	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2251		return (EINVAL);
2252	if (pr->pr_usrreqs->pru_flush != NULL) {
2253	        (*pr->pr_usrreqs->pru_flush)(so, how);
2254	}
2255	if (how != SHUT_WR)
2256		sorflush(so);
2257	if (how != SHUT_RD) {
2258		CURVNET_SET(so->so_vnet);
2259		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2260		CURVNET_RESTORE();
2261		return (error);
2262	}
2263	return (0);
2264}
2265
2266void
2267sorflush(struct socket *so)
2268{
2269	struct sockbuf *sb = &so->so_rcv;
2270	struct protosw *pr = so->so_proto;
2271	struct sockbuf asb;
2272
2273	/*
2274	 * In order to avoid calling dom_dispose with the socket buffer mutex
2275	 * held, and in order to generally avoid holding the lock for a long
2276	 * time, we make a copy of the socket buffer and clear the original
2277	 * (except locks, state).  The new socket buffer copy won't have
2278	 * initialized locks so we can only call routines that won't use or
2279	 * assert those locks.
2280	 *
2281	 * Dislodge threads currently blocked in receive and wait to acquire
2282	 * a lock against other simultaneous readers before clearing the
2283	 * socket buffer.  Don't let our acquire be interrupted by a signal
2284	 * despite any existing socket disposition on interruptable waiting.
2285	 */
2286	CURVNET_SET(so->so_vnet);
2287	socantrcvmore(so);
2288	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2289
2290	/*
2291	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2292	 * and mutex data unchanged.
2293	 */
2294	SOCKBUF_LOCK(sb);
2295	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2296	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2297	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2298	bzero(&sb->sb_startzero,
2299	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2300	SOCKBUF_UNLOCK(sb);
2301	sbunlock(sb);
2302
2303	/*
2304	 * Dispose of special rights and flush the socket buffer.  Don't call
2305	 * any unsafe routines (that rely on locks being initialized) on asb.
2306	 */
2307	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2308		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2309	sbrelease_internal(&asb, so);
2310	CURVNET_RESTORE();
2311}
2312
2313/*
2314 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2315 * additional variant to handle the case where the option value needs to be
2316 * some kind of integer, but not a specific size.  In addition to their use
2317 * here, these functions are also called by the protocol-level pr_ctloutput()
2318 * routines.
2319 */
2320int
2321sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2322{
2323	size_t	valsize;
2324
2325	/*
2326	 * If the user gives us more than we wanted, we ignore it, but if we
2327	 * don't get the minimum length the caller wants, we return EINVAL.
2328	 * On success, sopt->sopt_valsize is set to however much we actually
2329	 * retrieved.
2330	 */
2331	if ((valsize = sopt->sopt_valsize) < minlen)
2332		return EINVAL;
2333	if (valsize > len)
2334		sopt->sopt_valsize = valsize = len;
2335
2336	if (sopt->sopt_td != NULL)
2337		return (copyin(sopt->sopt_val, buf, valsize));
2338
2339	bcopy(sopt->sopt_val, buf, valsize);
2340	return (0);
2341}
2342
2343/*
2344 * Kernel version of setsockopt(2).
2345 *
2346 * XXX: optlen is size_t, not socklen_t
2347 */
2348int
2349so_setsockopt(struct socket *so, int level, int optname, void *optval,
2350    size_t optlen)
2351{
2352	struct sockopt sopt;
2353
2354	sopt.sopt_level = level;
2355	sopt.sopt_name = optname;
2356	sopt.sopt_dir = SOPT_SET;
2357	sopt.sopt_val = optval;
2358	sopt.sopt_valsize = optlen;
2359	sopt.sopt_td = NULL;
2360	return (sosetopt(so, &sopt));
2361}
2362
2363int
2364sosetopt(struct socket *so, struct sockopt *sopt)
2365{
2366	int	error, optval;
2367	struct	linger l;
2368	struct	timeval tv;
2369	u_long  val;
2370#ifdef MAC
2371	struct mac extmac;
2372#endif
2373
2374	error = 0;
2375	if (sopt->sopt_level != SOL_SOCKET) {
2376		if (so->so_proto && so->so_proto->pr_ctloutput)
2377			return ((*so->so_proto->pr_ctloutput)
2378				  (so, sopt));
2379		error = ENOPROTOOPT;
2380	} else {
2381		switch (sopt->sopt_name) {
2382#ifdef INET
2383		case SO_ACCEPTFILTER:
2384			error = do_setopt_accept_filter(so, sopt);
2385			if (error)
2386				goto bad;
2387			break;
2388#endif
2389		case SO_LINGER:
2390			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2391			if (error)
2392				goto bad;
2393
2394			SOCK_LOCK(so);
2395			so->so_linger = l.l_linger;
2396			if (l.l_onoff)
2397				so->so_options |= SO_LINGER;
2398			else
2399				so->so_options &= ~SO_LINGER;
2400			SOCK_UNLOCK(so);
2401			break;
2402
2403		case SO_DEBUG:
2404		case SO_KEEPALIVE:
2405		case SO_DONTROUTE:
2406		case SO_USELOOPBACK:
2407		case SO_BROADCAST:
2408		case SO_REUSEADDR:
2409		case SO_REUSEPORT:
2410		case SO_OOBINLINE:
2411		case SO_TIMESTAMP:
2412		case SO_BINTIME:
2413		case SO_NOSIGPIPE:
2414		case SO_NO_DDP:
2415		case SO_NO_OFFLOAD:
2416			error = sooptcopyin(sopt, &optval, sizeof optval,
2417					    sizeof optval);
2418			if (error)
2419				goto bad;
2420			SOCK_LOCK(so);
2421			if (optval)
2422				so->so_options |= sopt->sopt_name;
2423			else
2424				so->so_options &= ~sopt->sopt_name;
2425			SOCK_UNLOCK(so);
2426			break;
2427
2428		case SO_SETFIB:
2429			error = sooptcopyin(sopt, &optval, sizeof optval,
2430					    sizeof optval);
2431			if (optval < 1 || optval > rt_numfibs) {
2432				error = EINVAL;
2433				goto bad;
2434			}
2435			if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2436			    (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2437				so->so_fibnum = optval;
2438				/* Note: ignore error */
2439				if (so->so_proto && so->so_proto->pr_ctloutput)
2440					(*so->so_proto->pr_ctloutput)(so, sopt);
2441			} else {
2442				so->so_fibnum = 0;
2443			}
2444			break;
2445		case SO_SNDBUF:
2446		case SO_RCVBUF:
2447		case SO_SNDLOWAT:
2448		case SO_RCVLOWAT:
2449			error = sooptcopyin(sopt, &optval, sizeof optval,
2450					    sizeof optval);
2451			if (error)
2452				goto bad;
2453
2454			/*
2455			 * Values < 1 make no sense for any of these options,
2456			 * so disallow them.
2457			 */
2458			if (optval < 1) {
2459				error = EINVAL;
2460				goto bad;
2461			}
2462
2463			switch (sopt->sopt_name) {
2464			case SO_SNDBUF:
2465			case SO_RCVBUF:
2466				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2467				    &so->so_snd : &so->so_rcv, (u_long)optval,
2468				    so, curthread) == 0) {
2469					error = ENOBUFS;
2470					goto bad;
2471				}
2472				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2473				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2474				break;
2475
2476			/*
2477			 * Make sure the low-water is never greater than the
2478			 * high-water.
2479			 */
2480			case SO_SNDLOWAT:
2481				SOCKBUF_LOCK(&so->so_snd);
2482				so->so_snd.sb_lowat =
2483				    (optval > so->so_snd.sb_hiwat) ?
2484				    so->so_snd.sb_hiwat : optval;
2485				SOCKBUF_UNLOCK(&so->so_snd);
2486				break;
2487			case SO_RCVLOWAT:
2488				SOCKBUF_LOCK(&so->so_rcv);
2489				so->so_rcv.sb_lowat =
2490				    (optval > so->so_rcv.sb_hiwat) ?
2491				    so->so_rcv.sb_hiwat : optval;
2492				SOCKBUF_UNLOCK(&so->so_rcv);
2493				break;
2494			}
2495			break;
2496
2497		case SO_SNDTIMEO:
2498		case SO_RCVTIMEO:
2499#ifdef COMPAT_IA32
2500			if (SV_CURPROC_FLAG(SV_ILP32)) {
2501				struct timeval32 tv32;
2502
2503				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2504				    sizeof tv32);
2505				CP(tv32, tv, tv_sec);
2506				CP(tv32, tv, tv_usec);
2507			} else
2508#endif
2509				error = sooptcopyin(sopt, &tv, sizeof tv,
2510				    sizeof tv);
2511			if (error)
2512				goto bad;
2513
2514			/* assert(hz > 0); */
2515			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2516			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2517				error = EDOM;
2518				goto bad;
2519			}
2520			/* assert(tick > 0); */
2521			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2522			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2523			if (val > INT_MAX) {
2524				error = EDOM;
2525				goto bad;
2526			}
2527			if (val == 0 && tv.tv_usec != 0)
2528				val = 1;
2529
2530			switch (sopt->sopt_name) {
2531			case SO_SNDTIMEO:
2532				so->so_snd.sb_timeo = val;
2533				break;
2534			case SO_RCVTIMEO:
2535				so->so_rcv.sb_timeo = val;
2536				break;
2537			}
2538			break;
2539
2540		case SO_LABEL:
2541#ifdef MAC
2542			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2543			    sizeof extmac);
2544			if (error)
2545				goto bad;
2546			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2547			    so, &extmac);
2548#else
2549			error = EOPNOTSUPP;
2550#endif
2551			break;
2552
2553		default:
2554			error = ENOPROTOOPT;
2555			break;
2556		}
2557		if (error == 0 && so->so_proto != NULL &&
2558		    so->so_proto->pr_ctloutput != NULL) {
2559			(void) ((*so->so_proto->pr_ctloutput)
2560				  (so, sopt));
2561		}
2562	}
2563bad:
2564	return (error);
2565}
2566
2567/*
2568 * Helper routine for getsockopt.
2569 */
2570int
2571sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2572{
2573	int	error;
2574	size_t	valsize;
2575
2576	error = 0;
2577
2578	/*
2579	 * Documented get behavior is that we always return a value, possibly
2580	 * truncated to fit in the user's buffer.  Traditional behavior is
2581	 * that we always tell the user precisely how much we copied, rather
2582	 * than something useful like the total amount we had available for
2583	 * her.  Note that this interface is not idempotent; the entire
2584	 * answer must generated ahead of time.
2585	 */
2586	valsize = min(len, sopt->sopt_valsize);
2587	sopt->sopt_valsize = valsize;
2588	if (sopt->sopt_val != NULL) {
2589		if (sopt->sopt_td != NULL)
2590			error = copyout(buf, sopt->sopt_val, valsize);
2591		else
2592			bcopy(buf, sopt->sopt_val, valsize);
2593	}
2594	return (error);
2595}
2596
2597int
2598sogetopt(struct socket *so, struct sockopt *sopt)
2599{
2600	int	error, optval;
2601	struct	linger l;
2602	struct	timeval tv;
2603#ifdef MAC
2604	struct mac extmac;
2605#endif
2606
2607	error = 0;
2608	if (sopt->sopt_level != SOL_SOCKET) {
2609		if (so->so_proto && so->so_proto->pr_ctloutput) {
2610			return ((*so->so_proto->pr_ctloutput)
2611				  (so, sopt));
2612		} else
2613			return (ENOPROTOOPT);
2614	} else {
2615		switch (sopt->sopt_name) {
2616#ifdef INET
2617		case SO_ACCEPTFILTER:
2618			error = do_getopt_accept_filter(so, sopt);
2619			break;
2620#endif
2621		case SO_LINGER:
2622			SOCK_LOCK(so);
2623			l.l_onoff = so->so_options & SO_LINGER;
2624			l.l_linger = so->so_linger;
2625			SOCK_UNLOCK(so);
2626			error = sooptcopyout(sopt, &l, sizeof l);
2627			break;
2628
2629		case SO_USELOOPBACK:
2630		case SO_DONTROUTE:
2631		case SO_DEBUG:
2632		case SO_KEEPALIVE:
2633		case SO_REUSEADDR:
2634		case SO_REUSEPORT:
2635		case SO_BROADCAST:
2636		case SO_OOBINLINE:
2637		case SO_ACCEPTCONN:
2638		case SO_TIMESTAMP:
2639		case SO_BINTIME:
2640		case SO_NOSIGPIPE:
2641			optval = so->so_options & sopt->sopt_name;
2642integer:
2643			error = sooptcopyout(sopt, &optval, sizeof optval);
2644			break;
2645
2646		case SO_TYPE:
2647			optval = so->so_type;
2648			goto integer;
2649
2650		case SO_ERROR:
2651			SOCK_LOCK(so);
2652			optval = so->so_error;
2653			so->so_error = 0;
2654			SOCK_UNLOCK(so);
2655			goto integer;
2656
2657		case SO_SNDBUF:
2658			optval = so->so_snd.sb_hiwat;
2659			goto integer;
2660
2661		case SO_RCVBUF:
2662			optval = so->so_rcv.sb_hiwat;
2663			goto integer;
2664
2665		case SO_SNDLOWAT:
2666			optval = so->so_snd.sb_lowat;
2667			goto integer;
2668
2669		case SO_RCVLOWAT:
2670			optval = so->so_rcv.sb_lowat;
2671			goto integer;
2672
2673		case SO_SNDTIMEO:
2674		case SO_RCVTIMEO:
2675			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2676				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2677
2678			tv.tv_sec = optval / hz;
2679			tv.tv_usec = (optval % hz) * tick;
2680#ifdef COMPAT_IA32
2681			if (SV_CURPROC_FLAG(SV_ILP32)) {
2682				struct timeval32 tv32;
2683
2684				CP(tv, tv32, tv_sec);
2685				CP(tv, tv32, tv_usec);
2686				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2687			} else
2688#endif
2689				error = sooptcopyout(sopt, &tv, sizeof tv);
2690			break;
2691
2692		case SO_LABEL:
2693#ifdef MAC
2694			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2695			    sizeof(extmac));
2696			if (error)
2697				return (error);
2698			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2699			    so, &extmac);
2700			if (error)
2701				return (error);
2702			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2703#else
2704			error = EOPNOTSUPP;
2705#endif
2706			break;
2707
2708		case SO_PEERLABEL:
2709#ifdef MAC
2710			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2711			    sizeof(extmac));
2712			if (error)
2713				return (error);
2714			error = mac_getsockopt_peerlabel(
2715			    sopt->sopt_td->td_ucred, so, &extmac);
2716			if (error)
2717				return (error);
2718			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2719#else
2720			error = EOPNOTSUPP;
2721#endif
2722			break;
2723
2724		case SO_LISTENQLIMIT:
2725			optval = so->so_qlimit;
2726			goto integer;
2727
2728		case SO_LISTENQLEN:
2729			optval = so->so_qlen;
2730			goto integer;
2731
2732		case SO_LISTENINCQLEN:
2733			optval = so->so_incqlen;
2734			goto integer;
2735
2736		default:
2737			error = ENOPROTOOPT;
2738			break;
2739		}
2740		return (error);
2741	}
2742}
2743
2744/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2745int
2746soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2747{
2748	struct mbuf *m, *m_prev;
2749	int sopt_size = sopt->sopt_valsize;
2750
2751	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2752	if (m == NULL)
2753		return ENOBUFS;
2754	if (sopt_size > MLEN) {
2755		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2756		if ((m->m_flags & M_EXT) == 0) {
2757			m_free(m);
2758			return ENOBUFS;
2759		}
2760		m->m_len = min(MCLBYTES, sopt_size);
2761	} else {
2762		m->m_len = min(MLEN, sopt_size);
2763	}
2764	sopt_size -= m->m_len;
2765	*mp = m;
2766	m_prev = m;
2767
2768	while (sopt_size) {
2769		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2770		if (m == NULL) {
2771			m_freem(*mp);
2772			return ENOBUFS;
2773		}
2774		if (sopt_size > MLEN) {
2775			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2776			    M_DONTWAIT);
2777			if ((m->m_flags & M_EXT) == 0) {
2778				m_freem(m);
2779				m_freem(*mp);
2780				return ENOBUFS;
2781			}
2782			m->m_len = min(MCLBYTES, sopt_size);
2783		} else {
2784			m->m_len = min(MLEN, sopt_size);
2785		}
2786		sopt_size -= m->m_len;
2787		m_prev->m_next = m;
2788		m_prev = m;
2789	}
2790	return (0);
2791}
2792
2793/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2794int
2795soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2796{
2797	struct mbuf *m0 = m;
2798
2799	if (sopt->sopt_val == NULL)
2800		return (0);
2801	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2802		if (sopt->sopt_td != NULL) {
2803			int error;
2804
2805			error = copyin(sopt->sopt_val, mtod(m, char *),
2806				       m->m_len);
2807			if (error != 0) {
2808				m_freem(m0);
2809				return(error);
2810			}
2811		} else
2812			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2813		sopt->sopt_valsize -= m->m_len;
2814		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2815		m = m->m_next;
2816	}
2817	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2818		panic("ip6_sooptmcopyin");
2819	return (0);
2820}
2821
2822/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2823int
2824soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2825{
2826	struct mbuf *m0 = m;
2827	size_t valsize = 0;
2828
2829	if (sopt->sopt_val == NULL)
2830		return (0);
2831	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2832		if (sopt->sopt_td != NULL) {
2833			int error;
2834
2835			error = copyout(mtod(m, char *), sopt->sopt_val,
2836				       m->m_len);
2837			if (error != 0) {
2838				m_freem(m0);
2839				return(error);
2840			}
2841		} else
2842			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2843	       sopt->sopt_valsize -= m->m_len;
2844	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2845	       valsize += m->m_len;
2846	       m = m->m_next;
2847	}
2848	if (m != NULL) {
2849		/* enough soopt buffer should be given from user-land */
2850		m_freem(m0);
2851		return(EINVAL);
2852	}
2853	sopt->sopt_valsize = valsize;
2854	return (0);
2855}
2856
2857/*
2858 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2859 * out-of-band data, which will then notify socket consumers.
2860 */
2861void
2862sohasoutofband(struct socket *so)
2863{
2864
2865	if (so->so_sigio != NULL)
2866		pgsigio(&so->so_sigio, SIGURG, 0);
2867	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2868}
2869
2870int
2871sopoll(struct socket *so, int events, struct ucred *active_cred,
2872    struct thread *td)
2873{
2874
2875	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2876	    td));
2877}
2878
2879int
2880sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2881    struct thread *td)
2882{
2883	int revents = 0;
2884
2885	SOCKBUF_LOCK(&so->so_snd);
2886	SOCKBUF_LOCK(&so->so_rcv);
2887	if (events & (POLLIN | POLLRDNORM))
2888		if (soreadable(so))
2889			revents |= events & (POLLIN | POLLRDNORM);
2890
2891	if (events & POLLINIGNEOF)
2892		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2893		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2894			revents |= POLLINIGNEOF;
2895
2896	if (events & (POLLOUT | POLLWRNORM))
2897		if (sowriteable(so))
2898			revents |= events & (POLLOUT | POLLWRNORM);
2899
2900	if (events & (POLLPRI | POLLRDBAND))
2901		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2902			revents |= events & (POLLPRI | POLLRDBAND);
2903
2904	if (revents == 0) {
2905		if (events &
2906		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2907		     POLLRDBAND)) {
2908			selrecord(td, &so->so_rcv.sb_sel);
2909			so->so_rcv.sb_flags |= SB_SEL;
2910		}
2911
2912		if (events & (POLLOUT | POLLWRNORM)) {
2913			selrecord(td, &so->so_snd.sb_sel);
2914			so->so_snd.sb_flags |= SB_SEL;
2915		}
2916	}
2917
2918	SOCKBUF_UNLOCK(&so->so_rcv);
2919	SOCKBUF_UNLOCK(&so->so_snd);
2920	return (revents);
2921}
2922
2923int
2924soo_kqfilter(struct file *fp, struct knote *kn)
2925{
2926	struct socket *so = kn->kn_fp->f_data;
2927	struct sockbuf *sb;
2928
2929	switch (kn->kn_filter) {
2930	case EVFILT_READ:
2931		if (so->so_options & SO_ACCEPTCONN)
2932			kn->kn_fop = &solisten_filtops;
2933		else
2934			kn->kn_fop = &soread_filtops;
2935		sb = &so->so_rcv;
2936		break;
2937	case EVFILT_WRITE:
2938		kn->kn_fop = &sowrite_filtops;
2939		sb = &so->so_snd;
2940		break;
2941	default:
2942		return (EINVAL);
2943	}
2944
2945	SOCKBUF_LOCK(sb);
2946	knlist_add(&sb->sb_sel.si_note, kn, 1);
2947	sb->sb_flags |= SB_KNOTE;
2948	SOCKBUF_UNLOCK(sb);
2949	return (0);
2950}
2951
2952/*
2953 * Some routines that return EOPNOTSUPP for entry points that are not
2954 * supported by a protocol.  Fill in as needed.
2955 */
2956int
2957pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2958{
2959
2960	return EOPNOTSUPP;
2961}
2962
2963int
2964pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2965{
2966
2967	return EOPNOTSUPP;
2968}
2969
2970int
2971pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2972{
2973
2974	return EOPNOTSUPP;
2975}
2976
2977int
2978pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2979{
2980
2981	return EOPNOTSUPP;
2982}
2983
2984int
2985pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2986{
2987
2988	return EOPNOTSUPP;
2989}
2990
2991int
2992pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2993    struct ifnet *ifp, struct thread *td)
2994{
2995
2996	return EOPNOTSUPP;
2997}
2998
2999int
3000pru_disconnect_notsupp(struct socket *so)
3001{
3002
3003	return EOPNOTSUPP;
3004}
3005
3006int
3007pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3008{
3009
3010	return EOPNOTSUPP;
3011}
3012
3013int
3014pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3015{
3016
3017	return EOPNOTSUPP;
3018}
3019
3020int
3021pru_rcvd_notsupp(struct socket *so, int flags)
3022{
3023
3024	return EOPNOTSUPP;
3025}
3026
3027int
3028pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3029{
3030
3031	return EOPNOTSUPP;
3032}
3033
3034int
3035pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3036    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3037{
3038
3039	return EOPNOTSUPP;
3040}
3041
3042/*
3043 * This isn't really a ``null'' operation, but it's the default one and
3044 * doesn't do anything destructive.
3045 */
3046int
3047pru_sense_null(struct socket *so, struct stat *sb)
3048{
3049
3050	sb->st_blksize = so->so_snd.sb_hiwat;
3051	return 0;
3052}
3053
3054int
3055pru_shutdown_notsupp(struct socket *so)
3056{
3057
3058	return EOPNOTSUPP;
3059}
3060
3061int
3062pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3063{
3064
3065	return EOPNOTSUPP;
3066}
3067
3068int
3069pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3070    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3071{
3072
3073	return EOPNOTSUPP;
3074}
3075
3076int
3077pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3078    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3079{
3080
3081	return EOPNOTSUPP;
3082}
3083
3084int
3085pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3086    struct thread *td)
3087{
3088
3089	return EOPNOTSUPP;
3090}
3091
3092static void
3093filt_sordetach(struct knote *kn)
3094{
3095	struct socket *so = kn->kn_fp->f_data;
3096
3097	SOCKBUF_LOCK(&so->so_rcv);
3098	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3099	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3100		so->so_rcv.sb_flags &= ~SB_KNOTE;
3101	SOCKBUF_UNLOCK(&so->so_rcv);
3102}
3103
3104/*ARGSUSED*/
3105static int
3106filt_soread(struct knote *kn, long hint)
3107{
3108	struct socket *so;
3109
3110	so = kn->kn_fp->f_data;
3111	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3112
3113	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3114	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3115		kn->kn_flags |= EV_EOF;
3116		kn->kn_fflags = so->so_error;
3117		return (1);
3118	} else if (so->so_error)	/* temporary udp error */
3119		return (1);
3120	else if (kn->kn_sfflags & NOTE_LOWAT)
3121		return (kn->kn_data >= kn->kn_sdata);
3122	else
3123		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3124}
3125
3126static void
3127filt_sowdetach(struct knote *kn)
3128{
3129	struct socket *so = kn->kn_fp->f_data;
3130
3131	SOCKBUF_LOCK(&so->so_snd);
3132	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3133	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3134		so->so_snd.sb_flags &= ~SB_KNOTE;
3135	SOCKBUF_UNLOCK(&so->so_snd);
3136}
3137
3138/*ARGSUSED*/
3139static int
3140filt_sowrite(struct knote *kn, long hint)
3141{
3142	struct socket *so;
3143
3144	so = kn->kn_fp->f_data;
3145	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3146	kn->kn_data = sbspace(&so->so_snd);
3147	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3148		kn->kn_flags |= EV_EOF;
3149		kn->kn_fflags = so->so_error;
3150		return (1);
3151	} else if (so->so_error)	/* temporary udp error */
3152		return (1);
3153	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3154	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3155		return (0);
3156	else if (kn->kn_sfflags & NOTE_LOWAT)
3157		return (kn->kn_data >= kn->kn_sdata);
3158	else
3159		return (kn->kn_data >= so->so_snd.sb_lowat);
3160}
3161
3162/*ARGSUSED*/
3163static int
3164filt_solisten(struct knote *kn, long hint)
3165{
3166	struct socket *so = kn->kn_fp->f_data;
3167
3168	kn->kn_data = so->so_qlen;
3169	return (! TAILQ_EMPTY(&so->so_comp));
3170}
3171
3172int
3173socheckuid(struct socket *so, uid_t uid)
3174{
3175
3176	if (so == NULL)
3177		return (EPERM);
3178	if (so->so_cred->cr_uid != uid)
3179		return (EPERM);
3180	return (0);
3181}
3182
3183static int
3184sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3185{
3186	int error;
3187	int val;
3188
3189	val = somaxconn;
3190	error = sysctl_handle_int(oidp, &val, 0, req);
3191	if (error || !req->newptr )
3192		return (error);
3193
3194	if (val < 1 || val > USHRT_MAX)
3195		return (EINVAL);
3196
3197	somaxconn = val;
3198	return (0);
3199}
3200
3201/*
3202 * These functions are used by protocols to notify the socket layer (and its
3203 * consumers) of state changes in the sockets driven by protocol-side events.
3204 */
3205
3206/*
3207 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3208 *
3209 * Normal sequence from the active (originating) side is that
3210 * soisconnecting() is called during processing of connect() call, resulting
3211 * in an eventual call to soisconnected() if/when the connection is
3212 * established.  When the connection is torn down soisdisconnecting() is
3213 * called during processing of disconnect() call, and soisdisconnected() is
3214 * called when the connection to the peer is totally severed.  The semantics
3215 * of these routines are such that connectionless protocols can call
3216 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3217 * calls when setting up a ``connection'' takes no time.
3218 *
3219 * From the passive side, a socket is created with two queues of sockets:
3220 * so_incomp for connections in progress and so_comp for connections already
3221 * made and awaiting user acceptance.  As a protocol is preparing incoming
3222 * connections, it creates a socket structure queued on so_incomp by calling
3223 * sonewconn().  When the connection is established, soisconnected() is
3224 * called, and transfers the socket structure to so_comp, making it available
3225 * to accept().
3226 *
3227 * If a socket is closed with sockets on either so_incomp or so_comp, these
3228 * sockets are dropped.
3229 *
3230 * If higher-level protocols are implemented in the kernel, the wakeups done
3231 * here will sometimes cause software-interrupt process scheduling.
3232 */
3233void
3234soisconnecting(struct socket *so)
3235{
3236
3237	SOCK_LOCK(so);
3238	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3239	so->so_state |= SS_ISCONNECTING;
3240	SOCK_UNLOCK(so);
3241}
3242
3243void
3244soisconnected(struct socket *so)
3245{
3246	struct socket *head;
3247	int ret;
3248
3249restart:
3250	ACCEPT_LOCK();
3251	SOCK_LOCK(so);
3252	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3253	so->so_state |= SS_ISCONNECTED;
3254	head = so->so_head;
3255	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3256		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3257			SOCK_UNLOCK(so);
3258			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3259			head->so_incqlen--;
3260			so->so_qstate &= ~SQ_INCOMP;
3261			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3262			head->so_qlen++;
3263			so->so_qstate |= SQ_COMP;
3264			ACCEPT_UNLOCK();
3265			sorwakeup(head);
3266			wakeup_one(&head->so_timeo);
3267		} else {
3268			ACCEPT_UNLOCK();
3269			soupcall_set(so, SO_RCV,
3270			    head->so_accf->so_accept_filter->accf_callback,
3271			    head->so_accf->so_accept_filter_arg);
3272			so->so_options &= ~SO_ACCEPTFILTER;
3273			ret = head->so_accf->so_accept_filter->accf_callback(so,
3274			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3275			if (ret == SU_ISCONNECTED)
3276				soupcall_clear(so, SO_RCV);
3277			SOCK_UNLOCK(so);
3278			if (ret == SU_ISCONNECTED)
3279				goto restart;
3280		}
3281		return;
3282	}
3283	SOCK_UNLOCK(so);
3284	ACCEPT_UNLOCK();
3285	wakeup(&so->so_timeo);
3286	sorwakeup(so);
3287	sowwakeup(so);
3288}
3289
3290void
3291soisdisconnecting(struct socket *so)
3292{
3293
3294	/*
3295	 * Note: This code assumes that SOCK_LOCK(so) and
3296	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3297	 */
3298	SOCKBUF_LOCK(&so->so_rcv);
3299	so->so_state &= ~SS_ISCONNECTING;
3300	so->so_state |= SS_ISDISCONNECTING;
3301	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3302	sorwakeup_locked(so);
3303	SOCKBUF_LOCK(&so->so_snd);
3304	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3305	sowwakeup_locked(so);
3306	wakeup(&so->so_timeo);
3307}
3308
3309void
3310soisdisconnected(struct socket *so)
3311{
3312
3313	/*
3314	 * Note: This code assumes that SOCK_LOCK(so) and
3315	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3316	 */
3317	SOCKBUF_LOCK(&so->so_rcv);
3318	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3319	so->so_state |= SS_ISDISCONNECTED;
3320	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3321	sorwakeup_locked(so);
3322	SOCKBUF_LOCK(&so->so_snd);
3323	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3324	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3325	sowwakeup_locked(so);
3326	wakeup(&so->so_timeo);
3327}
3328
3329/*
3330 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3331 */
3332struct sockaddr *
3333sodupsockaddr(const struct sockaddr *sa, int mflags)
3334{
3335	struct sockaddr *sa2;
3336
3337	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3338	if (sa2)
3339		bcopy(sa, sa2, sa->sa_len);
3340	return sa2;
3341}
3342
3343/*
3344 * Register per-socket buffer upcalls.
3345 */
3346void
3347soupcall_set(struct socket *so, int which,
3348    int (*func)(struct socket *, void *, int), void *arg)
3349{
3350	struct sockbuf *sb;
3351
3352	switch (which) {
3353	case SO_RCV:
3354		sb = &so->so_rcv;
3355		break;
3356	case SO_SND:
3357		sb = &so->so_snd;
3358		break;
3359	default:
3360		panic("soupcall_set: bad which");
3361	}
3362	SOCKBUF_LOCK_ASSERT(sb);
3363#if 0
3364	/* XXX: accf_http actually wants to do this on purpose. */
3365	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3366#endif
3367	sb->sb_upcall = func;
3368	sb->sb_upcallarg = arg;
3369	sb->sb_flags |= SB_UPCALL;
3370}
3371
3372void
3373soupcall_clear(struct socket *so, int which)
3374{
3375	struct sockbuf *sb;
3376
3377	switch (which) {
3378	case SO_RCV:
3379		sb = &so->so_rcv;
3380		break;
3381	case SO_SND:
3382		sb = &so->so_snd;
3383		break;
3384	default:
3385		panic("soupcall_clear: bad which");
3386	}
3387	SOCKBUF_LOCK_ASSERT(sb);
3388	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3389	sb->sb_upcall = NULL;
3390	sb->sb_upcallarg = NULL;
3391	sb->sb_flags &= ~SB_UPCALL;
3392}
3393
3394/*
3395 * Create an external-format (``xsocket'') structure using the information in
3396 * the kernel-format socket structure pointed to by so.  This is done to
3397 * reduce the spew of irrelevant information over this interface, to isolate
3398 * user code from changes in the kernel structure, and potentially to provide
3399 * information-hiding if we decide that some of this information should be
3400 * hidden from users.
3401 */
3402void
3403sotoxsocket(struct socket *so, struct xsocket *xso)
3404{
3405
3406	xso->xso_len = sizeof *xso;
3407	xso->xso_so = so;
3408	xso->so_type = so->so_type;
3409	xso->so_options = so->so_options;
3410	xso->so_linger = so->so_linger;
3411	xso->so_state = so->so_state;
3412	xso->so_pcb = so->so_pcb;
3413	xso->xso_protocol = so->so_proto->pr_protocol;
3414	xso->xso_family = so->so_proto->pr_domain->dom_family;
3415	xso->so_qlen = so->so_qlen;
3416	xso->so_incqlen = so->so_incqlen;
3417	xso->so_qlimit = so->so_qlimit;
3418	xso->so_timeo = so->so_timeo;
3419	xso->so_error = so->so_error;
3420	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3421	xso->so_oobmark = so->so_oobmark;
3422	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3423	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3424	xso->so_uid = so->so_cred->cr_uid;
3425}
3426
3427
3428/*
3429 * Socket accessor functions to provide external consumers with
3430 * a safe interface to socket state
3431 *
3432 */
3433
3434void
3435so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3436{
3437
3438	TAILQ_FOREACH(so, &so->so_comp, so_list)
3439		func(so, arg);
3440}
3441
3442struct sockbuf *
3443so_sockbuf_rcv(struct socket *so)
3444{
3445
3446	return (&so->so_rcv);
3447}
3448
3449struct sockbuf *
3450so_sockbuf_snd(struct socket *so)
3451{
3452
3453	return (&so->so_snd);
3454}
3455
3456int
3457so_state_get(const struct socket *so)
3458{
3459
3460	return (so->so_state);
3461}
3462
3463void
3464so_state_set(struct socket *so, int val)
3465{
3466
3467	so->so_state = val;
3468}
3469
3470int
3471so_options_get(const struct socket *so)
3472{
3473
3474	return (so->so_options);
3475}
3476
3477void
3478so_options_set(struct socket *so, int val)
3479{
3480
3481	so->so_options = val;
3482}
3483
3484int
3485so_error_get(const struct socket *so)
3486{
3487
3488	return (so->so_error);
3489}
3490
3491void
3492so_error_set(struct socket *so, int val)
3493{
3494
3495	so->so_error = val;
3496}
3497
3498int
3499so_linger_get(const struct socket *so)
3500{
3501
3502	return (so->so_linger);
3503}
3504
3505void
3506so_linger_set(struct socket *so, int val)
3507{
3508
3509	so->so_linger = val;
3510}
3511
3512struct protosw *
3513so_protosw_get(const struct socket *so)
3514{
3515
3516	return (so->so_proto);
3517}
3518
3519void
3520so_protosw_set(struct socket *so, struct protosw *val)
3521{
3522
3523	so->so_proto = val;
3524}
3525
3526void
3527so_sorwakeup(struct socket *so)
3528{
3529
3530	sorwakeup(so);
3531}
3532
3533void
3534so_sowwakeup(struct socket *so)
3535{
3536
3537	sowwakeup(so);
3538}
3539
3540void
3541so_sorwakeup_locked(struct socket *so)
3542{
3543
3544	sorwakeup_locked(so);
3545}
3546
3547void
3548so_sowwakeup_locked(struct socket *so)
3549{
3550
3551	sowwakeup_locked(so);
3552}
3553
3554void
3555so_lock(struct socket *so)
3556{
3557	SOCK_LOCK(so);
3558}
3559
3560void
3561so_unlock(struct socket *so)
3562{
3563	SOCK_UNLOCK(so);
3564}
3565