uipc_socket.c revision 185435
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 185435 2008-11-29 14:32:14Z bz $");
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_mac.h"
103#include "opt_zero.h"
104#include "opt_compat.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/fcntl.h>
109#include <sys/limits.h>
110#include <sys/lock.h>
111#include <sys/mac.h>
112#include <sys/malloc.h>
113#include <sys/mbuf.h>
114#include <sys/mutex.h>
115#include <sys/domain.h>
116#include <sys/file.h>			/* for struct knote */
117#include <sys/kernel.h>
118#include <sys/event.h>
119#include <sys/eventhandler.h>
120#include <sys/poll.h>
121#include <sys/proc.h>
122#include <sys/protosw.h>
123#include <sys/socket.h>
124#include <sys/socketvar.h>
125#include <sys/resourcevar.h>
126#include <net/route.h>
127#include <sys/signalvar.h>
128#include <sys/stat.h>
129#include <sys/sx.h>
130#include <sys/sysctl.h>
131#include <sys/uio.h>
132#include <sys/jail.h>
133
134#include <security/mac/mac_framework.h>
135
136#include <vm/uma.h>
137
138#ifdef COMPAT_IA32
139#include <sys/mount.h>
140#include <sys/sysent.h>
141#include <compat/freebsd32/freebsd32.h>
142#endif
143
144static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
145		    int flags);
146
147static void	filt_sordetach(struct knote *kn);
148static int	filt_soread(struct knote *kn, long hint);
149static void	filt_sowdetach(struct knote *kn);
150static int	filt_sowrite(struct knote *kn, long hint);
151static int	filt_solisten(struct knote *kn, long hint);
152
153static struct filterops solisten_filtops =
154	{ 1, NULL, filt_sordetach, filt_solisten };
155static struct filterops soread_filtops =
156	{ 1, NULL, filt_sordetach, filt_soread };
157static struct filterops sowrite_filtops =
158	{ 1, NULL, filt_sowdetach, filt_sowrite };
159
160uma_zone_t socket_zone;
161so_gen_t	so_gencnt;	/* generation count for sockets */
162
163int	maxsockets;
164
165MALLOC_DEFINE(M_SONAME, "soname", "socket name");
166MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
167
168static int somaxconn = SOMAXCONN;
169static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
170/* XXX: we dont have SYSCTL_USHORT */
171SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
172    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
173    "queue size");
174static int numopensockets;
175SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
176    &numopensockets, 0, "Number of open sockets");
177#ifdef ZERO_COPY_SOCKETS
178/* These aren't static because they're used in other files. */
179int so_zero_copy_send = 1;
180int so_zero_copy_receive = 1;
181SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
182    "Zero copy controls");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
184    &so_zero_copy_receive, 0, "Enable zero copy receive");
185SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
186    &so_zero_copy_send, 0, "Enable zero copy send");
187#endif /* ZERO_COPY_SOCKETS */
188
189/*
190 * accept_mtx locks down per-socket fields relating to accept queues.  See
191 * socketvar.h for an annotation of the protected fields of struct socket.
192 */
193struct mtx accept_mtx;
194MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
195
196/*
197 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
198 * so_gencnt field.
199 */
200static struct mtx so_global_mtx;
201MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
202
203/*
204 * General IPC sysctl name space, used by sockets and a variety of other IPC
205 * types.
206 */
207SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
208
209/*
210 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
211 * of the change so that they can update their dependent limits as required.
212 */
213static int
214sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
215{
216	int error, newmaxsockets;
217
218	newmaxsockets = maxsockets;
219	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
220	if (error == 0 && req->newptr) {
221		if (newmaxsockets > maxsockets) {
222			maxsockets = newmaxsockets;
223			if (maxsockets > ((maxfiles / 4) * 3)) {
224				maxfiles = (maxsockets * 5) / 4;
225				maxfilesperproc = (maxfiles * 9) / 10;
226			}
227			EVENTHANDLER_INVOKE(maxsockets_change);
228		} else
229			error = EINVAL;
230	}
231	return (error);
232}
233
234SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
235    &maxsockets, 0, sysctl_maxsockets, "IU",
236    "Maximum number of sockets avaliable");
237
238/*
239 * Initialise maxsockets.
240 */
241static void init_maxsockets(void *ignored)
242{
243	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
244	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
245}
246SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
247
248/*
249 * Socket operation routines.  These routines are called by the routines in
250 * sys_socket.c or from a system process, and implement the semantics of
251 * socket operations by switching out to the protocol specific routines.
252 */
253
254/*
255 * Get a socket structure from our zone, and initialize it.  Note that it
256 * would probably be better to allocate socket and PCB at the same time, but
257 * I'm not convinced that all the protocols can be easily modified to do
258 * this.
259 *
260 * soalloc() returns a socket with a ref count of 0.
261 */
262static struct socket *
263soalloc(void)
264{
265	struct socket *so;
266
267	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
268	if (so == NULL)
269		return (NULL);
270#ifdef MAC
271	if (mac_socket_init(so, M_NOWAIT) != 0) {
272		uma_zfree(socket_zone, so);
273		return (NULL);
274	}
275#endif
276	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
277	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
278	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
279	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
280	TAILQ_INIT(&so->so_aiojobq);
281	mtx_lock(&so_global_mtx);
282	so->so_gencnt = ++so_gencnt;
283	++numopensockets;
284	mtx_unlock(&so_global_mtx);
285	return (so);
286}
287
288/*
289 * Free the storage associated with a socket at the socket layer, tear down
290 * locks, labels, etc.  All protocol state is assumed already to have been
291 * torn down (and possibly never set up) by the caller.
292 */
293static void
294sodealloc(struct socket *so)
295{
296
297	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
298	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
299
300	mtx_lock(&so_global_mtx);
301	so->so_gencnt = ++so_gencnt;
302	--numopensockets;	/* Could be below, but faster here. */
303	mtx_unlock(&so_global_mtx);
304	if (so->so_rcv.sb_hiwat)
305		(void)chgsbsize(so->so_cred->cr_uidinfo,
306		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
307	if (so->so_snd.sb_hiwat)
308		(void)chgsbsize(so->so_cred->cr_uidinfo,
309		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
310#ifdef INET
311	/* remove acccept filter if one is present. */
312	if (so->so_accf != NULL)
313		do_setopt_accept_filter(so, NULL);
314#endif
315#ifdef MAC
316	mac_socket_destroy(so);
317#endif
318	crfree(so->so_cred);
319	sx_destroy(&so->so_snd.sb_sx);
320	sx_destroy(&so->so_rcv.sb_sx);
321	SOCKBUF_LOCK_DESTROY(&so->so_snd);
322	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
323	uma_zfree(socket_zone, so);
324}
325
326/*
327 * socreate returns a socket with a ref count of 1.  The socket should be
328 * closed with soclose().
329 */
330int
331socreate(int dom, struct socket **aso, int type, int proto,
332    struct ucred *cred, struct thread *td)
333{
334	struct protosw *prp;
335	struct socket *so;
336	int error;
337
338	if (proto)
339		prp = pffindproto(dom, proto, type);
340	else
341		prp = pffindtype(dom, type);
342
343	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
344	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
345		return (EPROTONOSUPPORT);
346
347	if (jailed(cred) && jail_socket_unixiproute_only &&
348	    prp->pr_domain->dom_family != PF_LOCAL &&
349	    prp->pr_domain->dom_family != PF_INET &&
350#ifdef INET6
351	    prp->pr_domain->dom_family != PF_INET6 &&
352#endif
353	    prp->pr_domain->dom_family != PF_ROUTE) {
354		return (EPROTONOSUPPORT);
355	}
356
357	if (prp->pr_type != type)
358		return (EPROTOTYPE);
359	so = soalloc();
360	if (so == NULL)
361		return (ENOBUFS);
362
363	TAILQ_INIT(&so->so_incomp);
364	TAILQ_INIT(&so->so_comp);
365	so->so_type = type;
366	so->so_cred = crhold(cred);
367	if ((prp->pr_domain->dom_family == PF_INET) ||
368	    (prp->pr_domain->dom_family == PF_ROUTE))
369		so->so_fibnum = td->td_proc->p_fibnum;
370	else
371		so->so_fibnum = 0;
372	so->so_proto = prp;
373#ifdef MAC
374	mac_socket_create(cred, so);
375#endif
376	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
377	    NULL, NULL, NULL);
378	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
379	    NULL, NULL, NULL);
380	so->so_count = 1;
381	/*
382	 * Auto-sizing of socket buffers is managed by the protocols and
383	 * the appropriate flags must be set in the pru_attach function.
384	 */
385	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
386	if (error) {
387		KASSERT(so->so_count == 1, ("socreate: so_count %d",
388		    so->so_count));
389		so->so_count = 0;
390		sodealloc(so);
391		return (error);
392	}
393	*aso = so;
394	return (0);
395}
396
397#ifdef REGRESSION
398static int regression_sonewconn_earlytest = 1;
399SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
400    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
401#endif
402
403/*
404 * When an attempt at a new connection is noted on a socket which accepts
405 * connections, sonewconn is called.  If the connection is possible (subject
406 * to space constraints, etc.) then we allocate a new structure, propoerly
407 * linked into the data structure of the original socket, and return this.
408 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
409 *
410 * Note: the ref count on the socket is 0 on return.
411 */
412struct socket *
413sonewconn(struct socket *head, int connstatus)
414{
415	struct socket *so;
416	int over;
417
418	ACCEPT_LOCK();
419	over = (head->so_qlen > 3 * head->so_qlimit / 2);
420	ACCEPT_UNLOCK();
421#ifdef REGRESSION
422	if (regression_sonewconn_earlytest && over)
423#else
424	if (over)
425#endif
426		return (NULL);
427	so = soalloc();
428	if (so == NULL)
429		return (NULL);
430	if ((head->so_options & SO_ACCEPTFILTER) != 0)
431		connstatus = 0;
432	so->so_head = head;
433	so->so_type = head->so_type;
434	so->so_options = head->so_options &~ SO_ACCEPTCONN;
435	so->so_linger = head->so_linger;
436	so->so_state = head->so_state | SS_NOFDREF;
437	so->so_proto = head->so_proto;
438	so->so_cred = crhold(head->so_cred);
439#ifdef MAC
440	SOCK_LOCK(head);
441	mac_socket_newconn(head, so);
442	SOCK_UNLOCK(head);
443#endif
444	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
445	    NULL, NULL, NULL);
446	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
447	    NULL, NULL, NULL);
448	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
449	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
450		sodealloc(so);
451		return (NULL);
452	}
453	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
454	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
455	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
456	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
457	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
458	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
459	so->so_state |= connstatus;
460	ACCEPT_LOCK();
461	if (connstatus) {
462		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
463		so->so_qstate |= SQ_COMP;
464		head->so_qlen++;
465	} else {
466		/*
467		 * Keep removing sockets from the head until there's room for
468		 * us to insert on the tail.  In pre-locking revisions, this
469		 * was a simple if(), but as we could be racing with other
470		 * threads and soabort() requires dropping locks, we must
471		 * loop waiting for the condition to be true.
472		 */
473		while (head->so_incqlen > head->so_qlimit) {
474			struct socket *sp;
475			sp = TAILQ_FIRST(&head->so_incomp);
476			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
477			head->so_incqlen--;
478			sp->so_qstate &= ~SQ_INCOMP;
479			sp->so_head = NULL;
480			ACCEPT_UNLOCK();
481			soabort(sp);
482			ACCEPT_LOCK();
483		}
484		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
485		so->so_qstate |= SQ_INCOMP;
486		head->so_incqlen++;
487	}
488	ACCEPT_UNLOCK();
489	if (connstatus) {
490		sorwakeup(head);
491		wakeup_one(&head->so_timeo);
492	}
493	return (so);
494}
495
496int
497sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
498{
499
500	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
501}
502
503/*
504 * solisten() transitions a socket from a non-listening state to a listening
505 * state, but can also be used to update the listen queue depth on an
506 * existing listen socket.  The protocol will call back into the sockets
507 * layer using solisten_proto_check() and solisten_proto() to check and set
508 * socket-layer listen state.  Call backs are used so that the protocol can
509 * acquire both protocol and socket layer locks in whatever order is required
510 * by the protocol.
511 *
512 * Protocol implementors are advised to hold the socket lock across the
513 * socket-layer test and set to avoid races at the socket layer.
514 */
515int
516solisten(struct socket *so, int backlog, struct thread *td)
517{
518
519	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
520}
521
522int
523solisten_proto_check(struct socket *so)
524{
525
526	SOCK_LOCK_ASSERT(so);
527
528	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
529	    SS_ISDISCONNECTING))
530		return (EINVAL);
531	return (0);
532}
533
534void
535solisten_proto(struct socket *so, int backlog)
536{
537
538	SOCK_LOCK_ASSERT(so);
539
540	if (backlog < 0 || backlog > somaxconn)
541		backlog = somaxconn;
542	so->so_qlimit = backlog;
543	so->so_options |= SO_ACCEPTCONN;
544}
545
546/*
547 * Attempt to free a socket.  This should really be sotryfree().
548 *
549 * sofree() will succeed if:
550 *
551 * - There are no outstanding file descriptor references or related consumers
552 *   (so_count == 0).
553 *
554 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
555 *
556 * - The protocol does not have an outstanding strong reference on the socket
557 *   (SS_PROTOREF).
558 *
559 * - The socket is not in a completed connection queue, so a process has been
560 *   notified that it is present.  If it is removed, the user process may
561 *   block in accept() despite select() saying the socket was ready.
562 *
563 * Otherwise, it will quietly abort so that a future call to sofree(), when
564 * conditions are right, can succeed.
565 */
566void
567sofree(struct socket *so)
568{
569	struct protosw *pr = so->so_proto;
570	struct socket *head;
571
572	ACCEPT_LOCK_ASSERT();
573	SOCK_LOCK_ASSERT(so);
574
575	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
576	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
577		SOCK_UNLOCK(so);
578		ACCEPT_UNLOCK();
579		return;
580	}
581
582	head = so->so_head;
583	if (head != NULL) {
584		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
585		    (so->so_qstate & SQ_INCOMP) != 0,
586		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
587		    "SQ_INCOMP"));
588		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
589		    (so->so_qstate & SQ_INCOMP) == 0,
590		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
591		TAILQ_REMOVE(&head->so_incomp, so, so_list);
592		head->so_incqlen--;
593		so->so_qstate &= ~SQ_INCOMP;
594		so->so_head = NULL;
595	}
596	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
597	    (so->so_qstate & SQ_INCOMP) == 0,
598	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
599	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
600	if (so->so_options & SO_ACCEPTCONN) {
601		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
602		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
603	}
604	SOCK_UNLOCK(so);
605	ACCEPT_UNLOCK();
606
607	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
608		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
609	if (pr->pr_usrreqs->pru_detach != NULL)
610		(*pr->pr_usrreqs->pru_detach)(so);
611
612	/*
613	 * From this point on, we assume that no other references to this
614	 * socket exist anywhere else in the stack.  Therefore, no locks need
615	 * to be acquired or held.
616	 *
617	 * We used to do a lot of socket buffer and socket locking here, as
618	 * well as invoke sorflush() and perform wakeups.  The direct call to
619	 * dom_dispose() and sbrelease_internal() are an inlining of what was
620	 * necessary from sorflush().
621	 *
622	 * Notice that the socket buffer and kqueue state are torn down
623	 * before calling pru_detach.  This means that protocols shold not
624	 * assume they can perform socket wakeups, etc, in their detach code.
625	 */
626	sbdestroy(&so->so_snd, so);
627	sbdestroy(&so->so_rcv, so);
628	knlist_destroy(&so->so_rcv.sb_sel.si_note);
629	knlist_destroy(&so->so_snd.sb_sel.si_note);
630	sodealloc(so);
631}
632
633/*
634 * Close a socket on last file table reference removal.  Initiate disconnect
635 * if connected.  Free socket when disconnect complete.
636 *
637 * This function will sorele() the socket.  Note that soclose() may be called
638 * prior to the ref count reaching zero.  The actual socket structure will
639 * not be freed until the ref count reaches zero.
640 */
641int
642soclose(struct socket *so)
643{
644	int error = 0;
645
646	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
647
648	funsetown(&so->so_sigio);
649	if (so->so_state & SS_ISCONNECTED) {
650		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
651			error = sodisconnect(so);
652			if (error)
653				goto drop;
654		}
655		if (so->so_options & SO_LINGER) {
656			if ((so->so_state & SS_ISDISCONNECTING) &&
657			    (so->so_state & SS_NBIO))
658				goto drop;
659			while (so->so_state & SS_ISCONNECTED) {
660				error = tsleep(&so->so_timeo,
661				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
662				if (error)
663					break;
664			}
665		}
666	}
667
668drop:
669	if (so->so_proto->pr_usrreqs->pru_close != NULL)
670		(*so->so_proto->pr_usrreqs->pru_close)(so);
671	if (so->so_options & SO_ACCEPTCONN) {
672		struct socket *sp;
673		ACCEPT_LOCK();
674		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
675			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
676			so->so_incqlen--;
677			sp->so_qstate &= ~SQ_INCOMP;
678			sp->so_head = NULL;
679			ACCEPT_UNLOCK();
680			soabort(sp);
681			ACCEPT_LOCK();
682		}
683		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
684			TAILQ_REMOVE(&so->so_comp, sp, so_list);
685			so->so_qlen--;
686			sp->so_qstate &= ~SQ_COMP;
687			sp->so_head = NULL;
688			ACCEPT_UNLOCK();
689			soabort(sp);
690			ACCEPT_LOCK();
691		}
692		ACCEPT_UNLOCK();
693	}
694	ACCEPT_LOCK();
695	SOCK_LOCK(so);
696	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
697	so->so_state |= SS_NOFDREF;
698	sorele(so);
699	return (error);
700}
701
702/*
703 * soabort() is used to abruptly tear down a connection, such as when a
704 * resource limit is reached (listen queue depth exceeded), or if a listen
705 * socket is closed while there are sockets waiting to be accepted.
706 *
707 * This interface is tricky, because it is called on an unreferenced socket,
708 * and must be called only by a thread that has actually removed the socket
709 * from the listen queue it was on, or races with other threads are risked.
710 *
711 * This interface will call into the protocol code, so must not be called
712 * with any socket locks held.  Protocols do call it while holding their own
713 * recursible protocol mutexes, but this is something that should be subject
714 * to review in the future.
715 */
716void
717soabort(struct socket *so)
718{
719
720	/*
721	 * In as much as is possible, assert that no references to this
722	 * socket are held.  This is not quite the same as asserting that the
723	 * current thread is responsible for arranging for no references, but
724	 * is as close as we can get for now.
725	 */
726	KASSERT(so->so_count == 0, ("soabort: so_count"));
727	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
728	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
729	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
730	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
731
732	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
733		(*so->so_proto->pr_usrreqs->pru_abort)(so);
734	ACCEPT_LOCK();
735	SOCK_LOCK(so);
736	sofree(so);
737}
738
739int
740soaccept(struct socket *so, struct sockaddr **nam)
741{
742	int error;
743
744	SOCK_LOCK(so);
745	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
746	so->so_state &= ~SS_NOFDREF;
747	SOCK_UNLOCK(so);
748	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
749	return (error);
750}
751
752int
753soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
754{
755	int error;
756
757	if (so->so_options & SO_ACCEPTCONN)
758		return (EOPNOTSUPP);
759	/*
760	 * If protocol is connection-based, can only connect once.
761	 * Otherwise, if connected, try to disconnect first.  This allows
762	 * user to disconnect by connecting to, e.g., a null address.
763	 */
764	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
765	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
766	    (error = sodisconnect(so)))) {
767		error = EISCONN;
768	} else {
769		/*
770		 * Prevent accumulated error from previous connection from
771		 * biting us.
772		 */
773		so->so_error = 0;
774		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
775	}
776
777	return (error);
778}
779
780int
781soconnect2(struct socket *so1, struct socket *so2)
782{
783
784	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
785}
786
787int
788sodisconnect(struct socket *so)
789{
790	int error;
791
792	if ((so->so_state & SS_ISCONNECTED) == 0)
793		return (ENOTCONN);
794	if (so->so_state & SS_ISDISCONNECTING)
795		return (EALREADY);
796	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
797	return (error);
798}
799
800#ifdef ZERO_COPY_SOCKETS
801struct so_zerocopy_stats{
802	int size_ok;
803	int align_ok;
804	int found_ifp;
805};
806struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
807#include <netinet/in.h>
808#include <net/route.h>
809#include <netinet/in_pcb.h>
810#include <vm/vm.h>
811#include <vm/vm_page.h>
812#include <vm/vm_object.h>
813
814/*
815 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
816 * sosend_dgram() and sosend_generic() use m_uiotombuf().
817 *
818 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
819 * all of the data referenced by the uio.  If desired, it uses zero-copy.
820 * *space will be updated to reflect data copied in.
821 *
822 * NB: If atomic I/O is requested, the caller must already have checked that
823 * space can hold resid bytes.
824 *
825 * NB: In the event of an error, the caller may need to free the partial
826 * chain pointed to by *mpp.  The contents of both *uio and *space may be
827 * modified even in the case of an error.
828 */
829static int
830sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
831    int flags)
832{
833	struct mbuf *m, **mp, *top;
834	long len, resid;
835	int error;
836#ifdef ZERO_COPY_SOCKETS
837	int cow_send;
838#endif
839
840	*retmp = top = NULL;
841	mp = &top;
842	len = 0;
843	resid = uio->uio_resid;
844	error = 0;
845	do {
846#ifdef ZERO_COPY_SOCKETS
847		cow_send = 0;
848#endif /* ZERO_COPY_SOCKETS */
849		if (resid >= MINCLSIZE) {
850#ifdef ZERO_COPY_SOCKETS
851			if (top == NULL) {
852				m = m_gethdr(M_WAITOK, MT_DATA);
853				m->m_pkthdr.len = 0;
854				m->m_pkthdr.rcvif = NULL;
855			} else
856				m = m_get(M_WAITOK, MT_DATA);
857			if (so_zero_copy_send &&
858			    resid>=PAGE_SIZE &&
859			    *space>=PAGE_SIZE &&
860			    uio->uio_iov->iov_len>=PAGE_SIZE) {
861				so_zerocp_stats.size_ok++;
862				so_zerocp_stats.align_ok++;
863				cow_send = socow_setup(m, uio);
864				len = cow_send;
865			}
866			if (!cow_send) {
867				m_clget(m, M_WAITOK);
868				len = min(min(MCLBYTES, resid), *space);
869			}
870#else /* ZERO_COPY_SOCKETS */
871			if (top == NULL) {
872				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
873				m->m_pkthdr.len = 0;
874				m->m_pkthdr.rcvif = NULL;
875			} else
876				m = m_getcl(M_WAIT, MT_DATA, 0);
877			len = min(min(MCLBYTES, resid), *space);
878#endif /* ZERO_COPY_SOCKETS */
879		} else {
880			if (top == NULL) {
881				m = m_gethdr(M_WAIT, MT_DATA);
882				m->m_pkthdr.len = 0;
883				m->m_pkthdr.rcvif = NULL;
884
885				len = min(min(MHLEN, resid), *space);
886				/*
887				 * For datagram protocols, leave room
888				 * for protocol headers in first mbuf.
889				 */
890				if (atomic && m && len < MHLEN)
891					MH_ALIGN(m, len);
892			} else {
893				m = m_get(M_WAIT, MT_DATA);
894				len = min(min(MLEN, resid), *space);
895			}
896		}
897		if (m == NULL) {
898			error = ENOBUFS;
899			goto out;
900		}
901
902		*space -= len;
903#ifdef ZERO_COPY_SOCKETS
904		if (cow_send)
905			error = 0;
906		else
907#endif /* ZERO_COPY_SOCKETS */
908		error = uiomove(mtod(m, void *), (int)len, uio);
909		resid = uio->uio_resid;
910		m->m_len = len;
911		*mp = m;
912		top->m_pkthdr.len += len;
913		if (error)
914			goto out;
915		mp = &m->m_next;
916		if (resid <= 0) {
917			if (flags & MSG_EOR)
918				top->m_flags |= M_EOR;
919			break;
920		}
921	} while (*space > 0 && atomic);
922out:
923	*retmp = top;
924	return (error);
925}
926#endif /*ZERO_COPY_SOCKETS*/
927
928#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
929
930int
931sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
932    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
933{
934	long space, resid;
935	int clen = 0, error, dontroute;
936#ifdef ZERO_COPY_SOCKETS
937	int atomic = sosendallatonce(so) || top;
938#endif
939
940	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
941	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
942	    ("sodgram_send: !PR_ATOMIC"));
943
944	if (uio != NULL)
945		resid = uio->uio_resid;
946	else
947		resid = top->m_pkthdr.len;
948	/*
949	 * In theory resid should be unsigned.  However, space must be
950	 * signed, as it might be less than 0 if we over-committed, and we
951	 * must use a signed comparison of space and resid.  On the other
952	 * hand, a negative resid causes us to loop sending 0-length
953	 * segments to the protocol.
954	 *
955	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
956	 * type sockets since that's an error.
957	 */
958	if (resid < 0) {
959		error = EINVAL;
960		goto out;
961	}
962
963	dontroute =
964	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
965	if (td != NULL)
966		td->td_ru.ru_msgsnd++;
967	if (control != NULL)
968		clen = control->m_len;
969
970	SOCKBUF_LOCK(&so->so_snd);
971	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
972		SOCKBUF_UNLOCK(&so->so_snd);
973		error = EPIPE;
974		goto out;
975	}
976	if (so->so_error) {
977		error = so->so_error;
978		so->so_error = 0;
979		SOCKBUF_UNLOCK(&so->so_snd);
980		goto out;
981	}
982	if ((so->so_state & SS_ISCONNECTED) == 0) {
983		/*
984		 * `sendto' and `sendmsg' is allowed on a connection-based
985		 * socket if it supports implied connect.  Return ENOTCONN if
986		 * not connected and no address is supplied.
987		 */
988		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
989		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
990			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
991			    !(resid == 0 && clen != 0)) {
992				SOCKBUF_UNLOCK(&so->so_snd);
993				error = ENOTCONN;
994				goto out;
995			}
996		} else if (addr == NULL) {
997			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
998				error = ENOTCONN;
999			else
1000				error = EDESTADDRREQ;
1001			SOCKBUF_UNLOCK(&so->so_snd);
1002			goto out;
1003		}
1004	}
1005
1006	/*
1007	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1008	 * problem and need fixing.
1009	 */
1010	space = sbspace(&so->so_snd);
1011	if (flags & MSG_OOB)
1012		space += 1024;
1013	space -= clen;
1014	SOCKBUF_UNLOCK(&so->so_snd);
1015	if (resid > space) {
1016		error = EMSGSIZE;
1017		goto out;
1018	}
1019	if (uio == NULL) {
1020		resid = 0;
1021		if (flags & MSG_EOR)
1022			top->m_flags |= M_EOR;
1023	} else {
1024#ifdef ZERO_COPY_SOCKETS
1025		error = sosend_copyin(uio, &top, atomic, &space, flags);
1026		if (error)
1027			goto out;
1028#else
1029		/*
1030		 * Copy the data from userland into a mbuf chain.
1031		 * If no data is to be copied in, a single empty mbuf
1032		 * is returned.
1033		 */
1034		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1035		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1036		if (top == NULL) {
1037			error = EFAULT;	/* only possible error */
1038			goto out;
1039		}
1040		space -= resid - uio->uio_resid;
1041#endif
1042		resid = uio->uio_resid;
1043	}
1044	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1045	/*
1046	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1047	 * than with.
1048	 */
1049	if (dontroute) {
1050		SOCK_LOCK(so);
1051		so->so_options |= SO_DONTROUTE;
1052		SOCK_UNLOCK(so);
1053	}
1054	/*
1055	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1056	 * of date.  We could have recieved a reset packet in an interrupt or
1057	 * maybe we slept while doing page faults in uiomove() etc.  We could
1058	 * probably recheck again inside the locking protection here, but
1059	 * there are probably other places that this also happens.  We must
1060	 * rethink this.
1061	 */
1062	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1063	    (flags & MSG_OOB) ? PRUS_OOB :
1064	/*
1065	 * If the user set MSG_EOF, the protocol understands this flag and
1066	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1067	 */
1068	    ((flags & MSG_EOF) &&
1069	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1070	     (resid <= 0)) ?
1071		PRUS_EOF :
1072		/* If there is more to send set PRUS_MORETOCOME */
1073		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1074		top, addr, control, td);
1075	if (dontroute) {
1076		SOCK_LOCK(so);
1077		so->so_options &= ~SO_DONTROUTE;
1078		SOCK_UNLOCK(so);
1079	}
1080	clen = 0;
1081	control = NULL;
1082	top = NULL;
1083out:
1084	if (top != NULL)
1085		m_freem(top);
1086	if (control != NULL)
1087		m_freem(control);
1088	return (error);
1089}
1090
1091/*
1092 * Send on a socket.  If send must go all at once and message is larger than
1093 * send buffering, then hard error.  Lock against other senders.  If must go
1094 * all at once and not enough room now, then inform user that this would
1095 * block and do nothing.  Otherwise, if nonblocking, send as much as
1096 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1097 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1098 * in mbuf chain must be small enough to send all at once.
1099 *
1100 * Returns nonzero on error, timeout or signal; callers must check for short
1101 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1102 * on return.
1103 */
1104int
1105sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1106    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1107{
1108	long space, resid;
1109	int clen = 0, error, dontroute;
1110	int atomic = sosendallatonce(so) || top;
1111
1112	if (uio != NULL)
1113		resid = uio->uio_resid;
1114	else
1115		resid = top->m_pkthdr.len;
1116	/*
1117	 * In theory resid should be unsigned.  However, space must be
1118	 * signed, as it might be less than 0 if we over-committed, and we
1119	 * must use a signed comparison of space and resid.  On the other
1120	 * hand, a negative resid causes us to loop sending 0-length
1121	 * segments to the protocol.
1122	 *
1123	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1124	 * type sockets since that's an error.
1125	 */
1126	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1127		error = EINVAL;
1128		goto out;
1129	}
1130
1131	dontroute =
1132	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1133	    (so->so_proto->pr_flags & PR_ATOMIC);
1134	if (td != NULL)
1135		td->td_ru.ru_msgsnd++;
1136	if (control != NULL)
1137		clen = control->m_len;
1138
1139	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1140	if (error)
1141		goto out;
1142
1143restart:
1144	do {
1145		SOCKBUF_LOCK(&so->so_snd);
1146		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1147			SOCKBUF_UNLOCK(&so->so_snd);
1148			error = EPIPE;
1149			goto release;
1150		}
1151		if (so->so_error) {
1152			error = so->so_error;
1153			so->so_error = 0;
1154			SOCKBUF_UNLOCK(&so->so_snd);
1155			goto release;
1156		}
1157		if ((so->so_state & SS_ISCONNECTED) == 0) {
1158			/*
1159			 * `sendto' and `sendmsg' is allowed on a connection-
1160			 * based socket if it supports implied connect.
1161			 * Return ENOTCONN if not connected and no address is
1162			 * supplied.
1163			 */
1164			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1165			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1166				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1167				    !(resid == 0 && clen != 0)) {
1168					SOCKBUF_UNLOCK(&so->so_snd);
1169					error = ENOTCONN;
1170					goto release;
1171				}
1172			} else if (addr == NULL) {
1173				SOCKBUF_UNLOCK(&so->so_snd);
1174				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1175					error = ENOTCONN;
1176				else
1177					error = EDESTADDRREQ;
1178				goto release;
1179			}
1180		}
1181		space = sbspace(&so->so_snd);
1182		if (flags & MSG_OOB)
1183			space += 1024;
1184		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1185		    clen > so->so_snd.sb_hiwat) {
1186			SOCKBUF_UNLOCK(&so->so_snd);
1187			error = EMSGSIZE;
1188			goto release;
1189		}
1190		if (space < resid + clen &&
1191		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1192			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1193				SOCKBUF_UNLOCK(&so->so_snd);
1194				error = EWOULDBLOCK;
1195				goto release;
1196			}
1197			error = sbwait(&so->so_snd);
1198			SOCKBUF_UNLOCK(&so->so_snd);
1199			if (error)
1200				goto release;
1201			goto restart;
1202		}
1203		SOCKBUF_UNLOCK(&so->so_snd);
1204		space -= clen;
1205		do {
1206			if (uio == NULL) {
1207				resid = 0;
1208				if (flags & MSG_EOR)
1209					top->m_flags |= M_EOR;
1210			} else {
1211#ifdef ZERO_COPY_SOCKETS
1212				error = sosend_copyin(uio, &top, atomic,
1213				    &space, flags);
1214				if (error != 0)
1215					goto release;
1216#else
1217				/*
1218				 * Copy the data from userland into a mbuf
1219				 * chain.  If no data is to be copied in,
1220				 * a single empty mbuf is returned.
1221				 */
1222				top = m_uiotombuf(uio, M_WAITOK, space,
1223				    (atomic ? max_hdr : 0),
1224				    (atomic ? M_PKTHDR : 0) |
1225				    ((flags & MSG_EOR) ? M_EOR : 0));
1226				if (top == NULL) {
1227					error = EFAULT; /* only possible error */
1228					goto release;
1229				}
1230				space -= resid - uio->uio_resid;
1231#endif
1232				resid = uio->uio_resid;
1233			}
1234			if (dontroute) {
1235				SOCK_LOCK(so);
1236				so->so_options |= SO_DONTROUTE;
1237				SOCK_UNLOCK(so);
1238			}
1239			/*
1240			 * XXX all the SBS_CANTSENDMORE checks previously
1241			 * done could be out of date.  We could have recieved
1242			 * a reset packet in an interrupt or maybe we slept
1243			 * while doing page faults in uiomove() etc.  We
1244			 * could probably recheck again inside the locking
1245			 * protection here, but there are probably other
1246			 * places that this also happens.  We must rethink
1247			 * this.
1248			 */
1249			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1250			    (flags & MSG_OOB) ? PRUS_OOB :
1251			/*
1252			 * If the user set MSG_EOF, the protocol understands
1253			 * this flag and nothing left to send then use
1254			 * PRU_SEND_EOF instead of PRU_SEND.
1255			 */
1256			    ((flags & MSG_EOF) &&
1257			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1258			     (resid <= 0)) ?
1259				PRUS_EOF :
1260			/* If there is more to send set PRUS_MORETOCOME. */
1261			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1262			    top, addr, control, td);
1263			if (dontroute) {
1264				SOCK_LOCK(so);
1265				so->so_options &= ~SO_DONTROUTE;
1266				SOCK_UNLOCK(so);
1267			}
1268			clen = 0;
1269			control = NULL;
1270			top = NULL;
1271			if (error)
1272				goto release;
1273		} while (resid && space > 0);
1274	} while (resid);
1275
1276release:
1277	sbunlock(&so->so_snd);
1278out:
1279	if (top != NULL)
1280		m_freem(top);
1281	if (control != NULL)
1282		m_freem(control);
1283	return (error);
1284}
1285
1286int
1287sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1288    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1289{
1290
1291	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1292	    control, flags, td));
1293}
1294
1295/*
1296 * The part of soreceive() that implements reading non-inline out-of-band
1297 * data from a socket.  For more complete comments, see soreceive(), from
1298 * which this code originated.
1299 *
1300 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1301 * unable to return an mbuf chain to the caller.
1302 */
1303static int
1304soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1305{
1306	struct protosw *pr = so->so_proto;
1307	struct mbuf *m;
1308	int error;
1309
1310	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1311
1312	m = m_get(M_WAIT, MT_DATA);
1313	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1314	if (error)
1315		goto bad;
1316	do {
1317#ifdef ZERO_COPY_SOCKETS
1318		if (so_zero_copy_receive) {
1319			int disposable;
1320
1321			if ((m->m_flags & M_EXT)
1322			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1323				disposable = 1;
1324			else
1325				disposable = 0;
1326
1327			error = uiomoveco(mtod(m, void *),
1328					  min(uio->uio_resid, m->m_len),
1329					  uio, disposable);
1330		} else
1331#endif /* ZERO_COPY_SOCKETS */
1332		error = uiomove(mtod(m, void *),
1333		    (int) min(uio->uio_resid, m->m_len), uio);
1334		m = m_free(m);
1335	} while (uio->uio_resid && error == 0 && m);
1336bad:
1337	if (m != NULL)
1338		m_freem(m);
1339	return (error);
1340}
1341
1342/*
1343 * Following replacement or removal of the first mbuf on the first mbuf chain
1344 * of a socket buffer, push necessary state changes back into the socket
1345 * buffer so that other consumers see the values consistently.  'nextrecord'
1346 * is the callers locally stored value of the original value of
1347 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1348 * NOTE: 'nextrecord' may be NULL.
1349 */
1350static __inline void
1351sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1352{
1353
1354	SOCKBUF_LOCK_ASSERT(sb);
1355	/*
1356	 * First, update for the new value of nextrecord.  If necessary, make
1357	 * it the first record.
1358	 */
1359	if (sb->sb_mb != NULL)
1360		sb->sb_mb->m_nextpkt = nextrecord;
1361	else
1362		sb->sb_mb = nextrecord;
1363
1364        /*
1365         * Now update any dependent socket buffer fields to reflect the new
1366         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1367	 * addition of a second clause that takes care of the case where
1368	 * sb_mb has been updated, but remains the last record.
1369         */
1370        if (sb->sb_mb == NULL) {
1371                sb->sb_mbtail = NULL;
1372                sb->sb_lastrecord = NULL;
1373        } else if (sb->sb_mb->m_nextpkt == NULL)
1374                sb->sb_lastrecord = sb->sb_mb;
1375}
1376
1377
1378/*
1379 * Implement receive operations on a socket.  We depend on the way that
1380 * records are added to the sockbuf by sbappend.  In particular, each record
1381 * (mbufs linked through m_next) must begin with an address if the protocol
1382 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1383 * data, and then zero or more mbufs of data.  In order to allow parallelism
1384 * between network receive and copying to user space, as well as avoid
1385 * sleeping with a mutex held, we release the socket buffer mutex during the
1386 * user space copy.  Although the sockbuf is locked, new data may still be
1387 * appended, and thus we must maintain consistency of the sockbuf during that
1388 * time.
1389 *
1390 * The caller may receive the data as a single mbuf chain by supplying an
1391 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1392 * the count in uio_resid.
1393 */
1394int
1395soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1396    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1397{
1398	struct mbuf *m, **mp;
1399	int flags, len, error, offset;
1400	struct protosw *pr = so->so_proto;
1401	struct mbuf *nextrecord;
1402	int moff, type = 0;
1403	int orig_resid = uio->uio_resid;
1404
1405	mp = mp0;
1406	if (psa != NULL)
1407		*psa = NULL;
1408	if (controlp != NULL)
1409		*controlp = NULL;
1410	if (flagsp != NULL)
1411		flags = *flagsp &~ MSG_EOR;
1412	else
1413		flags = 0;
1414	if (flags & MSG_OOB)
1415		return (soreceive_rcvoob(so, uio, flags));
1416	if (mp != NULL)
1417		*mp = NULL;
1418	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1419	    && uio->uio_resid)
1420		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1421
1422	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1423	if (error)
1424		return (error);
1425
1426restart:
1427	SOCKBUF_LOCK(&so->so_rcv);
1428	m = so->so_rcv.sb_mb;
1429	/*
1430	 * If we have less data than requested, block awaiting more (subject
1431	 * to any timeout) if:
1432	 *   1. the current count is less than the low water mark, or
1433	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1434	 *	receive operation at once if we block (resid <= hiwat).
1435	 *   3. MSG_DONTWAIT is not set
1436	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1437	 * we have to do the receive in sections, and thus risk returning a
1438	 * short count if a timeout or signal occurs after we start.
1439	 */
1440	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1441	    so->so_rcv.sb_cc < uio->uio_resid) &&
1442	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1443	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1444	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1445		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1446		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1447		    m, so->so_rcv.sb_cc));
1448		if (so->so_error) {
1449			if (m != NULL)
1450				goto dontblock;
1451			error = so->so_error;
1452			if ((flags & MSG_PEEK) == 0)
1453				so->so_error = 0;
1454			SOCKBUF_UNLOCK(&so->so_rcv);
1455			goto release;
1456		}
1457		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1458		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1459			if (m == NULL) {
1460				SOCKBUF_UNLOCK(&so->so_rcv);
1461				goto release;
1462			} else
1463				goto dontblock;
1464		}
1465		for (; m != NULL; m = m->m_next)
1466			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1467				m = so->so_rcv.sb_mb;
1468				goto dontblock;
1469			}
1470		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1471		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1472			SOCKBUF_UNLOCK(&so->so_rcv);
1473			error = ENOTCONN;
1474			goto release;
1475		}
1476		if (uio->uio_resid == 0) {
1477			SOCKBUF_UNLOCK(&so->so_rcv);
1478			goto release;
1479		}
1480		if ((so->so_state & SS_NBIO) ||
1481		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1482			SOCKBUF_UNLOCK(&so->so_rcv);
1483			error = EWOULDBLOCK;
1484			goto release;
1485		}
1486		SBLASTRECORDCHK(&so->so_rcv);
1487		SBLASTMBUFCHK(&so->so_rcv);
1488		error = sbwait(&so->so_rcv);
1489		SOCKBUF_UNLOCK(&so->so_rcv);
1490		if (error)
1491			goto release;
1492		goto restart;
1493	}
1494dontblock:
1495	/*
1496	 * From this point onward, we maintain 'nextrecord' as a cache of the
1497	 * pointer to the next record in the socket buffer.  We must keep the
1498	 * various socket buffer pointers and local stack versions of the
1499	 * pointers in sync, pushing out modifications before dropping the
1500	 * socket buffer mutex, and re-reading them when picking it up.
1501	 *
1502	 * Otherwise, we will race with the network stack appending new data
1503	 * or records onto the socket buffer by using inconsistent/stale
1504	 * versions of the field, possibly resulting in socket buffer
1505	 * corruption.
1506	 *
1507	 * By holding the high-level sblock(), we prevent simultaneous
1508	 * readers from pulling off the front of the socket buffer.
1509	 */
1510	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1511	if (uio->uio_td)
1512		uio->uio_td->td_ru.ru_msgrcv++;
1513	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1514	SBLASTRECORDCHK(&so->so_rcv);
1515	SBLASTMBUFCHK(&so->so_rcv);
1516	nextrecord = m->m_nextpkt;
1517	if (pr->pr_flags & PR_ADDR) {
1518		KASSERT(m->m_type == MT_SONAME,
1519		    ("m->m_type == %d", m->m_type));
1520		orig_resid = 0;
1521		if (psa != NULL)
1522			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1523			    M_NOWAIT);
1524		if (flags & MSG_PEEK) {
1525			m = m->m_next;
1526		} else {
1527			sbfree(&so->so_rcv, m);
1528			so->so_rcv.sb_mb = m_free(m);
1529			m = so->so_rcv.sb_mb;
1530			sockbuf_pushsync(&so->so_rcv, nextrecord);
1531		}
1532	}
1533
1534	/*
1535	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1536	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1537	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1538	 * perform externalization (or freeing if controlp == NULL).
1539	 */
1540	if (m != NULL && m->m_type == MT_CONTROL) {
1541		struct mbuf *cm = NULL, *cmn;
1542		struct mbuf **cme = &cm;
1543
1544		do {
1545			if (flags & MSG_PEEK) {
1546				if (controlp != NULL) {
1547					*controlp = m_copy(m, 0, m->m_len);
1548					controlp = &(*controlp)->m_next;
1549				}
1550				m = m->m_next;
1551			} else {
1552				sbfree(&so->so_rcv, m);
1553				so->so_rcv.sb_mb = m->m_next;
1554				m->m_next = NULL;
1555				*cme = m;
1556				cme = &(*cme)->m_next;
1557				m = so->so_rcv.sb_mb;
1558			}
1559		} while (m != NULL && m->m_type == MT_CONTROL);
1560		if ((flags & MSG_PEEK) == 0)
1561			sockbuf_pushsync(&so->so_rcv, nextrecord);
1562		while (cm != NULL) {
1563			cmn = cm->m_next;
1564			cm->m_next = NULL;
1565			if (pr->pr_domain->dom_externalize != NULL) {
1566				SOCKBUF_UNLOCK(&so->so_rcv);
1567				error = (*pr->pr_domain->dom_externalize)
1568				    (cm, controlp);
1569				SOCKBUF_LOCK(&so->so_rcv);
1570			} else if (controlp != NULL)
1571				*controlp = cm;
1572			else
1573				m_freem(cm);
1574			if (controlp != NULL) {
1575				orig_resid = 0;
1576				while (*controlp != NULL)
1577					controlp = &(*controlp)->m_next;
1578			}
1579			cm = cmn;
1580		}
1581		if (m != NULL)
1582			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1583		else
1584			nextrecord = so->so_rcv.sb_mb;
1585		orig_resid = 0;
1586	}
1587	if (m != NULL) {
1588		if ((flags & MSG_PEEK) == 0) {
1589			KASSERT(m->m_nextpkt == nextrecord,
1590			    ("soreceive: post-control, nextrecord !sync"));
1591			if (nextrecord == NULL) {
1592				KASSERT(so->so_rcv.sb_mb == m,
1593				    ("soreceive: post-control, sb_mb!=m"));
1594				KASSERT(so->so_rcv.sb_lastrecord == m,
1595				    ("soreceive: post-control, lastrecord!=m"));
1596			}
1597		}
1598		type = m->m_type;
1599		if (type == MT_OOBDATA)
1600			flags |= MSG_OOB;
1601	} else {
1602		if ((flags & MSG_PEEK) == 0) {
1603			KASSERT(so->so_rcv.sb_mb == nextrecord,
1604			    ("soreceive: sb_mb != nextrecord"));
1605			if (so->so_rcv.sb_mb == NULL) {
1606				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1607				    ("soreceive: sb_lastercord != NULL"));
1608			}
1609		}
1610	}
1611	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1612	SBLASTRECORDCHK(&so->so_rcv);
1613	SBLASTMBUFCHK(&so->so_rcv);
1614
1615	/*
1616	 * Now continue to read any data mbufs off of the head of the socket
1617	 * buffer until the read request is satisfied.  Note that 'type' is
1618	 * used to store the type of any mbuf reads that have happened so far
1619	 * such that soreceive() can stop reading if the type changes, which
1620	 * causes soreceive() to return only one of regular data and inline
1621	 * out-of-band data in a single socket receive operation.
1622	 */
1623	moff = 0;
1624	offset = 0;
1625	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1626		/*
1627		 * If the type of mbuf has changed since the last mbuf
1628		 * examined ('type'), end the receive operation.
1629	 	 */
1630		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1631		if (m->m_type == MT_OOBDATA) {
1632			if (type != MT_OOBDATA)
1633				break;
1634		} else if (type == MT_OOBDATA)
1635			break;
1636		else
1637		    KASSERT(m->m_type == MT_DATA,
1638			("m->m_type == %d", m->m_type));
1639		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1640		len = uio->uio_resid;
1641		if (so->so_oobmark && len > so->so_oobmark - offset)
1642			len = so->so_oobmark - offset;
1643		if (len > m->m_len - moff)
1644			len = m->m_len - moff;
1645		/*
1646		 * If mp is set, just pass back the mbufs.  Otherwise copy
1647		 * them out via the uio, then free.  Sockbuf must be
1648		 * consistent here (points to current mbuf, it points to next
1649		 * record) when we drop priority; we must note any additions
1650		 * to the sockbuf when we block interrupts again.
1651		 */
1652		if (mp == NULL) {
1653			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1654			SBLASTRECORDCHK(&so->so_rcv);
1655			SBLASTMBUFCHK(&so->so_rcv);
1656			SOCKBUF_UNLOCK(&so->so_rcv);
1657#ifdef ZERO_COPY_SOCKETS
1658			if (so_zero_copy_receive) {
1659				int disposable;
1660
1661				if ((m->m_flags & M_EXT)
1662				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1663					disposable = 1;
1664				else
1665					disposable = 0;
1666
1667				error = uiomoveco(mtod(m, char *) + moff,
1668						  (int)len, uio,
1669						  disposable);
1670			} else
1671#endif /* ZERO_COPY_SOCKETS */
1672			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1673			SOCKBUF_LOCK(&so->so_rcv);
1674			if (error) {
1675				/*
1676				 * The MT_SONAME mbuf has already been removed
1677				 * from the record, so it is necessary to
1678				 * remove the data mbufs, if any, to preserve
1679				 * the invariant in the case of PR_ADDR that
1680				 * requires MT_SONAME mbufs at the head of
1681				 * each record.
1682				 */
1683				if (m && pr->pr_flags & PR_ATOMIC &&
1684				    ((flags & MSG_PEEK) == 0))
1685					(void)sbdroprecord_locked(&so->so_rcv);
1686				SOCKBUF_UNLOCK(&so->so_rcv);
1687				goto release;
1688			}
1689		} else
1690			uio->uio_resid -= len;
1691		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1692		if (len == m->m_len - moff) {
1693			if (m->m_flags & M_EOR)
1694				flags |= MSG_EOR;
1695			if (flags & MSG_PEEK) {
1696				m = m->m_next;
1697				moff = 0;
1698			} else {
1699				nextrecord = m->m_nextpkt;
1700				sbfree(&so->so_rcv, m);
1701				if (mp != NULL) {
1702					*mp = m;
1703					mp = &m->m_next;
1704					so->so_rcv.sb_mb = m = m->m_next;
1705					*mp = NULL;
1706				} else {
1707					so->so_rcv.sb_mb = m_free(m);
1708					m = so->so_rcv.sb_mb;
1709				}
1710				sockbuf_pushsync(&so->so_rcv, nextrecord);
1711				SBLASTRECORDCHK(&so->so_rcv);
1712				SBLASTMBUFCHK(&so->so_rcv);
1713			}
1714		} else {
1715			if (flags & MSG_PEEK)
1716				moff += len;
1717			else {
1718				if (mp != NULL) {
1719					int copy_flag;
1720
1721					if (flags & MSG_DONTWAIT)
1722						copy_flag = M_DONTWAIT;
1723					else
1724						copy_flag = M_WAIT;
1725					if (copy_flag == M_WAIT)
1726						SOCKBUF_UNLOCK(&so->so_rcv);
1727					*mp = m_copym(m, 0, len, copy_flag);
1728					if (copy_flag == M_WAIT)
1729						SOCKBUF_LOCK(&so->so_rcv);
1730 					if (*mp == NULL) {
1731 						/*
1732 						 * m_copym() couldn't
1733						 * allocate an mbuf.  Adjust
1734						 * uio_resid back (it was
1735						 * adjusted down by len
1736						 * bytes, which we didn't end
1737						 * up "copying" over).
1738 						 */
1739 						uio->uio_resid += len;
1740 						break;
1741 					}
1742				}
1743				m->m_data += len;
1744				m->m_len -= len;
1745				so->so_rcv.sb_cc -= len;
1746			}
1747		}
1748		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1749		if (so->so_oobmark) {
1750			if ((flags & MSG_PEEK) == 0) {
1751				so->so_oobmark -= len;
1752				if (so->so_oobmark == 0) {
1753					so->so_rcv.sb_state |= SBS_RCVATMARK;
1754					break;
1755				}
1756			} else {
1757				offset += len;
1758				if (offset == so->so_oobmark)
1759					break;
1760			}
1761		}
1762		if (flags & MSG_EOR)
1763			break;
1764		/*
1765		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1766		 * must not quit until "uio->uio_resid == 0" or an error
1767		 * termination.  If a signal/timeout occurs, return with a
1768		 * short count but without error.  Keep sockbuf locked
1769		 * against other readers.
1770		 */
1771		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1772		    !sosendallatonce(so) && nextrecord == NULL) {
1773			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1774			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1775				break;
1776			/*
1777			 * Notify the protocol that some data has been
1778			 * drained before blocking.
1779			 */
1780			if (pr->pr_flags & PR_WANTRCVD) {
1781				SOCKBUF_UNLOCK(&so->so_rcv);
1782				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1783				SOCKBUF_LOCK(&so->so_rcv);
1784			}
1785			SBLASTRECORDCHK(&so->so_rcv);
1786			SBLASTMBUFCHK(&so->so_rcv);
1787			error = sbwait(&so->so_rcv);
1788			if (error) {
1789				SOCKBUF_UNLOCK(&so->so_rcv);
1790				goto release;
1791			}
1792			m = so->so_rcv.sb_mb;
1793			if (m != NULL)
1794				nextrecord = m->m_nextpkt;
1795		}
1796	}
1797
1798	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1799	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1800		flags |= MSG_TRUNC;
1801		if ((flags & MSG_PEEK) == 0)
1802			(void) sbdroprecord_locked(&so->so_rcv);
1803	}
1804	if ((flags & MSG_PEEK) == 0) {
1805		if (m == NULL) {
1806			/*
1807			 * First part is an inline SB_EMPTY_FIXUP().  Second
1808			 * part makes sure sb_lastrecord is up-to-date if
1809			 * there is still data in the socket buffer.
1810			 */
1811			so->so_rcv.sb_mb = nextrecord;
1812			if (so->so_rcv.sb_mb == NULL) {
1813				so->so_rcv.sb_mbtail = NULL;
1814				so->so_rcv.sb_lastrecord = NULL;
1815			} else if (nextrecord->m_nextpkt == NULL)
1816				so->so_rcv.sb_lastrecord = nextrecord;
1817		}
1818		SBLASTRECORDCHK(&so->so_rcv);
1819		SBLASTMBUFCHK(&so->so_rcv);
1820		/*
1821		 * If soreceive() is being done from the socket callback,
1822		 * then don't need to generate ACK to peer to update window,
1823		 * since ACK will be generated on return to TCP.
1824		 */
1825		if (!(flags & MSG_SOCALLBCK) &&
1826		    (pr->pr_flags & PR_WANTRCVD)) {
1827			SOCKBUF_UNLOCK(&so->so_rcv);
1828			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1829			SOCKBUF_LOCK(&so->so_rcv);
1830		}
1831	}
1832	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1833	if (orig_resid == uio->uio_resid && orig_resid &&
1834	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1835		SOCKBUF_UNLOCK(&so->so_rcv);
1836		goto restart;
1837	}
1838	SOCKBUF_UNLOCK(&so->so_rcv);
1839
1840	if (flagsp != NULL)
1841		*flagsp |= flags;
1842release:
1843	sbunlock(&so->so_rcv);
1844	return (error);
1845}
1846
1847/*
1848 * Optimized version of soreceive() for simple datagram cases from userspace.
1849 * Unlike in the stream case, we're able to drop a datagram if copyout()
1850 * fails, and because we handle datagrams atomically, we don't need to use a
1851 * sleep lock to prevent I/O interlacing.
1852 */
1853int
1854soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1855    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1856{
1857	struct mbuf *m, *m2;
1858	int flags, len, error, offset;
1859	struct protosw *pr = so->so_proto;
1860	struct mbuf *nextrecord;
1861
1862	if (psa != NULL)
1863		*psa = NULL;
1864	if (controlp != NULL)
1865		*controlp = NULL;
1866	if (flagsp != NULL)
1867		flags = *flagsp &~ MSG_EOR;
1868	else
1869		flags = 0;
1870
1871	/*
1872	 * For any complicated cases, fall back to the full
1873	 * soreceive_generic().
1874	 */
1875	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
1876		return (soreceive_generic(so, psa, uio, mp0, controlp,
1877		    flagsp));
1878
1879	/*
1880	 * Enforce restrictions on use.
1881	 */
1882	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
1883	    ("soreceive_dgram: wantrcvd"));
1884	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
1885	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
1886	    ("soreceive_dgram: SBS_RCVATMARK"));
1887	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
1888	    ("soreceive_dgram: P_CONNREQUIRED"));
1889
1890	/*
1891	 * Loop blocking while waiting for a datagram.
1892	 */
1893	SOCKBUF_LOCK(&so->so_rcv);
1894	while ((m = so->so_rcv.sb_mb) == NULL) {
1895		KASSERT(so->so_rcv.sb_cc == 0,
1896		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
1897		    so->so_rcv.sb_cc));
1898		if (so->so_error) {
1899			error = so->so_error;
1900			so->so_error = 0;
1901			SOCKBUF_UNLOCK(&so->so_rcv);
1902			return (error);
1903		}
1904		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1905		    uio->uio_resid == 0) {
1906			SOCKBUF_UNLOCK(&so->so_rcv);
1907			return (0);
1908		}
1909		if ((so->so_state & SS_NBIO) ||
1910		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1911			SOCKBUF_UNLOCK(&so->so_rcv);
1912			return (EWOULDBLOCK);
1913		}
1914		SBLASTRECORDCHK(&so->so_rcv);
1915		SBLASTMBUFCHK(&so->so_rcv);
1916		error = sbwait(&so->so_rcv);
1917		if (error) {
1918			SOCKBUF_UNLOCK(&so->so_rcv);
1919			return (error);
1920		}
1921	}
1922	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1923
1924	if (uio->uio_td)
1925		uio->uio_td->td_ru.ru_msgrcv++;
1926	SBLASTRECORDCHK(&so->so_rcv);
1927	SBLASTMBUFCHK(&so->so_rcv);
1928	nextrecord = m->m_nextpkt;
1929	if (nextrecord == NULL) {
1930		KASSERT(so->so_rcv.sb_lastrecord == m,
1931		    ("soreceive_dgram: lastrecord != m"));
1932	}
1933
1934	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
1935	    ("soreceive_dgram: m_nextpkt != nextrecord"));
1936
1937	/*
1938	 * Pull 'm' and its chain off the front of the packet queue.
1939	 */
1940	so->so_rcv.sb_mb = NULL;
1941	sockbuf_pushsync(&so->so_rcv, nextrecord);
1942
1943	/*
1944	 * Walk 'm's chain and free that many bytes from the socket buffer.
1945	 */
1946	for (m2 = m; m2 != NULL; m2 = m2->m_next)
1947		sbfree(&so->so_rcv, m2);
1948
1949	/*
1950	 * Do a few last checks before we let go of the lock.
1951	 */
1952	SBLASTRECORDCHK(&so->so_rcv);
1953	SBLASTMBUFCHK(&so->so_rcv);
1954	SOCKBUF_UNLOCK(&so->so_rcv);
1955
1956	if (pr->pr_flags & PR_ADDR) {
1957		KASSERT(m->m_type == MT_SONAME,
1958		    ("m->m_type == %d", m->m_type));
1959		if (psa != NULL)
1960			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1961			    M_NOWAIT);
1962		m = m_free(m);
1963	}
1964	if (m == NULL) {
1965		/* XXXRW: Can this happen? */
1966		return (0);
1967	}
1968
1969	/*
1970	 * Packet to copyout() is now in 'm' and it is disconnected from the
1971	 * queue.
1972	 *
1973	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1974	 * in the first mbuf chain on the socket buffer.  We call into the
1975	 * protocol to perform externalization (or freeing if controlp ==
1976	 * NULL).
1977	 */
1978	if (m->m_type == MT_CONTROL) {
1979		struct mbuf *cm = NULL, *cmn;
1980		struct mbuf **cme = &cm;
1981
1982		do {
1983			m2 = m->m_next;
1984			m->m_next = NULL;
1985			*cme = m;
1986			cme = &(*cme)->m_next;
1987			m = m2;
1988		} while (m != NULL && m->m_type == MT_CONTROL);
1989		while (cm != NULL) {
1990			cmn = cm->m_next;
1991			cm->m_next = NULL;
1992			if (pr->pr_domain->dom_externalize != NULL) {
1993				error = (*pr->pr_domain->dom_externalize)
1994				    (cm, controlp);
1995			} else if (controlp != NULL)
1996				*controlp = cm;
1997			else
1998				m_freem(cm);
1999			if (controlp != NULL) {
2000				while (*controlp != NULL)
2001					controlp = &(*controlp)->m_next;
2002			}
2003			cm = cmn;
2004		}
2005	}
2006	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2007
2008	offset = 0;
2009	while (m != NULL && uio->uio_resid > 0) {
2010		len = uio->uio_resid;
2011		if (len > m->m_len)
2012			len = m->m_len;
2013		error = uiomove(mtod(m, char *), (int)len, uio);
2014		if (error) {
2015			m_freem(m);
2016			return (error);
2017		}
2018		m = m_free(m);
2019	}
2020	if (m != NULL)
2021		flags |= MSG_TRUNC;
2022	m_freem(m);
2023	if (flagsp != NULL)
2024		*flagsp |= flags;
2025	return (0);
2026}
2027
2028int
2029soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2030    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2031{
2032
2033	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2034	    controlp, flagsp));
2035}
2036
2037int
2038soshutdown(struct socket *so, int how)
2039{
2040	struct protosw *pr = so->so_proto;
2041
2042	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2043		return (EINVAL);
2044	if (pr->pr_usrreqs->pru_flush != NULL) {
2045	        (*pr->pr_usrreqs->pru_flush)(so, how);
2046	}
2047	if (how != SHUT_WR)
2048		sorflush(so);
2049	if (how != SHUT_RD)
2050		return ((*pr->pr_usrreqs->pru_shutdown)(so));
2051	return (0);
2052}
2053
2054void
2055sorflush(struct socket *so)
2056{
2057	struct sockbuf *sb = &so->so_rcv;
2058	struct protosw *pr = so->so_proto;
2059	struct sockbuf asb;
2060
2061	/*
2062	 * In order to avoid calling dom_dispose with the socket buffer mutex
2063	 * held, and in order to generally avoid holding the lock for a long
2064	 * time, we make a copy of the socket buffer and clear the original
2065	 * (except locks, state).  The new socket buffer copy won't have
2066	 * initialized locks so we can only call routines that won't use or
2067	 * assert those locks.
2068	 *
2069	 * Dislodge threads currently blocked in receive and wait to acquire
2070	 * a lock against other simultaneous readers before clearing the
2071	 * socket buffer.  Don't let our acquire be interrupted by a signal
2072	 * despite any existing socket disposition on interruptable waiting.
2073	 */
2074	socantrcvmore(so);
2075	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2076
2077	/*
2078	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2079	 * and mutex data unchanged.
2080	 */
2081	SOCKBUF_LOCK(sb);
2082	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2083	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2084	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2085	bzero(&sb->sb_startzero,
2086	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2087	SOCKBUF_UNLOCK(sb);
2088	sbunlock(sb);
2089
2090	/*
2091	 * Dispose of special rights and flush the socket buffer.  Don't call
2092	 * any unsafe routines (that rely on locks being initialized) on asb.
2093	 */
2094	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2095		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2096	sbrelease_internal(&asb, so);
2097}
2098
2099/*
2100 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2101 * additional variant to handle the case where the option value needs to be
2102 * some kind of integer, but not a specific size.  In addition to their use
2103 * here, these functions are also called by the protocol-level pr_ctloutput()
2104 * routines.
2105 */
2106int
2107sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2108{
2109	size_t	valsize;
2110
2111	/*
2112	 * If the user gives us more than we wanted, we ignore it, but if we
2113	 * don't get the minimum length the caller wants, we return EINVAL.
2114	 * On success, sopt->sopt_valsize is set to however much we actually
2115	 * retrieved.
2116	 */
2117	if ((valsize = sopt->sopt_valsize) < minlen)
2118		return EINVAL;
2119	if (valsize > len)
2120		sopt->sopt_valsize = valsize = len;
2121
2122	if (sopt->sopt_td != NULL)
2123		return (copyin(sopt->sopt_val, buf, valsize));
2124
2125	bcopy(sopt->sopt_val, buf, valsize);
2126	return (0);
2127}
2128
2129/*
2130 * Kernel version of setsockopt(2).
2131 *
2132 * XXX: optlen is size_t, not socklen_t
2133 */
2134int
2135so_setsockopt(struct socket *so, int level, int optname, void *optval,
2136    size_t optlen)
2137{
2138	struct sockopt sopt;
2139
2140	sopt.sopt_level = level;
2141	sopt.sopt_name = optname;
2142	sopt.sopt_dir = SOPT_SET;
2143	sopt.sopt_val = optval;
2144	sopt.sopt_valsize = optlen;
2145	sopt.sopt_td = NULL;
2146	return (sosetopt(so, &sopt));
2147}
2148
2149int
2150sosetopt(struct socket *so, struct sockopt *sopt)
2151{
2152	int	error, optval;
2153	struct	linger l;
2154	struct	timeval tv;
2155	u_long  val;
2156#ifdef MAC
2157	struct mac extmac;
2158#endif
2159
2160	error = 0;
2161	if (sopt->sopt_level != SOL_SOCKET) {
2162		if (so->so_proto && so->so_proto->pr_ctloutput)
2163			return ((*so->so_proto->pr_ctloutput)
2164				  (so, sopt));
2165		error = ENOPROTOOPT;
2166	} else {
2167		switch (sopt->sopt_name) {
2168#ifdef INET
2169		case SO_ACCEPTFILTER:
2170			error = do_setopt_accept_filter(so, sopt);
2171			if (error)
2172				goto bad;
2173			break;
2174#endif
2175		case SO_LINGER:
2176			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2177			if (error)
2178				goto bad;
2179
2180			SOCK_LOCK(so);
2181			so->so_linger = l.l_linger;
2182			if (l.l_onoff)
2183				so->so_options |= SO_LINGER;
2184			else
2185				so->so_options &= ~SO_LINGER;
2186			SOCK_UNLOCK(so);
2187			break;
2188
2189		case SO_DEBUG:
2190		case SO_KEEPALIVE:
2191		case SO_DONTROUTE:
2192		case SO_USELOOPBACK:
2193		case SO_BROADCAST:
2194		case SO_REUSEADDR:
2195		case SO_REUSEPORT:
2196		case SO_OOBINLINE:
2197		case SO_TIMESTAMP:
2198		case SO_BINTIME:
2199		case SO_NOSIGPIPE:
2200		case SO_NO_DDP:
2201		case SO_NO_OFFLOAD:
2202			error = sooptcopyin(sopt, &optval, sizeof optval,
2203					    sizeof optval);
2204			if (error)
2205				goto bad;
2206			SOCK_LOCK(so);
2207			if (optval)
2208				so->so_options |= sopt->sopt_name;
2209			else
2210				so->so_options &= ~sopt->sopt_name;
2211			SOCK_UNLOCK(so);
2212			break;
2213
2214		case SO_SETFIB:
2215			error = sooptcopyin(sopt, &optval, sizeof optval,
2216					    sizeof optval);
2217			if (optval < 1 || optval > rt_numfibs) {
2218				error = EINVAL;
2219				goto bad;
2220			}
2221			if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2222			    (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2223				so->so_fibnum = optval;
2224				/* Note: ignore error */
2225				if (so->so_proto && so->so_proto->pr_ctloutput)
2226					(*so->so_proto->pr_ctloutput)(so, sopt);
2227			} else {
2228				so->so_fibnum = 0;
2229			}
2230			break;
2231		case SO_SNDBUF:
2232		case SO_RCVBUF:
2233		case SO_SNDLOWAT:
2234		case SO_RCVLOWAT:
2235			error = sooptcopyin(sopt, &optval, sizeof optval,
2236					    sizeof optval);
2237			if (error)
2238				goto bad;
2239
2240			/*
2241			 * Values < 1 make no sense for any of these options,
2242			 * so disallow them.
2243			 */
2244			if (optval < 1) {
2245				error = EINVAL;
2246				goto bad;
2247			}
2248
2249			switch (sopt->sopt_name) {
2250			case SO_SNDBUF:
2251			case SO_RCVBUF:
2252				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2253				    &so->so_snd : &so->so_rcv, (u_long)optval,
2254				    so, curthread) == 0) {
2255					error = ENOBUFS;
2256					goto bad;
2257				}
2258				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2259				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2260				break;
2261
2262			/*
2263			 * Make sure the low-water is never greater than the
2264			 * high-water.
2265			 */
2266			case SO_SNDLOWAT:
2267				SOCKBUF_LOCK(&so->so_snd);
2268				so->so_snd.sb_lowat =
2269				    (optval > so->so_snd.sb_hiwat) ?
2270				    so->so_snd.sb_hiwat : optval;
2271				SOCKBUF_UNLOCK(&so->so_snd);
2272				break;
2273			case SO_RCVLOWAT:
2274				SOCKBUF_LOCK(&so->so_rcv);
2275				so->so_rcv.sb_lowat =
2276				    (optval > so->so_rcv.sb_hiwat) ?
2277				    so->so_rcv.sb_hiwat : optval;
2278				SOCKBUF_UNLOCK(&so->so_rcv);
2279				break;
2280			}
2281			break;
2282
2283		case SO_SNDTIMEO:
2284		case SO_RCVTIMEO:
2285#ifdef COMPAT_IA32
2286			if (SV_CURPROC_FLAG(SV_ILP32)) {
2287				struct timeval32 tv32;
2288
2289				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2290				    sizeof tv32);
2291				CP(tv32, tv, tv_sec);
2292				CP(tv32, tv, tv_usec);
2293			} else
2294#endif
2295				error = sooptcopyin(sopt, &tv, sizeof tv,
2296				    sizeof tv);
2297			if (error)
2298				goto bad;
2299
2300			/* assert(hz > 0); */
2301			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2302			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2303				error = EDOM;
2304				goto bad;
2305			}
2306			/* assert(tick > 0); */
2307			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2308			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2309			if (val > INT_MAX) {
2310				error = EDOM;
2311				goto bad;
2312			}
2313			if (val == 0 && tv.tv_usec != 0)
2314				val = 1;
2315
2316			switch (sopt->sopt_name) {
2317			case SO_SNDTIMEO:
2318				so->so_snd.sb_timeo = val;
2319				break;
2320			case SO_RCVTIMEO:
2321				so->so_rcv.sb_timeo = val;
2322				break;
2323			}
2324			break;
2325
2326		case SO_LABEL:
2327#ifdef MAC
2328			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2329			    sizeof extmac);
2330			if (error)
2331				goto bad;
2332			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2333			    so, &extmac);
2334#else
2335			error = EOPNOTSUPP;
2336#endif
2337			break;
2338
2339		default:
2340			error = ENOPROTOOPT;
2341			break;
2342		}
2343		if (error == 0 && so->so_proto != NULL &&
2344		    so->so_proto->pr_ctloutput != NULL) {
2345			(void) ((*so->so_proto->pr_ctloutput)
2346				  (so, sopt));
2347		}
2348	}
2349bad:
2350	return (error);
2351}
2352
2353/*
2354 * Helper routine for getsockopt.
2355 */
2356int
2357sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2358{
2359	int	error;
2360	size_t	valsize;
2361
2362	error = 0;
2363
2364	/*
2365	 * Documented get behavior is that we always return a value, possibly
2366	 * truncated to fit in the user's buffer.  Traditional behavior is
2367	 * that we always tell the user precisely how much we copied, rather
2368	 * than something useful like the total amount we had available for
2369	 * her.  Note that this interface is not idempotent; the entire
2370	 * answer must generated ahead of time.
2371	 */
2372	valsize = min(len, sopt->sopt_valsize);
2373	sopt->sopt_valsize = valsize;
2374	if (sopt->sopt_val != NULL) {
2375		if (sopt->sopt_td != NULL)
2376			error = copyout(buf, sopt->sopt_val, valsize);
2377		else
2378			bcopy(buf, sopt->sopt_val, valsize);
2379	}
2380	return (error);
2381}
2382
2383int
2384sogetopt(struct socket *so, struct sockopt *sopt)
2385{
2386	int	error, optval;
2387	struct	linger l;
2388	struct	timeval tv;
2389#ifdef MAC
2390	struct mac extmac;
2391#endif
2392
2393	error = 0;
2394	if (sopt->sopt_level != SOL_SOCKET) {
2395		if (so->so_proto && so->so_proto->pr_ctloutput) {
2396			return ((*so->so_proto->pr_ctloutput)
2397				  (so, sopt));
2398		} else
2399			return (ENOPROTOOPT);
2400	} else {
2401		switch (sopt->sopt_name) {
2402#ifdef INET
2403		case SO_ACCEPTFILTER:
2404			error = do_getopt_accept_filter(so, sopt);
2405			break;
2406#endif
2407		case SO_LINGER:
2408			SOCK_LOCK(so);
2409			l.l_onoff = so->so_options & SO_LINGER;
2410			l.l_linger = so->so_linger;
2411			SOCK_UNLOCK(so);
2412			error = sooptcopyout(sopt, &l, sizeof l);
2413			break;
2414
2415		case SO_USELOOPBACK:
2416		case SO_DONTROUTE:
2417		case SO_DEBUG:
2418		case SO_KEEPALIVE:
2419		case SO_REUSEADDR:
2420		case SO_REUSEPORT:
2421		case SO_BROADCAST:
2422		case SO_OOBINLINE:
2423		case SO_ACCEPTCONN:
2424		case SO_TIMESTAMP:
2425		case SO_BINTIME:
2426		case SO_NOSIGPIPE:
2427			optval = so->so_options & sopt->sopt_name;
2428integer:
2429			error = sooptcopyout(sopt, &optval, sizeof optval);
2430			break;
2431
2432		case SO_TYPE:
2433			optval = so->so_type;
2434			goto integer;
2435
2436		case SO_ERROR:
2437			SOCK_LOCK(so);
2438			optval = so->so_error;
2439			so->so_error = 0;
2440			SOCK_UNLOCK(so);
2441			goto integer;
2442
2443		case SO_SNDBUF:
2444			optval = so->so_snd.sb_hiwat;
2445			goto integer;
2446
2447		case SO_RCVBUF:
2448			optval = so->so_rcv.sb_hiwat;
2449			goto integer;
2450
2451		case SO_SNDLOWAT:
2452			optval = so->so_snd.sb_lowat;
2453			goto integer;
2454
2455		case SO_RCVLOWAT:
2456			optval = so->so_rcv.sb_lowat;
2457			goto integer;
2458
2459		case SO_SNDTIMEO:
2460		case SO_RCVTIMEO:
2461			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2462				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2463
2464			tv.tv_sec = optval / hz;
2465			tv.tv_usec = (optval % hz) * tick;
2466#ifdef COMPAT_IA32
2467			if (SV_CURPROC_FLAG(SV_ILP32)) {
2468				struct timeval32 tv32;
2469
2470				CP(tv, tv32, tv_sec);
2471				CP(tv, tv32, tv_usec);
2472				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2473			} else
2474#endif
2475				error = sooptcopyout(sopt, &tv, sizeof tv);
2476			break;
2477
2478		case SO_LABEL:
2479#ifdef MAC
2480			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2481			    sizeof(extmac));
2482			if (error)
2483				return (error);
2484			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2485			    so, &extmac);
2486			if (error)
2487				return (error);
2488			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2489#else
2490			error = EOPNOTSUPP;
2491#endif
2492			break;
2493
2494		case SO_PEERLABEL:
2495#ifdef MAC
2496			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2497			    sizeof(extmac));
2498			if (error)
2499				return (error);
2500			error = mac_getsockopt_peerlabel(
2501			    sopt->sopt_td->td_ucred, so, &extmac);
2502			if (error)
2503				return (error);
2504			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2505#else
2506			error = EOPNOTSUPP;
2507#endif
2508			break;
2509
2510		case SO_LISTENQLIMIT:
2511			optval = so->so_qlimit;
2512			goto integer;
2513
2514		case SO_LISTENQLEN:
2515			optval = so->so_qlen;
2516			goto integer;
2517
2518		case SO_LISTENINCQLEN:
2519			optval = so->so_incqlen;
2520			goto integer;
2521
2522		default:
2523			error = ENOPROTOOPT;
2524			break;
2525		}
2526		return (error);
2527	}
2528}
2529
2530/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2531int
2532soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2533{
2534	struct mbuf *m, *m_prev;
2535	int sopt_size = sopt->sopt_valsize;
2536
2537	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2538	if (m == NULL)
2539		return ENOBUFS;
2540	if (sopt_size > MLEN) {
2541		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2542		if ((m->m_flags & M_EXT) == 0) {
2543			m_free(m);
2544			return ENOBUFS;
2545		}
2546		m->m_len = min(MCLBYTES, sopt_size);
2547	} else {
2548		m->m_len = min(MLEN, sopt_size);
2549	}
2550	sopt_size -= m->m_len;
2551	*mp = m;
2552	m_prev = m;
2553
2554	while (sopt_size) {
2555		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2556		if (m == NULL) {
2557			m_freem(*mp);
2558			return ENOBUFS;
2559		}
2560		if (sopt_size > MLEN) {
2561			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2562			    M_DONTWAIT);
2563			if ((m->m_flags & M_EXT) == 0) {
2564				m_freem(m);
2565				m_freem(*mp);
2566				return ENOBUFS;
2567			}
2568			m->m_len = min(MCLBYTES, sopt_size);
2569		} else {
2570			m->m_len = min(MLEN, sopt_size);
2571		}
2572		sopt_size -= m->m_len;
2573		m_prev->m_next = m;
2574		m_prev = m;
2575	}
2576	return (0);
2577}
2578
2579/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2580int
2581soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2582{
2583	struct mbuf *m0 = m;
2584
2585	if (sopt->sopt_val == NULL)
2586		return (0);
2587	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2588		if (sopt->sopt_td != NULL) {
2589			int error;
2590
2591			error = copyin(sopt->sopt_val, mtod(m, char *),
2592				       m->m_len);
2593			if (error != 0) {
2594				m_freem(m0);
2595				return(error);
2596			}
2597		} else
2598			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2599		sopt->sopt_valsize -= m->m_len;
2600		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2601		m = m->m_next;
2602	}
2603	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2604		panic("ip6_sooptmcopyin");
2605	return (0);
2606}
2607
2608/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2609int
2610soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2611{
2612	struct mbuf *m0 = m;
2613	size_t valsize = 0;
2614
2615	if (sopt->sopt_val == NULL)
2616		return (0);
2617	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2618		if (sopt->sopt_td != NULL) {
2619			int error;
2620
2621			error = copyout(mtod(m, char *), sopt->sopt_val,
2622				       m->m_len);
2623			if (error != 0) {
2624				m_freem(m0);
2625				return(error);
2626			}
2627		} else
2628			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2629	       sopt->sopt_valsize -= m->m_len;
2630	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2631	       valsize += m->m_len;
2632	       m = m->m_next;
2633	}
2634	if (m != NULL) {
2635		/* enough soopt buffer should be given from user-land */
2636		m_freem(m0);
2637		return(EINVAL);
2638	}
2639	sopt->sopt_valsize = valsize;
2640	return (0);
2641}
2642
2643/*
2644 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2645 * out-of-band data, which will then notify socket consumers.
2646 */
2647void
2648sohasoutofband(struct socket *so)
2649{
2650
2651	if (so->so_sigio != NULL)
2652		pgsigio(&so->so_sigio, SIGURG, 0);
2653	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2654}
2655
2656int
2657sopoll(struct socket *so, int events, struct ucred *active_cred,
2658    struct thread *td)
2659{
2660
2661	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2662	    td));
2663}
2664
2665int
2666sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2667    struct thread *td)
2668{
2669	int revents = 0;
2670
2671	SOCKBUF_LOCK(&so->so_snd);
2672	SOCKBUF_LOCK(&so->so_rcv);
2673	if (events & (POLLIN | POLLRDNORM))
2674		if (soreadable(so))
2675			revents |= events & (POLLIN | POLLRDNORM);
2676
2677	if (events & POLLINIGNEOF)
2678		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2679		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2680			revents |= POLLINIGNEOF;
2681
2682	if (events & (POLLOUT | POLLWRNORM))
2683		if (sowriteable(so))
2684			revents |= events & (POLLOUT | POLLWRNORM);
2685
2686	if (events & (POLLPRI | POLLRDBAND))
2687		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2688			revents |= events & (POLLPRI | POLLRDBAND);
2689
2690	if (revents == 0) {
2691		if (events &
2692		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2693		     POLLRDBAND)) {
2694			selrecord(td, &so->so_rcv.sb_sel);
2695			so->so_rcv.sb_flags |= SB_SEL;
2696		}
2697
2698		if (events & (POLLOUT | POLLWRNORM)) {
2699			selrecord(td, &so->so_snd.sb_sel);
2700			so->so_snd.sb_flags |= SB_SEL;
2701		}
2702	}
2703
2704	SOCKBUF_UNLOCK(&so->so_rcv);
2705	SOCKBUF_UNLOCK(&so->so_snd);
2706	return (revents);
2707}
2708
2709int
2710soo_kqfilter(struct file *fp, struct knote *kn)
2711{
2712	struct socket *so = kn->kn_fp->f_data;
2713	struct sockbuf *sb;
2714
2715	switch (kn->kn_filter) {
2716	case EVFILT_READ:
2717		if (so->so_options & SO_ACCEPTCONN)
2718			kn->kn_fop = &solisten_filtops;
2719		else
2720			kn->kn_fop = &soread_filtops;
2721		sb = &so->so_rcv;
2722		break;
2723	case EVFILT_WRITE:
2724		kn->kn_fop = &sowrite_filtops;
2725		sb = &so->so_snd;
2726		break;
2727	default:
2728		return (EINVAL);
2729	}
2730
2731	SOCKBUF_LOCK(sb);
2732	knlist_add(&sb->sb_sel.si_note, kn, 1);
2733	sb->sb_flags |= SB_KNOTE;
2734	SOCKBUF_UNLOCK(sb);
2735	return (0);
2736}
2737
2738/*
2739 * Some routines that return EOPNOTSUPP for entry points that are not
2740 * supported by a protocol.  Fill in as needed.
2741 */
2742int
2743pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2744{
2745
2746	return EOPNOTSUPP;
2747}
2748
2749int
2750pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2751{
2752
2753	return EOPNOTSUPP;
2754}
2755
2756int
2757pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2758{
2759
2760	return EOPNOTSUPP;
2761}
2762
2763int
2764pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2765{
2766
2767	return EOPNOTSUPP;
2768}
2769
2770int
2771pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2772{
2773
2774	return EOPNOTSUPP;
2775}
2776
2777int
2778pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2779    struct ifnet *ifp, struct thread *td)
2780{
2781
2782	return EOPNOTSUPP;
2783}
2784
2785int
2786pru_disconnect_notsupp(struct socket *so)
2787{
2788
2789	return EOPNOTSUPP;
2790}
2791
2792int
2793pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2794{
2795
2796	return EOPNOTSUPP;
2797}
2798
2799int
2800pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2801{
2802
2803	return EOPNOTSUPP;
2804}
2805
2806int
2807pru_rcvd_notsupp(struct socket *so, int flags)
2808{
2809
2810	return EOPNOTSUPP;
2811}
2812
2813int
2814pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2815{
2816
2817	return EOPNOTSUPP;
2818}
2819
2820int
2821pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2822    struct sockaddr *addr, struct mbuf *control, struct thread *td)
2823{
2824
2825	return EOPNOTSUPP;
2826}
2827
2828/*
2829 * This isn't really a ``null'' operation, but it's the default one and
2830 * doesn't do anything destructive.
2831 */
2832int
2833pru_sense_null(struct socket *so, struct stat *sb)
2834{
2835
2836	sb->st_blksize = so->so_snd.sb_hiwat;
2837	return 0;
2838}
2839
2840int
2841pru_shutdown_notsupp(struct socket *so)
2842{
2843
2844	return EOPNOTSUPP;
2845}
2846
2847int
2848pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2849{
2850
2851	return EOPNOTSUPP;
2852}
2853
2854int
2855pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2856    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2857{
2858
2859	return EOPNOTSUPP;
2860}
2861
2862int
2863pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2864    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2865{
2866
2867	return EOPNOTSUPP;
2868}
2869
2870int
2871pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2872    struct thread *td)
2873{
2874
2875	return EOPNOTSUPP;
2876}
2877
2878static void
2879filt_sordetach(struct knote *kn)
2880{
2881	struct socket *so = kn->kn_fp->f_data;
2882
2883	SOCKBUF_LOCK(&so->so_rcv);
2884	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2885	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2886		so->so_rcv.sb_flags &= ~SB_KNOTE;
2887	SOCKBUF_UNLOCK(&so->so_rcv);
2888}
2889
2890/*ARGSUSED*/
2891static int
2892filt_soread(struct knote *kn, long hint)
2893{
2894	struct socket *so;
2895
2896	so = kn->kn_fp->f_data;
2897	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2898
2899	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2900	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2901		kn->kn_flags |= EV_EOF;
2902		kn->kn_fflags = so->so_error;
2903		return (1);
2904	} else if (so->so_error)	/* temporary udp error */
2905		return (1);
2906	else if (kn->kn_sfflags & NOTE_LOWAT)
2907		return (kn->kn_data >= kn->kn_sdata);
2908	else
2909		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2910}
2911
2912static void
2913filt_sowdetach(struct knote *kn)
2914{
2915	struct socket *so = kn->kn_fp->f_data;
2916
2917	SOCKBUF_LOCK(&so->so_snd);
2918	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2919	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2920		so->so_snd.sb_flags &= ~SB_KNOTE;
2921	SOCKBUF_UNLOCK(&so->so_snd);
2922}
2923
2924/*ARGSUSED*/
2925static int
2926filt_sowrite(struct knote *kn, long hint)
2927{
2928	struct socket *so;
2929
2930	so = kn->kn_fp->f_data;
2931	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2932	kn->kn_data = sbspace(&so->so_snd);
2933	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2934		kn->kn_flags |= EV_EOF;
2935		kn->kn_fflags = so->so_error;
2936		return (1);
2937	} else if (so->so_error)	/* temporary udp error */
2938		return (1);
2939	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2940	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2941		return (0);
2942	else if (kn->kn_sfflags & NOTE_LOWAT)
2943		return (kn->kn_data >= kn->kn_sdata);
2944	else
2945		return (kn->kn_data >= so->so_snd.sb_lowat);
2946}
2947
2948/*ARGSUSED*/
2949static int
2950filt_solisten(struct knote *kn, long hint)
2951{
2952	struct socket *so = kn->kn_fp->f_data;
2953
2954	kn->kn_data = so->so_qlen;
2955	return (! TAILQ_EMPTY(&so->so_comp));
2956}
2957
2958int
2959socheckuid(struct socket *so, uid_t uid)
2960{
2961
2962	if (so == NULL)
2963		return (EPERM);
2964	if (so->so_cred->cr_uid != uid)
2965		return (EPERM);
2966	return (0);
2967}
2968
2969static int
2970sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2971{
2972	int error;
2973	int val;
2974
2975	val = somaxconn;
2976	error = sysctl_handle_int(oidp, &val, 0, req);
2977	if (error || !req->newptr )
2978		return (error);
2979
2980	if (val < 1 || val > USHRT_MAX)
2981		return (EINVAL);
2982
2983	somaxconn = val;
2984	return (0);
2985}
2986
2987/*
2988 * These functions are used by protocols to notify the socket layer (and its
2989 * consumers) of state changes in the sockets driven by protocol-side events.
2990 */
2991
2992/*
2993 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2994 *
2995 * Normal sequence from the active (originating) side is that
2996 * soisconnecting() is called during processing of connect() call, resulting
2997 * in an eventual call to soisconnected() if/when the connection is
2998 * established.  When the connection is torn down soisdisconnecting() is
2999 * called during processing of disconnect() call, and soisdisconnected() is
3000 * called when the connection to the peer is totally severed.  The semantics
3001 * of these routines are such that connectionless protocols can call
3002 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3003 * calls when setting up a ``connection'' takes no time.
3004 *
3005 * From the passive side, a socket is created with two queues of sockets:
3006 * so_incomp for connections in progress and so_comp for connections already
3007 * made and awaiting user acceptance.  As a protocol is preparing incoming
3008 * connections, it creates a socket structure queued on so_incomp by calling
3009 * sonewconn().  When the connection is established, soisconnected() is
3010 * called, and transfers the socket structure to so_comp, making it available
3011 * to accept().
3012 *
3013 * If a socket is closed with sockets on either so_incomp or so_comp, these
3014 * sockets are dropped.
3015 *
3016 * If higher-level protocols are implemented in the kernel, the wakeups done
3017 * here will sometimes cause software-interrupt process scheduling.
3018 */
3019void
3020soisconnecting(struct socket *so)
3021{
3022
3023	SOCK_LOCK(so);
3024	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3025	so->so_state |= SS_ISCONNECTING;
3026	SOCK_UNLOCK(so);
3027}
3028
3029void
3030soisconnected(struct socket *so)
3031{
3032	struct socket *head;
3033
3034	ACCEPT_LOCK();
3035	SOCK_LOCK(so);
3036	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3037	so->so_state |= SS_ISCONNECTED;
3038	head = so->so_head;
3039	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3040		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3041			SOCK_UNLOCK(so);
3042			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3043			head->so_incqlen--;
3044			so->so_qstate &= ~SQ_INCOMP;
3045			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3046			head->so_qlen++;
3047			so->so_qstate |= SQ_COMP;
3048			ACCEPT_UNLOCK();
3049			sorwakeup(head);
3050			wakeup_one(&head->so_timeo);
3051		} else {
3052			ACCEPT_UNLOCK();
3053			so->so_upcall =
3054			    head->so_accf->so_accept_filter->accf_callback;
3055			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
3056			so->so_rcv.sb_flags |= SB_UPCALL;
3057			so->so_options &= ~SO_ACCEPTFILTER;
3058			SOCK_UNLOCK(so);
3059			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
3060		}
3061		return;
3062	}
3063	SOCK_UNLOCK(so);
3064	ACCEPT_UNLOCK();
3065	wakeup(&so->so_timeo);
3066	sorwakeup(so);
3067	sowwakeup(so);
3068}
3069
3070void
3071soisdisconnecting(struct socket *so)
3072{
3073
3074	/*
3075	 * Note: This code assumes that SOCK_LOCK(so) and
3076	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3077	 */
3078	SOCKBUF_LOCK(&so->so_rcv);
3079	so->so_state &= ~SS_ISCONNECTING;
3080	so->so_state |= SS_ISDISCONNECTING;
3081	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3082	sorwakeup_locked(so);
3083	SOCKBUF_LOCK(&so->so_snd);
3084	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3085	sowwakeup_locked(so);
3086	wakeup(&so->so_timeo);
3087}
3088
3089void
3090soisdisconnected(struct socket *so)
3091{
3092
3093	/*
3094	 * Note: This code assumes that SOCK_LOCK(so) and
3095	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3096	 */
3097	SOCKBUF_LOCK(&so->so_rcv);
3098	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3099	so->so_state |= SS_ISDISCONNECTED;
3100	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3101	sorwakeup_locked(so);
3102	SOCKBUF_LOCK(&so->so_snd);
3103	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3104	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3105	sowwakeup_locked(so);
3106	wakeup(&so->so_timeo);
3107}
3108
3109/*
3110 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3111 */
3112struct sockaddr *
3113sodupsockaddr(const struct sockaddr *sa, int mflags)
3114{
3115	struct sockaddr *sa2;
3116
3117	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3118	if (sa2)
3119		bcopy(sa, sa2, sa->sa_len);
3120	return sa2;
3121}
3122
3123/*
3124 * Create an external-format (``xsocket'') structure using the information in
3125 * the kernel-format socket structure pointed to by so.  This is done to
3126 * reduce the spew of irrelevant information over this interface, to isolate
3127 * user code from changes in the kernel structure, and potentially to provide
3128 * information-hiding if we decide that some of this information should be
3129 * hidden from users.
3130 */
3131void
3132sotoxsocket(struct socket *so, struct xsocket *xso)
3133{
3134
3135	xso->xso_len = sizeof *xso;
3136	xso->xso_so = so;
3137	xso->so_type = so->so_type;
3138	xso->so_options = so->so_options;
3139	xso->so_linger = so->so_linger;
3140	xso->so_state = so->so_state;
3141	xso->so_pcb = so->so_pcb;
3142	xso->xso_protocol = so->so_proto->pr_protocol;
3143	xso->xso_family = so->so_proto->pr_domain->dom_family;
3144	xso->so_qlen = so->so_qlen;
3145	xso->so_incqlen = so->so_incqlen;
3146	xso->so_qlimit = so->so_qlimit;
3147	xso->so_timeo = so->so_timeo;
3148	xso->so_error = so->so_error;
3149	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3150	xso->so_oobmark = so->so_oobmark;
3151	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3152	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3153	xso->so_uid = so->so_cred->cr_uid;
3154}
3155
3156
3157/*
3158 * Socket accessor functions to provide external consumers with
3159 * a safe interface to socket state
3160 *
3161 */
3162
3163void
3164so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3165{
3166
3167	TAILQ_FOREACH(so, &so->so_comp, so_list)
3168		func(so, arg);
3169}
3170
3171struct sockbuf *
3172so_sockbuf_rcv(struct socket *so)
3173{
3174
3175	return (&so->so_rcv);
3176}
3177
3178struct sockbuf *
3179so_sockbuf_snd(struct socket *so)
3180{
3181
3182	return (&so->so_snd);
3183}
3184
3185int
3186so_state_get(const struct socket *so)
3187{
3188
3189	return (so->so_state);
3190}
3191
3192void
3193so_state_set(struct socket *so, int val)
3194{
3195
3196	so->so_state = val;
3197}
3198
3199int
3200so_options_get(const struct socket *so)
3201{
3202
3203	return (so->so_options);
3204}
3205
3206void
3207so_options_set(struct socket *so, int val)
3208{
3209
3210	so->so_options = val;
3211}
3212
3213int
3214so_error_get(const struct socket *so)
3215{
3216
3217	return (so->so_error);
3218}
3219
3220void
3221so_error_set(struct socket *so, int val)
3222{
3223
3224	so->so_error = val;
3225}
3226
3227int
3228so_linger_get(const struct socket *so)
3229{
3230
3231	return (so->so_linger);
3232}
3233
3234void
3235so_linger_set(struct socket *so, int val)
3236{
3237
3238	so->so_linger = val;
3239}
3240
3241struct protosw *
3242so_protosw_get(const struct socket *so)
3243{
3244
3245	return (so->so_proto);
3246}
3247
3248void
3249so_protosw_set(struct socket *so, struct protosw *val)
3250{
3251
3252	so->so_proto = val;
3253}
3254
3255void
3256so_sorwakeup(struct socket *so)
3257{
3258
3259	sorwakeup(so);
3260}
3261
3262void
3263so_sowwakeup(struct socket *so)
3264{
3265
3266	sowwakeup(so);
3267}
3268
3269void
3270so_sorwakeup_locked(struct socket *so)
3271{
3272
3273	sorwakeup_locked(so);
3274}
3275
3276void
3277so_sowwakeup_locked(struct socket *so)
3278{
3279
3280	sowwakeup_locked(so);
3281}
3282
3283void
3284so_lock(struct socket *so)
3285{
3286	SOCK_LOCK(so);
3287}
3288
3289void
3290so_unlock(struct socket *so)
3291{
3292	SOCK_UNLOCK(so);
3293}
3294