uipc_socket.c revision 185892
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 185892 2008-12-10 22:10:37Z bz $");
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_mac.h"
103#include "opt_zero.h"
104#include "opt_compat.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/fcntl.h>
109#include <sys/limits.h>
110#include <sys/lock.h>
111#include <sys/mac.h>
112#include <sys/malloc.h>
113#include <sys/mbuf.h>
114#include <sys/mutex.h>
115#include <sys/domain.h>
116#include <sys/file.h>			/* for struct knote */
117#include <sys/kernel.h>
118#include <sys/event.h>
119#include <sys/eventhandler.h>
120#include <sys/poll.h>
121#include <sys/proc.h>
122#include <sys/protosw.h>
123#include <sys/socket.h>
124#include <sys/socketvar.h>
125#include <sys/resourcevar.h>
126#include <net/route.h>
127#include <sys/signalvar.h>
128#include <sys/stat.h>
129#include <sys/sx.h>
130#include <sys/sysctl.h>
131#include <sys/uio.h>
132#include <sys/jail.h>
133
134#include <security/mac/mac_framework.h>
135
136#include <vm/uma.h>
137
138#ifdef COMPAT_IA32
139#include <sys/mount.h>
140#include <sys/sysent.h>
141#include <compat/freebsd32/freebsd32.h>
142#endif
143
144static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
145		    int flags);
146
147static void	filt_sordetach(struct knote *kn);
148static int	filt_soread(struct knote *kn, long hint);
149static void	filt_sowdetach(struct knote *kn);
150static int	filt_sowrite(struct knote *kn, long hint);
151static int	filt_solisten(struct knote *kn, long hint);
152
153static struct filterops solisten_filtops =
154	{ 1, NULL, filt_sordetach, filt_solisten };
155static struct filterops soread_filtops =
156	{ 1, NULL, filt_sordetach, filt_soread };
157static struct filterops sowrite_filtops =
158	{ 1, NULL, filt_sowdetach, filt_sowrite };
159
160uma_zone_t socket_zone;
161so_gen_t	so_gencnt;	/* generation count for sockets */
162
163int	maxsockets;
164
165MALLOC_DEFINE(M_SONAME, "soname", "socket name");
166MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
167
168static int somaxconn = SOMAXCONN;
169static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
170/* XXX: we dont have SYSCTL_USHORT */
171SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
172    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
173    "queue size");
174static int numopensockets;
175SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
176    &numopensockets, 0, "Number of open sockets");
177#ifdef ZERO_COPY_SOCKETS
178/* These aren't static because they're used in other files. */
179int so_zero_copy_send = 1;
180int so_zero_copy_receive = 1;
181SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
182    "Zero copy controls");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
184    &so_zero_copy_receive, 0, "Enable zero copy receive");
185SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
186    &so_zero_copy_send, 0, "Enable zero copy send");
187#endif /* ZERO_COPY_SOCKETS */
188
189/*
190 * accept_mtx locks down per-socket fields relating to accept queues.  See
191 * socketvar.h for an annotation of the protected fields of struct socket.
192 */
193struct mtx accept_mtx;
194MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
195
196/*
197 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
198 * so_gencnt field.
199 */
200static struct mtx so_global_mtx;
201MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
202
203/*
204 * General IPC sysctl name space, used by sockets and a variety of other IPC
205 * types.
206 */
207SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
208
209/*
210 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
211 * of the change so that they can update their dependent limits as required.
212 */
213static int
214sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
215{
216	int error, newmaxsockets;
217
218	newmaxsockets = maxsockets;
219	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
220	if (error == 0 && req->newptr) {
221		if (newmaxsockets > maxsockets) {
222			maxsockets = newmaxsockets;
223			if (maxsockets > ((maxfiles / 4) * 3)) {
224				maxfiles = (maxsockets * 5) / 4;
225				maxfilesperproc = (maxfiles * 9) / 10;
226			}
227			EVENTHANDLER_INVOKE(maxsockets_change);
228		} else
229			error = EINVAL;
230	}
231	return (error);
232}
233
234SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
235    &maxsockets, 0, sysctl_maxsockets, "IU",
236    "Maximum number of sockets avaliable");
237
238/*
239 * Initialise maxsockets.
240 */
241static void
242init_maxsockets(void *ignored)
243{
244
245	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
246	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
247}
248SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
249
250/*
251 * Socket operation routines.  These routines are called by the routines in
252 * sys_socket.c or from a system process, and implement the semantics of
253 * socket operations by switching out to the protocol specific routines.
254 */
255
256/*
257 * Get a socket structure from our zone, and initialize it.  Note that it
258 * would probably be better to allocate socket and PCB at the same time, but
259 * I'm not convinced that all the protocols can be easily modified to do
260 * this.
261 *
262 * soalloc() returns a socket with a ref count of 0.
263 */
264static struct socket *
265soalloc(void)
266{
267	struct socket *so;
268
269	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
270	if (so == NULL)
271		return (NULL);
272#ifdef MAC
273	if (mac_socket_init(so, M_NOWAIT) != 0) {
274		uma_zfree(socket_zone, so);
275		return (NULL);
276	}
277#endif
278	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
279	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
280	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
281	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
282	TAILQ_INIT(&so->so_aiojobq);
283	mtx_lock(&so_global_mtx);
284	so->so_gencnt = ++so_gencnt;
285	++numopensockets;
286	mtx_unlock(&so_global_mtx);
287	return (so);
288}
289
290/*
291 * Free the storage associated with a socket at the socket layer, tear down
292 * locks, labels, etc.  All protocol state is assumed already to have been
293 * torn down (and possibly never set up) by the caller.
294 */
295static void
296sodealloc(struct socket *so)
297{
298
299	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
300	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
301
302	mtx_lock(&so_global_mtx);
303	so->so_gencnt = ++so_gencnt;
304	--numopensockets;	/* Could be below, but faster here. */
305	mtx_unlock(&so_global_mtx);
306	if (so->so_rcv.sb_hiwat)
307		(void)chgsbsize(so->so_cred->cr_uidinfo,
308		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
309	if (so->so_snd.sb_hiwat)
310		(void)chgsbsize(so->so_cred->cr_uidinfo,
311		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
312#ifdef INET
313	/* remove acccept filter if one is present. */
314	if (so->so_accf != NULL)
315		do_setopt_accept_filter(so, NULL);
316#endif
317#ifdef MAC
318	mac_socket_destroy(so);
319#endif
320	crfree(so->so_cred);
321	sx_destroy(&so->so_snd.sb_sx);
322	sx_destroy(&so->so_rcv.sb_sx);
323	SOCKBUF_LOCK_DESTROY(&so->so_snd);
324	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
325	uma_zfree(socket_zone, so);
326}
327
328/*
329 * socreate returns a socket with a ref count of 1.  The socket should be
330 * closed with soclose().
331 */
332int
333socreate(int dom, struct socket **aso, int type, int proto,
334    struct ucred *cred, struct thread *td)
335{
336	struct protosw *prp;
337	struct socket *so;
338	int error;
339
340	if (proto)
341		prp = pffindproto(dom, proto, type);
342	else
343		prp = pffindtype(dom, type);
344
345	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
346	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
347		return (EPROTONOSUPPORT);
348
349	if (jailed(cred) && jail_socket_unixiproute_only &&
350	    prp->pr_domain->dom_family != PF_LOCAL &&
351	    prp->pr_domain->dom_family != PF_INET &&
352#ifdef INET6
353	    prp->pr_domain->dom_family != PF_INET6 &&
354#endif
355	    prp->pr_domain->dom_family != PF_ROUTE) {
356		return (EPROTONOSUPPORT);
357	}
358
359	if (prp->pr_type != type)
360		return (EPROTOTYPE);
361	so = soalloc();
362	if (so == NULL)
363		return (ENOBUFS);
364
365	TAILQ_INIT(&so->so_incomp);
366	TAILQ_INIT(&so->so_comp);
367	so->so_type = type;
368	so->so_cred = crhold(cred);
369	if ((prp->pr_domain->dom_family == PF_INET) ||
370	    (prp->pr_domain->dom_family == PF_ROUTE))
371		so->so_fibnum = td->td_proc->p_fibnum;
372	else
373		so->so_fibnum = 0;
374	so->so_proto = prp;
375#ifdef MAC
376	mac_socket_create(cred, so);
377#endif
378	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
379	    NULL, NULL, NULL);
380	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
381	    NULL, NULL, NULL);
382	so->so_count = 1;
383	/*
384	 * Auto-sizing of socket buffers is managed by the protocols and
385	 * the appropriate flags must be set in the pru_attach function.
386	 */
387	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
388	if (error) {
389		KASSERT(so->so_count == 1, ("socreate: so_count %d",
390		    so->so_count));
391		so->so_count = 0;
392		sodealloc(so);
393		return (error);
394	}
395	*aso = so;
396	return (0);
397}
398
399#ifdef REGRESSION
400static int regression_sonewconn_earlytest = 1;
401SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
402    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
403#endif
404
405/*
406 * When an attempt at a new connection is noted on a socket which accepts
407 * connections, sonewconn is called.  If the connection is possible (subject
408 * to space constraints, etc.) then we allocate a new structure, propoerly
409 * linked into the data structure of the original socket, and return this.
410 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
411 *
412 * Note: the ref count on the socket is 0 on return.
413 */
414struct socket *
415sonewconn(struct socket *head, int connstatus)
416{
417	struct socket *so;
418	int over;
419
420	ACCEPT_LOCK();
421	over = (head->so_qlen > 3 * head->so_qlimit / 2);
422	ACCEPT_UNLOCK();
423#ifdef REGRESSION
424	if (regression_sonewconn_earlytest && over)
425#else
426	if (over)
427#endif
428		return (NULL);
429	so = soalloc();
430	if (so == NULL)
431		return (NULL);
432	if ((head->so_options & SO_ACCEPTFILTER) != 0)
433		connstatus = 0;
434	so->so_head = head;
435	so->so_type = head->so_type;
436	so->so_options = head->so_options &~ SO_ACCEPTCONN;
437	so->so_linger = head->so_linger;
438	so->so_state = head->so_state | SS_NOFDREF;
439	so->so_proto = head->so_proto;
440	so->so_cred = crhold(head->so_cred);
441#ifdef MAC
442	SOCK_LOCK(head);
443	mac_socket_newconn(head, so);
444	SOCK_UNLOCK(head);
445#endif
446	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
447	    NULL, NULL, NULL);
448	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
449	    NULL, NULL, NULL);
450	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
451	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
452		sodealloc(so);
453		return (NULL);
454	}
455	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
456	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
457	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
458	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
459	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
460	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
461	so->so_state |= connstatus;
462	ACCEPT_LOCK();
463	if (connstatus) {
464		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
465		so->so_qstate |= SQ_COMP;
466		head->so_qlen++;
467	} else {
468		/*
469		 * Keep removing sockets from the head until there's room for
470		 * us to insert on the tail.  In pre-locking revisions, this
471		 * was a simple if(), but as we could be racing with other
472		 * threads and soabort() requires dropping locks, we must
473		 * loop waiting for the condition to be true.
474		 */
475		while (head->so_incqlen > head->so_qlimit) {
476			struct socket *sp;
477			sp = TAILQ_FIRST(&head->so_incomp);
478			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
479			head->so_incqlen--;
480			sp->so_qstate &= ~SQ_INCOMP;
481			sp->so_head = NULL;
482			ACCEPT_UNLOCK();
483			soabort(sp);
484			ACCEPT_LOCK();
485		}
486		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
487		so->so_qstate |= SQ_INCOMP;
488		head->so_incqlen++;
489	}
490	ACCEPT_UNLOCK();
491	if (connstatus) {
492		sorwakeup(head);
493		wakeup_one(&head->so_timeo);
494	}
495	return (so);
496}
497
498int
499sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
500{
501
502	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
503}
504
505/*
506 * solisten() transitions a socket from a non-listening state to a listening
507 * state, but can also be used to update the listen queue depth on an
508 * existing listen socket.  The protocol will call back into the sockets
509 * layer using solisten_proto_check() and solisten_proto() to check and set
510 * socket-layer listen state.  Call backs are used so that the protocol can
511 * acquire both protocol and socket layer locks in whatever order is required
512 * by the protocol.
513 *
514 * Protocol implementors are advised to hold the socket lock across the
515 * socket-layer test and set to avoid races at the socket layer.
516 */
517int
518solisten(struct socket *so, int backlog, struct thread *td)
519{
520
521	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
522}
523
524int
525solisten_proto_check(struct socket *so)
526{
527
528	SOCK_LOCK_ASSERT(so);
529
530	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
531	    SS_ISDISCONNECTING))
532		return (EINVAL);
533	return (0);
534}
535
536void
537solisten_proto(struct socket *so, int backlog)
538{
539
540	SOCK_LOCK_ASSERT(so);
541
542	if (backlog < 0 || backlog > somaxconn)
543		backlog = somaxconn;
544	so->so_qlimit = backlog;
545	so->so_options |= SO_ACCEPTCONN;
546}
547
548/*
549 * Attempt to free a socket.  This should really be sotryfree().
550 *
551 * sofree() will succeed if:
552 *
553 * - There are no outstanding file descriptor references or related consumers
554 *   (so_count == 0).
555 *
556 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
557 *
558 * - The protocol does not have an outstanding strong reference on the socket
559 *   (SS_PROTOREF).
560 *
561 * - The socket is not in a completed connection queue, so a process has been
562 *   notified that it is present.  If it is removed, the user process may
563 *   block in accept() despite select() saying the socket was ready.
564 *
565 * Otherwise, it will quietly abort so that a future call to sofree(), when
566 * conditions are right, can succeed.
567 */
568void
569sofree(struct socket *so)
570{
571	struct protosw *pr = so->so_proto;
572	struct socket *head;
573
574	ACCEPT_LOCK_ASSERT();
575	SOCK_LOCK_ASSERT(so);
576
577	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
578	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
579		SOCK_UNLOCK(so);
580		ACCEPT_UNLOCK();
581		return;
582	}
583
584	head = so->so_head;
585	if (head != NULL) {
586		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
587		    (so->so_qstate & SQ_INCOMP) != 0,
588		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
589		    "SQ_INCOMP"));
590		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
591		    (so->so_qstate & SQ_INCOMP) == 0,
592		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
593		TAILQ_REMOVE(&head->so_incomp, so, so_list);
594		head->so_incqlen--;
595		so->so_qstate &= ~SQ_INCOMP;
596		so->so_head = NULL;
597	}
598	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
599	    (so->so_qstate & SQ_INCOMP) == 0,
600	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
601	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
602	if (so->so_options & SO_ACCEPTCONN) {
603		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
604		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
605	}
606	SOCK_UNLOCK(so);
607	ACCEPT_UNLOCK();
608
609	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
610		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
611	if (pr->pr_usrreqs->pru_detach != NULL)
612		(*pr->pr_usrreqs->pru_detach)(so);
613
614	/*
615	 * From this point on, we assume that no other references to this
616	 * socket exist anywhere else in the stack.  Therefore, no locks need
617	 * to be acquired or held.
618	 *
619	 * We used to do a lot of socket buffer and socket locking here, as
620	 * well as invoke sorflush() and perform wakeups.  The direct call to
621	 * dom_dispose() and sbrelease_internal() are an inlining of what was
622	 * necessary from sorflush().
623	 *
624	 * Notice that the socket buffer and kqueue state are torn down
625	 * before calling pru_detach.  This means that protocols shold not
626	 * assume they can perform socket wakeups, etc, in their detach code.
627	 */
628	sbdestroy(&so->so_snd, so);
629	sbdestroy(&so->so_rcv, so);
630	knlist_destroy(&so->so_rcv.sb_sel.si_note);
631	knlist_destroy(&so->so_snd.sb_sel.si_note);
632	sodealloc(so);
633}
634
635/*
636 * Close a socket on last file table reference removal.  Initiate disconnect
637 * if connected.  Free socket when disconnect complete.
638 *
639 * This function will sorele() the socket.  Note that soclose() may be called
640 * prior to the ref count reaching zero.  The actual socket structure will
641 * not be freed until the ref count reaches zero.
642 */
643int
644soclose(struct socket *so)
645{
646	int error = 0;
647
648	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
649
650	funsetown(&so->so_sigio);
651	if (so->so_state & SS_ISCONNECTED) {
652		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
653			error = sodisconnect(so);
654			if (error)
655				goto drop;
656		}
657		if (so->so_options & SO_LINGER) {
658			if ((so->so_state & SS_ISDISCONNECTING) &&
659			    (so->so_state & SS_NBIO))
660				goto drop;
661			while (so->so_state & SS_ISCONNECTED) {
662				error = tsleep(&so->so_timeo,
663				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
664				if (error)
665					break;
666			}
667		}
668	}
669
670drop:
671	if (so->so_proto->pr_usrreqs->pru_close != NULL)
672		(*so->so_proto->pr_usrreqs->pru_close)(so);
673	if (so->so_options & SO_ACCEPTCONN) {
674		struct socket *sp;
675		ACCEPT_LOCK();
676		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
677			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
678			so->so_incqlen--;
679			sp->so_qstate &= ~SQ_INCOMP;
680			sp->so_head = NULL;
681			ACCEPT_UNLOCK();
682			soabort(sp);
683			ACCEPT_LOCK();
684		}
685		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
686			TAILQ_REMOVE(&so->so_comp, sp, so_list);
687			so->so_qlen--;
688			sp->so_qstate &= ~SQ_COMP;
689			sp->so_head = NULL;
690			ACCEPT_UNLOCK();
691			soabort(sp);
692			ACCEPT_LOCK();
693		}
694		ACCEPT_UNLOCK();
695	}
696	ACCEPT_LOCK();
697	SOCK_LOCK(so);
698	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
699	so->so_state |= SS_NOFDREF;
700	sorele(so);
701	return (error);
702}
703
704/*
705 * soabort() is used to abruptly tear down a connection, such as when a
706 * resource limit is reached (listen queue depth exceeded), or if a listen
707 * socket is closed while there are sockets waiting to be accepted.
708 *
709 * This interface is tricky, because it is called on an unreferenced socket,
710 * and must be called only by a thread that has actually removed the socket
711 * from the listen queue it was on, or races with other threads are risked.
712 *
713 * This interface will call into the protocol code, so must not be called
714 * with any socket locks held.  Protocols do call it while holding their own
715 * recursible protocol mutexes, but this is something that should be subject
716 * to review in the future.
717 */
718void
719soabort(struct socket *so)
720{
721
722	/*
723	 * In as much as is possible, assert that no references to this
724	 * socket are held.  This is not quite the same as asserting that the
725	 * current thread is responsible for arranging for no references, but
726	 * is as close as we can get for now.
727	 */
728	KASSERT(so->so_count == 0, ("soabort: so_count"));
729	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
730	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
731	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
732	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
733
734	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
735		(*so->so_proto->pr_usrreqs->pru_abort)(so);
736	ACCEPT_LOCK();
737	SOCK_LOCK(so);
738	sofree(so);
739}
740
741int
742soaccept(struct socket *so, struct sockaddr **nam)
743{
744	int error;
745
746	SOCK_LOCK(so);
747	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
748	so->so_state &= ~SS_NOFDREF;
749	SOCK_UNLOCK(so);
750	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
751	return (error);
752}
753
754int
755soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
756{
757	int error;
758
759	if (so->so_options & SO_ACCEPTCONN)
760		return (EOPNOTSUPP);
761	/*
762	 * If protocol is connection-based, can only connect once.
763	 * Otherwise, if connected, try to disconnect first.  This allows
764	 * user to disconnect by connecting to, e.g., a null address.
765	 */
766	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
767	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
768	    (error = sodisconnect(so)))) {
769		error = EISCONN;
770	} else {
771		/*
772		 * Prevent accumulated error from previous connection from
773		 * biting us.
774		 */
775		so->so_error = 0;
776		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
777	}
778
779	return (error);
780}
781
782int
783soconnect2(struct socket *so1, struct socket *so2)
784{
785
786	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
787}
788
789int
790sodisconnect(struct socket *so)
791{
792	int error;
793
794	if ((so->so_state & SS_ISCONNECTED) == 0)
795		return (ENOTCONN);
796	if (so->so_state & SS_ISDISCONNECTING)
797		return (EALREADY);
798	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
799	return (error);
800}
801
802#ifdef ZERO_COPY_SOCKETS
803struct so_zerocopy_stats{
804	int size_ok;
805	int align_ok;
806	int found_ifp;
807};
808struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
809#include <netinet/in.h>
810#include <net/route.h>
811#include <netinet/in_pcb.h>
812#include <vm/vm.h>
813#include <vm/vm_page.h>
814#include <vm/vm_object.h>
815
816/*
817 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
818 * sosend_dgram() and sosend_generic() use m_uiotombuf().
819 *
820 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
821 * all of the data referenced by the uio.  If desired, it uses zero-copy.
822 * *space will be updated to reflect data copied in.
823 *
824 * NB: If atomic I/O is requested, the caller must already have checked that
825 * space can hold resid bytes.
826 *
827 * NB: In the event of an error, the caller may need to free the partial
828 * chain pointed to by *mpp.  The contents of both *uio and *space may be
829 * modified even in the case of an error.
830 */
831static int
832sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
833    int flags)
834{
835	struct mbuf *m, **mp, *top;
836	long len, resid;
837	int error;
838#ifdef ZERO_COPY_SOCKETS
839	int cow_send;
840#endif
841
842	*retmp = top = NULL;
843	mp = &top;
844	len = 0;
845	resid = uio->uio_resid;
846	error = 0;
847	do {
848#ifdef ZERO_COPY_SOCKETS
849		cow_send = 0;
850#endif /* ZERO_COPY_SOCKETS */
851		if (resid >= MINCLSIZE) {
852#ifdef ZERO_COPY_SOCKETS
853			if (top == NULL) {
854				m = m_gethdr(M_WAITOK, MT_DATA);
855				m->m_pkthdr.len = 0;
856				m->m_pkthdr.rcvif = NULL;
857			} else
858				m = m_get(M_WAITOK, MT_DATA);
859			if (so_zero_copy_send &&
860			    resid>=PAGE_SIZE &&
861			    *space>=PAGE_SIZE &&
862			    uio->uio_iov->iov_len>=PAGE_SIZE) {
863				so_zerocp_stats.size_ok++;
864				so_zerocp_stats.align_ok++;
865				cow_send = socow_setup(m, uio);
866				len = cow_send;
867			}
868			if (!cow_send) {
869				m_clget(m, M_WAITOK);
870				len = min(min(MCLBYTES, resid), *space);
871			}
872#else /* ZERO_COPY_SOCKETS */
873			if (top == NULL) {
874				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
875				m->m_pkthdr.len = 0;
876				m->m_pkthdr.rcvif = NULL;
877			} else
878				m = m_getcl(M_WAIT, MT_DATA, 0);
879			len = min(min(MCLBYTES, resid), *space);
880#endif /* ZERO_COPY_SOCKETS */
881		} else {
882			if (top == NULL) {
883				m = m_gethdr(M_WAIT, MT_DATA);
884				m->m_pkthdr.len = 0;
885				m->m_pkthdr.rcvif = NULL;
886
887				len = min(min(MHLEN, resid), *space);
888				/*
889				 * For datagram protocols, leave room
890				 * for protocol headers in first mbuf.
891				 */
892				if (atomic && m && len < MHLEN)
893					MH_ALIGN(m, len);
894			} else {
895				m = m_get(M_WAIT, MT_DATA);
896				len = min(min(MLEN, resid), *space);
897			}
898		}
899		if (m == NULL) {
900			error = ENOBUFS;
901			goto out;
902		}
903
904		*space -= len;
905#ifdef ZERO_COPY_SOCKETS
906		if (cow_send)
907			error = 0;
908		else
909#endif /* ZERO_COPY_SOCKETS */
910		error = uiomove(mtod(m, void *), (int)len, uio);
911		resid = uio->uio_resid;
912		m->m_len = len;
913		*mp = m;
914		top->m_pkthdr.len += len;
915		if (error)
916			goto out;
917		mp = &m->m_next;
918		if (resid <= 0) {
919			if (flags & MSG_EOR)
920				top->m_flags |= M_EOR;
921			break;
922		}
923	} while (*space > 0 && atomic);
924out:
925	*retmp = top;
926	return (error);
927}
928#endif /*ZERO_COPY_SOCKETS*/
929
930#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
931
932int
933sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
934    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
935{
936	long space, resid;
937	int clen = 0, error, dontroute;
938#ifdef ZERO_COPY_SOCKETS
939	int atomic = sosendallatonce(so) || top;
940#endif
941
942	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
943	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
944	    ("sodgram_send: !PR_ATOMIC"));
945
946	if (uio != NULL)
947		resid = uio->uio_resid;
948	else
949		resid = top->m_pkthdr.len;
950	/*
951	 * In theory resid should be unsigned.  However, space must be
952	 * signed, as it might be less than 0 if we over-committed, and we
953	 * must use a signed comparison of space and resid.  On the other
954	 * hand, a negative resid causes us to loop sending 0-length
955	 * segments to the protocol.
956	 *
957	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
958	 * type sockets since that's an error.
959	 */
960	if (resid < 0) {
961		error = EINVAL;
962		goto out;
963	}
964
965	dontroute =
966	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
967	if (td != NULL)
968		td->td_ru.ru_msgsnd++;
969	if (control != NULL)
970		clen = control->m_len;
971
972	SOCKBUF_LOCK(&so->so_snd);
973	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
974		SOCKBUF_UNLOCK(&so->so_snd);
975		error = EPIPE;
976		goto out;
977	}
978	if (so->so_error) {
979		error = so->so_error;
980		so->so_error = 0;
981		SOCKBUF_UNLOCK(&so->so_snd);
982		goto out;
983	}
984	if ((so->so_state & SS_ISCONNECTED) == 0) {
985		/*
986		 * `sendto' and `sendmsg' is allowed on a connection-based
987		 * socket if it supports implied connect.  Return ENOTCONN if
988		 * not connected and no address is supplied.
989		 */
990		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
991		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
992			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
993			    !(resid == 0 && clen != 0)) {
994				SOCKBUF_UNLOCK(&so->so_snd);
995				error = ENOTCONN;
996				goto out;
997			}
998		} else if (addr == NULL) {
999			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1000				error = ENOTCONN;
1001			else
1002				error = EDESTADDRREQ;
1003			SOCKBUF_UNLOCK(&so->so_snd);
1004			goto out;
1005		}
1006	}
1007
1008	/*
1009	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1010	 * problem and need fixing.
1011	 */
1012	space = sbspace(&so->so_snd);
1013	if (flags & MSG_OOB)
1014		space += 1024;
1015	space -= clen;
1016	SOCKBUF_UNLOCK(&so->so_snd);
1017	if (resid > space) {
1018		error = EMSGSIZE;
1019		goto out;
1020	}
1021	if (uio == NULL) {
1022		resid = 0;
1023		if (flags & MSG_EOR)
1024			top->m_flags |= M_EOR;
1025	} else {
1026#ifdef ZERO_COPY_SOCKETS
1027		error = sosend_copyin(uio, &top, atomic, &space, flags);
1028		if (error)
1029			goto out;
1030#else
1031		/*
1032		 * Copy the data from userland into a mbuf chain.
1033		 * If no data is to be copied in, a single empty mbuf
1034		 * is returned.
1035		 */
1036		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1037		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1038		if (top == NULL) {
1039			error = EFAULT;	/* only possible error */
1040			goto out;
1041		}
1042		space -= resid - uio->uio_resid;
1043#endif
1044		resid = uio->uio_resid;
1045	}
1046	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1047	/*
1048	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1049	 * than with.
1050	 */
1051	if (dontroute) {
1052		SOCK_LOCK(so);
1053		so->so_options |= SO_DONTROUTE;
1054		SOCK_UNLOCK(so);
1055	}
1056	/*
1057	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1058	 * of date.  We could have recieved a reset packet in an interrupt or
1059	 * maybe we slept while doing page faults in uiomove() etc.  We could
1060	 * probably recheck again inside the locking protection here, but
1061	 * there are probably other places that this also happens.  We must
1062	 * rethink this.
1063	 */
1064	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1065	    (flags & MSG_OOB) ? PRUS_OOB :
1066	/*
1067	 * If the user set MSG_EOF, the protocol understands this flag and
1068	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1069	 */
1070	    ((flags & MSG_EOF) &&
1071	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1072	     (resid <= 0)) ?
1073		PRUS_EOF :
1074		/* If there is more to send set PRUS_MORETOCOME */
1075		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1076		top, addr, control, td);
1077	if (dontroute) {
1078		SOCK_LOCK(so);
1079		so->so_options &= ~SO_DONTROUTE;
1080		SOCK_UNLOCK(so);
1081	}
1082	clen = 0;
1083	control = NULL;
1084	top = NULL;
1085out:
1086	if (top != NULL)
1087		m_freem(top);
1088	if (control != NULL)
1089		m_freem(control);
1090	return (error);
1091}
1092
1093/*
1094 * Send on a socket.  If send must go all at once and message is larger than
1095 * send buffering, then hard error.  Lock against other senders.  If must go
1096 * all at once and not enough room now, then inform user that this would
1097 * block and do nothing.  Otherwise, if nonblocking, send as much as
1098 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1099 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1100 * in mbuf chain must be small enough to send all at once.
1101 *
1102 * Returns nonzero on error, timeout or signal; callers must check for short
1103 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1104 * on return.
1105 */
1106int
1107sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1108    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1109{
1110	long space, resid;
1111	int clen = 0, error, dontroute;
1112	int atomic = sosendallatonce(so) || top;
1113
1114	if (uio != NULL)
1115		resid = uio->uio_resid;
1116	else
1117		resid = top->m_pkthdr.len;
1118	/*
1119	 * In theory resid should be unsigned.  However, space must be
1120	 * signed, as it might be less than 0 if we over-committed, and we
1121	 * must use a signed comparison of space and resid.  On the other
1122	 * hand, a negative resid causes us to loop sending 0-length
1123	 * segments to the protocol.
1124	 *
1125	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1126	 * type sockets since that's an error.
1127	 */
1128	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1129		error = EINVAL;
1130		goto out;
1131	}
1132
1133	dontroute =
1134	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1135	    (so->so_proto->pr_flags & PR_ATOMIC);
1136	if (td != NULL)
1137		td->td_ru.ru_msgsnd++;
1138	if (control != NULL)
1139		clen = control->m_len;
1140
1141	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1142	if (error)
1143		goto out;
1144
1145restart:
1146	do {
1147		SOCKBUF_LOCK(&so->so_snd);
1148		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1149			SOCKBUF_UNLOCK(&so->so_snd);
1150			error = EPIPE;
1151			goto release;
1152		}
1153		if (so->so_error) {
1154			error = so->so_error;
1155			so->so_error = 0;
1156			SOCKBUF_UNLOCK(&so->so_snd);
1157			goto release;
1158		}
1159		if ((so->so_state & SS_ISCONNECTED) == 0) {
1160			/*
1161			 * `sendto' and `sendmsg' is allowed on a connection-
1162			 * based socket if it supports implied connect.
1163			 * Return ENOTCONN if not connected and no address is
1164			 * supplied.
1165			 */
1166			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1167			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1168				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1169				    !(resid == 0 && clen != 0)) {
1170					SOCKBUF_UNLOCK(&so->so_snd);
1171					error = ENOTCONN;
1172					goto release;
1173				}
1174			} else if (addr == NULL) {
1175				SOCKBUF_UNLOCK(&so->so_snd);
1176				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1177					error = ENOTCONN;
1178				else
1179					error = EDESTADDRREQ;
1180				goto release;
1181			}
1182		}
1183		space = sbspace(&so->so_snd);
1184		if (flags & MSG_OOB)
1185			space += 1024;
1186		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1187		    clen > so->so_snd.sb_hiwat) {
1188			SOCKBUF_UNLOCK(&so->so_snd);
1189			error = EMSGSIZE;
1190			goto release;
1191		}
1192		if (space < resid + clen &&
1193		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1194			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1195				SOCKBUF_UNLOCK(&so->so_snd);
1196				error = EWOULDBLOCK;
1197				goto release;
1198			}
1199			error = sbwait(&so->so_snd);
1200			SOCKBUF_UNLOCK(&so->so_snd);
1201			if (error)
1202				goto release;
1203			goto restart;
1204		}
1205		SOCKBUF_UNLOCK(&so->so_snd);
1206		space -= clen;
1207		do {
1208			if (uio == NULL) {
1209				resid = 0;
1210				if (flags & MSG_EOR)
1211					top->m_flags |= M_EOR;
1212			} else {
1213#ifdef ZERO_COPY_SOCKETS
1214				error = sosend_copyin(uio, &top, atomic,
1215				    &space, flags);
1216				if (error != 0)
1217					goto release;
1218#else
1219				/*
1220				 * Copy the data from userland into a mbuf
1221				 * chain.  If no data is to be copied in,
1222				 * a single empty mbuf is returned.
1223				 */
1224				top = m_uiotombuf(uio, M_WAITOK, space,
1225				    (atomic ? max_hdr : 0),
1226				    (atomic ? M_PKTHDR : 0) |
1227				    ((flags & MSG_EOR) ? M_EOR : 0));
1228				if (top == NULL) {
1229					error = EFAULT; /* only possible error */
1230					goto release;
1231				}
1232				space -= resid - uio->uio_resid;
1233#endif
1234				resid = uio->uio_resid;
1235			}
1236			if (dontroute) {
1237				SOCK_LOCK(so);
1238				so->so_options |= SO_DONTROUTE;
1239				SOCK_UNLOCK(so);
1240			}
1241			/*
1242			 * XXX all the SBS_CANTSENDMORE checks previously
1243			 * done could be out of date.  We could have recieved
1244			 * a reset packet in an interrupt or maybe we slept
1245			 * while doing page faults in uiomove() etc.  We
1246			 * could probably recheck again inside the locking
1247			 * protection here, but there are probably other
1248			 * places that this also happens.  We must rethink
1249			 * this.
1250			 */
1251			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1252			    (flags & MSG_OOB) ? PRUS_OOB :
1253			/*
1254			 * If the user set MSG_EOF, the protocol understands
1255			 * this flag and nothing left to send then use
1256			 * PRU_SEND_EOF instead of PRU_SEND.
1257			 */
1258			    ((flags & MSG_EOF) &&
1259			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1260			     (resid <= 0)) ?
1261				PRUS_EOF :
1262			/* If there is more to send set PRUS_MORETOCOME. */
1263			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1264			    top, addr, control, td);
1265			if (dontroute) {
1266				SOCK_LOCK(so);
1267				so->so_options &= ~SO_DONTROUTE;
1268				SOCK_UNLOCK(so);
1269			}
1270			clen = 0;
1271			control = NULL;
1272			top = NULL;
1273			if (error)
1274				goto release;
1275		} while (resid && space > 0);
1276	} while (resid);
1277
1278release:
1279	sbunlock(&so->so_snd);
1280out:
1281	if (top != NULL)
1282		m_freem(top);
1283	if (control != NULL)
1284		m_freem(control);
1285	return (error);
1286}
1287
1288int
1289sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1290    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1291{
1292
1293	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1294	    control, flags, td));
1295}
1296
1297/*
1298 * The part of soreceive() that implements reading non-inline out-of-band
1299 * data from a socket.  For more complete comments, see soreceive(), from
1300 * which this code originated.
1301 *
1302 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1303 * unable to return an mbuf chain to the caller.
1304 */
1305static int
1306soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1307{
1308	struct protosw *pr = so->so_proto;
1309	struct mbuf *m;
1310	int error;
1311
1312	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1313
1314	m = m_get(M_WAIT, MT_DATA);
1315	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1316	if (error)
1317		goto bad;
1318	do {
1319#ifdef ZERO_COPY_SOCKETS
1320		if (so_zero_copy_receive) {
1321			int disposable;
1322
1323			if ((m->m_flags & M_EXT)
1324			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1325				disposable = 1;
1326			else
1327				disposable = 0;
1328
1329			error = uiomoveco(mtod(m, void *),
1330					  min(uio->uio_resid, m->m_len),
1331					  uio, disposable);
1332		} else
1333#endif /* ZERO_COPY_SOCKETS */
1334		error = uiomove(mtod(m, void *),
1335		    (int) min(uio->uio_resid, m->m_len), uio);
1336		m = m_free(m);
1337	} while (uio->uio_resid && error == 0 && m);
1338bad:
1339	if (m != NULL)
1340		m_freem(m);
1341	return (error);
1342}
1343
1344/*
1345 * Following replacement or removal of the first mbuf on the first mbuf chain
1346 * of a socket buffer, push necessary state changes back into the socket
1347 * buffer so that other consumers see the values consistently.  'nextrecord'
1348 * is the callers locally stored value of the original value of
1349 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1350 * NOTE: 'nextrecord' may be NULL.
1351 */
1352static __inline void
1353sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1354{
1355
1356	SOCKBUF_LOCK_ASSERT(sb);
1357	/*
1358	 * First, update for the new value of nextrecord.  If necessary, make
1359	 * it the first record.
1360	 */
1361	if (sb->sb_mb != NULL)
1362		sb->sb_mb->m_nextpkt = nextrecord;
1363	else
1364		sb->sb_mb = nextrecord;
1365
1366        /*
1367         * Now update any dependent socket buffer fields to reflect the new
1368         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1369	 * addition of a second clause that takes care of the case where
1370	 * sb_mb has been updated, but remains the last record.
1371         */
1372        if (sb->sb_mb == NULL) {
1373                sb->sb_mbtail = NULL;
1374                sb->sb_lastrecord = NULL;
1375        } else if (sb->sb_mb->m_nextpkt == NULL)
1376                sb->sb_lastrecord = sb->sb_mb;
1377}
1378
1379
1380/*
1381 * Implement receive operations on a socket.  We depend on the way that
1382 * records are added to the sockbuf by sbappend.  In particular, each record
1383 * (mbufs linked through m_next) must begin with an address if the protocol
1384 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1385 * data, and then zero or more mbufs of data.  In order to allow parallelism
1386 * between network receive and copying to user space, as well as avoid
1387 * sleeping with a mutex held, we release the socket buffer mutex during the
1388 * user space copy.  Although the sockbuf is locked, new data may still be
1389 * appended, and thus we must maintain consistency of the sockbuf during that
1390 * time.
1391 *
1392 * The caller may receive the data as a single mbuf chain by supplying an
1393 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1394 * the count in uio_resid.
1395 */
1396int
1397soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1398    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1399{
1400	struct mbuf *m, **mp;
1401	int flags, len, error, offset;
1402	struct protosw *pr = so->so_proto;
1403	struct mbuf *nextrecord;
1404	int moff, type = 0;
1405	int orig_resid = uio->uio_resid;
1406
1407	mp = mp0;
1408	if (psa != NULL)
1409		*psa = NULL;
1410	if (controlp != NULL)
1411		*controlp = NULL;
1412	if (flagsp != NULL)
1413		flags = *flagsp &~ MSG_EOR;
1414	else
1415		flags = 0;
1416	if (flags & MSG_OOB)
1417		return (soreceive_rcvoob(so, uio, flags));
1418	if (mp != NULL)
1419		*mp = NULL;
1420	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1421	    && uio->uio_resid)
1422		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1423
1424	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1425	if (error)
1426		return (error);
1427
1428restart:
1429	SOCKBUF_LOCK(&so->so_rcv);
1430	m = so->so_rcv.sb_mb;
1431	/*
1432	 * If we have less data than requested, block awaiting more (subject
1433	 * to any timeout) if:
1434	 *   1. the current count is less than the low water mark, or
1435	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1436	 *	receive operation at once if we block (resid <= hiwat).
1437	 *   3. MSG_DONTWAIT is not set
1438	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1439	 * we have to do the receive in sections, and thus risk returning a
1440	 * short count if a timeout or signal occurs after we start.
1441	 */
1442	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1443	    so->so_rcv.sb_cc < uio->uio_resid) &&
1444	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1445	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1446	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1447		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1448		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1449		    m, so->so_rcv.sb_cc));
1450		if (so->so_error) {
1451			if (m != NULL)
1452				goto dontblock;
1453			error = so->so_error;
1454			if ((flags & MSG_PEEK) == 0)
1455				so->so_error = 0;
1456			SOCKBUF_UNLOCK(&so->so_rcv);
1457			goto release;
1458		}
1459		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1460		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1461			if (m == NULL) {
1462				SOCKBUF_UNLOCK(&so->so_rcv);
1463				goto release;
1464			} else
1465				goto dontblock;
1466		}
1467		for (; m != NULL; m = m->m_next)
1468			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1469				m = so->so_rcv.sb_mb;
1470				goto dontblock;
1471			}
1472		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1473		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1474			SOCKBUF_UNLOCK(&so->so_rcv);
1475			error = ENOTCONN;
1476			goto release;
1477		}
1478		if (uio->uio_resid == 0) {
1479			SOCKBUF_UNLOCK(&so->so_rcv);
1480			goto release;
1481		}
1482		if ((so->so_state & SS_NBIO) ||
1483		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1484			SOCKBUF_UNLOCK(&so->so_rcv);
1485			error = EWOULDBLOCK;
1486			goto release;
1487		}
1488		SBLASTRECORDCHK(&so->so_rcv);
1489		SBLASTMBUFCHK(&so->so_rcv);
1490		error = sbwait(&so->so_rcv);
1491		SOCKBUF_UNLOCK(&so->so_rcv);
1492		if (error)
1493			goto release;
1494		goto restart;
1495	}
1496dontblock:
1497	/*
1498	 * From this point onward, we maintain 'nextrecord' as a cache of the
1499	 * pointer to the next record in the socket buffer.  We must keep the
1500	 * various socket buffer pointers and local stack versions of the
1501	 * pointers in sync, pushing out modifications before dropping the
1502	 * socket buffer mutex, and re-reading them when picking it up.
1503	 *
1504	 * Otherwise, we will race with the network stack appending new data
1505	 * or records onto the socket buffer by using inconsistent/stale
1506	 * versions of the field, possibly resulting in socket buffer
1507	 * corruption.
1508	 *
1509	 * By holding the high-level sblock(), we prevent simultaneous
1510	 * readers from pulling off the front of the socket buffer.
1511	 */
1512	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1513	if (uio->uio_td)
1514		uio->uio_td->td_ru.ru_msgrcv++;
1515	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1516	SBLASTRECORDCHK(&so->so_rcv);
1517	SBLASTMBUFCHK(&so->so_rcv);
1518	nextrecord = m->m_nextpkt;
1519	if (pr->pr_flags & PR_ADDR) {
1520		KASSERT(m->m_type == MT_SONAME,
1521		    ("m->m_type == %d", m->m_type));
1522		orig_resid = 0;
1523		if (psa != NULL)
1524			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1525			    M_NOWAIT);
1526		if (flags & MSG_PEEK) {
1527			m = m->m_next;
1528		} else {
1529			sbfree(&so->so_rcv, m);
1530			so->so_rcv.sb_mb = m_free(m);
1531			m = so->so_rcv.sb_mb;
1532			sockbuf_pushsync(&so->so_rcv, nextrecord);
1533		}
1534	}
1535
1536	/*
1537	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1538	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1539	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1540	 * perform externalization (or freeing if controlp == NULL).
1541	 */
1542	if (m != NULL && m->m_type == MT_CONTROL) {
1543		struct mbuf *cm = NULL, *cmn;
1544		struct mbuf **cme = &cm;
1545
1546		do {
1547			if (flags & MSG_PEEK) {
1548				if (controlp != NULL) {
1549					*controlp = m_copy(m, 0, m->m_len);
1550					controlp = &(*controlp)->m_next;
1551				}
1552				m = m->m_next;
1553			} else {
1554				sbfree(&so->so_rcv, m);
1555				so->so_rcv.sb_mb = m->m_next;
1556				m->m_next = NULL;
1557				*cme = m;
1558				cme = &(*cme)->m_next;
1559				m = so->so_rcv.sb_mb;
1560			}
1561		} while (m != NULL && m->m_type == MT_CONTROL);
1562		if ((flags & MSG_PEEK) == 0)
1563			sockbuf_pushsync(&so->so_rcv, nextrecord);
1564		while (cm != NULL) {
1565			cmn = cm->m_next;
1566			cm->m_next = NULL;
1567			if (pr->pr_domain->dom_externalize != NULL) {
1568				SOCKBUF_UNLOCK(&so->so_rcv);
1569				error = (*pr->pr_domain->dom_externalize)
1570				    (cm, controlp);
1571				SOCKBUF_LOCK(&so->so_rcv);
1572			} else if (controlp != NULL)
1573				*controlp = cm;
1574			else
1575				m_freem(cm);
1576			if (controlp != NULL) {
1577				orig_resid = 0;
1578				while (*controlp != NULL)
1579					controlp = &(*controlp)->m_next;
1580			}
1581			cm = cmn;
1582		}
1583		if (m != NULL)
1584			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1585		else
1586			nextrecord = so->so_rcv.sb_mb;
1587		orig_resid = 0;
1588	}
1589	if (m != NULL) {
1590		if ((flags & MSG_PEEK) == 0) {
1591			KASSERT(m->m_nextpkt == nextrecord,
1592			    ("soreceive: post-control, nextrecord !sync"));
1593			if (nextrecord == NULL) {
1594				KASSERT(so->so_rcv.sb_mb == m,
1595				    ("soreceive: post-control, sb_mb!=m"));
1596				KASSERT(so->so_rcv.sb_lastrecord == m,
1597				    ("soreceive: post-control, lastrecord!=m"));
1598			}
1599		}
1600		type = m->m_type;
1601		if (type == MT_OOBDATA)
1602			flags |= MSG_OOB;
1603	} else {
1604		if ((flags & MSG_PEEK) == 0) {
1605			KASSERT(so->so_rcv.sb_mb == nextrecord,
1606			    ("soreceive: sb_mb != nextrecord"));
1607			if (so->so_rcv.sb_mb == NULL) {
1608				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1609				    ("soreceive: sb_lastercord != NULL"));
1610			}
1611		}
1612	}
1613	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1614	SBLASTRECORDCHK(&so->so_rcv);
1615	SBLASTMBUFCHK(&so->so_rcv);
1616
1617	/*
1618	 * Now continue to read any data mbufs off of the head of the socket
1619	 * buffer until the read request is satisfied.  Note that 'type' is
1620	 * used to store the type of any mbuf reads that have happened so far
1621	 * such that soreceive() can stop reading if the type changes, which
1622	 * causes soreceive() to return only one of regular data and inline
1623	 * out-of-band data in a single socket receive operation.
1624	 */
1625	moff = 0;
1626	offset = 0;
1627	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1628		/*
1629		 * If the type of mbuf has changed since the last mbuf
1630		 * examined ('type'), end the receive operation.
1631	 	 */
1632		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1633		if (m->m_type == MT_OOBDATA) {
1634			if (type != MT_OOBDATA)
1635				break;
1636		} else if (type == MT_OOBDATA)
1637			break;
1638		else
1639		    KASSERT(m->m_type == MT_DATA,
1640			("m->m_type == %d", m->m_type));
1641		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1642		len = uio->uio_resid;
1643		if (so->so_oobmark && len > so->so_oobmark - offset)
1644			len = so->so_oobmark - offset;
1645		if (len > m->m_len - moff)
1646			len = m->m_len - moff;
1647		/*
1648		 * If mp is set, just pass back the mbufs.  Otherwise copy
1649		 * them out via the uio, then free.  Sockbuf must be
1650		 * consistent here (points to current mbuf, it points to next
1651		 * record) when we drop priority; we must note any additions
1652		 * to the sockbuf when we block interrupts again.
1653		 */
1654		if (mp == NULL) {
1655			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1656			SBLASTRECORDCHK(&so->so_rcv);
1657			SBLASTMBUFCHK(&so->so_rcv);
1658			SOCKBUF_UNLOCK(&so->so_rcv);
1659#ifdef ZERO_COPY_SOCKETS
1660			if (so_zero_copy_receive) {
1661				int disposable;
1662
1663				if ((m->m_flags & M_EXT)
1664				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1665					disposable = 1;
1666				else
1667					disposable = 0;
1668
1669				error = uiomoveco(mtod(m, char *) + moff,
1670						  (int)len, uio,
1671						  disposable);
1672			} else
1673#endif /* ZERO_COPY_SOCKETS */
1674			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1675			SOCKBUF_LOCK(&so->so_rcv);
1676			if (error) {
1677				/*
1678				 * The MT_SONAME mbuf has already been removed
1679				 * from the record, so it is necessary to
1680				 * remove the data mbufs, if any, to preserve
1681				 * the invariant in the case of PR_ADDR that
1682				 * requires MT_SONAME mbufs at the head of
1683				 * each record.
1684				 */
1685				if (m && pr->pr_flags & PR_ATOMIC &&
1686				    ((flags & MSG_PEEK) == 0))
1687					(void)sbdroprecord_locked(&so->so_rcv);
1688				SOCKBUF_UNLOCK(&so->so_rcv);
1689				goto release;
1690			}
1691		} else
1692			uio->uio_resid -= len;
1693		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1694		if (len == m->m_len - moff) {
1695			if (m->m_flags & M_EOR)
1696				flags |= MSG_EOR;
1697			if (flags & MSG_PEEK) {
1698				m = m->m_next;
1699				moff = 0;
1700			} else {
1701				nextrecord = m->m_nextpkt;
1702				sbfree(&so->so_rcv, m);
1703				if (mp != NULL) {
1704					*mp = m;
1705					mp = &m->m_next;
1706					so->so_rcv.sb_mb = m = m->m_next;
1707					*mp = NULL;
1708				} else {
1709					so->so_rcv.sb_mb = m_free(m);
1710					m = so->so_rcv.sb_mb;
1711				}
1712				sockbuf_pushsync(&so->so_rcv, nextrecord);
1713				SBLASTRECORDCHK(&so->so_rcv);
1714				SBLASTMBUFCHK(&so->so_rcv);
1715			}
1716		} else {
1717			if (flags & MSG_PEEK)
1718				moff += len;
1719			else {
1720				if (mp != NULL) {
1721					int copy_flag;
1722
1723					if (flags & MSG_DONTWAIT)
1724						copy_flag = M_DONTWAIT;
1725					else
1726						copy_flag = M_WAIT;
1727					if (copy_flag == M_WAIT)
1728						SOCKBUF_UNLOCK(&so->so_rcv);
1729					*mp = m_copym(m, 0, len, copy_flag);
1730					if (copy_flag == M_WAIT)
1731						SOCKBUF_LOCK(&so->so_rcv);
1732 					if (*mp == NULL) {
1733 						/*
1734 						 * m_copym() couldn't
1735						 * allocate an mbuf.  Adjust
1736						 * uio_resid back (it was
1737						 * adjusted down by len
1738						 * bytes, which we didn't end
1739						 * up "copying" over).
1740 						 */
1741 						uio->uio_resid += len;
1742 						break;
1743 					}
1744				}
1745				m->m_data += len;
1746				m->m_len -= len;
1747				so->so_rcv.sb_cc -= len;
1748			}
1749		}
1750		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1751		if (so->so_oobmark) {
1752			if ((flags & MSG_PEEK) == 0) {
1753				so->so_oobmark -= len;
1754				if (so->so_oobmark == 0) {
1755					so->so_rcv.sb_state |= SBS_RCVATMARK;
1756					break;
1757				}
1758			} else {
1759				offset += len;
1760				if (offset == so->so_oobmark)
1761					break;
1762			}
1763		}
1764		if (flags & MSG_EOR)
1765			break;
1766		/*
1767		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1768		 * must not quit until "uio->uio_resid == 0" or an error
1769		 * termination.  If a signal/timeout occurs, return with a
1770		 * short count but without error.  Keep sockbuf locked
1771		 * against other readers.
1772		 */
1773		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1774		    !sosendallatonce(so) && nextrecord == NULL) {
1775			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1776			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1777				break;
1778			/*
1779			 * Notify the protocol that some data has been
1780			 * drained before blocking.
1781			 */
1782			if (pr->pr_flags & PR_WANTRCVD) {
1783				SOCKBUF_UNLOCK(&so->so_rcv);
1784				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1785				SOCKBUF_LOCK(&so->so_rcv);
1786			}
1787			SBLASTRECORDCHK(&so->so_rcv);
1788			SBLASTMBUFCHK(&so->so_rcv);
1789			error = sbwait(&so->so_rcv);
1790			if (error) {
1791				SOCKBUF_UNLOCK(&so->so_rcv);
1792				goto release;
1793			}
1794			m = so->so_rcv.sb_mb;
1795			if (m != NULL)
1796				nextrecord = m->m_nextpkt;
1797		}
1798	}
1799
1800	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1801	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1802		flags |= MSG_TRUNC;
1803		if ((flags & MSG_PEEK) == 0)
1804			(void) sbdroprecord_locked(&so->so_rcv);
1805	}
1806	if ((flags & MSG_PEEK) == 0) {
1807		if (m == NULL) {
1808			/*
1809			 * First part is an inline SB_EMPTY_FIXUP().  Second
1810			 * part makes sure sb_lastrecord is up-to-date if
1811			 * there is still data in the socket buffer.
1812			 */
1813			so->so_rcv.sb_mb = nextrecord;
1814			if (so->so_rcv.sb_mb == NULL) {
1815				so->so_rcv.sb_mbtail = NULL;
1816				so->so_rcv.sb_lastrecord = NULL;
1817			} else if (nextrecord->m_nextpkt == NULL)
1818				so->so_rcv.sb_lastrecord = nextrecord;
1819		}
1820		SBLASTRECORDCHK(&so->so_rcv);
1821		SBLASTMBUFCHK(&so->so_rcv);
1822		/*
1823		 * If soreceive() is being done from the socket callback,
1824		 * then don't need to generate ACK to peer to update window,
1825		 * since ACK will be generated on return to TCP.
1826		 */
1827		if (!(flags & MSG_SOCALLBCK) &&
1828		    (pr->pr_flags & PR_WANTRCVD)) {
1829			SOCKBUF_UNLOCK(&so->so_rcv);
1830			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1831			SOCKBUF_LOCK(&so->so_rcv);
1832		}
1833	}
1834	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1835	if (orig_resid == uio->uio_resid && orig_resid &&
1836	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1837		SOCKBUF_UNLOCK(&so->so_rcv);
1838		goto restart;
1839	}
1840	SOCKBUF_UNLOCK(&so->so_rcv);
1841
1842	if (flagsp != NULL)
1843		*flagsp |= flags;
1844release:
1845	sbunlock(&so->so_rcv);
1846	return (error);
1847}
1848
1849/*
1850 * Optimized version of soreceive() for simple datagram cases from userspace.
1851 * Unlike in the stream case, we're able to drop a datagram if copyout()
1852 * fails, and because we handle datagrams atomically, we don't need to use a
1853 * sleep lock to prevent I/O interlacing.
1854 */
1855int
1856soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1857    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1858{
1859	struct mbuf *m, *m2;
1860	int flags, len, error, offset;
1861	struct protosw *pr = so->so_proto;
1862	struct mbuf *nextrecord;
1863
1864	if (psa != NULL)
1865		*psa = NULL;
1866	if (controlp != NULL)
1867		*controlp = NULL;
1868	if (flagsp != NULL)
1869		flags = *flagsp &~ MSG_EOR;
1870	else
1871		flags = 0;
1872
1873	/*
1874	 * For any complicated cases, fall back to the full
1875	 * soreceive_generic().
1876	 */
1877	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
1878		return (soreceive_generic(so, psa, uio, mp0, controlp,
1879		    flagsp));
1880
1881	/*
1882	 * Enforce restrictions on use.
1883	 */
1884	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
1885	    ("soreceive_dgram: wantrcvd"));
1886	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
1887	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
1888	    ("soreceive_dgram: SBS_RCVATMARK"));
1889	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
1890	    ("soreceive_dgram: P_CONNREQUIRED"));
1891
1892	/*
1893	 * Loop blocking while waiting for a datagram.
1894	 */
1895	SOCKBUF_LOCK(&so->so_rcv);
1896	while ((m = so->so_rcv.sb_mb) == NULL) {
1897		KASSERT(so->so_rcv.sb_cc == 0,
1898		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
1899		    so->so_rcv.sb_cc));
1900		if (so->so_error) {
1901			error = so->so_error;
1902			so->so_error = 0;
1903			SOCKBUF_UNLOCK(&so->so_rcv);
1904			return (error);
1905		}
1906		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1907		    uio->uio_resid == 0) {
1908			SOCKBUF_UNLOCK(&so->so_rcv);
1909			return (0);
1910		}
1911		if ((so->so_state & SS_NBIO) ||
1912		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1913			SOCKBUF_UNLOCK(&so->so_rcv);
1914			return (EWOULDBLOCK);
1915		}
1916		SBLASTRECORDCHK(&so->so_rcv);
1917		SBLASTMBUFCHK(&so->so_rcv);
1918		error = sbwait(&so->so_rcv);
1919		if (error) {
1920			SOCKBUF_UNLOCK(&so->so_rcv);
1921			return (error);
1922		}
1923	}
1924	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1925
1926	if (uio->uio_td)
1927		uio->uio_td->td_ru.ru_msgrcv++;
1928	SBLASTRECORDCHK(&so->so_rcv);
1929	SBLASTMBUFCHK(&so->so_rcv);
1930	nextrecord = m->m_nextpkt;
1931	if (nextrecord == NULL) {
1932		KASSERT(so->so_rcv.sb_lastrecord == m,
1933		    ("soreceive_dgram: lastrecord != m"));
1934	}
1935
1936	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
1937	    ("soreceive_dgram: m_nextpkt != nextrecord"));
1938
1939	/*
1940	 * Pull 'm' and its chain off the front of the packet queue.
1941	 */
1942	so->so_rcv.sb_mb = NULL;
1943	sockbuf_pushsync(&so->so_rcv, nextrecord);
1944
1945	/*
1946	 * Walk 'm's chain and free that many bytes from the socket buffer.
1947	 */
1948	for (m2 = m; m2 != NULL; m2 = m2->m_next)
1949		sbfree(&so->so_rcv, m2);
1950
1951	/*
1952	 * Do a few last checks before we let go of the lock.
1953	 */
1954	SBLASTRECORDCHK(&so->so_rcv);
1955	SBLASTMBUFCHK(&so->so_rcv);
1956	SOCKBUF_UNLOCK(&so->so_rcv);
1957
1958	if (pr->pr_flags & PR_ADDR) {
1959		KASSERT(m->m_type == MT_SONAME,
1960		    ("m->m_type == %d", m->m_type));
1961		if (psa != NULL)
1962			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1963			    M_NOWAIT);
1964		m = m_free(m);
1965	}
1966	if (m == NULL) {
1967		/* XXXRW: Can this happen? */
1968		return (0);
1969	}
1970
1971	/*
1972	 * Packet to copyout() is now in 'm' and it is disconnected from the
1973	 * queue.
1974	 *
1975	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1976	 * in the first mbuf chain on the socket buffer.  We call into the
1977	 * protocol to perform externalization (or freeing if controlp ==
1978	 * NULL).
1979	 */
1980	if (m->m_type == MT_CONTROL) {
1981		struct mbuf *cm = NULL, *cmn;
1982		struct mbuf **cme = &cm;
1983
1984		do {
1985			m2 = m->m_next;
1986			m->m_next = NULL;
1987			*cme = m;
1988			cme = &(*cme)->m_next;
1989			m = m2;
1990		} while (m != NULL && m->m_type == MT_CONTROL);
1991		while (cm != NULL) {
1992			cmn = cm->m_next;
1993			cm->m_next = NULL;
1994			if (pr->pr_domain->dom_externalize != NULL) {
1995				error = (*pr->pr_domain->dom_externalize)
1996				    (cm, controlp);
1997			} else if (controlp != NULL)
1998				*controlp = cm;
1999			else
2000				m_freem(cm);
2001			if (controlp != NULL) {
2002				while (*controlp != NULL)
2003					controlp = &(*controlp)->m_next;
2004			}
2005			cm = cmn;
2006		}
2007	}
2008	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2009
2010	offset = 0;
2011	while (m != NULL && uio->uio_resid > 0) {
2012		len = uio->uio_resid;
2013		if (len > m->m_len)
2014			len = m->m_len;
2015		error = uiomove(mtod(m, char *), (int)len, uio);
2016		if (error) {
2017			m_freem(m);
2018			return (error);
2019		}
2020		m = m_free(m);
2021	}
2022	if (m != NULL)
2023		flags |= MSG_TRUNC;
2024	m_freem(m);
2025	if (flagsp != NULL)
2026		*flagsp |= flags;
2027	return (0);
2028}
2029
2030int
2031soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2032    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2033{
2034
2035	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2036	    controlp, flagsp));
2037}
2038
2039int
2040soshutdown(struct socket *so, int how)
2041{
2042	struct protosw *pr = so->so_proto;
2043
2044	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2045		return (EINVAL);
2046	if (pr->pr_usrreqs->pru_flush != NULL) {
2047	        (*pr->pr_usrreqs->pru_flush)(so, how);
2048	}
2049	if (how != SHUT_WR)
2050		sorflush(so);
2051	if (how != SHUT_RD)
2052		return ((*pr->pr_usrreqs->pru_shutdown)(so));
2053	return (0);
2054}
2055
2056void
2057sorflush(struct socket *so)
2058{
2059	struct sockbuf *sb = &so->so_rcv;
2060	struct protosw *pr = so->so_proto;
2061	struct sockbuf asb;
2062
2063	/*
2064	 * In order to avoid calling dom_dispose with the socket buffer mutex
2065	 * held, and in order to generally avoid holding the lock for a long
2066	 * time, we make a copy of the socket buffer and clear the original
2067	 * (except locks, state).  The new socket buffer copy won't have
2068	 * initialized locks so we can only call routines that won't use or
2069	 * assert those locks.
2070	 *
2071	 * Dislodge threads currently blocked in receive and wait to acquire
2072	 * a lock against other simultaneous readers before clearing the
2073	 * socket buffer.  Don't let our acquire be interrupted by a signal
2074	 * despite any existing socket disposition on interruptable waiting.
2075	 */
2076	socantrcvmore(so);
2077	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2078
2079	/*
2080	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2081	 * and mutex data unchanged.
2082	 */
2083	SOCKBUF_LOCK(sb);
2084	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2085	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2086	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2087	bzero(&sb->sb_startzero,
2088	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2089	SOCKBUF_UNLOCK(sb);
2090	sbunlock(sb);
2091
2092	/*
2093	 * Dispose of special rights and flush the socket buffer.  Don't call
2094	 * any unsafe routines (that rely on locks being initialized) on asb.
2095	 */
2096	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2097		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2098	sbrelease_internal(&asb, so);
2099}
2100
2101/*
2102 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2103 * additional variant to handle the case where the option value needs to be
2104 * some kind of integer, but not a specific size.  In addition to their use
2105 * here, these functions are also called by the protocol-level pr_ctloutput()
2106 * routines.
2107 */
2108int
2109sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2110{
2111	size_t	valsize;
2112
2113	/*
2114	 * If the user gives us more than we wanted, we ignore it, but if we
2115	 * don't get the minimum length the caller wants, we return EINVAL.
2116	 * On success, sopt->sopt_valsize is set to however much we actually
2117	 * retrieved.
2118	 */
2119	if ((valsize = sopt->sopt_valsize) < minlen)
2120		return EINVAL;
2121	if (valsize > len)
2122		sopt->sopt_valsize = valsize = len;
2123
2124	if (sopt->sopt_td != NULL)
2125		return (copyin(sopt->sopt_val, buf, valsize));
2126
2127	bcopy(sopt->sopt_val, buf, valsize);
2128	return (0);
2129}
2130
2131/*
2132 * Kernel version of setsockopt(2).
2133 *
2134 * XXX: optlen is size_t, not socklen_t
2135 */
2136int
2137so_setsockopt(struct socket *so, int level, int optname, void *optval,
2138    size_t optlen)
2139{
2140	struct sockopt sopt;
2141
2142	sopt.sopt_level = level;
2143	sopt.sopt_name = optname;
2144	sopt.sopt_dir = SOPT_SET;
2145	sopt.sopt_val = optval;
2146	sopt.sopt_valsize = optlen;
2147	sopt.sopt_td = NULL;
2148	return (sosetopt(so, &sopt));
2149}
2150
2151int
2152sosetopt(struct socket *so, struct sockopt *sopt)
2153{
2154	int	error, optval;
2155	struct	linger l;
2156	struct	timeval tv;
2157	u_long  val;
2158#ifdef MAC
2159	struct mac extmac;
2160#endif
2161
2162	error = 0;
2163	if (sopt->sopt_level != SOL_SOCKET) {
2164		if (so->so_proto && so->so_proto->pr_ctloutput)
2165			return ((*so->so_proto->pr_ctloutput)
2166				  (so, sopt));
2167		error = ENOPROTOOPT;
2168	} else {
2169		switch (sopt->sopt_name) {
2170#ifdef INET
2171		case SO_ACCEPTFILTER:
2172			error = do_setopt_accept_filter(so, sopt);
2173			if (error)
2174				goto bad;
2175			break;
2176#endif
2177		case SO_LINGER:
2178			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2179			if (error)
2180				goto bad;
2181
2182			SOCK_LOCK(so);
2183			so->so_linger = l.l_linger;
2184			if (l.l_onoff)
2185				so->so_options |= SO_LINGER;
2186			else
2187				so->so_options &= ~SO_LINGER;
2188			SOCK_UNLOCK(so);
2189			break;
2190
2191		case SO_DEBUG:
2192		case SO_KEEPALIVE:
2193		case SO_DONTROUTE:
2194		case SO_USELOOPBACK:
2195		case SO_BROADCAST:
2196		case SO_REUSEADDR:
2197		case SO_REUSEPORT:
2198		case SO_OOBINLINE:
2199		case SO_TIMESTAMP:
2200		case SO_BINTIME:
2201		case SO_NOSIGPIPE:
2202		case SO_NO_DDP:
2203		case SO_NO_OFFLOAD:
2204			error = sooptcopyin(sopt, &optval, sizeof optval,
2205					    sizeof optval);
2206			if (error)
2207				goto bad;
2208			SOCK_LOCK(so);
2209			if (optval)
2210				so->so_options |= sopt->sopt_name;
2211			else
2212				so->so_options &= ~sopt->sopt_name;
2213			SOCK_UNLOCK(so);
2214			break;
2215
2216		case SO_SETFIB:
2217			error = sooptcopyin(sopt, &optval, sizeof optval,
2218					    sizeof optval);
2219			if (optval < 1 || optval > rt_numfibs) {
2220				error = EINVAL;
2221				goto bad;
2222			}
2223			if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2224			    (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2225				so->so_fibnum = optval;
2226				/* Note: ignore error */
2227				if (so->so_proto && so->so_proto->pr_ctloutput)
2228					(*so->so_proto->pr_ctloutput)(so, sopt);
2229			} else {
2230				so->so_fibnum = 0;
2231			}
2232			break;
2233		case SO_SNDBUF:
2234		case SO_RCVBUF:
2235		case SO_SNDLOWAT:
2236		case SO_RCVLOWAT:
2237			error = sooptcopyin(sopt, &optval, sizeof optval,
2238					    sizeof optval);
2239			if (error)
2240				goto bad;
2241
2242			/*
2243			 * Values < 1 make no sense for any of these options,
2244			 * so disallow them.
2245			 */
2246			if (optval < 1) {
2247				error = EINVAL;
2248				goto bad;
2249			}
2250
2251			switch (sopt->sopt_name) {
2252			case SO_SNDBUF:
2253			case SO_RCVBUF:
2254				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2255				    &so->so_snd : &so->so_rcv, (u_long)optval,
2256				    so, curthread) == 0) {
2257					error = ENOBUFS;
2258					goto bad;
2259				}
2260				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2261				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2262				break;
2263
2264			/*
2265			 * Make sure the low-water is never greater than the
2266			 * high-water.
2267			 */
2268			case SO_SNDLOWAT:
2269				SOCKBUF_LOCK(&so->so_snd);
2270				so->so_snd.sb_lowat =
2271				    (optval > so->so_snd.sb_hiwat) ?
2272				    so->so_snd.sb_hiwat : optval;
2273				SOCKBUF_UNLOCK(&so->so_snd);
2274				break;
2275			case SO_RCVLOWAT:
2276				SOCKBUF_LOCK(&so->so_rcv);
2277				so->so_rcv.sb_lowat =
2278				    (optval > so->so_rcv.sb_hiwat) ?
2279				    so->so_rcv.sb_hiwat : optval;
2280				SOCKBUF_UNLOCK(&so->so_rcv);
2281				break;
2282			}
2283			break;
2284
2285		case SO_SNDTIMEO:
2286		case SO_RCVTIMEO:
2287#ifdef COMPAT_IA32
2288			if (SV_CURPROC_FLAG(SV_ILP32)) {
2289				struct timeval32 tv32;
2290
2291				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2292				    sizeof tv32);
2293				CP(tv32, tv, tv_sec);
2294				CP(tv32, tv, tv_usec);
2295			} else
2296#endif
2297				error = sooptcopyin(sopt, &tv, sizeof tv,
2298				    sizeof tv);
2299			if (error)
2300				goto bad;
2301
2302			/* assert(hz > 0); */
2303			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2304			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2305				error = EDOM;
2306				goto bad;
2307			}
2308			/* assert(tick > 0); */
2309			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2310			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2311			if (val > INT_MAX) {
2312				error = EDOM;
2313				goto bad;
2314			}
2315			if (val == 0 && tv.tv_usec != 0)
2316				val = 1;
2317
2318			switch (sopt->sopt_name) {
2319			case SO_SNDTIMEO:
2320				so->so_snd.sb_timeo = val;
2321				break;
2322			case SO_RCVTIMEO:
2323				so->so_rcv.sb_timeo = val;
2324				break;
2325			}
2326			break;
2327
2328		case SO_LABEL:
2329#ifdef MAC
2330			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2331			    sizeof extmac);
2332			if (error)
2333				goto bad;
2334			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2335			    so, &extmac);
2336#else
2337			error = EOPNOTSUPP;
2338#endif
2339			break;
2340
2341		default:
2342			error = ENOPROTOOPT;
2343			break;
2344		}
2345		if (error == 0 && so->so_proto != NULL &&
2346		    so->so_proto->pr_ctloutput != NULL) {
2347			(void) ((*so->so_proto->pr_ctloutput)
2348				  (so, sopt));
2349		}
2350	}
2351bad:
2352	return (error);
2353}
2354
2355/*
2356 * Helper routine for getsockopt.
2357 */
2358int
2359sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2360{
2361	int	error;
2362	size_t	valsize;
2363
2364	error = 0;
2365
2366	/*
2367	 * Documented get behavior is that we always return a value, possibly
2368	 * truncated to fit in the user's buffer.  Traditional behavior is
2369	 * that we always tell the user precisely how much we copied, rather
2370	 * than something useful like the total amount we had available for
2371	 * her.  Note that this interface is not idempotent; the entire
2372	 * answer must generated ahead of time.
2373	 */
2374	valsize = min(len, sopt->sopt_valsize);
2375	sopt->sopt_valsize = valsize;
2376	if (sopt->sopt_val != NULL) {
2377		if (sopt->sopt_td != NULL)
2378			error = copyout(buf, sopt->sopt_val, valsize);
2379		else
2380			bcopy(buf, sopt->sopt_val, valsize);
2381	}
2382	return (error);
2383}
2384
2385int
2386sogetopt(struct socket *so, struct sockopt *sopt)
2387{
2388	int	error, optval;
2389	struct	linger l;
2390	struct	timeval tv;
2391#ifdef MAC
2392	struct mac extmac;
2393#endif
2394
2395	error = 0;
2396	if (sopt->sopt_level != SOL_SOCKET) {
2397		if (so->so_proto && so->so_proto->pr_ctloutput) {
2398			return ((*so->so_proto->pr_ctloutput)
2399				  (so, sopt));
2400		} else
2401			return (ENOPROTOOPT);
2402	} else {
2403		switch (sopt->sopt_name) {
2404#ifdef INET
2405		case SO_ACCEPTFILTER:
2406			error = do_getopt_accept_filter(so, sopt);
2407			break;
2408#endif
2409		case SO_LINGER:
2410			SOCK_LOCK(so);
2411			l.l_onoff = so->so_options & SO_LINGER;
2412			l.l_linger = so->so_linger;
2413			SOCK_UNLOCK(so);
2414			error = sooptcopyout(sopt, &l, sizeof l);
2415			break;
2416
2417		case SO_USELOOPBACK:
2418		case SO_DONTROUTE:
2419		case SO_DEBUG:
2420		case SO_KEEPALIVE:
2421		case SO_REUSEADDR:
2422		case SO_REUSEPORT:
2423		case SO_BROADCAST:
2424		case SO_OOBINLINE:
2425		case SO_ACCEPTCONN:
2426		case SO_TIMESTAMP:
2427		case SO_BINTIME:
2428		case SO_NOSIGPIPE:
2429			optval = so->so_options & sopt->sopt_name;
2430integer:
2431			error = sooptcopyout(sopt, &optval, sizeof optval);
2432			break;
2433
2434		case SO_TYPE:
2435			optval = so->so_type;
2436			goto integer;
2437
2438		case SO_ERROR:
2439			SOCK_LOCK(so);
2440			optval = so->so_error;
2441			so->so_error = 0;
2442			SOCK_UNLOCK(so);
2443			goto integer;
2444
2445		case SO_SNDBUF:
2446			optval = so->so_snd.sb_hiwat;
2447			goto integer;
2448
2449		case SO_RCVBUF:
2450			optval = so->so_rcv.sb_hiwat;
2451			goto integer;
2452
2453		case SO_SNDLOWAT:
2454			optval = so->so_snd.sb_lowat;
2455			goto integer;
2456
2457		case SO_RCVLOWAT:
2458			optval = so->so_rcv.sb_lowat;
2459			goto integer;
2460
2461		case SO_SNDTIMEO:
2462		case SO_RCVTIMEO:
2463			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2464				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2465
2466			tv.tv_sec = optval / hz;
2467			tv.tv_usec = (optval % hz) * tick;
2468#ifdef COMPAT_IA32
2469			if (SV_CURPROC_FLAG(SV_ILP32)) {
2470				struct timeval32 tv32;
2471
2472				CP(tv, tv32, tv_sec);
2473				CP(tv, tv32, tv_usec);
2474				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2475			} else
2476#endif
2477				error = sooptcopyout(sopt, &tv, sizeof tv);
2478			break;
2479
2480		case SO_LABEL:
2481#ifdef MAC
2482			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2483			    sizeof(extmac));
2484			if (error)
2485				return (error);
2486			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2487			    so, &extmac);
2488			if (error)
2489				return (error);
2490			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2491#else
2492			error = EOPNOTSUPP;
2493#endif
2494			break;
2495
2496		case SO_PEERLABEL:
2497#ifdef MAC
2498			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2499			    sizeof(extmac));
2500			if (error)
2501				return (error);
2502			error = mac_getsockopt_peerlabel(
2503			    sopt->sopt_td->td_ucred, so, &extmac);
2504			if (error)
2505				return (error);
2506			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2507#else
2508			error = EOPNOTSUPP;
2509#endif
2510			break;
2511
2512		case SO_LISTENQLIMIT:
2513			optval = so->so_qlimit;
2514			goto integer;
2515
2516		case SO_LISTENQLEN:
2517			optval = so->so_qlen;
2518			goto integer;
2519
2520		case SO_LISTENINCQLEN:
2521			optval = so->so_incqlen;
2522			goto integer;
2523
2524		default:
2525			error = ENOPROTOOPT;
2526			break;
2527		}
2528		return (error);
2529	}
2530}
2531
2532/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2533int
2534soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2535{
2536	struct mbuf *m, *m_prev;
2537	int sopt_size = sopt->sopt_valsize;
2538
2539	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2540	if (m == NULL)
2541		return ENOBUFS;
2542	if (sopt_size > MLEN) {
2543		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2544		if ((m->m_flags & M_EXT) == 0) {
2545			m_free(m);
2546			return ENOBUFS;
2547		}
2548		m->m_len = min(MCLBYTES, sopt_size);
2549	} else {
2550		m->m_len = min(MLEN, sopt_size);
2551	}
2552	sopt_size -= m->m_len;
2553	*mp = m;
2554	m_prev = m;
2555
2556	while (sopt_size) {
2557		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2558		if (m == NULL) {
2559			m_freem(*mp);
2560			return ENOBUFS;
2561		}
2562		if (sopt_size > MLEN) {
2563			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2564			    M_DONTWAIT);
2565			if ((m->m_flags & M_EXT) == 0) {
2566				m_freem(m);
2567				m_freem(*mp);
2568				return ENOBUFS;
2569			}
2570			m->m_len = min(MCLBYTES, sopt_size);
2571		} else {
2572			m->m_len = min(MLEN, sopt_size);
2573		}
2574		sopt_size -= m->m_len;
2575		m_prev->m_next = m;
2576		m_prev = m;
2577	}
2578	return (0);
2579}
2580
2581/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2582int
2583soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2584{
2585	struct mbuf *m0 = m;
2586
2587	if (sopt->sopt_val == NULL)
2588		return (0);
2589	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2590		if (sopt->sopt_td != NULL) {
2591			int error;
2592
2593			error = copyin(sopt->sopt_val, mtod(m, char *),
2594				       m->m_len);
2595			if (error != 0) {
2596				m_freem(m0);
2597				return(error);
2598			}
2599		} else
2600			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2601		sopt->sopt_valsize -= m->m_len;
2602		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2603		m = m->m_next;
2604	}
2605	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2606		panic("ip6_sooptmcopyin");
2607	return (0);
2608}
2609
2610/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2611int
2612soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2613{
2614	struct mbuf *m0 = m;
2615	size_t valsize = 0;
2616
2617	if (sopt->sopt_val == NULL)
2618		return (0);
2619	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2620		if (sopt->sopt_td != NULL) {
2621			int error;
2622
2623			error = copyout(mtod(m, char *), sopt->sopt_val,
2624				       m->m_len);
2625			if (error != 0) {
2626				m_freem(m0);
2627				return(error);
2628			}
2629		} else
2630			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2631	       sopt->sopt_valsize -= m->m_len;
2632	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2633	       valsize += m->m_len;
2634	       m = m->m_next;
2635	}
2636	if (m != NULL) {
2637		/* enough soopt buffer should be given from user-land */
2638		m_freem(m0);
2639		return(EINVAL);
2640	}
2641	sopt->sopt_valsize = valsize;
2642	return (0);
2643}
2644
2645/*
2646 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2647 * out-of-band data, which will then notify socket consumers.
2648 */
2649void
2650sohasoutofband(struct socket *so)
2651{
2652
2653	if (so->so_sigio != NULL)
2654		pgsigio(&so->so_sigio, SIGURG, 0);
2655	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2656}
2657
2658int
2659sopoll(struct socket *so, int events, struct ucred *active_cred,
2660    struct thread *td)
2661{
2662
2663	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2664	    td));
2665}
2666
2667int
2668sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2669    struct thread *td)
2670{
2671	int revents = 0;
2672
2673	SOCKBUF_LOCK(&so->so_snd);
2674	SOCKBUF_LOCK(&so->so_rcv);
2675	if (events & (POLLIN | POLLRDNORM))
2676		if (soreadable(so))
2677			revents |= events & (POLLIN | POLLRDNORM);
2678
2679	if (events & POLLINIGNEOF)
2680		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2681		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2682			revents |= POLLINIGNEOF;
2683
2684	if (events & (POLLOUT | POLLWRNORM))
2685		if (sowriteable(so))
2686			revents |= events & (POLLOUT | POLLWRNORM);
2687
2688	if (events & (POLLPRI | POLLRDBAND))
2689		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2690			revents |= events & (POLLPRI | POLLRDBAND);
2691
2692	if (revents == 0) {
2693		if (events &
2694		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2695		     POLLRDBAND)) {
2696			selrecord(td, &so->so_rcv.sb_sel);
2697			so->so_rcv.sb_flags |= SB_SEL;
2698		}
2699
2700		if (events & (POLLOUT | POLLWRNORM)) {
2701			selrecord(td, &so->so_snd.sb_sel);
2702			so->so_snd.sb_flags |= SB_SEL;
2703		}
2704	}
2705
2706	SOCKBUF_UNLOCK(&so->so_rcv);
2707	SOCKBUF_UNLOCK(&so->so_snd);
2708	return (revents);
2709}
2710
2711int
2712soo_kqfilter(struct file *fp, struct knote *kn)
2713{
2714	struct socket *so = kn->kn_fp->f_data;
2715	struct sockbuf *sb;
2716
2717	switch (kn->kn_filter) {
2718	case EVFILT_READ:
2719		if (so->so_options & SO_ACCEPTCONN)
2720			kn->kn_fop = &solisten_filtops;
2721		else
2722			kn->kn_fop = &soread_filtops;
2723		sb = &so->so_rcv;
2724		break;
2725	case EVFILT_WRITE:
2726		kn->kn_fop = &sowrite_filtops;
2727		sb = &so->so_snd;
2728		break;
2729	default:
2730		return (EINVAL);
2731	}
2732
2733	SOCKBUF_LOCK(sb);
2734	knlist_add(&sb->sb_sel.si_note, kn, 1);
2735	sb->sb_flags |= SB_KNOTE;
2736	SOCKBUF_UNLOCK(sb);
2737	return (0);
2738}
2739
2740/*
2741 * Some routines that return EOPNOTSUPP for entry points that are not
2742 * supported by a protocol.  Fill in as needed.
2743 */
2744int
2745pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2746{
2747
2748	return EOPNOTSUPP;
2749}
2750
2751int
2752pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2753{
2754
2755	return EOPNOTSUPP;
2756}
2757
2758int
2759pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2760{
2761
2762	return EOPNOTSUPP;
2763}
2764
2765int
2766pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2767{
2768
2769	return EOPNOTSUPP;
2770}
2771
2772int
2773pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2774{
2775
2776	return EOPNOTSUPP;
2777}
2778
2779int
2780pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2781    struct ifnet *ifp, struct thread *td)
2782{
2783
2784	return EOPNOTSUPP;
2785}
2786
2787int
2788pru_disconnect_notsupp(struct socket *so)
2789{
2790
2791	return EOPNOTSUPP;
2792}
2793
2794int
2795pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2796{
2797
2798	return EOPNOTSUPP;
2799}
2800
2801int
2802pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2803{
2804
2805	return EOPNOTSUPP;
2806}
2807
2808int
2809pru_rcvd_notsupp(struct socket *so, int flags)
2810{
2811
2812	return EOPNOTSUPP;
2813}
2814
2815int
2816pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2817{
2818
2819	return EOPNOTSUPP;
2820}
2821
2822int
2823pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2824    struct sockaddr *addr, struct mbuf *control, struct thread *td)
2825{
2826
2827	return EOPNOTSUPP;
2828}
2829
2830/*
2831 * This isn't really a ``null'' operation, but it's the default one and
2832 * doesn't do anything destructive.
2833 */
2834int
2835pru_sense_null(struct socket *so, struct stat *sb)
2836{
2837
2838	sb->st_blksize = so->so_snd.sb_hiwat;
2839	return 0;
2840}
2841
2842int
2843pru_shutdown_notsupp(struct socket *so)
2844{
2845
2846	return EOPNOTSUPP;
2847}
2848
2849int
2850pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2851{
2852
2853	return EOPNOTSUPP;
2854}
2855
2856int
2857pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2858    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2859{
2860
2861	return EOPNOTSUPP;
2862}
2863
2864int
2865pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2866    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2867{
2868
2869	return EOPNOTSUPP;
2870}
2871
2872int
2873pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2874    struct thread *td)
2875{
2876
2877	return EOPNOTSUPP;
2878}
2879
2880static void
2881filt_sordetach(struct knote *kn)
2882{
2883	struct socket *so = kn->kn_fp->f_data;
2884
2885	SOCKBUF_LOCK(&so->so_rcv);
2886	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2887	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2888		so->so_rcv.sb_flags &= ~SB_KNOTE;
2889	SOCKBUF_UNLOCK(&so->so_rcv);
2890}
2891
2892/*ARGSUSED*/
2893static int
2894filt_soread(struct knote *kn, long hint)
2895{
2896	struct socket *so;
2897
2898	so = kn->kn_fp->f_data;
2899	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2900
2901	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2902	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2903		kn->kn_flags |= EV_EOF;
2904		kn->kn_fflags = so->so_error;
2905		return (1);
2906	} else if (so->so_error)	/* temporary udp error */
2907		return (1);
2908	else if (kn->kn_sfflags & NOTE_LOWAT)
2909		return (kn->kn_data >= kn->kn_sdata);
2910	else
2911		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2912}
2913
2914static void
2915filt_sowdetach(struct knote *kn)
2916{
2917	struct socket *so = kn->kn_fp->f_data;
2918
2919	SOCKBUF_LOCK(&so->so_snd);
2920	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2921	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2922		so->so_snd.sb_flags &= ~SB_KNOTE;
2923	SOCKBUF_UNLOCK(&so->so_snd);
2924}
2925
2926/*ARGSUSED*/
2927static int
2928filt_sowrite(struct knote *kn, long hint)
2929{
2930	struct socket *so;
2931
2932	so = kn->kn_fp->f_data;
2933	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2934	kn->kn_data = sbspace(&so->so_snd);
2935	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2936		kn->kn_flags |= EV_EOF;
2937		kn->kn_fflags = so->so_error;
2938		return (1);
2939	} else if (so->so_error)	/* temporary udp error */
2940		return (1);
2941	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2942	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2943		return (0);
2944	else if (kn->kn_sfflags & NOTE_LOWAT)
2945		return (kn->kn_data >= kn->kn_sdata);
2946	else
2947		return (kn->kn_data >= so->so_snd.sb_lowat);
2948}
2949
2950/*ARGSUSED*/
2951static int
2952filt_solisten(struct knote *kn, long hint)
2953{
2954	struct socket *so = kn->kn_fp->f_data;
2955
2956	kn->kn_data = so->so_qlen;
2957	return (! TAILQ_EMPTY(&so->so_comp));
2958}
2959
2960int
2961socheckuid(struct socket *so, uid_t uid)
2962{
2963
2964	if (so == NULL)
2965		return (EPERM);
2966	if (so->so_cred->cr_uid != uid)
2967		return (EPERM);
2968	return (0);
2969}
2970
2971static int
2972sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2973{
2974	int error;
2975	int val;
2976
2977	val = somaxconn;
2978	error = sysctl_handle_int(oidp, &val, 0, req);
2979	if (error || !req->newptr )
2980		return (error);
2981
2982	if (val < 1 || val > USHRT_MAX)
2983		return (EINVAL);
2984
2985	somaxconn = val;
2986	return (0);
2987}
2988
2989/*
2990 * These functions are used by protocols to notify the socket layer (and its
2991 * consumers) of state changes in the sockets driven by protocol-side events.
2992 */
2993
2994/*
2995 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2996 *
2997 * Normal sequence from the active (originating) side is that
2998 * soisconnecting() is called during processing of connect() call, resulting
2999 * in an eventual call to soisconnected() if/when the connection is
3000 * established.  When the connection is torn down soisdisconnecting() is
3001 * called during processing of disconnect() call, and soisdisconnected() is
3002 * called when the connection to the peer is totally severed.  The semantics
3003 * of these routines are such that connectionless protocols can call
3004 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3005 * calls when setting up a ``connection'' takes no time.
3006 *
3007 * From the passive side, a socket is created with two queues of sockets:
3008 * so_incomp for connections in progress and so_comp for connections already
3009 * made and awaiting user acceptance.  As a protocol is preparing incoming
3010 * connections, it creates a socket structure queued on so_incomp by calling
3011 * sonewconn().  When the connection is established, soisconnected() is
3012 * called, and transfers the socket structure to so_comp, making it available
3013 * to accept().
3014 *
3015 * If a socket is closed with sockets on either so_incomp or so_comp, these
3016 * sockets are dropped.
3017 *
3018 * If higher-level protocols are implemented in the kernel, the wakeups done
3019 * here will sometimes cause software-interrupt process scheduling.
3020 */
3021void
3022soisconnecting(struct socket *so)
3023{
3024
3025	SOCK_LOCK(so);
3026	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3027	so->so_state |= SS_ISCONNECTING;
3028	SOCK_UNLOCK(so);
3029}
3030
3031void
3032soisconnected(struct socket *so)
3033{
3034	struct socket *head;
3035
3036	ACCEPT_LOCK();
3037	SOCK_LOCK(so);
3038	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3039	so->so_state |= SS_ISCONNECTED;
3040	head = so->so_head;
3041	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3042		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3043			SOCK_UNLOCK(so);
3044			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3045			head->so_incqlen--;
3046			so->so_qstate &= ~SQ_INCOMP;
3047			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3048			head->so_qlen++;
3049			so->so_qstate |= SQ_COMP;
3050			ACCEPT_UNLOCK();
3051			sorwakeup(head);
3052			wakeup_one(&head->so_timeo);
3053		} else {
3054			ACCEPT_UNLOCK();
3055			so->so_upcall =
3056			    head->so_accf->so_accept_filter->accf_callback;
3057			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
3058			so->so_rcv.sb_flags |= SB_UPCALL;
3059			so->so_options &= ~SO_ACCEPTFILTER;
3060			SOCK_UNLOCK(so);
3061			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
3062		}
3063		return;
3064	}
3065	SOCK_UNLOCK(so);
3066	ACCEPT_UNLOCK();
3067	wakeup(&so->so_timeo);
3068	sorwakeup(so);
3069	sowwakeup(so);
3070}
3071
3072void
3073soisdisconnecting(struct socket *so)
3074{
3075
3076	/*
3077	 * Note: This code assumes that SOCK_LOCK(so) and
3078	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3079	 */
3080	SOCKBUF_LOCK(&so->so_rcv);
3081	so->so_state &= ~SS_ISCONNECTING;
3082	so->so_state |= SS_ISDISCONNECTING;
3083	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3084	sorwakeup_locked(so);
3085	SOCKBUF_LOCK(&so->so_snd);
3086	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3087	sowwakeup_locked(so);
3088	wakeup(&so->so_timeo);
3089}
3090
3091void
3092soisdisconnected(struct socket *so)
3093{
3094
3095	/*
3096	 * Note: This code assumes that SOCK_LOCK(so) and
3097	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3098	 */
3099	SOCKBUF_LOCK(&so->so_rcv);
3100	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3101	so->so_state |= SS_ISDISCONNECTED;
3102	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3103	sorwakeup_locked(so);
3104	SOCKBUF_LOCK(&so->so_snd);
3105	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3106	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3107	sowwakeup_locked(so);
3108	wakeup(&so->so_timeo);
3109}
3110
3111/*
3112 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3113 */
3114struct sockaddr *
3115sodupsockaddr(const struct sockaddr *sa, int mflags)
3116{
3117	struct sockaddr *sa2;
3118
3119	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3120	if (sa2)
3121		bcopy(sa, sa2, sa->sa_len);
3122	return sa2;
3123}
3124
3125/*
3126 * Create an external-format (``xsocket'') structure using the information in
3127 * the kernel-format socket structure pointed to by so.  This is done to
3128 * reduce the spew of irrelevant information over this interface, to isolate
3129 * user code from changes in the kernel structure, and potentially to provide
3130 * information-hiding if we decide that some of this information should be
3131 * hidden from users.
3132 */
3133void
3134sotoxsocket(struct socket *so, struct xsocket *xso)
3135{
3136
3137	xso->xso_len = sizeof *xso;
3138	xso->xso_so = so;
3139	xso->so_type = so->so_type;
3140	xso->so_options = so->so_options;
3141	xso->so_linger = so->so_linger;
3142	xso->so_state = so->so_state;
3143	xso->so_pcb = so->so_pcb;
3144	xso->xso_protocol = so->so_proto->pr_protocol;
3145	xso->xso_family = so->so_proto->pr_domain->dom_family;
3146	xso->so_qlen = so->so_qlen;
3147	xso->so_incqlen = so->so_incqlen;
3148	xso->so_qlimit = so->so_qlimit;
3149	xso->so_timeo = so->so_timeo;
3150	xso->so_error = so->so_error;
3151	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3152	xso->so_oobmark = so->so_oobmark;
3153	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3154	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3155	xso->so_uid = so->so_cred->cr_uid;
3156}
3157
3158
3159/*
3160 * Socket accessor functions to provide external consumers with
3161 * a safe interface to socket state
3162 *
3163 */
3164
3165void
3166so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3167{
3168
3169	TAILQ_FOREACH(so, &so->so_comp, so_list)
3170		func(so, arg);
3171}
3172
3173struct sockbuf *
3174so_sockbuf_rcv(struct socket *so)
3175{
3176
3177	return (&so->so_rcv);
3178}
3179
3180struct sockbuf *
3181so_sockbuf_snd(struct socket *so)
3182{
3183
3184	return (&so->so_snd);
3185}
3186
3187int
3188so_state_get(const struct socket *so)
3189{
3190
3191	return (so->so_state);
3192}
3193
3194void
3195so_state_set(struct socket *so, int val)
3196{
3197
3198	so->so_state = val;
3199}
3200
3201int
3202so_options_get(const struct socket *so)
3203{
3204
3205	return (so->so_options);
3206}
3207
3208void
3209so_options_set(struct socket *so, int val)
3210{
3211
3212	so->so_options = val;
3213}
3214
3215int
3216so_error_get(const struct socket *so)
3217{
3218
3219	return (so->so_error);
3220}
3221
3222void
3223so_error_set(struct socket *so, int val)
3224{
3225
3226	so->so_error = val;
3227}
3228
3229int
3230so_linger_get(const struct socket *so)
3231{
3232
3233	return (so->so_linger);
3234}
3235
3236void
3237so_linger_set(struct socket *so, int val)
3238{
3239
3240	so->so_linger = val;
3241}
3242
3243struct protosw *
3244so_protosw_get(const struct socket *so)
3245{
3246
3247	return (so->so_proto);
3248}
3249
3250void
3251so_protosw_set(struct socket *so, struct protosw *val)
3252{
3253
3254	so->so_proto = val;
3255}
3256
3257void
3258so_sorwakeup(struct socket *so)
3259{
3260
3261	sorwakeup(so);
3262}
3263
3264void
3265so_sowwakeup(struct socket *so)
3266{
3267
3268	sowwakeup(so);
3269}
3270
3271void
3272so_sorwakeup_locked(struct socket *so)
3273{
3274
3275	sorwakeup_locked(so);
3276}
3277
3278void
3279so_sowwakeup_locked(struct socket *so)
3280{
3281
3282	sowwakeup_locked(so);
3283}
3284
3285void
3286so_lock(struct socket *so)
3287{
3288	SOCK_LOCK(so);
3289}
3290
3291void
3292so_unlock(struct socket *so)
3293{
3294	SOCK_UNLOCK(so);
3295}
3296