uipc_socket.c revision 166171
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 166171 2007-01-22 14:50:28Z andre $");
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h>			/* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
126#include <sys/sysctl.h>
127#include <sys/uio.h>
128#include <sys/jail.h>
129
130#include <security/mac/mac_framework.h>
131
132#include <vm/uma.h>
133
134#ifdef COMPAT_IA32
135#include <sys/mount.h>
136#include <compat/freebsd32/freebsd32.h>
137
138extern struct sysentvec ia32_freebsd_sysvec;
139#endif
140
141static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
142		    int flags);
143
144static void	filt_sordetach(struct knote *kn);
145static int	filt_soread(struct knote *kn, long hint);
146static void	filt_sowdetach(struct knote *kn);
147static int	filt_sowrite(struct knote *kn, long hint);
148static int	filt_solisten(struct knote *kn, long hint);
149
150static struct filterops solisten_filtops =
151	{ 1, NULL, filt_sordetach, filt_solisten };
152static struct filterops soread_filtops =
153	{ 1, NULL, filt_sordetach, filt_soread };
154static struct filterops sowrite_filtops =
155	{ 1, NULL, filt_sowdetach, filt_sowrite };
156
157uma_zone_t socket_zone;
158so_gen_t	so_gencnt;	/* generation count for sockets */
159
160int	maxsockets;
161
162MALLOC_DEFINE(M_SONAME, "soname", "socket name");
163MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
164
165static int somaxconn = SOMAXCONN;
166static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
167/* XXX: we dont have SYSCTL_USHORT */
168SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
169    0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
170    "queue size");
171static int numopensockets;
172SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
173    &numopensockets, 0, "Number of open sockets");
174#ifdef ZERO_COPY_SOCKETS
175/* These aren't static because they're used in other files. */
176int so_zero_copy_send = 1;
177int so_zero_copy_receive = 1;
178SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
179    "Zero copy controls");
180SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
181    &so_zero_copy_receive, 0, "Enable zero copy receive");
182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
183    &so_zero_copy_send, 0, "Enable zero copy send");
184#endif /* ZERO_COPY_SOCKETS */
185
186/*
187 * accept_mtx locks down per-socket fields relating to accept queues.  See
188 * socketvar.h for an annotation of the protected fields of struct socket.
189 */
190struct mtx accept_mtx;
191MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
192
193/*
194 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
195 * so_gencnt field.
196 */
197static struct mtx so_global_mtx;
198MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
199
200/*
201 * General IPC sysctl name space, used by sockets and a variety of other IPC
202 * types.
203 */
204SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
205
206/*
207 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
208 * of the change so that they can update their dependent limits as required.
209 */
210static int
211sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
212{
213	int error, newmaxsockets;
214
215	newmaxsockets = maxsockets;
216	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
217	if (error == 0 && req->newptr) {
218		if (newmaxsockets > maxsockets) {
219			maxsockets = newmaxsockets;
220			if (maxsockets > ((maxfiles / 4) * 3)) {
221				maxfiles = (maxsockets * 5) / 4;
222				maxfilesperproc = (maxfiles * 9) / 10;
223			}
224			EVENTHANDLER_INVOKE(maxsockets_change);
225		} else
226			error = EINVAL;
227	}
228	return (error);
229}
230
231SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
232    &maxsockets, 0, sysctl_maxsockets, "IU",
233    "Maximum number of sockets avaliable");
234
235/*
236 * Initialise maxsockets.
237 */
238static void init_maxsockets(void *ignored)
239{
240	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
241	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
242}
243SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
244
245/*
246 * Socket operation routines.  These routines are called by the routines in
247 * sys_socket.c or from a system process, and implement the semantics of
248 * socket operations by switching out to the protocol specific routines.
249 */
250
251/*
252 * Get a socket structure from our zone, and initialize it.  Note that it
253 * would probably be better to allocate socket and PCB at the same time, but
254 * I'm not convinced that all the protocols can be easily modified to do
255 * this.
256 *
257 * soalloc() returns a socket with a ref count of 0.
258 */
259static struct socket *
260soalloc(int mflags)
261{
262	struct socket *so;
263
264	so = uma_zalloc(socket_zone, mflags | M_ZERO);
265	if (so == NULL)
266		return (NULL);
267#ifdef MAC
268	if (mac_init_socket(so, mflags) != 0) {
269		uma_zfree(socket_zone, so);
270		return (NULL);
271	}
272#endif
273	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
274	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
275	TAILQ_INIT(&so->so_aiojobq);
276	mtx_lock(&so_global_mtx);
277	so->so_gencnt = ++so_gencnt;
278	++numopensockets;
279	mtx_unlock(&so_global_mtx);
280	return (so);
281}
282
283/*
284 * Free the storage associated with a socket at the socket layer, tear down
285 * locks, labels, etc.  All protocol state is assumed already to have been
286 * torn down (and possibly never set up) by the caller.
287 */
288static void
289sodealloc(struct socket *so)
290{
291
292	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
293	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
294
295	mtx_lock(&so_global_mtx);
296	so->so_gencnt = ++so_gencnt;
297	--numopensockets;	/* Could be below, but faster here. */
298	mtx_unlock(&so_global_mtx);
299	if (so->so_rcv.sb_hiwat)
300		(void)chgsbsize(so->so_cred->cr_uidinfo,
301		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
302	if (so->so_snd.sb_hiwat)
303		(void)chgsbsize(so->so_cred->cr_uidinfo,
304		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
305#ifdef INET
306	/* remove acccept filter if one is present. */
307	if (so->so_accf != NULL)
308		do_setopt_accept_filter(so, NULL);
309#endif
310#ifdef MAC
311	mac_destroy_socket(so);
312#endif
313	crfree(so->so_cred);
314	SOCKBUF_LOCK_DESTROY(&so->so_snd);
315	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
316	uma_zfree(socket_zone, so);
317}
318
319/*
320 * socreate returns a socket with a ref count of 1.  The socket should be
321 * closed with soclose().
322 */
323int
324socreate(dom, aso, type, proto, cred, td)
325	int dom;
326	struct socket **aso;
327	int type;
328	int proto;
329	struct ucred *cred;
330	struct thread *td;
331{
332	struct protosw *prp;
333	struct socket *so;
334	int error;
335
336	if (proto)
337		prp = pffindproto(dom, proto, type);
338	else
339		prp = pffindtype(dom, type);
340
341	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
342	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
343		return (EPROTONOSUPPORT);
344
345	if (jailed(cred) && jail_socket_unixiproute_only &&
346	    prp->pr_domain->dom_family != PF_LOCAL &&
347	    prp->pr_domain->dom_family != PF_INET &&
348	    prp->pr_domain->dom_family != PF_ROUTE) {
349		return (EPROTONOSUPPORT);
350	}
351
352	if (prp->pr_type != type)
353		return (EPROTOTYPE);
354	so = soalloc(M_WAITOK);
355	if (so == NULL)
356		return (ENOBUFS);
357
358	TAILQ_INIT(&so->so_incomp);
359	TAILQ_INIT(&so->so_comp);
360	so->so_type = type;
361	so->so_cred = crhold(cred);
362	so->so_proto = prp;
363#ifdef MAC
364	mac_create_socket(cred, so);
365#endif
366	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
367	    NULL, NULL, NULL);
368	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
369	    NULL, NULL, NULL);
370	so->so_count = 1;
371	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
372	if (error) {
373		KASSERT(so->so_count == 1, ("socreate: so_count %d",
374		    so->so_count));
375		so->so_count = 0;
376		sodealloc(so);
377		return (error);
378	}
379	*aso = so;
380	return (0);
381}
382
383#ifdef REGRESSION
384static int regression_sonewconn_earlytest = 1;
385SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
386    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
387#endif
388
389/*
390 * When an attempt at a new connection is noted on a socket which accepts
391 * connections, sonewconn is called.  If the connection is possible (subject
392 * to space constraints, etc.) then we allocate a new structure, propoerly
393 * linked into the data structure of the original socket, and return this.
394 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
395 *
396 * Note: the ref count on the socket is 0 on return.
397 */
398struct socket *
399sonewconn(head, connstatus)
400	register struct socket *head;
401	int connstatus;
402{
403	register struct socket *so;
404	int over;
405
406	ACCEPT_LOCK();
407	over = (head->so_qlen > 3 * head->so_qlimit / 2);
408	ACCEPT_UNLOCK();
409#ifdef REGRESSION
410	if (regression_sonewconn_earlytest && over)
411#else
412	if (over)
413#endif
414		return (NULL);
415	so = soalloc(M_NOWAIT);
416	if (so == NULL)
417		return (NULL);
418	if ((head->so_options & SO_ACCEPTFILTER) != 0)
419		connstatus = 0;
420	so->so_head = head;
421	so->so_type = head->so_type;
422	so->so_options = head->so_options &~ SO_ACCEPTCONN;
423	so->so_linger = head->so_linger;
424	so->so_state = head->so_state | SS_NOFDREF;
425	so->so_proto = head->so_proto;
426	so->so_cred = crhold(head->so_cred);
427#ifdef MAC
428	SOCK_LOCK(head);
429	mac_create_socket_from_socket(head, so);
430	SOCK_UNLOCK(head);
431#endif
432	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
433	    NULL, NULL, NULL);
434	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
435	    NULL, NULL, NULL);
436	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
437	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
438		sodealloc(so);
439		return (NULL);
440	}
441	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
442	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
443	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
444	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
445	so->so_state |= connstatus;
446	ACCEPT_LOCK();
447	if (connstatus) {
448		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
449		so->so_qstate |= SQ_COMP;
450		head->so_qlen++;
451	} else {
452		/*
453		 * Keep removing sockets from the head until there's room for
454		 * us to insert on the tail.  In pre-locking revisions, this
455		 * was a simple if(), but as we could be racing with other
456		 * threads and soabort() requires dropping locks, we must
457		 * loop waiting for the condition to be true.
458		 */
459		while (head->so_incqlen > head->so_qlimit) {
460			struct socket *sp;
461			sp = TAILQ_FIRST(&head->so_incomp);
462			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
463			head->so_incqlen--;
464			sp->so_qstate &= ~SQ_INCOMP;
465			sp->so_head = NULL;
466			ACCEPT_UNLOCK();
467			soabort(sp);
468			ACCEPT_LOCK();
469		}
470		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
471		so->so_qstate |= SQ_INCOMP;
472		head->so_incqlen++;
473	}
474	ACCEPT_UNLOCK();
475	if (connstatus) {
476		sorwakeup(head);
477		wakeup_one(&head->so_timeo);
478	}
479	return (so);
480}
481
482int
483sobind(so, nam, td)
484	struct socket *so;
485	struct sockaddr *nam;
486	struct thread *td;
487{
488
489	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
490}
491
492/*
493 * solisten() transitions a socket from a non-listening state to a listening
494 * state, but can also be used to update the listen queue depth on an
495 * existing listen socket.  The protocol will call back into the sockets
496 * layer using solisten_proto_check() and solisten_proto() to check and set
497 * socket-layer listen state.  Call backs are used so that the protocol can
498 * acquire both protocol and socket layer locks in whatever order is required
499 * by the protocol.
500 *
501 * Protocol implementors are advised to hold the socket lock across the
502 * socket-layer test and set to avoid races at the socket layer.
503 */
504int
505solisten(so, backlog, td)
506	struct socket *so;
507	int backlog;
508	struct thread *td;
509{
510
511	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
512}
513
514int
515solisten_proto_check(so)
516	struct socket *so;
517{
518
519	SOCK_LOCK_ASSERT(so);
520
521	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
522	    SS_ISDISCONNECTING))
523		return (EINVAL);
524	return (0);
525}
526
527void
528solisten_proto(so, backlog)
529	struct socket *so;
530	int backlog;
531{
532
533	SOCK_LOCK_ASSERT(so);
534
535	if (backlog < 0 || backlog > somaxconn)
536		backlog = somaxconn;
537	so->so_qlimit = backlog;
538	so->so_options |= SO_ACCEPTCONN;
539}
540
541/*
542 * Attempt to free a socket.  This should really be sotryfree().
543 *
544 * sofree() will succeed if:
545 *
546 * - There are no outstanding file descriptor references or related consumers
547 *   (so_count == 0).
548 *
549 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
550 *
551 * - The protocol does not have an outstanding strong reference on the socket
552 *   (SS_PROTOREF).
553 *
554 * - The socket is not in a completed connection queue, so a process has been
555 *   notified that it is present.  If it is removed, the user process may
556 *   block in accept() despite select() saying the socket was ready.
557 *
558 * Otherwise, it will quietly abort so that a future call to sofree(), when
559 * conditions are right, can succeed.
560 */
561void
562sofree(so)
563	struct socket *so;
564{
565	struct protosw *pr = so->so_proto;
566	struct socket *head;
567
568	ACCEPT_LOCK_ASSERT();
569	SOCK_LOCK_ASSERT(so);
570
571	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
572	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
573		SOCK_UNLOCK(so);
574		ACCEPT_UNLOCK();
575		return;
576	}
577
578	head = so->so_head;
579	if (head != NULL) {
580		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
581		    (so->so_qstate & SQ_INCOMP) != 0,
582		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
583		    "SQ_INCOMP"));
584		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
585		    (so->so_qstate & SQ_INCOMP) == 0,
586		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
587		TAILQ_REMOVE(&head->so_incomp, so, so_list);
588		head->so_incqlen--;
589		so->so_qstate &= ~SQ_INCOMP;
590		so->so_head = NULL;
591	}
592	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
593	    (so->so_qstate & SQ_INCOMP) == 0,
594	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
595	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
596	if (so->so_options & SO_ACCEPTCONN) {
597		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
598		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
599	}
600	SOCK_UNLOCK(so);
601	ACCEPT_UNLOCK();
602
603	/*
604	 * From this point on, we assume that no other references to this
605	 * socket exist anywhere else in the stack.  Therefore, no locks need
606	 * to be acquired or held.
607	 *
608	 * We used to do a lot of socket buffer and socket locking here, as
609	 * well as invoke sorflush() and perform wakeups.  The direct call to
610	 * dom_dispose() and sbrelease_internal() are an inlining of what was
611	 * necessary from sorflush().
612	 *
613	 * Notice that the socket buffer and kqueue state are torn down
614	 * before calling pru_detach.  This means that protocols shold not
615	 * assume they can perform socket wakeups, etc, in their detach
616	 * code.
617	 */
618	KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
619	KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
620	sbdestroy(&so->so_snd, so);
621	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
622		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
623	sbdestroy(&so->so_rcv, so);
624	if (pr->pr_usrreqs->pru_detach != NULL)
625		(*pr->pr_usrreqs->pru_detach)(so);
626	knlist_destroy(&so->so_rcv.sb_sel.si_note);
627	knlist_destroy(&so->so_snd.sb_sel.si_note);
628	sodealloc(so);
629}
630
631/*
632 * Close a socket on last file table reference removal.  Initiate disconnect
633 * if connected.  Free socket when disconnect complete.
634 *
635 * This function will sorele() the socket.  Note that soclose() may be called
636 * prior to the ref count reaching zero.  The actual socket structure will
637 * not be freed until the ref count reaches zero.
638 */
639int
640soclose(so)
641	struct socket *so;
642{
643	int error = 0;
644
645	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
646
647	funsetown(&so->so_sigio);
648	if (so->so_state & SS_ISCONNECTED) {
649		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
650			error = sodisconnect(so);
651			if (error)
652				goto drop;
653		}
654		if (so->so_options & SO_LINGER) {
655			if ((so->so_state & SS_ISDISCONNECTING) &&
656			    (so->so_state & SS_NBIO))
657				goto drop;
658			while (so->so_state & SS_ISCONNECTED) {
659				error = tsleep(&so->so_timeo,
660				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
661				if (error)
662					break;
663			}
664		}
665	}
666
667drop:
668	if (so->so_proto->pr_usrreqs->pru_close != NULL)
669		(*so->so_proto->pr_usrreqs->pru_close)(so);
670	if (so->so_options & SO_ACCEPTCONN) {
671		struct socket *sp;
672		ACCEPT_LOCK();
673		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
674			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
675			so->so_incqlen--;
676			sp->so_qstate &= ~SQ_INCOMP;
677			sp->so_head = NULL;
678			ACCEPT_UNLOCK();
679			soabort(sp);
680			ACCEPT_LOCK();
681		}
682		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
683			TAILQ_REMOVE(&so->so_comp, sp, so_list);
684			so->so_qlen--;
685			sp->so_qstate &= ~SQ_COMP;
686			sp->so_head = NULL;
687			ACCEPT_UNLOCK();
688			soabort(sp);
689			ACCEPT_LOCK();
690		}
691		ACCEPT_UNLOCK();
692	}
693	ACCEPT_LOCK();
694	SOCK_LOCK(so);
695	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
696	so->so_state |= SS_NOFDREF;
697	sorele(so);
698	return (error);
699}
700
701/*
702 * soabort() is used to abruptly tear down a connection, such as when a
703 * resource limit is reached (listen queue depth exceeded), or if a listen
704 * socket is closed while there are sockets waiting to be accepted.
705 *
706 * This interface is tricky, because it is called on an unreferenced socket,
707 * and must be called only by a thread that has actually removed the socket
708 * from the listen queue it was on, or races with other threads are risked.
709 *
710 * This interface will call into the protocol code, so must not be called
711 * with any socket locks held.  Protocols do call it while holding their own
712 * recursible protocol mutexes, but this is something that should be subject
713 * to review in the future.
714 */
715void
716soabort(so)
717	struct socket *so;
718{
719
720	/*
721	 * In as much as is possible, assert that no references to this
722	 * socket are held.  This is not quite the same as asserting that the
723	 * current thread is responsible for arranging for no references, but
724	 * is as close as we can get for now.
725	 */
726	KASSERT(so->so_count == 0, ("soabort: so_count"));
727	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
728	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
729	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
730	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
731
732	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
733		(*so->so_proto->pr_usrreqs->pru_abort)(so);
734	ACCEPT_LOCK();
735	SOCK_LOCK(so);
736	sofree(so);
737}
738
739int
740soaccept(so, nam)
741	struct socket *so;
742	struct sockaddr **nam;
743{
744	int error;
745
746	SOCK_LOCK(so);
747	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
748	so->so_state &= ~SS_NOFDREF;
749	SOCK_UNLOCK(so);
750	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
751	return (error);
752}
753
754int
755soconnect(so, nam, td)
756	struct socket *so;
757	struct sockaddr *nam;
758	struct thread *td;
759{
760	int error;
761
762	if (so->so_options & SO_ACCEPTCONN)
763		return (EOPNOTSUPP);
764	/*
765	 * If protocol is connection-based, can only connect once.
766	 * Otherwise, if connected, try to disconnect first.  This allows
767	 * user to disconnect by connecting to, e.g., a null address.
768	 */
769	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
770	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
771	    (error = sodisconnect(so)))) {
772		error = EISCONN;
773	} else {
774		/*
775		 * Prevent accumulated error from previous connection from
776		 * biting us.
777		 */
778		so->so_error = 0;
779		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
780	}
781
782	return (error);
783}
784
785int
786soconnect2(so1, so2)
787	struct socket *so1;
788	struct socket *so2;
789{
790
791	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
792}
793
794int
795sodisconnect(so)
796	struct socket *so;
797{
798	int error;
799
800	if ((so->so_state & SS_ISCONNECTED) == 0)
801		return (ENOTCONN);
802	if (so->so_state & SS_ISDISCONNECTING)
803		return (EALREADY);
804	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
805	return (error);
806}
807
808#ifdef ZERO_COPY_SOCKETS
809struct so_zerocopy_stats{
810	int size_ok;
811	int align_ok;
812	int found_ifp;
813};
814struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
815#include <netinet/in.h>
816#include <net/route.h>
817#include <netinet/in_pcb.h>
818#include <vm/vm.h>
819#include <vm/vm_page.h>
820#include <vm/vm_object.h>
821
822/*
823 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
824 * sosend_dgram() and sosend_generic() use m_uiotombuf().
825 *
826 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
827 * all of the data referenced by the uio.  If desired, it uses zero-copy.
828 * *space will be updated to reflect data copied in.
829 *
830 * NB: If atomic I/O is requested, the caller must already have checked that
831 * space can hold resid bytes.
832 *
833 * NB: In the event of an error, the caller may need to free the partial
834 * chain pointed to by *mpp.  The contents of both *uio and *space may be
835 * modified even in the case of an error.
836 */
837static int
838sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
839    int flags)
840{
841	struct mbuf *m, **mp, *top;
842	long len, resid;
843	int error;
844#ifdef ZERO_COPY_SOCKETS
845	int cow_send;
846#endif
847
848	*retmp = top = NULL;
849	mp = &top;
850	len = 0;
851	resid = uio->uio_resid;
852	error = 0;
853	do {
854#ifdef ZERO_COPY_SOCKETS
855		cow_send = 0;
856#endif /* ZERO_COPY_SOCKETS */
857		if (resid >= MINCLSIZE) {
858#ifdef ZERO_COPY_SOCKETS
859			if (top == NULL) {
860				MGETHDR(m, M_TRYWAIT, MT_DATA);
861				if (m == NULL) {
862					error = ENOBUFS;
863					goto out;
864				}
865				m->m_pkthdr.len = 0;
866				m->m_pkthdr.rcvif = NULL;
867			} else {
868				MGET(m, M_TRYWAIT, MT_DATA);
869				if (m == NULL) {
870					error = ENOBUFS;
871					goto out;
872				}
873			}
874			if (so_zero_copy_send &&
875			    resid>=PAGE_SIZE &&
876			    *space>=PAGE_SIZE &&
877			    uio->uio_iov->iov_len>=PAGE_SIZE) {
878				so_zerocp_stats.size_ok++;
879				so_zerocp_stats.align_ok++;
880				cow_send = socow_setup(m, uio);
881				len = cow_send;
882			}
883			if (!cow_send) {
884				MCLGET(m, M_TRYWAIT);
885				if ((m->m_flags & M_EXT) == 0) {
886					m_free(m);
887					m = NULL;
888				} else {
889					len = min(min(MCLBYTES, resid),
890					    *space);
891				}
892			}
893#else /* ZERO_COPY_SOCKETS */
894			if (top == NULL) {
895				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
896				m->m_pkthdr.len = 0;
897				m->m_pkthdr.rcvif = NULL;
898			} else
899				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
900			len = min(min(MCLBYTES, resid), *space);
901#endif /* ZERO_COPY_SOCKETS */
902		} else {
903			if (top == NULL) {
904				m = m_gethdr(M_TRYWAIT, MT_DATA);
905				m->m_pkthdr.len = 0;
906				m->m_pkthdr.rcvif = NULL;
907
908				len = min(min(MHLEN, resid), *space);
909				/*
910				 * For datagram protocols, leave room
911				 * for protocol headers in first mbuf.
912				 */
913				if (atomic && m && len < MHLEN)
914					MH_ALIGN(m, len);
915			} else {
916				m = m_get(M_TRYWAIT, MT_DATA);
917				len = min(min(MLEN, resid), *space);
918			}
919		}
920		if (m == NULL) {
921			error = ENOBUFS;
922			goto out;
923		}
924
925		*space -= len;
926#ifdef ZERO_COPY_SOCKETS
927		if (cow_send)
928			error = 0;
929		else
930#endif /* ZERO_COPY_SOCKETS */
931		error = uiomove(mtod(m, void *), (int)len, uio);
932		resid = uio->uio_resid;
933		m->m_len = len;
934		*mp = m;
935		top->m_pkthdr.len += len;
936		if (error)
937			goto out;
938		mp = &m->m_next;
939		if (resid <= 0) {
940			if (flags & MSG_EOR)
941				top->m_flags |= M_EOR;
942			break;
943		}
944	} while (*space > 0 && atomic);
945out:
946	*retmp = top;
947	return (error);
948}
949#endif /*ZERO_COPY_SOCKETS*/
950
951#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
952
953int
954sosend_dgram(so, addr, uio, top, control, flags, td)
955	struct socket *so;
956	struct sockaddr *addr;
957	struct uio *uio;
958	struct mbuf *top;
959	struct mbuf *control;
960	int flags;
961	struct thread *td;
962{
963	long space, resid;
964	int clen = 0, error, dontroute;
965#ifdef ZERO_COPY_SOCKETS
966	int atomic = sosendallatonce(so) || top;
967#endif
968
969	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
970	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
971	    ("sodgram_send: !PR_ATOMIC"));
972
973	if (uio != NULL)
974		resid = uio->uio_resid;
975	else
976		resid = top->m_pkthdr.len;
977	/*
978	 * In theory resid should be unsigned.  However, space must be
979	 * signed, as it might be less than 0 if we over-committed, and we
980	 * must use a signed comparison of space and resid.  On the other
981	 * hand, a negative resid causes us to loop sending 0-length
982	 * segments to the protocol.
983	 *
984	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
985	 * type sockets since that's an error.
986	 */
987	if (resid < 0) {
988		error = EINVAL;
989		goto out;
990	}
991
992	dontroute =
993	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
994	if (td != NULL)
995		td->td_proc->p_stats->p_ru.ru_msgsnd++;
996	if (control != NULL)
997		clen = control->m_len;
998
999	SOCKBUF_LOCK(&so->so_snd);
1000	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1001		SOCKBUF_UNLOCK(&so->so_snd);
1002		error = EPIPE;
1003		goto out;
1004	}
1005	if (so->so_error) {
1006		error = so->so_error;
1007		so->so_error = 0;
1008		SOCKBUF_UNLOCK(&so->so_snd);
1009		goto out;
1010	}
1011	if ((so->so_state & SS_ISCONNECTED) == 0) {
1012		/*
1013		 * `sendto' and `sendmsg' is allowed on a connection-based
1014		 * socket if it supports implied connect.  Return ENOTCONN if
1015		 * not connected and no address is supplied.
1016		 */
1017		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1018		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1019			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1020			    !(resid == 0 && clen != 0)) {
1021				SOCKBUF_UNLOCK(&so->so_snd);
1022				error = ENOTCONN;
1023				goto out;
1024			}
1025		} else if (addr == NULL) {
1026			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1027				error = ENOTCONN;
1028			else
1029				error = EDESTADDRREQ;
1030			SOCKBUF_UNLOCK(&so->so_snd);
1031			goto out;
1032		}
1033	}
1034
1035	/*
1036	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1037	 * problem and need fixing.
1038	 */
1039	space = sbspace(&so->so_snd);
1040	if (flags & MSG_OOB)
1041		space += 1024;
1042	space -= clen;
1043	SOCKBUF_UNLOCK(&so->so_snd);
1044	if (resid > space) {
1045		error = EMSGSIZE;
1046		goto out;
1047	}
1048	if (uio == NULL) {
1049		resid = 0;
1050		if (flags & MSG_EOR)
1051			top->m_flags |= M_EOR;
1052	} else {
1053#ifdef ZERO_COPY_SOCKETS
1054		error = sosend_copyin(uio, &top, atomic, &space, flags);
1055		if (error)
1056			goto out;
1057#else
1058		/*
1059		 * Copy the data from userland into a mbuf chain.
1060		 * If no data is to be copied in, a single empty mbuf
1061		 * is returned.
1062		 */
1063		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1064		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1065		if (top == NULL) {
1066			error = EFAULT;	/* only possible error */
1067			goto out;
1068		}
1069		space -= resid - uio->uio_resid;
1070#endif
1071		resid = uio->uio_resid;
1072	}
1073	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1074	/*
1075	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1076	 * than with.
1077	 */
1078	if (dontroute) {
1079		SOCK_LOCK(so);
1080		so->so_options |= SO_DONTROUTE;
1081		SOCK_UNLOCK(so);
1082	}
1083	/*
1084	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1085	 * of date.  We could have recieved a reset packet in an interrupt or
1086	 * maybe we slept while doing page faults in uiomove() etc.  We could
1087	 * probably recheck again inside the locking protection here, but
1088	 * there are probably other places that this also happens.  We must
1089	 * rethink this.
1090	 */
1091	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1092	    (flags & MSG_OOB) ? PRUS_OOB :
1093	/*
1094	 * If the user set MSG_EOF, the protocol understands this flag and
1095	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1096	 */
1097	    ((flags & MSG_EOF) &&
1098	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1099	     (resid <= 0)) ?
1100		PRUS_EOF :
1101		/* If there is more to send set PRUS_MORETOCOME */
1102		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1103		top, addr, control, td);
1104	if (dontroute) {
1105		SOCK_LOCK(so);
1106		so->so_options &= ~SO_DONTROUTE;
1107		SOCK_UNLOCK(so);
1108	}
1109	clen = 0;
1110	control = NULL;
1111	top = NULL;
1112out:
1113	if (top != NULL)
1114		m_freem(top);
1115	if (control != NULL)
1116		m_freem(control);
1117	return (error);
1118}
1119
1120/*
1121 * Send on a socket.  If send must go all at once and message is larger than
1122 * send buffering, then hard error.  Lock against other senders.  If must go
1123 * all at once and not enough room now, then inform user that this would
1124 * block and do nothing.  Otherwise, if nonblocking, send as much as
1125 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1126 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1127 * in mbuf chain must be small enough to send all at once.
1128 *
1129 * Returns nonzero on error, timeout or signal; callers must check for short
1130 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1131 * on return.
1132 */
1133#define	snderr(errno)	{ error = (errno); goto release; }
1134int
1135sosend_generic(so, addr, uio, top, control, flags, td)
1136	struct socket *so;
1137	struct sockaddr *addr;
1138	struct uio *uio;
1139	struct mbuf *top;
1140	struct mbuf *control;
1141	int flags;
1142	struct thread *td;
1143{
1144	long space, resid;
1145	int clen = 0, error, dontroute;
1146	int atomic = sosendallatonce(so) || top;
1147
1148	if (uio != NULL)
1149		resid = uio->uio_resid;
1150	else
1151		resid = top->m_pkthdr.len;
1152	/*
1153	 * In theory resid should be unsigned.  However, space must be
1154	 * signed, as it might be less than 0 if we over-committed, and we
1155	 * must use a signed comparison of space and resid.  On the other
1156	 * hand, a negative resid causes us to loop sending 0-length
1157	 * segments to the protocol.
1158	 *
1159	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1160	 * type sockets since that's an error.
1161	 */
1162	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1163		error = EINVAL;
1164		goto out;
1165	}
1166
1167	dontroute =
1168	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1169	    (so->so_proto->pr_flags & PR_ATOMIC);
1170	if (td != NULL)
1171		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1172	if (control != NULL)
1173		clen = control->m_len;
1174
1175	SOCKBUF_LOCK(&so->so_snd);
1176restart:
1177	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1178	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1179	if (error)
1180		goto out_locked;
1181	do {
1182		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1183		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1184			snderr(EPIPE);
1185		if (so->so_error) {
1186			error = so->so_error;
1187			so->so_error = 0;
1188			goto release;
1189		}
1190		if ((so->so_state & SS_ISCONNECTED) == 0) {
1191			/*
1192			 * `sendto' and `sendmsg' is allowed on a connection-
1193			 * based socket if it supports implied connect.
1194			 * Return ENOTCONN if not connected and no address is
1195			 * supplied.
1196			 */
1197			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1198			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1199				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1200				    !(resid == 0 && clen != 0))
1201					snderr(ENOTCONN);
1202			} else if (addr == NULL)
1203			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1204				   ENOTCONN : EDESTADDRREQ);
1205		}
1206		space = sbspace(&so->so_snd);
1207		if (flags & MSG_OOB)
1208			space += 1024;
1209		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1210		    clen > so->so_snd.sb_hiwat)
1211			snderr(EMSGSIZE);
1212		if (space < resid + clen &&
1213		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1214			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1215				snderr(EWOULDBLOCK);
1216			sbunlock(&so->so_snd);
1217			error = sbwait(&so->so_snd);
1218			if (error)
1219				goto out_locked;
1220			goto restart;
1221		}
1222		SOCKBUF_UNLOCK(&so->so_snd);
1223		space -= clen;
1224		do {
1225			if (uio == NULL) {
1226				resid = 0;
1227				if (flags & MSG_EOR)
1228					top->m_flags |= M_EOR;
1229			} else {
1230#ifdef ZERO_COPY_SOCKETS
1231				error = sosend_copyin(uio, &top, atomic,
1232				    &space, flags);
1233				if (error != 0) {
1234					SOCKBUF_LOCK(&so->so_snd);
1235					goto release;
1236				}
1237#else
1238				/*
1239				 * Copy the data from userland into a mbuf
1240				 * chain.  If no data is to be copied in,
1241				 * a single empty mbuf is returned.
1242				 */
1243				top = m_uiotombuf(uio, M_WAITOK, space,
1244				    (atomic ? max_hdr : 0),
1245				    (atomic ? M_PKTHDR : 0) |
1246				    ((flags & MSG_EOR) ? M_EOR : 0));
1247				if (top == NULL) {
1248					SOCKBUF_LOCK(&so->so_snd);
1249					error = EFAULT; /* only possible error */
1250					goto release;
1251				}
1252				space -= resid - uio->uio_resid;
1253#endif
1254				resid = uio->uio_resid;
1255			}
1256			if (dontroute) {
1257				SOCK_LOCK(so);
1258				so->so_options |= SO_DONTROUTE;
1259				SOCK_UNLOCK(so);
1260			}
1261			/*
1262			 * XXX all the SBS_CANTSENDMORE checks previously
1263			 * done could be out of date.  We could have recieved
1264			 * a reset packet in an interrupt or maybe we slept
1265			 * while doing page faults in uiomove() etc.  We
1266			 * could probably recheck again inside the locking
1267			 * protection here, but there are probably other
1268			 * places that this also happens.  We must rethink
1269			 * this.
1270			 */
1271			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1272			    (flags & MSG_OOB) ? PRUS_OOB :
1273			/*
1274			 * If the user set MSG_EOF, the protocol understands
1275			 * this flag and nothing left to send then use
1276			 * PRU_SEND_EOF instead of PRU_SEND.
1277			 */
1278			    ((flags & MSG_EOF) &&
1279			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1280			     (resid <= 0)) ?
1281				PRUS_EOF :
1282			/* If there is more to send set PRUS_MORETOCOME. */
1283			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1284			    top, addr, control, td);
1285			if (dontroute) {
1286				SOCK_LOCK(so);
1287				so->so_options &= ~SO_DONTROUTE;
1288				SOCK_UNLOCK(so);
1289			}
1290			clen = 0;
1291			control = NULL;
1292			top = NULL;
1293			if (error) {
1294				SOCKBUF_LOCK(&so->so_snd);
1295				goto release;
1296			}
1297		} while (resid && space > 0);
1298		SOCKBUF_LOCK(&so->so_snd);
1299	} while (resid);
1300
1301release:
1302	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1303	sbunlock(&so->so_snd);
1304out_locked:
1305	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1306	SOCKBUF_UNLOCK(&so->so_snd);
1307out:
1308	if (top != NULL)
1309		m_freem(top);
1310	if (control != NULL)
1311		m_freem(control);
1312	return (error);
1313}
1314#undef snderr
1315
1316int
1317sosend(so, addr, uio, top, control, flags, td)
1318	struct socket *so;
1319	struct sockaddr *addr;
1320	struct uio *uio;
1321	struct mbuf *top;
1322	struct mbuf *control;
1323	int flags;
1324	struct thread *td;
1325{
1326
1327	/* XXXRW: Temporary debugging. */
1328	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1329	    ("sosend: protocol calls sosend"));
1330
1331	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1332	    control, flags, td));
1333}
1334
1335/*
1336 * The part of soreceive() that implements reading non-inline out-of-band
1337 * data from a socket.  For more complete comments, see soreceive(), from
1338 * which this code originated.
1339 *
1340 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1341 * unable to return an mbuf chain to the caller.
1342 */
1343static int
1344soreceive_rcvoob(so, uio, flags)
1345	struct socket *so;
1346	struct uio *uio;
1347	int flags;
1348{
1349	struct protosw *pr = so->so_proto;
1350	struct mbuf *m;
1351	int error;
1352
1353	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1354
1355	m = m_get(M_TRYWAIT, MT_DATA);
1356	if (m == NULL)
1357		return (ENOBUFS);
1358	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1359	if (error)
1360		goto bad;
1361	do {
1362#ifdef ZERO_COPY_SOCKETS
1363		if (so_zero_copy_receive) {
1364			int disposable;
1365
1366			if ((m->m_flags & M_EXT)
1367			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1368				disposable = 1;
1369			else
1370				disposable = 0;
1371
1372			error = uiomoveco(mtod(m, void *),
1373					  min(uio->uio_resid, m->m_len),
1374					  uio, disposable);
1375		} else
1376#endif /* ZERO_COPY_SOCKETS */
1377		error = uiomove(mtod(m, void *),
1378		    (int) min(uio->uio_resid, m->m_len), uio);
1379		m = m_free(m);
1380	} while (uio->uio_resid && error == 0 && m);
1381bad:
1382	if (m != NULL)
1383		m_freem(m);
1384	return (error);
1385}
1386
1387/*
1388 * Following replacement or removal of the first mbuf on the first mbuf chain
1389 * of a socket buffer, push necessary state changes back into the socket
1390 * buffer so that other consumers see the values consistently.  'nextrecord'
1391 * is the callers locally stored value of the original value of
1392 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1393 * NOTE: 'nextrecord' may be NULL.
1394 */
1395static __inline void
1396sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1397{
1398
1399	SOCKBUF_LOCK_ASSERT(sb);
1400	/*
1401	 * First, update for the new value of nextrecord.  If necessary, make
1402	 * it the first record.
1403	 */
1404	if (sb->sb_mb != NULL)
1405		sb->sb_mb->m_nextpkt = nextrecord;
1406	else
1407		sb->sb_mb = nextrecord;
1408
1409        /*
1410         * Now update any dependent socket buffer fields to reflect the new
1411         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1412	 * addition of a second clause that takes care of the case where
1413	 * sb_mb has been updated, but remains the last record.
1414         */
1415        if (sb->sb_mb == NULL) {
1416                sb->sb_mbtail = NULL;
1417                sb->sb_lastrecord = NULL;
1418        } else if (sb->sb_mb->m_nextpkt == NULL)
1419                sb->sb_lastrecord = sb->sb_mb;
1420}
1421
1422
1423/*
1424 * Implement receive operations on a socket.  We depend on the way that
1425 * records are added to the sockbuf by sbappend.  In particular, each record
1426 * (mbufs linked through m_next) must begin with an address if the protocol
1427 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1428 * data, and then zero or more mbufs of data.  In order to allow parallelism
1429 * between network receive and copying to user space, as well as avoid
1430 * sleeping with a mutex held, we release the socket buffer mutex during the
1431 * user space copy.  Although the sockbuf is locked, new data may still be
1432 * appended, and thus we must maintain consistency of the sockbuf during that
1433 * time.
1434 *
1435 * The caller may receive the data as a single mbuf chain by supplying an
1436 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1437 * the count in uio_resid.
1438 */
1439int
1440soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1441	struct socket *so;
1442	struct sockaddr **psa;
1443	struct uio *uio;
1444	struct mbuf **mp0;
1445	struct mbuf **controlp;
1446	int *flagsp;
1447{
1448	struct mbuf *m, **mp;
1449	int flags, len, error, offset;
1450	struct protosw *pr = so->so_proto;
1451	struct mbuf *nextrecord;
1452	int moff, type = 0;
1453	int orig_resid = uio->uio_resid;
1454
1455	mp = mp0;
1456	if (psa != NULL)
1457		*psa = NULL;
1458	if (controlp != NULL)
1459		*controlp = NULL;
1460	if (flagsp != NULL)
1461		flags = *flagsp &~ MSG_EOR;
1462	else
1463		flags = 0;
1464	if (flags & MSG_OOB)
1465		return (soreceive_rcvoob(so, uio, flags));
1466	if (mp != NULL)
1467		*mp = NULL;
1468	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1469	    && uio->uio_resid)
1470		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1471
1472	SOCKBUF_LOCK(&so->so_rcv);
1473restart:
1474	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1475	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1476	if (error)
1477		goto out;
1478
1479	m = so->so_rcv.sb_mb;
1480	/*
1481	 * If we have less data than requested, block awaiting more (subject
1482	 * to any timeout) if:
1483	 *   1. the current count is less than the low water mark, or
1484	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1485	 *	receive operation at once if we block (resid <= hiwat).
1486	 *   3. MSG_DONTWAIT is not set
1487	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1488	 * we have to do the receive in sections, and thus risk returning a
1489	 * short count if a timeout or signal occurs after we start.
1490	 */
1491	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1492	    so->so_rcv.sb_cc < uio->uio_resid) &&
1493	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1494	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1495	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1496		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1497		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1498		    m, so->so_rcv.sb_cc));
1499		if (so->so_error) {
1500			if (m != NULL)
1501				goto dontblock;
1502			error = so->so_error;
1503			if ((flags & MSG_PEEK) == 0)
1504				so->so_error = 0;
1505			goto release;
1506		}
1507		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1508		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1509			if (m)
1510				goto dontblock;
1511			else
1512				goto release;
1513		}
1514		for (; m != NULL; m = m->m_next)
1515			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1516				m = so->so_rcv.sb_mb;
1517				goto dontblock;
1518			}
1519		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1520		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1521			error = ENOTCONN;
1522			goto release;
1523		}
1524		if (uio->uio_resid == 0)
1525			goto release;
1526		if ((so->so_state & SS_NBIO) ||
1527		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1528			error = EWOULDBLOCK;
1529			goto release;
1530		}
1531		SBLASTRECORDCHK(&so->so_rcv);
1532		SBLASTMBUFCHK(&so->so_rcv);
1533		sbunlock(&so->so_rcv);
1534		error = sbwait(&so->so_rcv);
1535		if (error)
1536			goto out;
1537		goto restart;
1538	}
1539dontblock:
1540	/*
1541	 * From this point onward, we maintain 'nextrecord' as a cache of the
1542	 * pointer to the next record in the socket buffer.  We must keep the
1543	 * various socket buffer pointers and local stack versions of the
1544	 * pointers in sync, pushing out modifications before dropping the
1545	 * socket buffer mutex, and re-reading them when picking it up.
1546	 *
1547	 * Otherwise, we will race with the network stack appending new data
1548	 * or records onto the socket buffer by using inconsistent/stale
1549	 * versions of the field, possibly resulting in socket buffer
1550	 * corruption.
1551	 *
1552	 * By holding the high-level sblock(), we prevent simultaneous
1553	 * readers from pulling off the front of the socket buffer.
1554	 */
1555	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1556	if (uio->uio_td)
1557		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1558	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1559	SBLASTRECORDCHK(&so->so_rcv);
1560	SBLASTMBUFCHK(&so->so_rcv);
1561	nextrecord = m->m_nextpkt;
1562	if (pr->pr_flags & PR_ADDR) {
1563		KASSERT(m->m_type == MT_SONAME,
1564		    ("m->m_type == %d", m->m_type));
1565		orig_resid = 0;
1566		if (psa != NULL)
1567			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1568			    M_NOWAIT);
1569		if (flags & MSG_PEEK) {
1570			m = m->m_next;
1571		} else {
1572			sbfree(&so->so_rcv, m);
1573			so->so_rcv.sb_mb = m_free(m);
1574			m = so->so_rcv.sb_mb;
1575			sockbuf_pushsync(&so->so_rcv, nextrecord);
1576		}
1577	}
1578
1579	/*
1580	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1581	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1582	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1583	 * perform externalization (or freeing if controlp == NULL).
1584	 */
1585	if (m != NULL && m->m_type == MT_CONTROL) {
1586		struct mbuf *cm = NULL, *cmn;
1587		struct mbuf **cme = &cm;
1588
1589		do {
1590			if (flags & MSG_PEEK) {
1591				if (controlp != NULL) {
1592					*controlp = m_copy(m, 0, m->m_len);
1593					controlp = &(*controlp)->m_next;
1594				}
1595				m = m->m_next;
1596			} else {
1597				sbfree(&so->so_rcv, m);
1598				so->so_rcv.sb_mb = m->m_next;
1599				m->m_next = NULL;
1600				*cme = m;
1601				cme = &(*cme)->m_next;
1602				m = so->so_rcv.sb_mb;
1603			}
1604		} while (m != NULL && m->m_type == MT_CONTROL);
1605		if ((flags & MSG_PEEK) == 0)
1606			sockbuf_pushsync(&so->so_rcv, nextrecord);
1607		while (cm != NULL) {
1608			cmn = cm->m_next;
1609			cm->m_next = NULL;
1610			if (pr->pr_domain->dom_externalize != NULL) {
1611				SOCKBUF_UNLOCK(&so->so_rcv);
1612				error = (*pr->pr_domain->dom_externalize)
1613				    (cm, controlp);
1614				SOCKBUF_LOCK(&so->so_rcv);
1615			} else if (controlp != NULL)
1616				*controlp = cm;
1617			else
1618				m_freem(cm);
1619			if (controlp != NULL) {
1620				orig_resid = 0;
1621				while (*controlp != NULL)
1622					controlp = &(*controlp)->m_next;
1623			}
1624			cm = cmn;
1625		}
1626		if (m != NULL)
1627			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1628		else
1629			nextrecord = so->so_rcv.sb_mb;
1630		orig_resid = 0;
1631	}
1632	if (m != NULL) {
1633		if ((flags & MSG_PEEK) == 0) {
1634			KASSERT(m->m_nextpkt == nextrecord,
1635			    ("soreceive: post-control, nextrecord !sync"));
1636			if (nextrecord == NULL) {
1637				KASSERT(so->so_rcv.sb_mb == m,
1638				    ("soreceive: post-control, sb_mb!=m"));
1639				KASSERT(so->so_rcv.sb_lastrecord == m,
1640				    ("soreceive: post-control, lastrecord!=m"));
1641			}
1642		}
1643		type = m->m_type;
1644		if (type == MT_OOBDATA)
1645			flags |= MSG_OOB;
1646	} else {
1647		if ((flags & MSG_PEEK) == 0) {
1648			KASSERT(so->so_rcv.sb_mb == nextrecord,
1649			    ("soreceive: sb_mb != nextrecord"));
1650			if (so->so_rcv.sb_mb == NULL) {
1651				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1652				    ("soreceive: sb_lastercord != NULL"));
1653			}
1654		}
1655	}
1656	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1657	SBLASTRECORDCHK(&so->so_rcv);
1658	SBLASTMBUFCHK(&so->so_rcv);
1659
1660	/*
1661	 * Now continue to read any data mbufs off of the head of the socket
1662	 * buffer until the read request is satisfied.  Note that 'type' is
1663	 * used to store the type of any mbuf reads that have happened so far
1664	 * such that soreceive() can stop reading if the type changes, which
1665	 * causes soreceive() to return only one of regular data and inline
1666	 * out-of-band data in a single socket receive operation.
1667	 */
1668	moff = 0;
1669	offset = 0;
1670	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1671		/*
1672		 * If the type of mbuf has changed since the last mbuf
1673		 * examined ('type'), end the receive operation.
1674	 	 */
1675		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1676		if (m->m_type == MT_OOBDATA) {
1677			if (type != MT_OOBDATA)
1678				break;
1679		} else if (type == MT_OOBDATA)
1680			break;
1681		else
1682		    KASSERT(m->m_type == MT_DATA,
1683			("m->m_type == %d", m->m_type));
1684		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1685		len = uio->uio_resid;
1686		if (so->so_oobmark && len > so->so_oobmark - offset)
1687			len = so->so_oobmark - offset;
1688		if (len > m->m_len - moff)
1689			len = m->m_len - moff;
1690		/*
1691		 * If mp is set, just pass back the mbufs.  Otherwise copy
1692		 * them out via the uio, then free.  Sockbuf must be
1693		 * consistent here (points to current mbuf, it points to next
1694		 * record) when we drop priority; we must note any additions
1695		 * to the sockbuf when we block interrupts again.
1696		 */
1697		if (mp == NULL) {
1698			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1699			SBLASTRECORDCHK(&so->so_rcv);
1700			SBLASTMBUFCHK(&so->so_rcv);
1701			SOCKBUF_UNLOCK(&so->so_rcv);
1702#ifdef ZERO_COPY_SOCKETS
1703			if (so_zero_copy_receive) {
1704				int disposable;
1705
1706				if ((m->m_flags & M_EXT)
1707				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1708					disposable = 1;
1709				else
1710					disposable = 0;
1711
1712				error = uiomoveco(mtod(m, char *) + moff,
1713						  (int)len, uio,
1714						  disposable);
1715			} else
1716#endif /* ZERO_COPY_SOCKETS */
1717			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1718			SOCKBUF_LOCK(&so->so_rcv);
1719			if (error) {
1720				/*
1721				 * The MT_SONAME mbuf has already been removed
1722				 * from the record, so it is necessary to
1723				 * remove the data mbufs, if any, to preserve
1724				 * the invariant in the case of PR_ADDR that
1725				 * requires MT_SONAME mbufs at the head of
1726				 * each record.
1727				 */
1728				if (m && pr->pr_flags & PR_ATOMIC
1729				    && ((flags & MSG_PEEK) == 0)) {
1730					(void)sbdroprecord_locked(&so->so_rcv);
1731				}
1732				goto release;
1733			}
1734		} else
1735			uio->uio_resid -= len;
1736		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1737		if (len == m->m_len - moff) {
1738			if (m->m_flags & M_EOR)
1739				flags |= MSG_EOR;
1740			if (flags & MSG_PEEK) {
1741				m = m->m_next;
1742				moff = 0;
1743			} else {
1744				nextrecord = m->m_nextpkt;
1745				sbfree(&so->so_rcv, m);
1746				if (mp != NULL) {
1747					*mp = m;
1748					mp = &m->m_next;
1749					so->so_rcv.sb_mb = m = m->m_next;
1750					*mp = NULL;
1751				} else {
1752					so->so_rcv.sb_mb = m_free(m);
1753					m = so->so_rcv.sb_mb;
1754				}
1755				sockbuf_pushsync(&so->so_rcv, nextrecord);
1756				SBLASTRECORDCHK(&so->so_rcv);
1757				SBLASTMBUFCHK(&so->so_rcv);
1758			}
1759		} else {
1760			if (flags & MSG_PEEK)
1761				moff += len;
1762			else {
1763				if (mp != NULL) {
1764					int copy_flag;
1765
1766					if (flags & MSG_DONTWAIT)
1767						copy_flag = M_DONTWAIT;
1768					else
1769						copy_flag = M_TRYWAIT;
1770					if (copy_flag == M_TRYWAIT)
1771						SOCKBUF_UNLOCK(&so->so_rcv);
1772					*mp = m_copym(m, 0, len, copy_flag);
1773					if (copy_flag == M_TRYWAIT)
1774						SOCKBUF_LOCK(&so->so_rcv);
1775 					if (*mp == NULL) {
1776 						/*
1777 						 * m_copym() couldn't
1778						 * allocate an mbuf.  Adjust
1779						 * uio_resid back (it was
1780						 * adjusted down by len
1781						 * bytes, which we didn't end
1782						 * up "copying" over).
1783 						 */
1784 						uio->uio_resid += len;
1785 						break;
1786 					}
1787				}
1788				m->m_data += len;
1789				m->m_len -= len;
1790				so->so_rcv.sb_cc -= len;
1791			}
1792		}
1793		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1794		if (so->so_oobmark) {
1795			if ((flags & MSG_PEEK) == 0) {
1796				so->so_oobmark -= len;
1797				if (so->so_oobmark == 0) {
1798					so->so_rcv.sb_state |= SBS_RCVATMARK;
1799					break;
1800				}
1801			} else {
1802				offset += len;
1803				if (offset == so->so_oobmark)
1804					break;
1805			}
1806		}
1807		if (flags & MSG_EOR)
1808			break;
1809		/*
1810		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1811		 * must not quit until "uio->uio_resid == 0" or an error
1812		 * termination.  If a signal/timeout occurs, return with a
1813		 * short count but without error.  Keep sockbuf locked
1814		 * against other readers.
1815		 */
1816		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1817		    !sosendallatonce(so) && nextrecord == NULL) {
1818			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1819			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1820				break;
1821			/*
1822			 * Notify the protocol that some data has been
1823			 * drained before blocking.
1824			 */
1825			if (pr->pr_flags & PR_WANTRCVD) {
1826				SOCKBUF_UNLOCK(&so->so_rcv);
1827				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1828				SOCKBUF_LOCK(&so->so_rcv);
1829			}
1830			SBLASTRECORDCHK(&so->so_rcv);
1831			SBLASTMBUFCHK(&so->so_rcv);
1832			error = sbwait(&so->so_rcv);
1833			if (error)
1834				goto release;
1835			m = so->so_rcv.sb_mb;
1836			if (m != NULL)
1837				nextrecord = m->m_nextpkt;
1838		}
1839	}
1840
1841	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1842	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1843		flags |= MSG_TRUNC;
1844		if ((flags & MSG_PEEK) == 0)
1845			(void) sbdroprecord_locked(&so->so_rcv);
1846	}
1847	if ((flags & MSG_PEEK) == 0) {
1848		if (m == NULL) {
1849			/*
1850			 * First part is an inline SB_EMPTY_FIXUP().  Second
1851			 * part makes sure sb_lastrecord is up-to-date if
1852			 * there is still data in the socket buffer.
1853			 */
1854			so->so_rcv.sb_mb = nextrecord;
1855			if (so->so_rcv.sb_mb == NULL) {
1856				so->so_rcv.sb_mbtail = NULL;
1857				so->so_rcv.sb_lastrecord = NULL;
1858			} else if (nextrecord->m_nextpkt == NULL)
1859				so->so_rcv.sb_lastrecord = nextrecord;
1860		}
1861		SBLASTRECORDCHK(&so->so_rcv);
1862		SBLASTMBUFCHK(&so->so_rcv);
1863		/*
1864		 * If soreceive() is being done from the socket callback,
1865		 * then don't need to generate ACK to peer to update window,
1866		 * since ACK will be generated on return to TCP.
1867		 */
1868		if (!(flags & MSG_SOCALLBCK) &&
1869		    (pr->pr_flags & PR_WANTRCVD)) {
1870			SOCKBUF_UNLOCK(&so->so_rcv);
1871			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1872			SOCKBUF_LOCK(&so->so_rcv);
1873		}
1874	}
1875	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1876	if (orig_resid == uio->uio_resid && orig_resid &&
1877	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1878		sbunlock(&so->so_rcv);
1879		goto restart;
1880	}
1881
1882	if (flagsp != NULL)
1883		*flagsp |= flags;
1884release:
1885	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1886	sbunlock(&so->so_rcv);
1887out:
1888	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1889	SOCKBUF_UNLOCK(&so->so_rcv);
1890	return (error);
1891}
1892
1893int
1894soreceive(so, psa, uio, mp0, controlp, flagsp)
1895	struct socket *so;
1896	struct sockaddr **psa;
1897	struct uio *uio;
1898	struct mbuf **mp0;
1899	struct mbuf **controlp;
1900	int *flagsp;
1901{
1902
1903	/* XXXRW: Temporary debugging. */
1904	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1905	    ("soreceive: protocol calls soreceive"));
1906
1907	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1908	    controlp, flagsp));
1909}
1910
1911int
1912soshutdown(so, how)
1913	struct socket *so;
1914	int how;
1915{
1916	struct protosw *pr = so->so_proto;
1917
1918	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1919		return (EINVAL);
1920
1921	if (how != SHUT_WR)
1922		sorflush(so);
1923	if (how != SHUT_RD)
1924		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1925	return (0);
1926}
1927
1928void
1929sorflush(so)
1930	struct socket *so;
1931{
1932	struct sockbuf *sb = &so->so_rcv;
1933	struct protosw *pr = so->so_proto;
1934	struct sockbuf asb;
1935
1936	/*
1937	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1938	 * the socket buffer, then zero'd the original to clear the buffer
1939	 * fields.  However, with mutexes in the socket buffer, this causes
1940	 * problems.  We only clear the zeroable bits of the original;
1941	 * however, we have to initialize and destroy the mutex in the copy
1942	 * so that dom_dispose() and sbrelease() can lock t as needed.
1943	 */
1944	SOCKBUF_LOCK(sb);
1945	sb->sb_flags |= SB_NOINTR;
1946	(void) sblock(sb, M_WAITOK);
1947	/*
1948	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1949	 * can safely perform wakeups.  Re-acquire the mutex before
1950	 * continuing.
1951	 */
1952	socantrcvmore_locked(so);
1953	SOCKBUF_LOCK(sb);
1954	sbunlock(sb);
1955	/*
1956	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1957	 * and mutex data unchanged.
1958	 */
1959	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1960	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1961	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1962	bzero(&sb->sb_startzero,
1963	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1964	SOCKBUF_UNLOCK(sb);
1965
1966	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1967	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1968		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1969	sbrelease(&asb, so);
1970	SOCKBUF_LOCK_DESTROY(&asb);
1971}
1972
1973/*
1974 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1975 * additional variant to handle the case where the option value needs to be
1976 * some kind of integer, but not a specific size.  In addition to their use
1977 * here, these functions are also called by the protocol-level pr_ctloutput()
1978 * routines.
1979 */
1980int
1981sooptcopyin(sopt, buf, len, minlen)
1982	struct	sockopt *sopt;
1983	void	*buf;
1984	size_t	len;
1985	size_t	minlen;
1986{
1987	size_t	valsize;
1988
1989	/*
1990	 * If the user gives us more than we wanted, we ignore it, but if we
1991	 * don't get the minimum length the caller wants, we return EINVAL.
1992	 * On success, sopt->sopt_valsize is set to however much we actually
1993	 * retrieved.
1994	 */
1995	if ((valsize = sopt->sopt_valsize) < minlen)
1996		return EINVAL;
1997	if (valsize > len)
1998		sopt->sopt_valsize = valsize = len;
1999
2000	if (sopt->sopt_td != NULL)
2001		return (copyin(sopt->sopt_val, buf, valsize));
2002
2003	bcopy(sopt->sopt_val, buf, valsize);
2004	return (0);
2005}
2006
2007/*
2008 * Kernel version of setsockopt(2).
2009 *
2010 * XXX: optlen is size_t, not socklen_t
2011 */
2012int
2013so_setsockopt(struct socket *so, int level, int optname, void *optval,
2014    size_t optlen)
2015{
2016	struct sockopt sopt;
2017
2018	sopt.sopt_level = level;
2019	sopt.sopt_name = optname;
2020	sopt.sopt_dir = SOPT_SET;
2021	sopt.sopt_val = optval;
2022	sopt.sopt_valsize = optlen;
2023	sopt.sopt_td = NULL;
2024	return (sosetopt(so, &sopt));
2025}
2026
2027int
2028sosetopt(so, sopt)
2029	struct socket *so;
2030	struct sockopt *sopt;
2031{
2032	int	error, optval;
2033	struct	linger l;
2034	struct	timeval tv;
2035	u_long  val;
2036#ifdef MAC
2037	struct mac extmac;
2038#endif
2039
2040	error = 0;
2041	if (sopt->sopt_level != SOL_SOCKET) {
2042		if (so->so_proto && so->so_proto->pr_ctloutput)
2043			return ((*so->so_proto->pr_ctloutput)
2044				  (so, sopt));
2045		error = ENOPROTOOPT;
2046	} else {
2047		switch (sopt->sopt_name) {
2048#ifdef INET
2049		case SO_ACCEPTFILTER:
2050			error = do_setopt_accept_filter(so, sopt);
2051			if (error)
2052				goto bad;
2053			break;
2054#endif
2055		case SO_LINGER:
2056			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2057			if (error)
2058				goto bad;
2059
2060			SOCK_LOCK(so);
2061			so->so_linger = l.l_linger;
2062			if (l.l_onoff)
2063				so->so_options |= SO_LINGER;
2064			else
2065				so->so_options &= ~SO_LINGER;
2066			SOCK_UNLOCK(so);
2067			break;
2068
2069		case SO_DEBUG:
2070		case SO_KEEPALIVE:
2071		case SO_DONTROUTE:
2072		case SO_USELOOPBACK:
2073		case SO_BROADCAST:
2074		case SO_REUSEADDR:
2075		case SO_REUSEPORT:
2076		case SO_OOBINLINE:
2077		case SO_TIMESTAMP:
2078		case SO_BINTIME:
2079		case SO_NOSIGPIPE:
2080			error = sooptcopyin(sopt, &optval, sizeof optval,
2081					    sizeof optval);
2082			if (error)
2083				goto bad;
2084			SOCK_LOCK(so);
2085			if (optval)
2086				so->so_options |= sopt->sopt_name;
2087			else
2088				so->so_options &= ~sopt->sopt_name;
2089			SOCK_UNLOCK(so);
2090			break;
2091
2092		case SO_SNDBUF:
2093		case SO_RCVBUF:
2094		case SO_SNDLOWAT:
2095		case SO_RCVLOWAT:
2096			error = sooptcopyin(sopt, &optval, sizeof optval,
2097					    sizeof optval);
2098			if (error)
2099				goto bad;
2100
2101			/*
2102			 * Values < 1 make no sense for any of these options,
2103			 * so disallow them.
2104			 */
2105			if (optval < 1) {
2106				error = EINVAL;
2107				goto bad;
2108			}
2109
2110			switch (sopt->sopt_name) {
2111			case SO_SNDBUF:
2112			case SO_RCVBUF:
2113				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2114				    &so->so_snd : &so->so_rcv, (u_long)optval,
2115				    so, curthread) == 0) {
2116					error = ENOBUFS;
2117					goto bad;
2118				}
2119				break;
2120
2121			/*
2122			 * Make sure the low-water is never greater than the
2123			 * high-water.
2124			 */
2125			case SO_SNDLOWAT:
2126				SOCKBUF_LOCK(&so->so_snd);
2127				so->so_snd.sb_lowat =
2128				    (optval > so->so_snd.sb_hiwat) ?
2129				    so->so_snd.sb_hiwat : optval;
2130				SOCKBUF_UNLOCK(&so->so_snd);
2131				break;
2132			case SO_RCVLOWAT:
2133				SOCKBUF_LOCK(&so->so_rcv);
2134				so->so_rcv.sb_lowat =
2135				    (optval > so->so_rcv.sb_hiwat) ?
2136				    so->so_rcv.sb_hiwat : optval;
2137				SOCKBUF_UNLOCK(&so->so_rcv);
2138				break;
2139			}
2140			break;
2141
2142		case SO_SNDTIMEO:
2143		case SO_RCVTIMEO:
2144#ifdef COMPAT_IA32
2145			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2146				struct timeval32 tv32;
2147
2148				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2149				    sizeof tv32);
2150				CP(tv32, tv, tv_sec);
2151				CP(tv32, tv, tv_usec);
2152			} else
2153#endif
2154				error = sooptcopyin(sopt, &tv, sizeof tv,
2155				    sizeof tv);
2156			if (error)
2157				goto bad;
2158
2159			/* assert(hz > 0); */
2160			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2161			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2162				error = EDOM;
2163				goto bad;
2164			}
2165			/* assert(tick > 0); */
2166			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2167			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2168			if (val > INT_MAX) {
2169				error = EDOM;
2170				goto bad;
2171			}
2172			if (val == 0 && tv.tv_usec != 0)
2173				val = 1;
2174
2175			switch (sopt->sopt_name) {
2176			case SO_SNDTIMEO:
2177				so->so_snd.sb_timeo = val;
2178				break;
2179			case SO_RCVTIMEO:
2180				so->so_rcv.sb_timeo = val;
2181				break;
2182			}
2183			break;
2184
2185		case SO_LABEL:
2186#ifdef MAC
2187			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2188			    sizeof extmac);
2189			if (error)
2190				goto bad;
2191			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2192			    so, &extmac);
2193#else
2194			error = EOPNOTSUPP;
2195#endif
2196			break;
2197
2198		default:
2199			error = ENOPROTOOPT;
2200			break;
2201		}
2202		if (error == 0 && so->so_proto != NULL &&
2203		    so->so_proto->pr_ctloutput != NULL) {
2204			(void) ((*so->so_proto->pr_ctloutput)
2205				  (so, sopt));
2206		}
2207	}
2208bad:
2209	return (error);
2210}
2211
2212/*
2213 * Helper routine for getsockopt.
2214 */
2215int
2216sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2217{
2218	int	error;
2219	size_t	valsize;
2220
2221	error = 0;
2222
2223	/*
2224	 * Documented get behavior is that we always return a value, possibly
2225	 * truncated to fit in the user's buffer.  Traditional behavior is
2226	 * that we always tell the user precisely how much we copied, rather
2227	 * than something useful like the total amount we had available for
2228	 * her.  Note that this interface is not idempotent; the entire
2229	 * answer must generated ahead of time.
2230	 */
2231	valsize = min(len, sopt->sopt_valsize);
2232	sopt->sopt_valsize = valsize;
2233	if (sopt->sopt_val != NULL) {
2234		if (sopt->sopt_td != NULL)
2235			error = copyout(buf, sopt->sopt_val, valsize);
2236		else
2237			bcopy(buf, sopt->sopt_val, valsize);
2238	}
2239	return (error);
2240}
2241
2242int
2243sogetopt(so, sopt)
2244	struct socket *so;
2245	struct sockopt *sopt;
2246{
2247	int	error, optval;
2248	struct	linger l;
2249	struct	timeval tv;
2250#ifdef MAC
2251	struct mac extmac;
2252#endif
2253
2254	error = 0;
2255	if (sopt->sopt_level != SOL_SOCKET) {
2256		if (so->so_proto && so->so_proto->pr_ctloutput) {
2257			return ((*so->so_proto->pr_ctloutput)
2258				  (so, sopt));
2259		} else
2260			return (ENOPROTOOPT);
2261	} else {
2262		switch (sopt->sopt_name) {
2263#ifdef INET
2264		case SO_ACCEPTFILTER:
2265			error = do_getopt_accept_filter(so, sopt);
2266			break;
2267#endif
2268		case SO_LINGER:
2269			SOCK_LOCK(so);
2270			l.l_onoff = so->so_options & SO_LINGER;
2271			l.l_linger = so->so_linger;
2272			SOCK_UNLOCK(so);
2273			error = sooptcopyout(sopt, &l, sizeof l);
2274			break;
2275
2276		case SO_USELOOPBACK:
2277		case SO_DONTROUTE:
2278		case SO_DEBUG:
2279		case SO_KEEPALIVE:
2280		case SO_REUSEADDR:
2281		case SO_REUSEPORT:
2282		case SO_BROADCAST:
2283		case SO_OOBINLINE:
2284		case SO_ACCEPTCONN:
2285		case SO_TIMESTAMP:
2286		case SO_BINTIME:
2287		case SO_NOSIGPIPE:
2288			optval = so->so_options & sopt->sopt_name;
2289integer:
2290			error = sooptcopyout(sopt, &optval, sizeof optval);
2291			break;
2292
2293		case SO_TYPE:
2294			optval = so->so_type;
2295			goto integer;
2296
2297		case SO_ERROR:
2298			SOCK_LOCK(so);
2299			optval = so->so_error;
2300			so->so_error = 0;
2301			SOCK_UNLOCK(so);
2302			goto integer;
2303
2304		case SO_SNDBUF:
2305			optval = so->so_snd.sb_hiwat;
2306			goto integer;
2307
2308		case SO_RCVBUF:
2309			optval = so->so_rcv.sb_hiwat;
2310			goto integer;
2311
2312		case SO_SNDLOWAT:
2313			optval = so->so_snd.sb_lowat;
2314			goto integer;
2315
2316		case SO_RCVLOWAT:
2317			optval = so->so_rcv.sb_lowat;
2318			goto integer;
2319
2320		case SO_SNDTIMEO:
2321		case SO_RCVTIMEO:
2322			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2323				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2324
2325			tv.tv_sec = optval / hz;
2326			tv.tv_usec = (optval % hz) * tick;
2327#ifdef COMPAT_IA32
2328			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2329				struct timeval32 tv32;
2330
2331				CP(tv, tv32, tv_sec);
2332				CP(tv, tv32, tv_usec);
2333				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2334			} else
2335#endif
2336				error = sooptcopyout(sopt, &tv, sizeof tv);
2337			break;
2338
2339		case SO_LABEL:
2340#ifdef MAC
2341			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2342			    sizeof(extmac));
2343			if (error)
2344				return (error);
2345			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2346			    so, &extmac);
2347			if (error)
2348				return (error);
2349			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2350#else
2351			error = EOPNOTSUPP;
2352#endif
2353			break;
2354
2355		case SO_PEERLABEL:
2356#ifdef MAC
2357			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2358			    sizeof(extmac));
2359			if (error)
2360				return (error);
2361			error = mac_getsockopt_peerlabel(
2362			    sopt->sopt_td->td_ucred, so, &extmac);
2363			if (error)
2364				return (error);
2365			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2366#else
2367			error = EOPNOTSUPP;
2368#endif
2369			break;
2370
2371		case SO_LISTENQLIMIT:
2372			optval = so->so_qlimit;
2373			goto integer;
2374
2375		case SO_LISTENQLEN:
2376			optval = so->so_qlen;
2377			goto integer;
2378
2379		case SO_LISTENINCQLEN:
2380			optval = so->so_incqlen;
2381			goto integer;
2382
2383		default:
2384			error = ENOPROTOOPT;
2385			break;
2386		}
2387		return (error);
2388	}
2389}
2390
2391/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2392int
2393soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2394{
2395	struct mbuf *m, *m_prev;
2396	int sopt_size = sopt->sopt_valsize;
2397
2398	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2399	if (m == NULL)
2400		return ENOBUFS;
2401	if (sopt_size > MLEN) {
2402		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2403		if ((m->m_flags & M_EXT) == 0) {
2404			m_free(m);
2405			return ENOBUFS;
2406		}
2407		m->m_len = min(MCLBYTES, sopt_size);
2408	} else {
2409		m->m_len = min(MLEN, sopt_size);
2410	}
2411	sopt_size -= m->m_len;
2412	*mp = m;
2413	m_prev = m;
2414
2415	while (sopt_size) {
2416		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2417		if (m == NULL) {
2418			m_freem(*mp);
2419			return ENOBUFS;
2420		}
2421		if (sopt_size > MLEN) {
2422			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2423			    M_DONTWAIT);
2424			if ((m->m_flags & M_EXT) == 0) {
2425				m_freem(m);
2426				m_freem(*mp);
2427				return ENOBUFS;
2428			}
2429			m->m_len = min(MCLBYTES, sopt_size);
2430		} else {
2431			m->m_len = min(MLEN, sopt_size);
2432		}
2433		sopt_size -= m->m_len;
2434		m_prev->m_next = m;
2435		m_prev = m;
2436	}
2437	return (0);
2438}
2439
2440/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2441int
2442soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2443{
2444	struct mbuf *m0 = m;
2445
2446	if (sopt->sopt_val == NULL)
2447		return (0);
2448	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2449		if (sopt->sopt_td != NULL) {
2450			int error;
2451
2452			error = copyin(sopt->sopt_val, mtod(m, char *),
2453				       m->m_len);
2454			if (error != 0) {
2455				m_freem(m0);
2456				return(error);
2457			}
2458		} else
2459			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2460		sopt->sopt_valsize -= m->m_len;
2461		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2462		m = m->m_next;
2463	}
2464	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2465		panic("ip6_sooptmcopyin");
2466	return (0);
2467}
2468
2469/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2470int
2471soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2472{
2473	struct mbuf *m0 = m;
2474	size_t valsize = 0;
2475
2476	if (sopt->sopt_val == NULL)
2477		return (0);
2478	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2479		if (sopt->sopt_td != NULL) {
2480			int error;
2481
2482			error = copyout(mtod(m, char *), sopt->sopt_val,
2483				       m->m_len);
2484			if (error != 0) {
2485				m_freem(m0);
2486				return(error);
2487			}
2488		} else
2489			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2490	       sopt->sopt_valsize -= m->m_len;
2491	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2492	       valsize += m->m_len;
2493	       m = m->m_next;
2494	}
2495	if (m != NULL) {
2496		/* enough soopt buffer should be given from user-land */
2497		m_freem(m0);
2498		return(EINVAL);
2499	}
2500	sopt->sopt_valsize = valsize;
2501	return (0);
2502}
2503
2504/*
2505 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2506 * out-of-band data, which will then notify socket consumers.
2507 */
2508void
2509sohasoutofband(so)
2510	struct socket *so;
2511{
2512	if (so->so_sigio != NULL)
2513		pgsigio(&so->so_sigio, SIGURG, 0);
2514	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2515}
2516
2517int
2518sopoll(struct socket *so, int events, struct ucred *active_cred,
2519    struct thread *td)
2520{
2521
2522	/* XXXRW: Temporary debugging. */
2523	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2524	    ("sopoll: protocol calls sopoll"));
2525
2526	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2527	    td));
2528}
2529
2530int
2531sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2532    struct thread *td)
2533{
2534	int revents = 0;
2535
2536	SOCKBUF_LOCK(&so->so_snd);
2537	SOCKBUF_LOCK(&so->so_rcv);
2538	if (events & (POLLIN | POLLRDNORM))
2539		if (soreadable(so))
2540			revents |= events & (POLLIN | POLLRDNORM);
2541
2542	if (events & POLLINIGNEOF)
2543		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2544		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2545			revents |= POLLINIGNEOF;
2546
2547	if (events & (POLLOUT | POLLWRNORM))
2548		if (sowriteable(so))
2549			revents |= events & (POLLOUT | POLLWRNORM);
2550
2551	if (events & (POLLPRI | POLLRDBAND))
2552		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2553			revents |= events & (POLLPRI | POLLRDBAND);
2554
2555	if (revents == 0) {
2556		if (events &
2557		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2558		     POLLRDBAND)) {
2559			selrecord(td, &so->so_rcv.sb_sel);
2560			so->so_rcv.sb_flags |= SB_SEL;
2561		}
2562
2563		if (events & (POLLOUT | POLLWRNORM)) {
2564			selrecord(td, &so->so_snd.sb_sel);
2565			so->so_snd.sb_flags |= SB_SEL;
2566		}
2567	}
2568
2569	SOCKBUF_UNLOCK(&so->so_rcv);
2570	SOCKBUF_UNLOCK(&so->so_snd);
2571	return (revents);
2572}
2573
2574int
2575soo_kqfilter(struct file *fp, struct knote *kn)
2576{
2577	struct socket *so = kn->kn_fp->f_data;
2578	struct sockbuf *sb;
2579
2580	switch (kn->kn_filter) {
2581	case EVFILT_READ:
2582		if (so->so_options & SO_ACCEPTCONN)
2583			kn->kn_fop = &solisten_filtops;
2584		else
2585			kn->kn_fop = &soread_filtops;
2586		sb = &so->so_rcv;
2587		break;
2588	case EVFILT_WRITE:
2589		kn->kn_fop = &sowrite_filtops;
2590		sb = &so->so_snd;
2591		break;
2592	default:
2593		return (EINVAL);
2594	}
2595
2596	SOCKBUF_LOCK(sb);
2597	knlist_add(&sb->sb_sel.si_note, kn, 1);
2598	sb->sb_flags |= SB_KNOTE;
2599	SOCKBUF_UNLOCK(sb);
2600	return (0);
2601}
2602
2603static void
2604filt_sordetach(struct knote *kn)
2605{
2606	struct socket *so = kn->kn_fp->f_data;
2607
2608	SOCKBUF_LOCK(&so->so_rcv);
2609	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2610	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2611		so->so_rcv.sb_flags &= ~SB_KNOTE;
2612	SOCKBUF_UNLOCK(&so->so_rcv);
2613}
2614
2615/*ARGSUSED*/
2616static int
2617filt_soread(struct knote *kn, long hint)
2618{
2619	struct socket *so;
2620
2621	so = kn->kn_fp->f_data;
2622	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2623
2624	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2625	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2626		kn->kn_flags |= EV_EOF;
2627		kn->kn_fflags = so->so_error;
2628		return (1);
2629	} else if (so->so_error)	/* temporary udp error */
2630		return (1);
2631	else if (kn->kn_sfflags & NOTE_LOWAT)
2632		return (kn->kn_data >= kn->kn_sdata);
2633	else
2634		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2635}
2636
2637static void
2638filt_sowdetach(struct knote *kn)
2639{
2640	struct socket *so = kn->kn_fp->f_data;
2641
2642	SOCKBUF_LOCK(&so->so_snd);
2643	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2644	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2645		so->so_snd.sb_flags &= ~SB_KNOTE;
2646	SOCKBUF_UNLOCK(&so->so_snd);
2647}
2648
2649/*ARGSUSED*/
2650static int
2651filt_sowrite(struct knote *kn, long hint)
2652{
2653	struct socket *so;
2654
2655	so = kn->kn_fp->f_data;
2656	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2657	kn->kn_data = sbspace(&so->so_snd);
2658	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2659		kn->kn_flags |= EV_EOF;
2660		kn->kn_fflags = so->so_error;
2661		return (1);
2662	} else if (so->so_error)	/* temporary udp error */
2663		return (1);
2664	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2665	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2666		return (0);
2667	else if (kn->kn_sfflags & NOTE_LOWAT)
2668		return (kn->kn_data >= kn->kn_sdata);
2669	else
2670		return (kn->kn_data >= so->so_snd.sb_lowat);
2671}
2672
2673/*ARGSUSED*/
2674static int
2675filt_solisten(struct knote *kn, long hint)
2676{
2677	struct socket *so = kn->kn_fp->f_data;
2678
2679	kn->kn_data = so->so_qlen;
2680	return (! TAILQ_EMPTY(&so->so_comp));
2681}
2682
2683int
2684socheckuid(struct socket *so, uid_t uid)
2685{
2686
2687	if (so == NULL)
2688		return (EPERM);
2689	if (so->so_cred->cr_uid != uid)
2690		return (EPERM);
2691	return (0);
2692}
2693
2694static int
2695somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2696{
2697	int error;
2698	int val;
2699
2700	val = somaxconn;
2701	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2702	if (error || !req->newptr )
2703		return (error);
2704
2705	if (val < 1 || val > USHRT_MAX)
2706		return (EINVAL);
2707
2708	somaxconn = val;
2709	return (0);
2710}
2711