uipc_socket.c revision 131005
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 131005 2004-06-24 00:54:26Z rwatson $");
34
35#include "opt_inet.h"
36#include "opt_mac.h"
37#include "opt_zero.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/fcntl.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/mac.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/mutex.h>
48#include <sys/domain.h>
49#include <sys/file.h>			/* for struct knote */
50#include <sys/kernel.h>
51#include <sys/event.h>
52#include <sys/poll.h>
53#include <sys/proc.h>
54#include <sys/protosw.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57#include <sys/resourcevar.h>
58#include <sys/signalvar.h>
59#include <sys/sysctl.h>
60#include <sys/uio.h>
61#include <sys/jail.h>
62
63#include <vm/uma.h>
64
65
66#ifdef INET
67static int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
68#endif
69
70static void	filt_sordetach(struct knote *kn);
71static int	filt_soread(struct knote *kn, long hint);
72static void	filt_sowdetach(struct knote *kn);
73static int	filt_sowrite(struct knote *kn, long hint);
74static int	filt_solisten(struct knote *kn, long hint);
75
76static struct filterops solisten_filtops =
77	{ 1, NULL, filt_sordetach, filt_solisten };
78static struct filterops soread_filtops =
79	{ 1, NULL, filt_sordetach, filt_soread };
80static struct filterops sowrite_filtops =
81	{ 1, NULL, filt_sowdetach, filt_sowrite };
82
83uma_zone_t socket_zone;
84so_gen_t	so_gencnt;	/* generation count for sockets */
85
86MALLOC_DEFINE(M_SONAME, "soname", "socket name");
87MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
88
89SYSCTL_DECL(_kern_ipc);
90
91static int somaxconn = SOMAXCONN;
92SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
93    &somaxconn, 0, "Maximum pending socket connection queue size");
94static int numopensockets;
95SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
96    &numopensockets, 0, "Number of open sockets");
97#ifdef ZERO_COPY_SOCKETS
98/* These aren't static because they're used in other files. */
99int so_zero_copy_send = 1;
100int so_zero_copy_receive = 1;
101SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
102    "Zero copy controls");
103SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
104    &so_zero_copy_receive, 0, "Enable zero copy receive");
105SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
106    &so_zero_copy_send, 0, "Enable zero copy send");
107#endif /* ZERO_COPY_SOCKETS */
108
109struct mtx accept_mtx;
110MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
111
112
113/*
114 * Socket operation routines.
115 * These routines are called by the routines in
116 * sys_socket.c or from a system process, and
117 * implement the semantics of socket operations by
118 * switching out to the protocol specific routines.
119 */
120
121/*
122 * Get a socket structure from our zone, and initialize it.
123 * Note that it would probably be better to allocate socket
124 * and PCB at the same time, but I'm not convinced that all
125 * the protocols can be easily modified to do this.
126 *
127 * soalloc() returns a socket with a ref count of 0.
128 */
129struct socket *
130soalloc(int mflags)
131{
132	struct socket *so;
133#ifdef MAC
134	int error;
135#endif
136
137	so = uma_zalloc(socket_zone, mflags | M_ZERO);
138	if (so != NULL) {
139#ifdef MAC
140		error = mac_init_socket(so, mflags);
141		if (error != 0) {
142			uma_zfree(socket_zone, so);
143			so = NULL;
144			return so;
145		}
146#endif
147		SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
148		SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
149		/* XXX race condition for reentrant kernel */
150		so->so_gencnt = ++so_gencnt;
151		/* sx_init(&so->so_sxlock, "socket sxlock"); */
152		TAILQ_INIT(&so->so_aiojobq);
153		++numopensockets;
154	}
155	return so;
156}
157
158/*
159 * socreate returns a socket with a ref count of 1.  The socket should be
160 * closed with soclose().
161 */
162int
163socreate(dom, aso, type, proto, cred, td)
164	int dom;
165	struct socket **aso;
166	int type;
167	int proto;
168	struct ucred *cred;
169	struct thread *td;
170{
171	struct protosw *prp;
172	struct socket *so;
173	int error;
174
175	if (proto)
176		prp = pffindproto(dom, proto, type);
177	else
178		prp = pffindtype(dom, type);
179
180	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
181		return (EPROTONOSUPPORT);
182
183	if (jailed(cred) && jail_socket_unixiproute_only &&
184	    prp->pr_domain->dom_family != PF_LOCAL &&
185	    prp->pr_domain->dom_family != PF_INET &&
186	    prp->pr_domain->dom_family != PF_ROUTE) {
187		return (EPROTONOSUPPORT);
188	}
189
190	if (prp->pr_type != type)
191		return (EPROTOTYPE);
192	so = soalloc(M_WAITOK);
193	if (so == NULL)
194		return (ENOBUFS);
195
196	TAILQ_INIT(&so->so_incomp);
197	TAILQ_INIT(&so->so_comp);
198	so->so_type = type;
199	so->so_cred = crhold(cred);
200	so->so_proto = prp;
201#ifdef MAC
202	mac_create_socket(cred, so);
203#endif
204	SOCK_LOCK(so);
205	soref(so);
206	SOCK_UNLOCK(so);
207	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
208	if (error) {
209		SOCK_LOCK(so);
210		so->so_state |= SS_NOFDREF;
211		sorele(so);
212		return (error);
213	}
214	*aso = so;
215	return (0);
216}
217
218int
219sobind(so, nam, td)
220	struct socket *so;
221	struct sockaddr *nam;
222	struct thread *td;
223{
224
225	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
226}
227
228void
229sodealloc(struct socket *so)
230{
231
232	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
233	so->so_gencnt = ++so_gencnt;
234	if (so->so_rcv.sb_hiwat)
235		(void)chgsbsize(so->so_cred->cr_uidinfo,
236		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
237	if (so->so_snd.sb_hiwat)
238		(void)chgsbsize(so->so_cred->cr_uidinfo,
239		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
240#ifdef INET
241	/* remove acccept filter if one is present. */
242	if (so->so_accf != NULL)
243		do_setopt_accept_filter(so, NULL);
244#endif
245#ifdef MAC
246	mac_destroy_socket(so);
247#endif
248	crfree(so->so_cred);
249	SOCKBUF_LOCK_DESTROY(&so->so_snd);
250	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
251	/* sx_destroy(&so->so_sxlock); */
252	uma_zfree(socket_zone, so);
253	--numopensockets;
254}
255
256int
257solisten(so, backlog, td)
258	struct socket *so;
259	int backlog;
260	struct thread *td;
261{
262	int error;
263
264	/*
265	 * XXXRW: Ordering issue here -- perhaps we need to set
266	 * SO_ACCEPTCONN before the call to pru_listen()?
267	 * XXXRW: General atomic test-and-set concerns here also.
268	 */
269	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
270			    SS_ISDISCONNECTING))
271		return (EINVAL);
272	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
273	if (error)
274		return (error);
275	ACCEPT_LOCK();
276	if (TAILQ_EMPTY(&so->so_comp)) {
277		SOCK_LOCK(so);
278		so->so_options |= SO_ACCEPTCONN;
279		SOCK_UNLOCK(so);
280	}
281	if (backlog < 0 || backlog > somaxconn)
282		backlog = somaxconn;
283	so->so_qlimit = backlog;
284	ACCEPT_UNLOCK();
285	return (0);
286}
287
288void
289sofree(so)
290	struct socket *so;
291{
292	struct socket *head;
293
294	KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
295	SOCK_LOCK_ASSERT(so);
296
297	if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) {
298		SOCK_UNLOCK(so);
299		return;
300	}
301
302	SOCK_UNLOCK(so);
303	ACCEPT_LOCK();
304	head = so->so_head;
305	if (head != NULL) {
306		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
307		    (so->so_qstate & SQ_INCOMP) != 0,
308		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
309		    "SQ_INCOMP"));
310		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
311		    (so->so_qstate & SQ_INCOMP) == 0,
312		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
313		/*
314		 * accept(2) is responsible draining the completed
315		 * connection queue and freeing those sockets, so
316		 * we just return here if this socket is currently
317		 * on the completed connection queue.  Otherwise,
318		 * accept(2) may hang after select(2) has indicating
319		 * that a listening socket was ready.  If it's an
320		 * incomplete connection, we remove it from the queue
321		 * and free it; otherwise, it won't be released until
322		 * the listening socket is closed.
323		 */
324		if ((so->so_qstate & SQ_COMP) != 0) {
325			ACCEPT_UNLOCK();
326			return;
327		}
328		TAILQ_REMOVE(&head->so_incomp, so, so_list);
329		head->so_incqlen--;
330		so->so_qstate &= ~SQ_INCOMP;
331		so->so_head = NULL;
332	}
333	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
334	    (so->so_qstate & SQ_INCOMP) == 0,
335	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
336	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
337	ACCEPT_UNLOCK();
338	SOCKBUF_LOCK(&so->so_snd);
339	so->so_snd.sb_flags |= SB_NOINTR;
340	(void)sblock(&so->so_snd, M_WAITOK);
341	/*
342	 * socantsendmore_locked() drops the socket buffer mutex so that it
343	 * can safely perform wakeups.  Re-acquire the mutex before
344	 * continuing.
345	 */
346	socantsendmore_locked(so);
347	SOCKBUF_LOCK(&so->so_snd);
348	sbunlock(&so->so_snd);
349	sbrelease_locked(&so->so_snd, so);
350	SOCKBUF_UNLOCK(&so->so_snd);
351	sorflush(so);
352	sodealloc(so);
353}
354
355/*
356 * Close a socket on last file table reference removal.
357 * Initiate disconnect if connected.
358 * Free socket when disconnect complete.
359 *
360 * This function will sorele() the socket.  Note that soclose() may be
361 * called prior to the ref count reaching zero.  The actual socket
362 * structure will not be freed until the ref count reaches zero.
363 */
364int
365soclose(so)
366	struct socket *so;
367{
368	int error = 0;
369
370	funsetown(&so->so_sigio);
371	if (so->so_options & SO_ACCEPTCONN) {
372		struct socket *sp;
373		ACCEPT_LOCK();
374		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
375			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
376			so->so_incqlen--;
377			sp->so_qstate &= ~SQ_INCOMP;
378			sp->so_head = NULL;
379			ACCEPT_UNLOCK();
380			(void) soabort(sp);
381			ACCEPT_LOCK();
382		}
383		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
384			TAILQ_REMOVE(&so->so_comp, sp, so_list);
385			so->so_qlen--;
386			sp->so_qstate &= ~SQ_COMP;
387			sp->so_head = NULL;
388			ACCEPT_UNLOCK();
389			(void) soabort(sp);
390			ACCEPT_LOCK();
391		}
392		ACCEPT_UNLOCK();
393	}
394	if (so->so_pcb == NULL)
395		goto discard;
396	if (so->so_state & SS_ISCONNECTED) {
397		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
398			error = sodisconnect(so);
399			if (error)
400				goto drop;
401		}
402		if (so->so_options & SO_LINGER) {
403			if ((so->so_state & SS_ISDISCONNECTING) &&
404			    (so->so_state & SS_NBIO))
405				goto drop;
406			while (so->so_state & SS_ISCONNECTED) {
407				error = tsleep(&so->so_timeo,
408				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
409				if (error)
410					break;
411			}
412		}
413	}
414drop:
415	if (so->so_pcb != NULL) {
416		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
417		if (error == 0)
418			error = error2;
419	}
420discard:
421	SOCK_LOCK(so);
422	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
423	so->so_state |= SS_NOFDREF;
424	sorele(so);
425	return (error);
426}
427
428/*
429 * Must be called at splnet...
430 */
431int
432soabort(so)
433	struct socket *so;
434{
435	int error;
436
437	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
438	if (error) {
439		SOCK_LOCK(so);
440		sotryfree(so);	/* note: does not decrement the ref count */
441		return error;
442	}
443	return (0);
444}
445
446int
447soaccept(so, nam)
448	struct socket *so;
449	struct sockaddr **nam;
450{
451	int error;
452
453	SOCK_LOCK(so);
454	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
455	so->so_state &= ~SS_NOFDREF;
456	SOCK_UNLOCK(so);
457	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
458	return (error);
459}
460
461int
462soconnect(so, nam, td)
463	struct socket *so;
464	struct sockaddr *nam;
465	struct thread *td;
466{
467	int error;
468
469	if (so->so_options & SO_ACCEPTCONN)
470		return (EOPNOTSUPP);
471	/*
472	 * If protocol is connection-based, can only connect once.
473	 * Otherwise, if connected, try to disconnect first.
474	 * This allows user to disconnect by connecting to, e.g.,
475	 * a null address.
476	 */
477	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
478	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
479	    (error = sodisconnect(so))))
480		error = EISCONN;
481	else
482		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
483	return (error);
484}
485
486int
487soconnect2(so1, so2)
488	struct socket *so1;
489	struct socket *so2;
490{
491
492	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
493}
494
495int
496sodisconnect(so)
497	struct socket *so;
498{
499	int error;
500
501	if ((so->so_state & SS_ISCONNECTED) == 0)
502		return (ENOTCONN);
503	if (so->so_state & SS_ISDISCONNECTING)
504		return (EALREADY);
505	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
506	return (error);
507}
508
509#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
510/*
511 * Send on a socket.
512 * If send must go all at once and message is larger than
513 * send buffering, then hard error.
514 * Lock against other senders.
515 * If must go all at once and not enough room now, then
516 * inform user that this would block and do nothing.
517 * Otherwise, if nonblocking, send as much as possible.
518 * The data to be sent is described by "uio" if nonzero,
519 * otherwise by the mbuf chain "top" (which must be null
520 * if uio is not).  Data provided in mbuf chain must be small
521 * enough to send all at once.
522 *
523 * Returns nonzero on error, timeout or signal; callers
524 * must check for short counts if EINTR/ERESTART are returned.
525 * Data and control buffers are freed on return.
526 */
527
528#ifdef ZERO_COPY_SOCKETS
529struct so_zerocopy_stats{
530	int size_ok;
531	int align_ok;
532	int found_ifp;
533};
534struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
535#include <netinet/in.h>
536#include <net/route.h>
537#include <netinet/in_pcb.h>
538#include <vm/vm.h>
539#include <vm/vm_page.h>
540#include <vm/vm_object.h>
541#endif /*ZERO_COPY_SOCKETS*/
542
543int
544sosend(so, addr, uio, top, control, flags, td)
545	struct socket *so;
546	struct sockaddr *addr;
547	struct uio *uio;
548	struct mbuf *top;
549	struct mbuf *control;
550	int flags;
551	struct thread *td;
552{
553	struct mbuf **mp;
554	struct mbuf *m;
555	long space, len = 0, resid;
556	int clen = 0, error, dontroute;
557	int atomic = sosendallatonce(so) || top;
558#ifdef ZERO_COPY_SOCKETS
559	int cow_send;
560#endif /* ZERO_COPY_SOCKETS */
561
562	if (uio != NULL)
563		resid = uio->uio_resid;
564	else
565		resid = top->m_pkthdr.len;
566	/*
567	 * In theory resid should be unsigned.
568	 * However, space must be signed, as it might be less than 0
569	 * if we over-committed, and we must use a signed comparison
570	 * of space and resid.  On the other hand, a negative resid
571	 * causes us to loop sending 0-length segments to the protocol.
572	 *
573	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
574	 * type sockets since that's an error.
575	 */
576	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
577		error = EINVAL;
578		goto out;
579	}
580
581	dontroute =
582	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
583	    (so->so_proto->pr_flags & PR_ATOMIC);
584	if (td != NULL)
585		td->td_proc->p_stats->p_ru.ru_msgsnd++;
586	if (control != NULL)
587		clen = control->m_len;
588#define	snderr(errno)	{ error = (errno); goto release; }
589
590	SOCKBUF_LOCK(&so->so_snd);
591restart:
592	SOCKBUF_LOCK_ASSERT(&so->so_snd);
593	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
594	if (error)
595		goto out_locked;
596	do {
597		SOCKBUF_LOCK_ASSERT(&so->so_snd);
598		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
599			snderr(EPIPE);
600		if (so->so_error) {
601			error = so->so_error;
602			so->so_error = 0;
603			goto release;
604		}
605		if ((so->so_state & SS_ISCONNECTED) == 0) {
606			/*
607			 * `sendto' and `sendmsg' is allowed on a connection-
608			 * based socket if it supports implied connect.
609			 * Return ENOTCONN if not connected and no address is
610			 * supplied.
611			 */
612			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
613			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
614				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
615				    !(resid == 0 && clen != 0))
616					snderr(ENOTCONN);
617			} else if (addr == NULL)
618			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
619				   ENOTCONN : EDESTADDRREQ);
620		}
621		space = sbspace(&so->so_snd);
622		if (flags & MSG_OOB)
623			space += 1024;
624		if ((atomic && resid > so->so_snd.sb_hiwat) ||
625		    clen > so->so_snd.sb_hiwat)
626			snderr(EMSGSIZE);
627		if (space < resid + clen &&
628		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
629			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
630				snderr(EWOULDBLOCK);
631			sbunlock(&so->so_snd);
632			error = sbwait(&so->so_snd);
633			if (error)
634				goto out_locked;
635			goto restart;
636		}
637		SOCKBUF_UNLOCK(&so->so_snd);
638		mp = &top;
639		space -= clen;
640		do {
641		    if (uio == NULL) {
642			/*
643			 * Data is prepackaged in "top".
644			 */
645			resid = 0;
646			if (flags & MSG_EOR)
647				top->m_flags |= M_EOR;
648		    } else do {
649#ifdef ZERO_COPY_SOCKETS
650			cow_send = 0;
651#endif /* ZERO_COPY_SOCKETS */
652			if (resid >= MINCLSIZE) {
653#ifdef ZERO_COPY_SOCKETS
654				if (top == NULL) {
655					MGETHDR(m, M_TRYWAIT, MT_DATA);
656					if (m == NULL) {
657						error = ENOBUFS;
658						SOCKBUF_LOCK(&so->so_snd);
659						goto release;
660					}
661					m->m_pkthdr.len = 0;
662					m->m_pkthdr.rcvif = (struct ifnet *)0;
663				} else {
664					MGET(m, M_TRYWAIT, MT_DATA);
665					if (m == NULL) {
666						error = ENOBUFS;
667						SOCKBUF_LOCK(&so->so_snd);
668						goto release;
669					}
670				}
671				if (so_zero_copy_send &&
672				    resid>=PAGE_SIZE &&
673				    space>=PAGE_SIZE &&
674				    uio->uio_iov->iov_len>=PAGE_SIZE) {
675					so_zerocp_stats.size_ok++;
676					if (!((vm_offset_t)
677					  uio->uio_iov->iov_base & PAGE_MASK)){
678						so_zerocp_stats.align_ok++;
679						cow_send = socow_setup(m, uio);
680					}
681				}
682				if (!cow_send) {
683					MCLGET(m, M_TRYWAIT);
684					if ((m->m_flags & M_EXT) == 0) {
685						m_free(m);
686						m = NULL;
687					} else {
688						len = min(min(MCLBYTES, resid), space);
689					}
690				} else
691					len = PAGE_SIZE;
692#else /* ZERO_COPY_SOCKETS */
693				if (top == NULL) {
694					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
695					m->m_pkthdr.len = 0;
696					m->m_pkthdr.rcvif = (struct ifnet *)0;
697				} else
698					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
699				len = min(min(MCLBYTES, resid), space);
700#endif /* ZERO_COPY_SOCKETS */
701			} else {
702				if (top == NULL) {
703					m = m_gethdr(M_TRYWAIT, MT_DATA);
704					m->m_pkthdr.len = 0;
705					m->m_pkthdr.rcvif = (struct ifnet *)0;
706
707					len = min(min(MHLEN, resid), space);
708					/*
709					 * For datagram protocols, leave room
710					 * for protocol headers in first mbuf.
711					 */
712					if (atomic && m && len < MHLEN)
713						MH_ALIGN(m, len);
714				} else {
715					m = m_get(M_TRYWAIT, MT_DATA);
716					len = min(min(MLEN, resid), space);
717				}
718			}
719			if (m == NULL) {
720				error = ENOBUFS;
721				SOCKBUF_LOCK(&so->so_snd);
722				goto release;
723			}
724
725			space -= len;
726#ifdef ZERO_COPY_SOCKETS
727			if (cow_send)
728				error = 0;
729			else
730#endif /* ZERO_COPY_SOCKETS */
731			error = uiomove(mtod(m, void *), (int)len, uio);
732			resid = uio->uio_resid;
733			m->m_len = len;
734			*mp = m;
735			top->m_pkthdr.len += len;
736			if (error) {
737				SOCKBUF_LOCK(&so->so_snd);
738				goto release;
739			}
740			mp = &m->m_next;
741			if (resid <= 0) {
742				if (flags & MSG_EOR)
743					top->m_flags |= M_EOR;
744				break;
745			}
746		    } while (space > 0 && atomic);
747		    if (dontroute) {
748			    SOCK_LOCK(so);
749			    so->so_options |= SO_DONTROUTE;
750			    SOCK_UNLOCK(so);
751		    }
752		    /*
753		     * XXX all the SBS_CANTSENDMORE checks previously
754		     * done could be out of date.  We could have recieved
755		     * a reset packet in an interrupt or maybe we slept
756		     * while doing page faults in uiomove() etc. We could
757		     * probably recheck again inside the splnet() protection
758		     * here, but there are probably other places that this
759		     * also happens.  We must rethink this.
760		     */
761		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
762			(flags & MSG_OOB) ? PRUS_OOB :
763			/*
764			 * If the user set MSG_EOF, the protocol
765			 * understands this flag and nothing left to
766			 * send then use PRU_SEND_EOF instead of PRU_SEND.
767			 */
768			((flags & MSG_EOF) &&
769			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
770			 (resid <= 0)) ?
771				PRUS_EOF :
772			/* If there is more to send set PRUS_MORETOCOME */
773			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
774			top, addr, control, td);
775		    if (dontroute) {
776			    SOCK_LOCK(so);
777			    so->so_options &= ~SO_DONTROUTE;
778			    SOCK_UNLOCK(so);
779		    }
780		    clen = 0;
781		    control = NULL;
782		    top = NULL;
783		    mp = &top;
784		    if (error) {
785			SOCKBUF_LOCK(&so->so_snd);
786			goto release;
787		    }
788		} while (resid && space > 0);
789		SOCKBUF_LOCK(&so->so_snd);
790	} while (resid);
791
792release:
793	SOCKBUF_LOCK_ASSERT(&so->so_snd);
794	sbunlock(&so->so_snd);
795out_locked:
796	SOCKBUF_LOCK_ASSERT(&so->so_snd);
797	SOCKBUF_UNLOCK(&so->so_snd);
798out:
799	if (top != NULL)
800		m_freem(top);
801	if (control != NULL)
802		m_freem(control);
803	return (error);
804}
805
806/*
807 * Implement receive operations on a socket.
808 * We depend on the way that records are added to the sockbuf
809 * by sbappend*.  In particular, each record (mbufs linked through m_next)
810 * must begin with an address if the protocol so specifies,
811 * followed by an optional mbuf or mbufs containing ancillary data,
812 * and then zero or more mbufs of data.
813 * In order to avoid blocking network interrupts for the entire time here,
814 * we splx() while doing the actual copy to user space.
815 * Although the sockbuf is locked, new data may still be appended,
816 * and thus we must maintain consistency of the sockbuf during that time.
817 *
818 * The caller may receive the data as a single mbuf chain by supplying
819 * an mbuf **mp0 for use in returning the chain.  The uio is then used
820 * only for the count in uio_resid.
821 */
822int
823soreceive(so, psa, uio, mp0, controlp, flagsp)
824	struct socket *so;
825	struct sockaddr **psa;
826	struct uio *uio;
827	struct mbuf **mp0;
828	struct mbuf **controlp;
829	int *flagsp;
830{
831	struct mbuf *m, **mp;
832	int flags, len, error, offset;
833	struct protosw *pr = so->so_proto;
834	struct mbuf *nextrecord;
835	int moff, type = 0;
836	int orig_resid = uio->uio_resid;
837
838	mp = mp0;
839	if (psa != NULL)
840		*psa = 0;
841	if (controlp != NULL)
842		*controlp = 0;
843	if (flagsp != NULL)
844		flags = *flagsp &~ MSG_EOR;
845	else
846		flags = 0;
847	if (flags & MSG_OOB) {
848		m = m_get(M_TRYWAIT, MT_DATA);
849		if (m == NULL)
850			return (ENOBUFS);
851		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
852		if (error)
853			goto bad;
854		do {
855#ifdef ZERO_COPY_SOCKETS
856			if (so_zero_copy_receive) {
857				vm_page_t pg;
858				int disposable;
859
860				if ((m->m_flags & M_EXT)
861				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
862					disposable = 1;
863				else
864					disposable = 0;
865
866				pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
867				if (uio->uio_offset == -1)
868					uio->uio_offset =IDX_TO_OFF(pg->pindex);
869
870				error = uiomoveco(mtod(m, void *),
871						  min(uio->uio_resid, m->m_len),
872						  uio, pg->object,
873						  disposable);
874			} else
875#endif /* ZERO_COPY_SOCKETS */
876			error = uiomove(mtod(m, void *),
877			    (int) min(uio->uio_resid, m->m_len), uio);
878			m = m_free(m);
879		} while (uio->uio_resid && error == 0 && m);
880bad:
881		if (m != NULL)
882			m_freem(m);
883		return (error);
884	}
885	if (mp != NULL)
886		*mp = NULL;
887	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
888		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
889
890	SOCKBUF_LOCK(&so->so_rcv);
891restart:
892	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
893	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
894	if (error)
895		goto out;
896
897	m = so->so_rcv.sb_mb;
898	/*
899	 * If we have less data than requested, block awaiting more
900	 * (subject to any timeout) if:
901	 *   1. the current count is less than the low water mark, or
902	 *   2. MSG_WAITALL is set, and it is possible to do the entire
903	 *	receive operation at once if we block (resid <= hiwat).
904	 *   3. MSG_DONTWAIT is not set
905	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
906	 * we have to do the receive in sections, and thus risk returning
907	 * a short count if a timeout or signal occurs after we start.
908	 */
909	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
910	    so->so_rcv.sb_cc < uio->uio_resid) &&
911	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
912	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
913	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
914		KASSERT(m != NULL || !so->so_rcv.sb_cc,
915		    ("receive: m == %p so->so_rcv.sb_cc == %u",
916		    m, so->so_rcv.sb_cc));
917		if (so->so_error) {
918			if (m != NULL)
919				goto dontblock;
920			error = so->so_error;
921			if ((flags & MSG_PEEK) == 0)
922				so->so_error = 0;
923			goto release;
924		}
925		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
926			if (m)
927				goto dontblock;
928			else
929				goto release;
930		}
931		for (; m != NULL; m = m->m_next)
932			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
933				m = so->so_rcv.sb_mb;
934				goto dontblock;
935			}
936		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
937		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
938			error = ENOTCONN;
939			goto release;
940		}
941		if (uio->uio_resid == 0)
942			goto release;
943		if ((so->so_state & SS_NBIO) ||
944		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
945			error = EWOULDBLOCK;
946			goto release;
947		}
948		SBLASTRECORDCHK(&so->so_rcv);
949		SBLASTMBUFCHK(&so->so_rcv);
950		sbunlock(&so->so_rcv);
951		error = sbwait(&so->so_rcv);
952		if (error)
953			goto out;
954		goto restart;
955	}
956dontblock:
957	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
958	if (uio->uio_td)
959		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
960	SBLASTRECORDCHK(&so->so_rcv);
961	SBLASTMBUFCHK(&so->so_rcv);
962	nextrecord = m->m_nextpkt;
963	if (pr->pr_flags & PR_ADDR) {
964		KASSERT(m->m_type == MT_SONAME,
965		    ("m->m_type == %d", m->m_type));
966		orig_resid = 0;
967		if (psa != NULL)
968			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
969			    M_NOWAIT);
970		if (flags & MSG_PEEK) {
971			m = m->m_next;
972		} else {
973			sbfree(&so->so_rcv, m);
974			so->so_rcv.sb_mb = m_free(m);
975			m = so->so_rcv.sb_mb;
976		}
977	}
978	while (m != NULL && m->m_type == MT_CONTROL && error == 0) {
979		if (flags & MSG_PEEK) {
980			if (controlp != NULL)
981				*controlp = m_copy(m, 0, m->m_len);
982			m = m->m_next;
983		} else {
984			sbfree(&so->so_rcv, m);
985			so->so_rcv.sb_mb = m->m_next;
986			m->m_next = NULL;
987			if (pr->pr_domain->dom_externalize) {
988				SOCKBUF_UNLOCK(&so->so_rcv);
989				error = (*pr->pr_domain->dom_externalize)
990				    (m, controlp);
991				SOCKBUF_LOCK(&so->so_rcv);
992			} else if (controlp != NULL)
993				*controlp = m;
994			else
995				m_freem(m);
996			m = so->so_rcv.sb_mb;
997		}
998		if (controlp != NULL) {
999			orig_resid = 0;
1000			while (*controlp != NULL)
1001				controlp = &(*controlp)->m_next;
1002		}
1003	}
1004	if (m != NULL) {
1005		if ((flags & MSG_PEEK) == 0) {
1006			m->m_nextpkt = nextrecord;
1007			/*
1008			 * If nextrecord == NULL (this is a single chain),
1009			 * then sb_lastrecord may not be valid here if m
1010			 * was changed earlier.
1011			 */
1012			if (nextrecord == NULL) {
1013				KASSERT(so->so_rcv.sb_mb == m,
1014					("receive tailq 1"));
1015				so->so_rcv.sb_lastrecord = m;
1016			}
1017		}
1018		type = m->m_type;
1019		if (type == MT_OOBDATA)
1020			flags |= MSG_OOB;
1021	} else {
1022		if ((flags & MSG_PEEK) == 0) {
1023			KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2"));
1024			so->so_rcv.sb_mb = nextrecord;
1025			SB_EMPTY_FIXUP(&so->so_rcv);
1026		}
1027	}
1028	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1029	SBLASTRECORDCHK(&so->so_rcv);
1030	SBLASTMBUFCHK(&so->so_rcv);
1031
1032	moff = 0;
1033	offset = 0;
1034	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1035		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1036		if (m->m_type == MT_OOBDATA) {
1037			if (type != MT_OOBDATA)
1038				break;
1039		} else if (type == MT_OOBDATA)
1040			break;
1041		else
1042		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1043			("m->m_type == %d", m->m_type));
1044		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1045		len = uio->uio_resid;
1046		if (so->so_oobmark && len > so->so_oobmark - offset)
1047			len = so->so_oobmark - offset;
1048		if (len > m->m_len - moff)
1049			len = m->m_len - moff;
1050		/*
1051		 * If mp is set, just pass back the mbufs.
1052		 * Otherwise copy them out via the uio, then free.
1053		 * Sockbuf must be consistent here (points to current mbuf,
1054		 * it points to next record) when we drop priority;
1055		 * we must note any additions to the sockbuf when we
1056		 * block interrupts again.
1057		 */
1058		if (mp == NULL) {
1059			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1060			SBLASTRECORDCHK(&so->so_rcv);
1061			SBLASTMBUFCHK(&so->so_rcv);
1062			SOCKBUF_UNLOCK(&so->so_rcv);
1063#ifdef ZERO_COPY_SOCKETS
1064			if (so_zero_copy_receive) {
1065				vm_page_t pg;
1066				int disposable;
1067
1068				if ((m->m_flags & M_EXT)
1069				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1070					disposable = 1;
1071				else
1072					disposable = 0;
1073
1074				pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
1075					moff));
1076
1077				if (uio->uio_offset == -1)
1078					uio->uio_offset =IDX_TO_OFF(pg->pindex);
1079
1080				error = uiomoveco(mtod(m, char *) + moff,
1081						  (int)len, uio,pg->object,
1082						  disposable);
1083			} else
1084#endif /* ZERO_COPY_SOCKETS */
1085			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1086			SOCKBUF_LOCK(&so->so_rcv);
1087			if (error)
1088				goto release;
1089		} else
1090			uio->uio_resid -= len;
1091		if (len == m->m_len - moff) {
1092			if (m->m_flags & M_EOR)
1093				flags |= MSG_EOR;
1094			if (flags & MSG_PEEK) {
1095				m = m->m_next;
1096				moff = 0;
1097			} else {
1098				nextrecord = m->m_nextpkt;
1099				sbfree(&so->so_rcv, m);
1100				if (mp != NULL) {
1101					*mp = m;
1102					mp = &m->m_next;
1103					so->so_rcv.sb_mb = m = m->m_next;
1104					*mp = NULL;
1105				} else {
1106					so->so_rcv.sb_mb = m_free(m);
1107					m = so->so_rcv.sb_mb;
1108				}
1109				if (m != NULL) {
1110					m->m_nextpkt = nextrecord;
1111					if (nextrecord == NULL)
1112						so->so_rcv.sb_lastrecord = m;
1113				} else {
1114					so->so_rcv.sb_mb = nextrecord;
1115					SB_EMPTY_FIXUP(&so->so_rcv);
1116				}
1117				SBLASTRECORDCHK(&so->so_rcv);
1118				SBLASTMBUFCHK(&so->so_rcv);
1119			}
1120		} else {
1121			if (flags & MSG_PEEK)
1122				moff += len;
1123			else {
1124				if (mp != NULL)
1125					*mp = m_copym(m, 0, len, M_TRYWAIT);
1126				m->m_data += len;
1127				m->m_len -= len;
1128				so->so_rcv.sb_cc -= len;
1129			}
1130		}
1131		if (so->so_oobmark) {
1132			if ((flags & MSG_PEEK) == 0) {
1133				so->so_oobmark -= len;
1134				if (so->so_oobmark == 0) {
1135					SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1136					so->so_rcv.sb_state |= SBS_RCVATMARK;
1137					break;
1138				}
1139			} else {
1140				offset += len;
1141				if (offset == so->so_oobmark)
1142					break;
1143			}
1144		}
1145		if (flags & MSG_EOR)
1146			break;
1147		/*
1148		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1149		 * we must not quit until "uio->uio_resid == 0" or an error
1150		 * termination.  If a signal/timeout occurs, return
1151		 * with a short count but without error.
1152		 * Keep sockbuf locked against other readers.
1153		 */
1154		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1155		    !sosendallatonce(so) && nextrecord == NULL) {
1156			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1157			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1158				break;
1159			/*
1160			 * Notify the protocol that some data has been
1161			 * drained before blocking.
1162			 */
1163			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
1164				SOCKBUF_UNLOCK(&so->so_rcv);
1165				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1166				SOCKBUF_LOCK(&so->so_rcv);
1167			}
1168			SBLASTRECORDCHK(&so->so_rcv);
1169			SBLASTMBUFCHK(&so->so_rcv);
1170			error = sbwait(&so->so_rcv);
1171			if (error)
1172				goto release;
1173			m = so->so_rcv.sb_mb;
1174			if (m != NULL)
1175				nextrecord = m->m_nextpkt;
1176		}
1177	}
1178
1179	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1180		flags |= MSG_TRUNC;
1181		if ((flags & MSG_PEEK) == 0) {
1182			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1183			(void) sbdroprecord_locked(&so->so_rcv);
1184		}
1185	}
1186	if ((flags & MSG_PEEK) == 0) {
1187		if (m == NULL) {
1188			/*
1189			 * First part is an inline SB_EMPTY_FIXUP().  Second
1190			 * part makes sure sb_lastrecord is up-to-date if
1191			 * there is still data in the socket buffer.
1192			 */
1193			so->so_rcv.sb_mb = nextrecord;
1194			if (so->so_rcv.sb_mb == NULL) {
1195				so->so_rcv.sb_mbtail = NULL;
1196				so->so_rcv.sb_lastrecord = NULL;
1197			} else if (nextrecord->m_nextpkt == NULL)
1198				so->so_rcv.sb_lastrecord = nextrecord;
1199		}
1200		SBLASTRECORDCHK(&so->so_rcv);
1201		SBLASTMBUFCHK(&so->so_rcv);
1202		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
1203			SOCKBUF_UNLOCK(&so->so_rcv);
1204			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1205			SOCKBUF_LOCK(&so->so_rcv);
1206		}
1207	}
1208	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1209	if (orig_resid == uio->uio_resid && orig_resid &&
1210	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1211		sbunlock(&so->so_rcv);
1212		goto restart;
1213	}
1214
1215	if (flagsp != NULL)
1216		*flagsp |= flags;
1217release:
1218	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1219	sbunlock(&so->so_rcv);
1220out:
1221	SOCKBUF_UNLOCK(&so->so_rcv);
1222	return (error);
1223}
1224
1225int
1226soshutdown(so, how)
1227	struct socket *so;
1228	int how;
1229{
1230	struct protosw *pr = so->so_proto;
1231
1232	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1233		return (EINVAL);
1234
1235	if (how != SHUT_WR)
1236		sorflush(so);
1237	if (how != SHUT_RD)
1238		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1239	return (0);
1240}
1241
1242void
1243sorflush(so)
1244	struct socket *so;
1245{
1246	struct sockbuf *sb = &so->so_rcv;
1247	struct protosw *pr = so->so_proto;
1248	struct sockbuf asb;
1249
1250	/*
1251	 * XXXRW: This is quite ugly.  The existing code made a copy of the
1252	 * socket buffer, then zero'd the original to clear the buffer
1253	 * fields.  However, with mutexes in the socket buffer, this causes
1254	 * problems.  We only clear the zeroable bits of the original;
1255	 * however, we have to initialize and destroy the mutex in the copy
1256	 * so that dom_dispose() and sbrelease() can lock t as needed.
1257	 */
1258	SOCKBUF_LOCK(sb);
1259	sb->sb_flags |= SB_NOINTR;
1260	(void) sblock(sb, M_WAITOK);
1261	/*
1262	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1263	 * can safely perform wakeups.  Re-acquire the mutex before
1264	 * continuing.
1265	 */
1266	socantrcvmore_locked(so);
1267	SOCKBUF_LOCK(sb);
1268	sbunlock(sb);
1269	/*
1270	 * Invalidate/clear most of the sockbuf structure, but leave
1271	 * selinfo and mutex data unchanged.
1272	 */
1273	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1274	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1275	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1276	bzero(&sb->sb_startzero,
1277	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1278	SOCKBUF_UNLOCK(sb);
1279
1280	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1281	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1282		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1283	sbrelease(&asb, so);
1284	SOCKBUF_LOCK_DESTROY(&asb);
1285}
1286
1287#ifdef INET
1288static int
1289do_setopt_accept_filter(so, sopt)
1290	struct	socket *so;
1291	struct	sockopt *sopt;
1292{
1293	struct accept_filter_arg	*afap = NULL;
1294	struct accept_filter	*afp;
1295	struct so_accf	*af = so->so_accf;
1296	int	error = 0;
1297
1298	/* do not set/remove accept filters on non listen sockets */
1299	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1300		error = EINVAL;
1301		goto out;
1302	}
1303
1304	/* removing the filter */
1305	if (sopt == NULL) {
1306		if (af != NULL) {
1307			if (af->so_accept_filter != NULL &&
1308				af->so_accept_filter->accf_destroy != NULL) {
1309				af->so_accept_filter->accf_destroy(so);
1310			}
1311			if (af->so_accept_filter_str != NULL) {
1312				FREE(af->so_accept_filter_str, M_ACCF);
1313			}
1314			FREE(af, M_ACCF);
1315			so->so_accf = NULL;
1316		}
1317		so->so_options &= ~SO_ACCEPTFILTER;
1318		return (0);
1319	}
1320	/* adding a filter */
1321	/* must remove previous filter first */
1322	if (af != NULL) {
1323		error = EINVAL;
1324		goto out;
1325	}
1326	/* don't put large objects on the kernel stack */
1327	MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
1328	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1329	afap->af_name[sizeof(afap->af_name)-1] = '\0';
1330	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1331	if (error)
1332		goto out;
1333	afp = accept_filt_get(afap->af_name);
1334	if (afp == NULL) {
1335		error = ENOENT;
1336		goto out;
1337	}
1338	MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
1339	if (afp->accf_create != NULL) {
1340		if (afap->af_name[0] != '\0') {
1341			int len = strlen(afap->af_name) + 1;
1342
1343			MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
1344			strcpy(af->so_accept_filter_str, afap->af_name);
1345		}
1346		af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1347		if (af->so_accept_filter_arg == NULL) {
1348			FREE(af->so_accept_filter_str, M_ACCF);
1349			FREE(af, M_ACCF);
1350			so->so_accf = NULL;
1351			error = EINVAL;
1352			goto out;
1353		}
1354	}
1355	af->so_accept_filter = afp;
1356	so->so_accf = af;
1357	so->so_options |= SO_ACCEPTFILTER;
1358out:
1359	if (afap != NULL)
1360		FREE(afap, M_TEMP);
1361	return (error);
1362}
1363#endif /* INET */
1364
1365/*
1366 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1367 * an additional variant to handle the case where the option value needs
1368 * to be some kind of integer, but not a specific size.
1369 * In addition to their use here, these functions are also called by the
1370 * protocol-level pr_ctloutput() routines.
1371 */
1372int
1373sooptcopyin(sopt, buf, len, minlen)
1374	struct	sockopt *sopt;
1375	void	*buf;
1376	size_t	len;
1377	size_t	minlen;
1378{
1379	size_t	valsize;
1380
1381	/*
1382	 * If the user gives us more than we wanted, we ignore it,
1383	 * but if we don't get the minimum length the caller
1384	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
1385	 * is set to however much we actually retrieved.
1386	 */
1387	if ((valsize = sopt->sopt_valsize) < minlen)
1388		return EINVAL;
1389	if (valsize > len)
1390		sopt->sopt_valsize = valsize = len;
1391
1392	if (sopt->sopt_td != NULL)
1393		return (copyin(sopt->sopt_val, buf, valsize));
1394
1395	bcopy(sopt->sopt_val, buf, valsize);
1396	return 0;
1397}
1398
1399int
1400sosetopt(so, sopt)
1401	struct socket *so;
1402	struct sockopt *sopt;
1403{
1404	int	error, optval;
1405	struct	linger l;
1406	struct	timeval tv;
1407	u_long  val;
1408#ifdef MAC
1409	struct mac extmac;
1410#endif
1411
1412	error = 0;
1413	if (sopt->sopt_level != SOL_SOCKET) {
1414		if (so->so_proto && so->so_proto->pr_ctloutput)
1415			return ((*so->so_proto->pr_ctloutput)
1416				  (so, sopt));
1417		error = ENOPROTOOPT;
1418	} else {
1419		switch (sopt->sopt_name) {
1420#ifdef INET
1421		case SO_ACCEPTFILTER:
1422			error = do_setopt_accept_filter(so, sopt);
1423			if (error)
1424				goto bad;
1425			break;
1426#endif
1427		case SO_LINGER:
1428			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1429			if (error)
1430				goto bad;
1431
1432			SOCK_LOCK(so);
1433			so->so_linger = l.l_linger;
1434			if (l.l_onoff)
1435				so->so_options |= SO_LINGER;
1436			else
1437				so->so_options &= ~SO_LINGER;
1438			SOCK_UNLOCK(so);
1439			break;
1440
1441		case SO_DEBUG:
1442		case SO_KEEPALIVE:
1443		case SO_DONTROUTE:
1444		case SO_USELOOPBACK:
1445		case SO_BROADCAST:
1446		case SO_REUSEADDR:
1447		case SO_REUSEPORT:
1448		case SO_OOBINLINE:
1449		case SO_TIMESTAMP:
1450		case SO_BINTIME:
1451		case SO_NOSIGPIPE:
1452			error = sooptcopyin(sopt, &optval, sizeof optval,
1453					    sizeof optval);
1454			if (error)
1455				goto bad;
1456			SOCK_LOCK(so);
1457			if (optval)
1458				so->so_options |= sopt->sopt_name;
1459			else
1460				so->so_options &= ~sopt->sopt_name;
1461			SOCK_UNLOCK(so);
1462			break;
1463
1464		case SO_SNDBUF:
1465		case SO_RCVBUF:
1466		case SO_SNDLOWAT:
1467		case SO_RCVLOWAT:
1468			error = sooptcopyin(sopt, &optval, sizeof optval,
1469					    sizeof optval);
1470			if (error)
1471				goto bad;
1472
1473			/*
1474			 * Values < 1 make no sense for any of these
1475			 * options, so disallow them.
1476			 */
1477			if (optval < 1) {
1478				error = EINVAL;
1479				goto bad;
1480			}
1481
1482			switch (sopt->sopt_name) {
1483			case SO_SNDBUF:
1484			case SO_RCVBUF:
1485				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1486				    &so->so_snd : &so->so_rcv, (u_long)optval,
1487				    so, curthread) == 0) {
1488					error = ENOBUFS;
1489					goto bad;
1490				}
1491				break;
1492
1493			/*
1494			 * Make sure the low-water is never greater than
1495			 * the high-water.
1496			 */
1497			case SO_SNDLOWAT:
1498				so->so_snd.sb_lowat =
1499				    (optval > so->so_snd.sb_hiwat) ?
1500				    so->so_snd.sb_hiwat : optval;
1501				break;
1502			case SO_RCVLOWAT:
1503				so->so_rcv.sb_lowat =
1504				    (optval > so->so_rcv.sb_hiwat) ?
1505				    so->so_rcv.sb_hiwat : optval;
1506				break;
1507			}
1508			break;
1509
1510		case SO_SNDTIMEO:
1511		case SO_RCVTIMEO:
1512			error = sooptcopyin(sopt, &tv, sizeof tv,
1513					    sizeof tv);
1514			if (error)
1515				goto bad;
1516
1517			/* assert(hz > 0); */
1518			if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1519			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1520				error = EDOM;
1521				goto bad;
1522			}
1523			/* assert(tick > 0); */
1524			/* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1525			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1526			if (val > SHRT_MAX) {
1527				error = EDOM;
1528				goto bad;
1529			}
1530			if (val == 0 && tv.tv_usec != 0)
1531				val = 1;
1532
1533			switch (sopt->sopt_name) {
1534			case SO_SNDTIMEO:
1535				so->so_snd.sb_timeo = val;
1536				break;
1537			case SO_RCVTIMEO:
1538				so->so_rcv.sb_timeo = val;
1539				break;
1540			}
1541			break;
1542		case SO_LABEL:
1543#ifdef MAC
1544			error = sooptcopyin(sopt, &extmac, sizeof extmac,
1545			    sizeof extmac);
1546			if (error)
1547				goto bad;
1548			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1549			    so, &extmac);
1550#else
1551			error = EOPNOTSUPP;
1552#endif
1553			break;
1554		default:
1555			error = ENOPROTOOPT;
1556			break;
1557		}
1558		if (error == 0 && so->so_proto != NULL &&
1559		    so->so_proto->pr_ctloutput != NULL) {
1560			(void) ((*so->so_proto->pr_ctloutput)
1561				  (so, sopt));
1562		}
1563	}
1564bad:
1565	return (error);
1566}
1567
1568/* Helper routine for getsockopt */
1569int
1570sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1571{
1572	int	error;
1573	size_t	valsize;
1574
1575	error = 0;
1576
1577	/*
1578	 * Documented get behavior is that we always return a value,
1579	 * possibly truncated to fit in the user's buffer.
1580	 * Traditional behavior is that we always tell the user
1581	 * precisely how much we copied, rather than something useful
1582	 * like the total amount we had available for her.
1583	 * Note that this interface is not idempotent; the entire answer must
1584	 * generated ahead of time.
1585	 */
1586	valsize = min(len, sopt->sopt_valsize);
1587	sopt->sopt_valsize = valsize;
1588	if (sopt->sopt_val != NULL) {
1589		if (sopt->sopt_td != NULL)
1590			error = copyout(buf, sopt->sopt_val, valsize);
1591		else
1592			bcopy(buf, sopt->sopt_val, valsize);
1593	}
1594	return error;
1595}
1596
1597int
1598sogetopt(so, sopt)
1599	struct socket *so;
1600	struct sockopt *sopt;
1601{
1602	int	error, optval;
1603	struct	linger l;
1604	struct	timeval tv;
1605#ifdef INET
1606	struct accept_filter_arg *afap;
1607#endif
1608#ifdef MAC
1609	struct mac extmac;
1610#endif
1611
1612	error = 0;
1613	if (sopt->sopt_level != SOL_SOCKET) {
1614		if (so->so_proto && so->so_proto->pr_ctloutput) {
1615			return ((*so->so_proto->pr_ctloutput)
1616				  (so, sopt));
1617		} else
1618			return (ENOPROTOOPT);
1619	} else {
1620		switch (sopt->sopt_name) {
1621#ifdef INET
1622		case SO_ACCEPTFILTER:
1623			if ((so->so_options & SO_ACCEPTCONN) == 0)
1624				return (EINVAL);
1625			MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1626				M_TEMP, M_WAITOK | M_ZERO);
1627			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1628				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1629				if (so->so_accf->so_accept_filter_str != NULL)
1630					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1631			}
1632			error = sooptcopyout(sopt, afap, sizeof(*afap));
1633			FREE(afap, M_TEMP);
1634			break;
1635#endif
1636
1637		case SO_LINGER:
1638			/*
1639			 * XXXRW: We grab the lock here to get a consistent
1640			 * snapshot of both fields.  This may not really
1641			 * be necessary.
1642			 */
1643			SOCK_LOCK(so);
1644			l.l_onoff = so->so_options & SO_LINGER;
1645			l.l_linger = so->so_linger;
1646			SOCK_UNLOCK(so);
1647			error = sooptcopyout(sopt, &l, sizeof l);
1648			break;
1649
1650		case SO_USELOOPBACK:
1651		case SO_DONTROUTE:
1652		case SO_DEBUG:
1653		case SO_KEEPALIVE:
1654		case SO_REUSEADDR:
1655		case SO_REUSEPORT:
1656		case SO_BROADCAST:
1657		case SO_OOBINLINE:
1658		case SO_TIMESTAMP:
1659		case SO_BINTIME:
1660		case SO_NOSIGPIPE:
1661			optval = so->so_options & sopt->sopt_name;
1662integer:
1663			error = sooptcopyout(sopt, &optval, sizeof optval);
1664			break;
1665
1666		case SO_TYPE:
1667			optval = so->so_type;
1668			goto integer;
1669
1670		case SO_ERROR:
1671			optval = so->so_error;
1672			so->so_error = 0;
1673			goto integer;
1674
1675		case SO_SNDBUF:
1676			optval = so->so_snd.sb_hiwat;
1677			goto integer;
1678
1679		case SO_RCVBUF:
1680			optval = so->so_rcv.sb_hiwat;
1681			goto integer;
1682
1683		case SO_SNDLOWAT:
1684			optval = so->so_snd.sb_lowat;
1685			goto integer;
1686
1687		case SO_RCVLOWAT:
1688			optval = so->so_rcv.sb_lowat;
1689			goto integer;
1690
1691		case SO_SNDTIMEO:
1692		case SO_RCVTIMEO:
1693			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1694				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1695
1696			tv.tv_sec = optval / hz;
1697			tv.tv_usec = (optval % hz) * tick;
1698			error = sooptcopyout(sopt, &tv, sizeof tv);
1699			break;
1700		case SO_LABEL:
1701#ifdef MAC
1702			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1703			    sizeof(extmac));
1704			if (error)
1705				return (error);
1706			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1707			    so, &extmac);
1708			if (error)
1709				return (error);
1710			error = sooptcopyout(sopt, &extmac, sizeof extmac);
1711#else
1712			error = EOPNOTSUPP;
1713#endif
1714			break;
1715		case SO_PEERLABEL:
1716#ifdef MAC
1717			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1718			    sizeof(extmac));
1719			if (error)
1720				return (error);
1721			error = mac_getsockopt_peerlabel(
1722			    sopt->sopt_td->td_ucred, so, &extmac);
1723			if (error)
1724				return (error);
1725			error = sooptcopyout(sopt, &extmac, sizeof extmac);
1726#else
1727			error = EOPNOTSUPP;
1728#endif
1729			break;
1730		default:
1731			error = ENOPROTOOPT;
1732			break;
1733		}
1734		return (error);
1735	}
1736}
1737
1738/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1739int
1740soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1741{
1742	struct mbuf *m, *m_prev;
1743	int sopt_size = sopt->sopt_valsize;
1744
1745	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1746	if (m == NULL)
1747		return ENOBUFS;
1748	if (sopt_size > MLEN) {
1749		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
1750		if ((m->m_flags & M_EXT) == 0) {
1751			m_free(m);
1752			return ENOBUFS;
1753		}
1754		m->m_len = min(MCLBYTES, sopt_size);
1755	} else {
1756		m->m_len = min(MLEN, sopt_size);
1757	}
1758	sopt_size -= m->m_len;
1759	*mp = m;
1760	m_prev = m;
1761
1762	while (sopt_size) {
1763		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1764		if (m == NULL) {
1765			m_freem(*mp);
1766			return ENOBUFS;
1767		}
1768		if (sopt_size > MLEN) {
1769			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
1770			    M_DONTWAIT);
1771			if ((m->m_flags & M_EXT) == 0) {
1772				m_freem(m);
1773				m_freem(*mp);
1774				return ENOBUFS;
1775			}
1776			m->m_len = min(MCLBYTES, sopt_size);
1777		} else {
1778			m->m_len = min(MLEN, sopt_size);
1779		}
1780		sopt_size -= m->m_len;
1781		m_prev->m_next = m;
1782		m_prev = m;
1783	}
1784	return 0;
1785}
1786
1787/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1788int
1789soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1790{
1791	struct mbuf *m0 = m;
1792
1793	if (sopt->sopt_val == NULL)
1794		return 0;
1795	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1796		if (sopt->sopt_td != NULL) {
1797			int error;
1798
1799			error = copyin(sopt->sopt_val, mtod(m, char *),
1800				       m->m_len);
1801			if (error != 0) {
1802				m_freem(m0);
1803				return(error);
1804			}
1805		} else
1806			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1807		sopt->sopt_valsize -= m->m_len;
1808		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
1809		m = m->m_next;
1810	}
1811	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1812		panic("ip6_sooptmcopyin");
1813	return 0;
1814}
1815
1816/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1817int
1818soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1819{
1820	struct mbuf *m0 = m;
1821	size_t valsize = 0;
1822
1823	if (sopt->sopt_val == NULL)
1824		return 0;
1825	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1826		if (sopt->sopt_td != NULL) {
1827			int error;
1828
1829			error = copyout(mtod(m, char *), sopt->sopt_val,
1830				       m->m_len);
1831			if (error != 0) {
1832				m_freem(m0);
1833				return(error);
1834			}
1835		} else
1836			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
1837	       sopt->sopt_valsize -= m->m_len;
1838	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
1839	       valsize += m->m_len;
1840	       m = m->m_next;
1841	}
1842	if (m != NULL) {
1843		/* enough soopt buffer should be given from user-land */
1844		m_freem(m0);
1845		return(EINVAL);
1846	}
1847	sopt->sopt_valsize = valsize;
1848	return 0;
1849}
1850
1851void
1852sohasoutofband(so)
1853	struct socket *so;
1854{
1855	if (so->so_sigio != NULL)
1856		pgsigio(&so->so_sigio, SIGURG, 0);
1857	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
1858}
1859
1860int
1861sopoll(struct socket *so, int events, struct ucred *active_cred,
1862    struct thread *td)
1863{
1864	int revents = 0;
1865
1866	if (events & (POLLIN | POLLRDNORM))
1867		if (soreadable(so))
1868			revents |= events & (POLLIN | POLLRDNORM);
1869
1870	if (events & POLLINIGNEOF)
1871		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
1872		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
1873			revents |= POLLINIGNEOF;
1874
1875	if (events & (POLLOUT | POLLWRNORM))
1876		if (sowriteable(so))
1877			revents |= events & (POLLOUT | POLLWRNORM);
1878
1879	if (events & (POLLPRI | POLLRDBAND))
1880		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
1881			revents |= events & (POLLPRI | POLLRDBAND);
1882
1883	if (revents == 0) {
1884		if (events &
1885		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
1886		     POLLRDBAND)) {
1887			SOCKBUF_LOCK(&so->so_rcv);
1888			selrecord(td, &so->so_rcv.sb_sel);
1889			so->so_rcv.sb_flags |= SB_SEL;
1890			SOCKBUF_UNLOCK(&so->so_rcv);
1891		}
1892
1893		if (events & (POLLOUT | POLLWRNORM)) {
1894			SOCKBUF_LOCK(&so->so_snd);
1895			selrecord(td, &so->so_snd.sb_sel);
1896			so->so_snd.sb_flags |= SB_SEL;
1897			SOCKBUF_UNLOCK(&so->so_snd);
1898		}
1899	}
1900
1901	return (revents);
1902}
1903
1904int
1905soo_kqfilter(struct file *fp, struct knote *kn)
1906{
1907	struct socket *so = kn->kn_fp->f_data;
1908	struct sockbuf *sb;
1909
1910	switch (kn->kn_filter) {
1911	case EVFILT_READ:
1912		if (so->so_options & SO_ACCEPTCONN)
1913			kn->kn_fop = &solisten_filtops;
1914		else
1915			kn->kn_fop = &soread_filtops;
1916		sb = &so->so_rcv;
1917		break;
1918	case EVFILT_WRITE:
1919		kn->kn_fop = &sowrite_filtops;
1920		sb = &so->so_snd;
1921		break;
1922	default:
1923		return (1);
1924	}
1925
1926	SOCKBUF_LOCK(sb);
1927	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1928	sb->sb_flags |= SB_KNOTE;
1929	SOCKBUF_UNLOCK(sb);
1930	return (0);
1931}
1932
1933static void
1934filt_sordetach(struct knote *kn)
1935{
1936	struct socket *so = kn->kn_fp->f_data;
1937
1938	SOCKBUF_LOCK(&so->so_rcv);
1939	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1940	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1941		so->so_rcv.sb_flags &= ~SB_KNOTE;
1942	SOCKBUF_UNLOCK(&so->so_rcv);
1943}
1944
1945/*ARGSUSED*/
1946static int
1947filt_soread(struct knote *kn, long hint)
1948{
1949	struct socket *so = kn->kn_fp->f_data;
1950	int need_lock, result;
1951
1952	/*
1953	 * XXXRW: Conditional locking because filt_soread() can be called
1954	 * either from KNOTE() in the socket context where the socket buffer
1955	 * lock is already held, or from kqueue() itself.
1956	 */
1957	need_lock = !SOCKBUF_OWNED(&so->so_rcv);
1958	if (need_lock)
1959		SOCKBUF_LOCK(&so->so_rcv);
1960	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
1961	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1962		kn->kn_flags |= EV_EOF;
1963		kn->kn_fflags = so->so_error;
1964		result = 1;
1965	} else if (so->so_error)	/* temporary udp error */
1966		result = 1;
1967	else if (kn->kn_sfflags & NOTE_LOWAT)
1968		result = (kn->kn_data >= kn->kn_sdata);
1969	else
1970		result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
1971	if (need_lock)
1972		SOCKBUF_UNLOCK(&so->so_rcv);
1973	return (result);
1974}
1975
1976static void
1977filt_sowdetach(struct knote *kn)
1978{
1979	struct socket *so = kn->kn_fp->f_data;
1980
1981	SOCKBUF_LOCK(&so->so_snd);
1982	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1983	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1984		so->so_snd.sb_flags &= ~SB_KNOTE;
1985	SOCKBUF_UNLOCK(&so->so_snd);
1986}
1987
1988/*ARGSUSED*/
1989static int
1990filt_sowrite(struct knote *kn, long hint)
1991{
1992	struct socket *so = kn->kn_fp->f_data;
1993	int need_lock, result;
1994
1995	/*
1996	 * XXXRW: Conditional locking because filt_soread() can be called
1997	 * either from KNOTE() in the socket context where the socket buffer
1998	 * lock is already held, or from kqueue() itself.
1999	 */
2000	need_lock = !SOCKBUF_OWNED(&so->so_snd);
2001	if (need_lock)
2002		SOCKBUF_LOCK(&so->so_snd);
2003	kn->kn_data = sbspace(&so->so_snd);
2004	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2005		kn->kn_flags |= EV_EOF;
2006		kn->kn_fflags = so->so_error;
2007		result = 1;
2008	} else if (so->so_error)	/* temporary udp error */
2009		result = 1;
2010	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2011	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2012		result = 0;
2013	else if (kn->kn_sfflags & NOTE_LOWAT)
2014		result = (kn->kn_data >= kn->kn_sdata);
2015	else
2016		result = (kn->kn_data >= so->so_snd.sb_lowat);
2017	if (need_lock)
2018		SOCKBUF_UNLOCK(&so->so_snd);
2019	return (result);
2020}
2021
2022/*ARGSUSED*/
2023static int
2024filt_solisten(struct knote *kn, long hint)
2025{
2026	struct socket *so = kn->kn_fp->f_data;
2027
2028	kn->kn_data = so->so_qlen;
2029	return (! TAILQ_EMPTY(&so->so_comp));
2030}
2031
2032int
2033socheckuid(struct socket *so, uid_t uid)
2034{
2035
2036	if (so == NULL)
2037		return (EPERM);
2038	if (so->so_cred->cr_uid == uid)
2039		return (0);
2040	return (EPERM);
2041}
2042