uipc_socket.c revision 134240
133965Sjdp/*
233965Sjdp * Copyright (c) 2004 The FreeBSD Foundation
333965Sjdp * Copyright (c) 2004 Robert Watson
433965Sjdp * Copyright (c) 1982, 1986, 1988, 1990, 1993
533965Sjdp *	The Regents of the University of California.  All rights reserved.
633965Sjdp *
733965Sjdp * Redistribution and use in source and binary forms, with or without
833965Sjdp * modification, are permitted provided that the following conditions
933965Sjdp * are met:
1033965Sjdp * 1. Redistributions of source code must retain the above copyright
1133965Sjdp *    notice, this list of conditions and the following disclaimer.
1233965Sjdp * 2. Redistributions in binary form must reproduce the above copyright
1333965Sjdp *    notice, this list of conditions and the following disclaimer in the
1433965Sjdp *    documentation and/or other materials provided with the distribution.
1533965Sjdp * 4. Neither the name of the University nor the names of its contributors
1633965Sjdp *    may be used to endorse or promote products derived from this software
1733965Sjdp *    without specific prior written permission.
1833965Sjdp *
1933965Sjdp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2033965Sjdp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2133965Sjdp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2233965Sjdp * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2333965Sjdp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2433965Sjdp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2533965Sjdp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2633965Sjdp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2733965Sjdp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2833965Sjdp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2933965Sjdp * SUCH DAMAGE.
3033965Sjdp *
3133965Sjdp *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
3233965Sjdp */
3333965Sjdp
3433965Sjdp#include <sys/cdefs.h>
3533965Sjdp__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 134240 2004-08-24 05:28:18Z rwatson $");
3633965Sjdp
3733965Sjdp#include "opt_inet.h"
3833965Sjdp#include "opt_mac.h"
3933965Sjdp#include "opt_zero.h"
4033965Sjdp
4133965Sjdp#include <sys/param.h>
4233965Sjdp#include <sys/systm.h>
4333965Sjdp#include <sys/fcntl.h>
4433965Sjdp#include <sys/limits.h>
4533965Sjdp#include <sys/lock.h>
4633965Sjdp#include <sys/mac.h>
4733965Sjdp#include <sys/malloc.h>
4833965Sjdp#include <sys/mbuf.h>
4933965Sjdp#include <sys/mutex.h>
5033965Sjdp#include <sys/domain.h>
5133965Sjdp#include <sys/file.h>			/* for struct knote */
5233965Sjdp#include <sys/kernel.h>
5333965Sjdp#include <sys/event.h>
5433965Sjdp#include <sys/poll.h>
5533965Sjdp#include <sys/proc.h>
5633965Sjdp#include <sys/protosw.h>
5733965Sjdp#include <sys/socket.h>
5833965Sjdp#include <sys/socketvar.h>
5933965Sjdp#include <sys/resourcevar.h>
6033965Sjdp#include <sys/signalvar.h>
6133965Sjdp#include <sys/sysctl.h>
6233965Sjdp#include <sys/uio.h>
6333965Sjdp#include <sys/jail.h>
6433965Sjdp
6533965Sjdp#include <vm/uma.h>
6633965Sjdp
6733965Sjdp
6833965Sjdpstatic int	soreceive_rcvoob(struct socket *so, struct uio *uio,
6933965Sjdp		    int flags);
7033965Sjdp
7133965Sjdp#ifdef INET
7233965Sjdpstatic int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
7333965Sjdp#endif
7433965Sjdp
7533965Sjdpstatic void	filt_sordetach(struct knote *kn);
7633965Sjdpstatic int	filt_soread(struct knote *kn, long hint);
7733965Sjdpstatic void	filt_sowdetach(struct knote *kn);
7833965Sjdpstatic int	filt_sowrite(struct knote *kn, long hint);
7933965Sjdpstatic int	filt_solisten(struct knote *kn, long hint);
8033965Sjdp
8133965Sjdpstatic struct filterops solisten_filtops =
8233965Sjdp	{ 1, NULL, filt_sordetach, filt_solisten };
8333965Sjdpstatic struct filterops soread_filtops =
8433965Sjdp	{ 1, NULL, filt_sordetach, filt_soread };
8533965Sjdpstatic struct filterops sowrite_filtops =
8633965Sjdp	{ 1, NULL, filt_sowdetach, filt_sowrite };
8733965Sjdp
8833965Sjdpuma_zone_t socket_zone;
8933965Sjdpso_gen_t	so_gencnt;	/* generation count for sockets */
9033965Sjdp
9133965SjdpMALLOC_DEFINE(M_SONAME, "soname", "socket name");
9233965SjdpMALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
9333965Sjdp
9433965SjdpSYSCTL_DECL(_kern_ipc);
9533965Sjdp
9633965Sjdpstatic int somaxconn = SOMAXCONN;
9733965SjdpSYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
9833965Sjdp    &somaxconn, 0, "Maximum pending socket connection queue size");
9933965Sjdpstatic int numopensockets;
10033965SjdpSYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
10133965Sjdp    &numopensockets, 0, "Number of open sockets");
10233965Sjdp#ifdef ZERO_COPY_SOCKETS
10333965Sjdp/* These aren't static because they're used in other files. */
10433965Sjdpint so_zero_copy_send = 1;
10533965Sjdpint so_zero_copy_receive = 1;
10633965SjdpSYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
10733965Sjdp    "Zero copy controls");
10833965SjdpSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
10933965Sjdp    &so_zero_copy_receive, 0, "Enable zero copy receive");
11033965SjdpSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
11133965Sjdp    &so_zero_copy_send, 0, "Enable zero copy send");
11233965Sjdp#endif /* ZERO_COPY_SOCKETS */
11333965Sjdp
11433965Sjdp/*
11533965Sjdp * accept_mtx locks down per-socket fields relating to accept queues.  See
11633965Sjdp * socketvar.h for an annotation of the protected fields of struct socket.
11733965Sjdp */
11833965Sjdpstruct mtx accept_mtx;
11933965SjdpMTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
12033965Sjdp
12133965Sjdp/*
12233965Sjdp * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
12333965Sjdp * so_gencnt field.
12433965Sjdp *
12533965Sjdp * XXXRW: These variables might be better manipulated using atomic operations
12633965Sjdp * for improved efficiency.
12733965Sjdp */
12833965Sjdpstatic struct mtx so_global_mtx;
12933965SjdpMTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
13033965Sjdp
13133965Sjdp/*
13233965Sjdp * Socket operation routines.
13333965Sjdp * These routines are called by the routines in
13433965Sjdp * sys_socket.c or from a system process, and
13533965Sjdp * implement the semantics of socket operations by
13633965Sjdp * switching out to the protocol specific routines.
13733965Sjdp */
13833965Sjdp
13933965Sjdp/*
14033965Sjdp * Get a socket structure from our zone, and initialize it.
14133965Sjdp * Note that it would probably be better to allocate socket
14233965Sjdp * and PCB at the same time, but I'm not convinced that all
14333965Sjdp * the protocols can be easily modified to do this.
14433965Sjdp *
14533965Sjdp * soalloc() returns a socket with a ref count of 0.
14633965Sjdp */
14733965Sjdpstruct socket *
14833965Sjdpsoalloc(int mflags)
14933965Sjdp{
15033965Sjdp	struct socket *so;
15133965Sjdp#ifdef MAC
15233965Sjdp	int error;
15333965Sjdp#endif
15433965Sjdp
15533965Sjdp	so = uma_zalloc(socket_zone, mflags | M_ZERO);
15633965Sjdp	if (so != NULL) {
15733965Sjdp#ifdef MAC
15833965Sjdp		error = mac_init_socket(so, mflags);
15933965Sjdp		if (error != 0) {
16033965Sjdp			uma_zfree(socket_zone, so);
16133965Sjdp			so = NULL;
16233965Sjdp			return so;
16333965Sjdp		}
16433965Sjdp#endif
16533965Sjdp		SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
16633965Sjdp		SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
16733965Sjdp		/* sx_init(&so->so_sxlock, "socket sxlock"); */
16833965Sjdp		TAILQ_INIT(&so->so_aiojobq);
16933965Sjdp		mtx_lock(&so_global_mtx);
17033965Sjdp		so->so_gencnt = ++so_gencnt;
17133965Sjdp		++numopensockets;
17233965Sjdp		mtx_unlock(&so_global_mtx);
17333965Sjdp	}
17433965Sjdp	return so;
17533965Sjdp}
17633965Sjdp
17733965Sjdp/*
17833965Sjdp * socreate returns a socket with a ref count of 1.  The socket should be
17933965Sjdp * closed with soclose().
18033965Sjdp */
18133965Sjdpint
18233965Sjdpsocreate(dom, aso, type, proto, cred, td)
18333965Sjdp	int dom;
18433965Sjdp	struct socket **aso;
18533965Sjdp	int type;
18633965Sjdp	int proto;
18733965Sjdp	struct ucred *cred;
18833965Sjdp	struct thread *td;
18933965Sjdp{
19033965Sjdp	struct protosw *prp;
19133965Sjdp	struct socket *so;
19233965Sjdp	int error;
19333965Sjdp
19433965Sjdp	if (proto)
19533965Sjdp		prp = pffindproto(dom, proto, type);
19633965Sjdp	else
19733965Sjdp		prp = pffindtype(dom, type);
19833965Sjdp
19933965Sjdp	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
20033965Sjdp		return (EPROTONOSUPPORT);
20133965Sjdp
20233965Sjdp	if (jailed(cred) && jail_socket_unixiproute_only &&
20333965Sjdp	    prp->pr_domain->dom_family != PF_LOCAL &&
20433965Sjdp	    prp->pr_domain->dom_family != PF_INET &&
20533965Sjdp	    prp->pr_domain->dom_family != PF_ROUTE) {
20633965Sjdp		return (EPROTONOSUPPORT);
20733965Sjdp	}
20833965Sjdp
20933965Sjdp	if (prp->pr_type != type)
21033965Sjdp		return (EPROTOTYPE);
21133965Sjdp	so = soalloc(M_WAITOK);
21233965Sjdp	if (so == NULL)
21333965Sjdp		return (ENOBUFS);
21433965Sjdp
21533965Sjdp	TAILQ_INIT(&so->so_incomp);
21633965Sjdp	TAILQ_INIT(&so->so_comp);
21733965Sjdp	so->so_type = type;
21833965Sjdp	so->so_cred = crhold(cred);
21933965Sjdp	so->so_proto = prp;
22033965Sjdp#ifdef MAC
22133965Sjdp	mac_create_socket(cred, so);
22233965Sjdp#endif
22333965Sjdp	SOCK_LOCK(so);
22433965Sjdp	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
22533965Sjdp	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
22633965Sjdp	soref(so);
22733965Sjdp	SOCK_UNLOCK(so);
22833965Sjdp	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
22933965Sjdp	if (error) {
23033965Sjdp		SOCK_LOCK(so);
23133965Sjdp		so->so_state |= SS_NOFDREF;
23233965Sjdp		sorele(so);
23333965Sjdp		return (error);
23433965Sjdp	}
23533965Sjdp	*aso = so;
23633965Sjdp	return (0);
23733965Sjdp}
23833965Sjdp
23933965Sjdpint
24033965Sjdpsobind(so, nam, td)
24133965Sjdp	struct socket *so;
24233965Sjdp	struct sockaddr *nam;
24333965Sjdp	struct thread *td;
24433965Sjdp{
24533965Sjdp
24633965Sjdp	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
24733965Sjdp}
24833965Sjdp
24933965Sjdpvoid
25033965Sjdpsodealloc(struct socket *so)
25133965Sjdp{
25233965Sjdp
25333965Sjdp	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
25433965Sjdp	mtx_lock(&so_global_mtx);
25533965Sjdp	so->so_gencnt = ++so_gencnt;
25633965Sjdp	mtx_unlock(&so_global_mtx);
25733965Sjdp	if (so->so_rcv.sb_hiwat)
25833965Sjdp		(void)chgsbsize(so->so_cred->cr_uidinfo,
25933965Sjdp		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
26033965Sjdp	if (so->so_snd.sb_hiwat)
26133965Sjdp		(void)chgsbsize(so->so_cred->cr_uidinfo,
26233965Sjdp		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
26333965Sjdp#ifdef INET
26433965Sjdp	/* remove acccept filter if one is present. */
26533965Sjdp	if (so->so_accf != NULL)
26633965Sjdp		do_setopt_accept_filter(so, NULL);
26733965Sjdp#endif
26833965Sjdp#ifdef MAC
26933965Sjdp	mac_destroy_socket(so);
27033965Sjdp#endif
27133965Sjdp	crfree(so->so_cred);
27233965Sjdp	SOCKBUF_LOCK_DESTROY(&so->so_snd);
27333965Sjdp	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
27433965Sjdp	/* sx_destroy(&so->so_sxlock); */
27533965Sjdp	uma_zfree(socket_zone, so);
27633965Sjdp	/*
27733965Sjdp	 * XXXRW: Seems like a shame to grab the mutex again down here, but
27833965Sjdp	 * we don't want to decrement the socket count until after we free
27933965Sjdp	 * the socket, and we can't increment the gencnt on the socket after
28033965Sjdp	 * we free, it so...
28133965Sjdp	 */
28233965Sjdp	mtx_lock(&so_global_mtx);
28333965Sjdp	--numopensockets;
28433965Sjdp	mtx_unlock(&so_global_mtx);
28533965Sjdp}
28633965Sjdp
28733965Sjdpint
28833965Sjdpsolisten(so, backlog, td)
28933965Sjdp	struct socket *so;
29033965Sjdp	int backlog;
29133965Sjdp	struct thread *td;
29233965Sjdp{
29333965Sjdp	int error;
29433965Sjdp
29533965Sjdp	/*
29633965Sjdp	 * XXXRW: Ordering issue here -- perhaps we need to set
29733965Sjdp	 * SO_ACCEPTCONN before the call to pru_listen()?
29833965Sjdp	 * XXXRW: General atomic test-and-set concerns here also.
29933965Sjdp	 */
30033965Sjdp	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
30133965Sjdp			    SS_ISDISCONNECTING))
30233965Sjdp		return (EINVAL);
30333965Sjdp	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
30433965Sjdp	if (error)
30533965Sjdp		return (error);
30633965Sjdp	ACCEPT_LOCK();
30733965Sjdp	if (TAILQ_EMPTY(&so->so_comp)) {
30833965Sjdp		SOCK_LOCK(so);
30933965Sjdp		so->so_options |= SO_ACCEPTCONN;
31033965Sjdp		SOCK_UNLOCK(so);
31133965Sjdp	}
31233965Sjdp	if (backlog < 0 || backlog > somaxconn)
31333965Sjdp		backlog = somaxconn;
31433965Sjdp	so->so_qlimit = backlog;
31533965Sjdp	ACCEPT_UNLOCK();
31633965Sjdp	return (0);
31733965Sjdp}
31833965Sjdp
31933965Sjdpvoid
32033965Sjdpsofree(so)
32133965Sjdp	struct socket *so;
32233965Sjdp{
32333965Sjdp	struct socket *head;
32433965Sjdp
32533965Sjdp	KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
32633965Sjdp	SOCK_LOCK_ASSERT(so);
32733965Sjdp
32833965Sjdp	if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) {
32933965Sjdp		SOCK_UNLOCK(so);
33033965Sjdp		return;
33133965Sjdp	}
33233965Sjdp
33333965Sjdp	SOCK_UNLOCK(so);
33433965Sjdp	ACCEPT_LOCK();
33533965Sjdp	head = so->so_head;
33633965Sjdp	if (head != NULL) {
33733965Sjdp		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
33833965Sjdp		    (so->so_qstate & SQ_INCOMP) != 0,
33933965Sjdp		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
34033965Sjdp		    "SQ_INCOMP"));
34133965Sjdp		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
34233965Sjdp		    (so->so_qstate & SQ_INCOMP) == 0,
34333965Sjdp		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
34433965Sjdp		/*
34533965Sjdp		 * accept(2) is responsible draining the completed
34633965Sjdp		 * connection queue and freeing those sockets, so
34733965Sjdp		 * we just return here if this socket is currently
34833965Sjdp		 * on the completed connection queue.  Otherwise,
34933965Sjdp		 * accept(2) may hang after select(2) has indicating
35033965Sjdp		 * that a listening socket was ready.  If it's an
35133965Sjdp		 * incomplete connection, we remove it from the queue
35233965Sjdp		 * and free it; otherwise, it won't be released until
35333965Sjdp		 * the listening socket is closed.
35433965Sjdp		 */
35533965Sjdp		if ((so->so_qstate & SQ_COMP) != 0) {
35633965Sjdp			ACCEPT_UNLOCK();
35733965Sjdp			return;
35833965Sjdp		}
35933965Sjdp		TAILQ_REMOVE(&head->so_incomp, so, so_list);
36033965Sjdp		head->so_incqlen--;
36133965Sjdp		so->so_qstate &= ~SQ_INCOMP;
36233965Sjdp		so->so_head = NULL;
36333965Sjdp	}
36433965Sjdp	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
36533965Sjdp	    (so->so_qstate & SQ_INCOMP) == 0,
36633965Sjdp	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
36733965Sjdp	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
36833965Sjdp	ACCEPT_UNLOCK();
36933965Sjdp	SOCKBUF_LOCK(&so->so_snd);
37033965Sjdp	so->so_snd.sb_flags |= SB_NOINTR;
37133965Sjdp	(void)sblock(&so->so_snd, M_WAITOK);
37233965Sjdp	/*
37333965Sjdp	 * socantsendmore_locked() drops the socket buffer mutex so that it
37433965Sjdp	 * can safely perform wakeups.  Re-acquire the mutex before
37533965Sjdp	 * continuing.
37633965Sjdp	 */
37733965Sjdp	socantsendmore_locked(so);
37833965Sjdp	SOCKBUF_LOCK(&so->so_snd);
37933965Sjdp	sbunlock(&so->so_snd);
38033965Sjdp	sbrelease_locked(&so->so_snd, so);
38133965Sjdp	SOCKBUF_UNLOCK(&so->so_snd);
38233965Sjdp	sorflush(so);
38333965Sjdp	knlist_destroy(&so->so_rcv.sb_sel.si_note);
38433965Sjdp	knlist_destroy(&so->so_snd.sb_sel.si_note);
38533965Sjdp	sodealloc(so);
38633965Sjdp}
38733965Sjdp
38833965Sjdp/*
38933965Sjdp * Close a socket on last file table reference removal.
39033965Sjdp * Initiate disconnect if connected.
39133965Sjdp * Free socket when disconnect complete.
39233965Sjdp *
39333965Sjdp * This function will sorele() the socket.  Note that soclose() may be
39433965Sjdp * called prior to the ref count reaching zero.  The actual socket
39533965Sjdp * structure will not be freed until the ref count reaches zero.
39633965Sjdp */
39733965Sjdpint
39833965Sjdpsoclose(so)
39933965Sjdp	struct socket *so;
40033965Sjdp{
40133965Sjdp	int error = 0;
40233965Sjdp
40333965Sjdp	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
40433965Sjdp
40533965Sjdp	funsetown(&so->so_sigio);
40633965Sjdp	if (so->so_options & SO_ACCEPTCONN) {
40733965Sjdp		struct socket *sp;
40833965Sjdp		ACCEPT_LOCK();
40933965Sjdp		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
41033965Sjdp			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
41133965Sjdp			so->so_incqlen--;
41233965Sjdp			sp->so_qstate &= ~SQ_INCOMP;
41333965Sjdp			sp->so_head = NULL;
41433965Sjdp			ACCEPT_UNLOCK();
41533965Sjdp			(void) soabort(sp);
41633965Sjdp			ACCEPT_LOCK();
41733965Sjdp		}
41833965Sjdp		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
41933965Sjdp			TAILQ_REMOVE(&so->so_comp, sp, so_list);
42033965Sjdp			so->so_qlen--;
42133965Sjdp			sp->so_qstate &= ~SQ_COMP;
42233965Sjdp			sp->so_head = NULL;
42333965Sjdp			ACCEPT_UNLOCK();
42433965Sjdp			(void) soabort(sp);
42533965Sjdp			ACCEPT_LOCK();
42633965Sjdp		}
42733965Sjdp		ACCEPT_UNLOCK();
42833965Sjdp	}
42933965Sjdp	if (so->so_pcb == NULL)
43033965Sjdp		goto discard;
43133965Sjdp	if (so->so_state & SS_ISCONNECTED) {
43233965Sjdp		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
43333965Sjdp			error = sodisconnect(so);
43433965Sjdp			if (error)
43533965Sjdp				goto drop;
43633965Sjdp		}
43733965Sjdp		if (so->so_options & SO_LINGER) {
43833965Sjdp			if ((so->so_state & SS_ISDISCONNECTING) &&
43933965Sjdp			    (so->so_state & SS_NBIO))
44033965Sjdp				goto drop;
44133965Sjdp			while (so->so_state & SS_ISCONNECTED) {
44233965Sjdp				error = tsleep(&so->so_timeo,
44333965Sjdp				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
44433965Sjdp				if (error)
44533965Sjdp					break;
44633965Sjdp			}
44733965Sjdp		}
44833965Sjdp	}
44933965Sjdpdrop:
45033965Sjdp	if (so->so_pcb != NULL) {
45133965Sjdp		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
45233965Sjdp		if (error == 0)
45333965Sjdp			error = error2;
45433965Sjdp	}
45533965Sjdpdiscard:
45633965Sjdp	SOCK_LOCK(so);
45733965Sjdp	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
45833965Sjdp	so->so_state |= SS_NOFDREF;
45933965Sjdp	sorele(so);
46033965Sjdp	return (error);
46133965Sjdp}
46233965Sjdp
46333965Sjdp/*
46433965Sjdp * soabort() must not be called with any socket locks held, as it calls
46533965Sjdp * into the protocol, which will call back into the socket code causing
46633965Sjdp * it to acquire additional socket locks that may cause recursion or lock
46733965Sjdp * order reversals.
46833965Sjdp */
46933965Sjdpint
47033965Sjdpsoabort(so)
47133965Sjdp	struct socket *so;
47233965Sjdp{
47333965Sjdp	int error;
47433965Sjdp
47533965Sjdp	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
47633965Sjdp	if (error) {
47733965Sjdp		SOCK_LOCK(so);
47833965Sjdp		sotryfree(so);	/* note: does not decrement the ref count */
47933965Sjdp		return error;
48033965Sjdp	}
48133965Sjdp	return (0);
48233965Sjdp}
48333965Sjdp
48433965Sjdpint
48533965Sjdpsoaccept(so, nam)
48633965Sjdp	struct socket *so;
48733965Sjdp	struct sockaddr **nam;
48833965Sjdp{
48933965Sjdp	int error;
49033965Sjdp
49133965Sjdp	SOCK_LOCK(so);
49233965Sjdp	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
49333965Sjdp	so->so_state &= ~SS_NOFDREF;
49433965Sjdp	SOCK_UNLOCK(so);
49533965Sjdp	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
49633965Sjdp	return (error);
49733965Sjdp}
49833965Sjdp
49933965Sjdpint
50033965Sjdpsoconnect(so, nam, td)
50133965Sjdp	struct socket *so;
50233965Sjdp	struct sockaddr *nam;
50333965Sjdp	struct thread *td;
50433965Sjdp{
50533965Sjdp	int error;
50633965Sjdp
50733965Sjdp	if (so->so_options & SO_ACCEPTCONN)
50833965Sjdp		return (EOPNOTSUPP);
50933965Sjdp	/*
51033965Sjdp	 * If protocol is connection-based, can only connect once.
51133965Sjdp	 * Otherwise, if connected, try to disconnect first.
51233965Sjdp	 * This allows user to disconnect by connecting to, e.g.,
51333965Sjdp	 * a null address.
51433965Sjdp	 */
51533965Sjdp	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
51633965Sjdp	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
51733965Sjdp	    (error = sodisconnect(so))))
51833965Sjdp		error = EISCONN;
51933965Sjdp	else
52033965Sjdp		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
52133965Sjdp	return (error);
52233965Sjdp}
52333965Sjdp
52433965Sjdpint
52533965Sjdpsoconnect2(so1, so2)
52633965Sjdp	struct socket *so1;
52733965Sjdp	struct socket *so2;
52833965Sjdp{
52933965Sjdp
53033965Sjdp	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
53133965Sjdp}
53233965Sjdp
53333965Sjdpint
53433965Sjdpsodisconnect(so)
53533965Sjdp	struct socket *so;
53633965Sjdp{
53733965Sjdp	int error;
53833965Sjdp
53933965Sjdp	if ((so->so_state & SS_ISCONNECTED) == 0)
54033965Sjdp		return (ENOTCONN);
54133965Sjdp	if (so->so_state & SS_ISDISCONNECTING)
54233965Sjdp		return (EALREADY);
54333965Sjdp	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
54433965Sjdp	return (error);
54533965Sjdp}
54633965Sjdp
54733965Sjdp#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
54833965Sjdp/*
54933965Sjdp * Send on a socket.
55033965Sjdp * If send must go all at once and message is larger than
55133965Sjdp * send buffering, then hard error.
55233965Sjdp * Lock against other senders.
55333965Sjdp * If must go all at once and not enough room now, then
55433965Sjdp * inform user that this would block and do nothing.
55533965Sjdp * Otherwise, if nonblocking, send as much as possible.
55633965Sjdp * The data to be sent is described by "uio" if nonzero,
55733965Sjdp * otherwise by the mbuf chain "top" (which must be null
55833965Sjdp * if uio is not).  Data provided in mbuf chain must be small
55933965Sjdp * enough to send all at once.
56033965Sjdp *
56133965Sjdp * Returns nonzero on error, timeout or signal; callers
56233965Sjdp * must check for short counts if EINTR/ERESTART are returned.
56333965Sjdp * Data and control buffers are freed on return.
56433965Sjdp */
56533965Sjdp
56633965Sjdp#ifdef ZERO_COPY_SOCKETS
56733965Sjdpstruct so_zerocopy_stats{
56833965Sjdp	int size_ok;
56933965Sjdp	int align_ok;
57033965Sjdp	int found_ifp;
57133965Sjdp};
57233965Sjdpstruct so_zerocopy_stats so_zerocp_stats = {0,0,0};
57333965Sjdp#include <netinet/in.h>
57433965Sjdp#include <net/route.h>
57533965Sjdp#include <netinet/in_pcb.h>
57633965Sjdp#include <vm/vm.h>
57733965Sjdp#include <vm/vm_page.h>
57833965Sjdp#include <vm/vm_object.h>
57933965Sjdp#endif /*ZERO_COPY_SOCKETS*/
58033965Sjdp
58133965Sjdpint
58233965Sjdpsosend(so, addr, uio, top, control, flags, td)
58333965Sjdp	struct socket *so;
58433965Sjdp	struct sockaddr *addr;
58533965Sjdp	struct uio *uio;
58633965Sjdp	struct mbuf *top;
58733965Sjdp	struct mbuf *control;
58833965Sjdp	int flags;
58933965Sjdp	struct thread *td;
59033965Sjdp{
59133965Sjdp	struct mbuf **mp;
59233965Sjdp	struct mbuf *m;
59333965Sjdp	long space, len = 0, resid;
59433965Sjdp	int clen = 0, error, dontroute;
59533965Sjdp	int atomic = sosendallatonce(so) || top;
59633965Sjdp#ifdef ZERO_COPY_SOCKETS
59733965Sjdp	int cow_send;
59833965Sjdp#endif /* ZERO_COPY_SOCKETS */
59933965Sjdp
60033965Sjdp	if (uio != NULL)
60133965Sjdp		resid = uio->uio_resid;
60233965Sjdp	else
60333965Sjdp		resid = top->m_pkthdr.len;
60433965Sjdp	/*
60533965Sjdp	 * In theory resid should be unsigned.
60633965Sjdp	 * However, space must be signed, as it might be less than 0
60733965Sjdp	 * if we over-committed, and we must use a signed comparison
60833965Sjdp	 * of space and resid.  On the other hand, a negative resid
60933965Sjdp	 * causes us to loop sending 0-length segments to the protocol.
61033965Sjdp	 *
61133965Sjdp	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
61233965Sjdp	 * type sockets since that's an error.
61333965Sjdp	 */
61433965Sjdp	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
61533965Sjdp		error = EINVAL;
61633965Sjdp		goto out;
61733965Sjdp	}
61833965Sjdp
61933965Sjdp	dontroute =
62033965Sjdp	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
62133965Sjdp	    (so->so_proto->pr_flags & PR_ATOMIC);
62233965Sjdp	if (td != NULL)
62333965Sjdp		td->td_proc->p_stats->p_ru.ru_msgsnd++;
62433965Sjdp	if (control != NULL)
62533965Sjdp		clen = control->m_len;
62633965Sjdp#define	snderr(errno)	{ error = (errno); goto release; }
62733965Sjdp
62833965Sjdp	SOCKBUF_LOCK(&so->so_snd);
62933965Sjdprestart:
63033965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_snd);
63133965Sjdp	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
63233965Sjdp	if (error)
63333965Sjdp		goto out_locked;
63433965Sjdp	do {
63533965Sjdp		SOCKBUF_LOCK_ASSERT(&so->so_snd);
63633965Sjdp		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
63733965Sjdp			snderr(EPIPE);
63833965Sjdp		if (so->so_error) {
63933965Sjdp			error = so->so_error;
64033965Sjdp			so->so_error = 0;
64133965Sjdp			goto release;
64233965Sjdp		}
64333965Sjdp		if ((so->so_state & SS_ISCONNECTED) == 0) {
64433965Sjdp			/*
64533965Sjdp			 * `sendto' and `sendmsg' is allowed on a connection-
64633965Sjdp			 * based socket if it supports implied connect.
64733965Sjdp			 * Return ENOTCONN if not connected and no address is
64833965Sjdp			 * supplied.
64933965Sjdp			 */
65033965Sjdp			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
65133965Sjdp			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
65233965Sjdp				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
65333965Sjdp				    !(resid == 0 && clen != 0))
65433965Sjdp					snderr(ENOTCONN);
65533965Sjdp			} else if (addr == NULL)
65633965Sjdp			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
65733965Sjdp				   ENOTCONN : EDESTADDRREQ);
65833965Sjdp		}
65933965Sjdp		space = sbspace(&so->so_snd);
66033965Sjdp		if (flags & MSG_OOB)
66133965Sjdp			space += 1024;
66233965Sjdp		if ((atomic && resid > so->so_snd.sb_hiwat) ||
66333965Sjdp		    clen > so->so_snd.sb_hiwat)
66433965Sjdp			snderr(EMSGSIZE);
66533965Sjdp		if (space < resid + clen &&
66633965Sjdp		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
66733965Sjdp			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
66833965Sjdp				snderr(EWOULDBLOCK);
66933965Sjdp			sbunlock(&so->so_snd);
67033965Sjdp			error = sbwait(&so->so_snd);
67133965Sjdp			if (error)
67233965Sjdp				goto out_locked;
67333965Sjdp			goto restart;
67433965Sjdp		}
67533965Sjdp		SOCKBUF_UNLOCK(&so->so_snd);
67633965Sjdp		mp = &top;
67733965Sjdp		space -= clen;
67833965Sjdp		do {
67933965Sjdp		    if (uio == NULL) {
68033965Sjdp			/*
68133965Sjdp			 * Data is prepackaged in "top".
68233965Sjdp			 */
68333965Sjdp			resid = 0;
68433965Sjdp			if (flags & MSG_EOR)
68533965Sjdp				top->m_flags |= M_EOR;
68633965Sjdp		    } else do {
68733965Sjdp#ifdef ZERO_COPY_SOCKETS
68833965Sjdp			cow_send = 0;
68933965Sjdp#endif /* ZERO_COPY_SOCKETS */
69033965Sjdp			if (resid >= MINCLSIZE) {
69133965Sjdp#ifdef ZERO_COPY_SOCKETS
69233965Sjdp				if (top == NULL) {
69333965Sjdp					MGETHDR(m, M_TRYWAIT, MT_DATA);
69433965Sjdp					if (m == NULL) {
69533965Sjdp						error = ENOBUFS;
69633965Sjdp						SOCKBUF_LOCK(&so->so_snd);
69733965Sjdp						goto release;
69833965Sjdp					}
69933965Sjdp					m->m_pkthdr.len = 0;
70033965Sjdp					m->m_pkthdr.rcvif = (struct ifnet *)0;
70133965Sjdp				} else {
70233965Sjdp					MGET(m, M_TRYWAIT, MT_DATA);
70333965Sjdp					if (m == NULL) {
70433965Sjdp						error = ENOBUFS;
70533965Sjdp						SOCKBUF_LOCK(&so->so_snd);
70633965Sjdp						goto release;
70733965Sjdp					}
70833965Sjdp				}
70933965Sjdp				if (so_zero_copy_send &&
71033965Sjdp				    resid>=PAGE_SIZE &&
71133965Sjdp				    space>=PAGE_SIZE &&
71233965Sjdp				    uio->uio_iov->iov_len>=PAGE_SIZE) {
71333965Sjdp					so_zerocp_stats.size_ok++;
71433965Sjdp					if (!((vm_offset_t)
71533965Sjdp					  uio->uio_iov->iov_base & PAGE_MASK)){
71633965Sjdp						so_zerocp_stats.align_ok++;
71733965Sjdp						cow_send = socow_setup(m, uio);
71833965Sjdp					}
71933965Sjdp				}
72033965Sjdp				if (!cow_send) {
72133965Sjdp					MCLGET(m, M_TRYWAIT);
72233965Sjdp					if ((m->m_flags & M_EXT) == 0) {
72333965Sjdp						m_free(m);
72433965Sjdp						m = NULL;
72533965Sjdp					} else {
72633965Sjdp						len = min(min(MCLBYTES, resid), space);
72733965Sjdp					}
72833965Sjdp				} else
72933965Sjdp					len = PAGE_SIZE;
73033965Sjdp#else /* ZERO_COPY_SOCKETS */
73133965Sjdp				if (top == NULL) {
73233965Sjdp					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
73333965Sjdp					m->m_pkthdr.len = 0;
73433965Sjdp					m->m_pkthdr.rcvif = (struct ifnet *)0;
73533965Sjdp				} else
73633965Sjdp					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
73733965Sjdp				len = min(min(MCLBYTES, resid), space);
73833965Sjdp#endif /* ZERO_COPY_SOCKETS */
73933965Sjdp			} else {
74033965Sjdp				if (top == NULL) {
74133965Sjdp					m = m_gethdr(M_TRYWAIT, MT_DATA);
74233965Sjdp					m->m_pkthdr.len = 0;
74333965Sjdp					m->m_pkthdr.rcvif = (struct ifnet *)0;
74433965Sjdp
74533965Sjdp					len = min(min(MHLEN, resid), space);
74633965Sjdp					/*
74733965Sjdp					 * For datagram protocols, leave room
74833965Sjdp					 * for protocol headers in first mbuf.
74933965Sjdp					 */
75033965Sjdp					if (atomic && m && len < MHLEN)
75133965Sjdp						MH_ALIGN(m, len);
75233965Sjdp				} else {
75333965Sjdp					m = m_get(M_TRYWAIT, MT_DATA);
75433965Sjdp					len = min(min(MLEN, resid), space);
75533965Sjdp				}
75633965Sjdp			}
75733965Sjdp			if (m == NULL) {
75833965Sjdp				error = ENOBUFS;
75933965Sjdp				SOCKBUF_LOCK(&so->so_snd);
76033965Sjdp				goto release;
76133965Sjdp			}
76233965Sjdp
76333965Sjdp			space -= len;
76433965Sjdp#ifdef ZERO_COPY_SOCKETS
76533965Sjdp			if (cow_send)
76633965Sjdp				error = 0;
76733965Sjdp			else
76833965Sjdp#endif /* ZERO_COPY_SOCKETS */
76933965Sjdp			error = uiomove(mtod(m, void *), (int)len, uio);
77033965Sjdp			resid = uio->uio_resid;
77133965Sjdp			m->m_len = len;
77233965Sjdp			*mp = m;
77333965Sjdp			top->m_pkthdr.len += len;
77433965Sjdp			if (error) {
77533965Sjdp				SOCKBUF_LOCK(&so->so_snd);
77633965Sjdp				goto release;
77733965Sjdp			}
77833965Sjdp			mp = &m->m_next;
77933965Sjdp			if (resid <= 0) {
78033965Sjdp				if (flags & MSG_EOR)
78133965Sjdp					top->m_flags |= M_EOR;
78233965Sjdp				break;
78333965Sjdp			}
78433965Sjdp		    } while (space > 0 && atomic);
78533965Sjdp		    if (dontroute) {
78633965Sjdp			    SOCK_LOCK(so);
78733965Sjdp			    so->so_options |= SO_DONTROUTE;
78833965Sjdp			    SOCK_UNLOCK(so);
78933965Sjdp		    }
79033965Sjdp		    /*
79133965Sjdp		     * XXX all the SBS_CANTSENDMORE checks previously
79233965Sjdp		     * done could be out of date.  We could have recieved
79333965Sjdp		     * a reset packet in an interrupt or maybe we slept
79433965Sjdp		     * while doing page faults in uiomove() etc. We could
79533965Sjdp		     * probably recheck again inside the locking protection
79633965Sjdp		     * here, but there are probably other places that this
79733965Sjdp		     * also happens.  We must rethink this.
79833965Sjdp		     */
79933965Sjdp		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
80033965Sjdp			(flags & MSG_OOB) ? PRUS_OOB :
80133965Sjdp			/*
80233965Sjdp			 * If the user set MSG_EOF, the protocol
80333965Sjdp			 * understands this flag and nothing left to
80433965Sjdp			 * send then use PRU_SEND_EOF instead of PRU_SEND.
80533965Sjdp			 */
80633965Sjdp			((flags & MSG_EOF) &&
80733965Sjdp			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
80833965Sjdp			 (resid <= 0)) ?
80933965Sjdp				PRUS_EOF :
81033965Sjdp			/* If there is more to send set PRUS_MORETOCOME */
81133965Sjdp			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
81233965Sjdp			top, addr, control, td);
81333965Sjdp		    if (dontroute) {
81433965Sjdp			    SOCK_LOCK(so);
81533965Sjdp			    so->so_options &= ~SO_DONTROUTE;
81633965Sjdp			    SOCK_UNLOCK(so);
81733965Sjdp		    }
81833965Sjdp		    clen = 0;
81933965Sjdp		    control = NULL;
82033965Sjdp		    top = NULL;
82133965Sjdp		    mp = &top;
82233965Sjdp		    if (error) {
82333965Sjdp			SOCKBUF_LOCK(&so->so_snd);
82433965Sjdp			goto release;
82533965Sjdp		    }
82633965Sjdp		} while (resid && space > 0);
82733965Sjdp		SOCKBUF_LOCK(&so->so_snd);
82833965Sjdp	} while (resid);
82933965Sjdp
83033965Sjdprelease:
83133965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_snd);
83233965Sjdp	sbunlock(&so->so_snd);
83333965Sjdpout_locked:
83433965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_snd);
83533965Sjdp	SOCKBUF_UNLOCK(&so->so_snd);
83633965Sjdpout:
83733965Sjdp	if (top != NULL)
83833965Sjdp		m_freem(top);
83933965Sjdp	if (control != NULL)
84033965Sjdp		m_freem(control);
84133965Sjdp	return (error);
84233965Sjdp}
84333965Sjdp
84433965Sjdp/*
84533965Sjdp * The part of soreceive() that implements reading non-inline out-of-band
84633965Sjdp * data from a socket.  For more complete comments, see soreceive(), from
84733965Sjdp * which this code originated.
84833965Sjdp *
84933965Sjdp * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(),
85033965Sjdp * is unable to return an mbuf chain to the caller.
85133965Sjdp */
85233965Sjdpstatic int
85333965Sjdpsoreceive_rcvoob(so, uio, flags)
85433965Sjdp	struct socket *so;
85533965Sjdp	struct uio *uio;
85633965Sjdp	int flags;
85733965Sjdp{
85833965Sjdp	struct protosw *pr = so->so_proto;
85933965Sjdp	struct mbuf *m;
86033965Sjdp	int error;
86133965Sjdp
86233965Sjdp	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
86333965Sjdp
86433965Sjdp	m = m_get(M_TRYWAIT, MT_DATA);
86533965Sjdp	if (m == NULL)
86633965Sjdp		return (ENOBUFS);
86733965Sjdp	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
86833965Sjdp	if (error)
86933965Sjdp		goto bad;
87033965Sjdp	do {
87133965Sjdp#ifdef ZERO_COPY_SOCKETS
87233965Sjdp		if (so_zero_copy_receive) {
87333965Sjdp			vm_page_t pg;
87433965Sjdp			int disposable;
87533965Sjdp
87633965Sjdp			if ((m->m_flags & M_EXT)
87733965Sjdp			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
87833965Sjdp				disposable = 1;
87933965Sjdp			else
88033965Sjdp				disposable = 0;
88133965Sjdp
88233965Sjdp			pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
88333965Sjdp			if (uio->uio_offset == -1)
88433965Sjdp				uio->uio_offset =IDX_TO_OFF(pg->pindex);
88533965Sjdp
88633965Sjdp			error = uiomoveco(mtod(m, void *),
88733965Sjdp					  min(uio->uio_resid, m->m_len),
88833965Sjdp					  uio, pg->object,
88933965Sjdp					  disposable);
89033965Sjdp		} else
89133965Sjdp#endif /* ZERO_COPY_SOCKETS */
89233965Sjdp		error = uiomove(mtod(m, void *),
89333965Sjdp		    (int) min(uio->uio_resid, m->m_len), uio);
89433965Sjdp		m = m_free(m);
89533965Sjdp	} while (uio->uio_resid && error == 0 && m);
89633965Sjdpbad:
89733965Sjdp	if (m != NULL)
89833965Sjdp		m_freem(m);
89933965Sjdp	return (error);
90033965Sjdp}
90133965Sjdp
90233965Sjdp/*
90333965Sjdp * Following replacement or removal of the first mbuf on the first mbuf chain
90433965Sjdp * of a socket buffer, push necessary state changes back into the socket
90533965Sjdp * buffer so that other consumers see the values consistently.  'nextrecord'
90633965Sjdp * is the callers locally stored value of the original value of
90733965Sjdp * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
90833965Sjdp * NOTE: 'nextrecord' may be NULL.
90933965Sjdp */
91033965Sjdpstatic __inline void
91133965Sjdpsockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
91233965Sjdp{
91333965Sjdp
91433965Sjdp	SOCKBUF_LOCK_ASSERT(sb);
91533965Sjdp	/*
91633965Sjdp	 * First, update for the new value of nextrecord.  If necessary, make
91733965Sjdp	 * it the first record.
91833965Sjdp	 */
91933965Sjdp	if (sb->sb_mb != NULL)
92033965Sjdp		sb->sb_mb->m_nextpkt = nextrecord;
92133965Sjdp	else
92233965Sjdp		sb->sb_mb = nextrecord;
92333965Sjdp
92433965Sjdp        /*
92533965Sjdp         * Now update any dependent socket buffer fields to reflect the new
92633965Sjdp         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
92733965Sjdp	 * addition of a second clause that takes care of the case where
92833965Sjdp	 * sb_mb has been updated, but remains the last record.
92933965Sjdp         */
93033965Sjdp        if (sb->sb_mb == NULL) {
93133965Sjdp                sb->sb_mbtail = NULL;
93233965Sjdp                sb->sb_lastrecord = NULL;
93333965Sjdp        } else if (sb->sb_mb->m_nextpkt == NULL)
93433965Sjdp                sb->sb_lastrecord = sb->sb_mb;
93533965Sjdp}
93633965Sjdp
93733965Sjdp
93833965Sjdp/*
93933965Sjdp * Implement receive operations on a socket.
94033965Sjdp * We depend on the way that records are added to the sockbuf
94133965Sjdp * by sbappend*.  In particular, each record (mbufs linked through m_next)
94233965Sjdp * must begin with an address if the protocol so specifies,
94333965Sjdp * followed by an optional mbuf or mbufs containing ancillary data,
94433965Sjdp * and then zero or more mbufs of data.
94533965Sjdp * In order to avoid blocking network interrupts for the entire time here,
94633965Sjdp * we splx() while doing the actual copy to user space.
94733965Sjdp * Although the sockbuf is locked, new data may still be appended,
94833965Sjdp * and thus we must maintain consistency of the sockbuf during that time.
94933965Sjdp *
95033965Sjdp * The caller may receive the data as a single mbuf chain by supplying
95133965Sjdp * an mbuf **mp0 for use in returning the chain.  The uio is then used
95233965Sjdp * only for the count in uio_resid.
95333965Sjdp */
95433965Sjdpint
95533965Sjdpsoreceive(so, psa, uio, mp0, controlp, flagsp)
95633965Sjdp	struct socket *so;
95733965Sjdp	struct sockaddr **psa;
95833965Sjdp	struct uio *uio;
95933965Sjdp	struct mbuf **mp0;
96033965Sjdp	struct mbuf **controlp;
96133965Sjdp	int *flagsp;
96233965Sjdp{
96333965Sjdp	struct mbuf *m, **mp;
96433965Sjdp	int flags, len, error, offset;
96533965Sjdp	struct protosw *pr = so->so_proto;
96633965Sjdp	struct mbuf *nextrecord;
96733965Sjdp	int moff, type = 0;
96833965Sjdp	int orig_resid = uio->uio_resid;
96933965Sjdp
97033965Sjdp	mp = mp0;
97133965Sjdp	if (psa != NULL)
97233965Sjdp		*psa = NULL;
97333965Sjdp	if (controlp != NULL)
97433965Sjdp		*controlp = NULL;
97533965Sjdp	if (flagsp != NULL)
97633965Sjdp		flags = *flagsp &~ MSG_EOR;
97733965Sjdp	else
97833965Sjdp		flags = 0;
97933965Sjdp	if (flags & MSG_OOB)
98033965Sjdp		return (soreceive_rcvoob(so, uio, flags));
98133965Sjdp	if (mp != NULL)
98233965Sjdp		*mp = NULL;
98333965Sjdp	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
98433965Sjdp		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
98533965Sjdp
98633965Sjdp	SOCKBUF_LOCK(&so->so_rcv);
98733965Sjdprestart:
98833965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
98933965Sjdp	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
99033965Sjdp	if (error)
99133965Sjdp		goto out;
99233965Sjdp
99333965Sjdp	m = so->so_rcv.sb_mb;
99433965Sjdp	/*
99533965Sjdp	 * If we have less data than requested, block awaiting more
99633965Sjdp	 * (subject to any timeout) if:
99733965Sjdp	 *   1. the current count is less than the low water mark, or
99833965Sjdp	 *   2. MSG_WAITALL is set, and it is possible to do the entire
99933965Sjdp	 *	receive operation at once if we block (resid <= hiwat).
100033965Sjdp	 *   3. MSG_DONTWAIT is not set
100133965Sjdp	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
100233965Sjdp	 * we have to do the receive in sections, and thus risk returning
100333965Sjdp	 * a short count if a timeout or signal occurs after we start.
100433965Sjdp	 */
100533965Sjdp	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
100633965Sjdp	    so->so_rcv.sb_cc < uio->uio_resid) &&
100733965Sjdp	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
100833965Sjdp	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
100933965Sjdp	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
101033965Sjdp		KASSERT(m != NULL || !so->so_rcv.sb_cc,
101133965Sjdp		    ("receive: m == %p so->so_rcv.sb_cc == %u",
101233965Sjdp		    m, so->so_rcv.sb_cc));
101333965Sjdp		if (so->so_error) {
101433965Sjdp			if (m != NULL)
101533965Sjdp				goto dontblock;
101633965Sjdp			error = so->so_error;
101733965Sjdp			if ((flags & MSG_PEEK) == 0)
101833965Sjdp				so->so_error = 0;
101933965Sjdp			goto release;
102033965Sjdp		}
102133965Sjdp		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
102233965Sjdp		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
102333965Sjdp			if (m)
102433965Sjdp				goto dontblock;
102533965Sjdp			else
102633965Sjdp				goto release;
102733965Sjdp		}
102833965Sjdp		for (; m != NULL; m = m->m_next)
102933965Sjdp			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
103033965Sjdp				m = so->so_rcv.sb_mb;
103133965Sjdp				goto dontblock;
103233965Sjdp			}
103333965Sjdp		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
103433965Sjdp		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
103533965Sjdp			error = ENOTCONN;
103633965Sjdp			goto release;
103733965Sjdp		}
103833965Sjdp		if (uio->uio_resid == 0)
103933965Sjdp			goto release;
104033965Sjdp		if ((so->so_state & SS_NBIO) ||
104133965Sjdp		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
104233965Sjdp			error = EWOULDBLOCK;
104333965Sjdp			goto release;
104433965Sjdp		}
104533965Sjdp		SBLASTRECORDCHK(&so->so_rcv);
104633965Sjdp		SBLASTMBUFCHK(&so->so_rcv);
104733965Sjdp		sbunlock(&so->so_rcv);
104833965Sjdp		error = sbwait(&so->so_rcv);
104933965Sjdp		if (error)
105033965Sjdp			goto out;
105133965Sjdp		goto restart;
105233965Sjdp	}
105333965Sjdpdontblock:
105433965Sjdp	/*
105533965Sjdp	 * From this point onward, we maintain 'nextrecord' as a cache of the
105633965Sjdp	 * pointer to the next record in the socket buffer.  We must keep the
105733965Sjdp	 * various socket buffer pointers and local stack versions of the
105833965Sjdp	 * pointers in sync, pushing out modifications before dropping the
105933965Sjdp	 * socket buffer mutex, and re-reading them when picking it up.
106033965Sjdp	 *
106133965Sjdp	 * Otherwise, we will race with the network stack appending new data
106233965Sjdp	 * or records onto the socket buffer by using inconsistent/stale
106333965Sjdp	 * versions of the field, possibly resulting in socket buffer
106433965Sjdp	 * corruption.
106533965Sjdp	 *
106633965Sjdp	 * By holding the high-level sblock(), we prevent simultaneous
106733965Sjdp	 * readers from pulling off the front of the socket buffer.
106833965Sjdp	 */
106933965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
107033965Sjdp	if (uio->uio_td)
107133965Sjdp		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
107233965Sjdp	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
107333965Sjdp	SBLASTRECORDCHK(&so->so_rcv);
107433965Sjdp	SBLASTMBUFCHK(&so->so_rcv);
107533965Sjdp	nextrecord = m->m_nextpkt;
107633965Sjdp	if (pr->pr_flags & PR_ADDR) {
107733965Sjdp		KASSERT(m->m_type == MT_SONAME,
107833965Sjdp		    ("m->m_type == %d", m->m_type));
107933965Sjdp		orig_resid = 0;
108033965Sjdp		if (psa != NULL)
108133965Sjdp			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
108233965Sjdp			    M_NOWAIT);
108333965Sjdp		if (flags & MSG_PEEK) {
108433965Sjdp			m = m->m_next;
108533965Sjdp		} else {
108633965Sjdp			sbfree(&so->so_rcv, m);
108733965Sjdp			so->so_rcv.sb_mb = m_free(m);
108833965Sjdp			m = so->so_rcv.sb_mb;
108933965Sjdp			sockbuf_pushsync(&so->so_rcv, nextrecord);
109033965Sjdp		}
109133965Sjdp	}
109233965Sjdp
109333965Sjdp	/*
109433965Sjdp	 * Process one or more MT_CONTROL mbufs present before any data mbufs
109533965Sjdp	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
109633965Sjdp	 * just copy the data; if !MSG_PEEK, we call into the protocol to
109733965Sjdp	 * perform externalization (or freeing if controlp == NULL).
109833965Sjdp	 */
109933965Sjdp	if (m != NULL && m->m_type == MT_CONTROL) {
110033965Sjdp		struct mbuf *cm = NULL, *cmn;
110133965Sjdp		struct mbuf **cme = &cm;
110233965Sjdp
110333965Sjdp		do {
110433965Sjdp			if (flags & MSG_PEEK) {
110533965Sjdp				if (controlp != NULL) {
110633965Sjdp					*controlp = m_copy(m, 0, m->m_len);
110733965Sjdp					controlp = &(*controlp)->m_next;
110833965Sjdp				}
110933965Sjdp				m = m->m_next;
111033965Sjdp			} else {
111133965Sjdp				sbfree(&so->so_rcv, m);
111233965Sjdp				so->so_rcv.sb_mb = m->m_next;
111333965Sjdp				m->m_next = NULL;
111433965Sjdp				*cme = m;
111533965Sjdp				cme = &(*cme)->m_next;
111633965Sjdp				m = so->so_rcv.sb_mb;
111733965Sjdp			}
111833965Sjdp		} while (m != NULL && m->m_type == MT_CONTROL);
111933965Sjdp		if ((flags & MSG_PEEK) == 0)
112033965Sjdp			sockbuf_pushsync(&so->so_rcv, nextrecord);
112133965Sjdp		while (cm != NULL) {
112233965Sjdp			cmn = cm->m_next;
112333965Sjdp			cm->m_next = NULL;
112433965Sjdp			if (pr->pr_domain->dom_externalize != NULL) {
112533965Sjdp				SOCKBUF_UNLOCK(&so->so_rcv);
112633965Sjdp				error = (*pr->pr_domain->dom_externalize)
112733965Sjdp				    (cm, controlp);
112833965Sjdp				SOCKBUF_LOCK(&so->so_rcv);
112933965Sjdp			} else if (controlp != NULL)
113033965Sjdp				*controlp = cm;
113133965Sjdp			else
113233965Sjdp				m_freem(cm);
113333965Sjdp			if (controlp != NULL) {
113433965Sjdp				orig_resid = 0;
113533965Sjdp				while (*controlp != NULL)
113633965Sjdp					controlp = &(*controlp)->m_next;
113733965Sjdp			}
113833965Sjdp			cm = cmn;
113933965Sjdp		}
114033965Sjdp		nextrecord = so->so_rcv.sb_mb->m_nextpkt;
114133965Sjdp		orig_resid = 0;
114233965Sjdp	}
114333965Sjdp	if (m != NULL) {
114433965Sjdp		if ((flags & MSG_PEEK) == 0) {
114533965Sjdp			KASSERT(m->m_nextpkt == nextrecord,
114633965Sjdp			    ("soreceive: post-control, nextrecord !sync"));
114733965Sjdp			if (nextrecord == NULL) {
114833965Sjdp				KASSERT(so->so_rcv.sb_mb == m,
114933965Sjdp				    ("soreceive: post-control, sb_mb!=m"));
115033965Sjdp				KASSERT(so->so_rcv.sb_lastrecord == m,
115133965Sjdp				    ("soreceive: post-control, lastrecord!=m"));
115233965Sjdp			}
115333965Sjdp		}
115433965Sjdp		type = m->m_type;
115533965Sjdp		if (type == MT_OOBDATA)
115633965Sjdp			flags |= MSG_OOB;
115733965Sjdp	} else {
115833965Sjdp		if ((flags & MSG_PEEK) == 0) {
115933965Sjdp			KASSERT(so->so_rcv.sb_mb == nextrecord,
116033965Sjdp			    ("soreceive: sb_mb != nextrecord"));
116133965Sjdp			if (so->so_rcv.sb_mb == NULL) {
116233965Sjdp				KASSERT(so->so_rcv.sb_lastrecord == NULL,
116333965Sjdp				    ("soreceive: sb_lastercord != NULL"));
116433965Sjdp			}
116533965Sjdp		}
116633965Sjdp	}
116733965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
116833965Sjdp	SBLASTRECORDCHK(&so->so_rcv);
116933965Sjdp	SBLASTMBUFCHK(&so->so_rcv);
117033965Sjdp
117133965Sjdp	/*
117233965Sjdp	 * Now continue to read any data mbufs off of the head of the socket
117333965Sjdp	 * buffer until the read request is satisfied.  Note that 'type' is
117433965Sjdp	 * used to store the type of any mbuf reads that have happened so far
117533965Sjdp	 * such that soreceive() can stop reading if the type changes, which
117633965Sjdp	 * causes soreceive() to return only one of regular data and inline
117733965Sjdp	 * out-of-band data in a single socket receive operation.
117833965Sjdp	 */
117933965Sjdp	moff = 0;
118033965Sjdp	offset = 0;
118133965Sjdp	while (m != NULL && uio->uio_resid > 0 && error == 0) {
118233965Sjdp		/*
118333965Sjdp		 * If the type of mbuf has changed since the last mbuf
118433965Sjdp		 * examined ('type'), end the receive operation.
118533965Sjdp	 	 */
118633965Sjdp		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
118733965Sjdp		if (m->m_type == MT_OOBDATA) {
118833965Sjdp			if (type != MT_OOBDATA)
118933965Sjdp				break;
119033965Sjdp		} else if (type == MT_OOBDATA)
119133965Sjdp			break;
119233965Sjdp		else
119333965Sjdp		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
119433965Sjdp			("m->m_type == %d", m->m_type));
119533965Sjdp		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
119633965Sjdp		len = uio->uio_resid;
119733965Sjdp		if (so->so_oobmark && len > so->so_oobmark - offset)
119833965Sjdp			len = so->so_oobmark - offset;
119933965Sjdp		if (len > m->m_len - moff)
120033965Sjdp			len = m->m_len - moff;
120133965Sjdp		/*
120233965Sjdp		 * If mp is set, just pass back the mbufs.
120333965Sjdp		 * Otherwise copy them out via the uio, then free.
120433965Sjdp		 * Sockbuf must be consistent here (points to current mbuf,
120533965Sjdp		 * it points to next record) when we drop priority;
120633965Sjdp		 * we must note any additions to the sockbuf when we
120733965Sjdp		 * block interrupts again.
120833965Sjdp		 */
120933965Sjdp		if (mp == NULL) {
121033965Sjdp			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
121133965Sjdp			SBLASTRECORDCHK(&so->so_rcv);
121233965Sjdp			SBLASTMBUFCHK(&so->so_rcv);
121333965Sjdp			SOCKBUF_UNLOCK(&so->so_rcv);
121433965Sjdp#ifdef ZERO_COPY_SOCKETS
121533965Sjdp			if (so_zero_copy_receive) {
121633965Sjdp				vm_page_t pg;
121733965Sjdp				int disposable;
121833965Sjdp
121933965Sjdp				if ((m->m_flags & M_EXT)
122033965Sjdp				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
122133965Sjdp					disposable = 1;
122233965Sjdp				else
122333965Sjdp					disposable = 0;
122433965Sjdp
122533965Sjdp				pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
122633965Sjdp					moff));
122733965Sjdp
122833965Sjdp				if (uio->uio_offset == -1)
122933965Sjdp					uio->uio_offset =IDX_TO_OFF(pg->pindex);
123033965Sjdp
123133965Sjdp				error = uiomoveco(mtod(m, char *) + moff,
123233965Sjdp						  (int)len, uio,pg->object,
123333965Sjdp						  disposable);
123433965Sjdp			} else
123533965Sjdp#endif /* ZERO_COPY_SOCKETS */
123633965Sjdp			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
123733965Sjdp			SOCKBUF_LOCK(&so->so_rcv);
123833965Sjdp			if (error)
123933965Sjdp				goto release;
124033965Sjdp		} else
124133965Sjdp			uio->uio_resid -= len;
124233965Sjdp		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
124333965Sjdp		if (len == m->m_len - moff) {
124433965Sjdp			if (m->m_flags & M_EOR)
124533965Sjdp				flags |= MSG_EOR;
124633965Sjdp			if (flags & MSG_PEEK) {
124733965Sjdp				m = m->m_next;
124833965Sjdp				moff = 0;
124933965Sjdp			} else {
125033965Sjdp				nextrecord = m->m_nextpkt;
125133965Sjdp				sbfree(&so->so_rcv, m);
125233965Sjdp				if (mp != NULL) {
125333965Sjdp					*mp = m;
125433965Sjdp					mp = &m->m_next;
125533965Sjdp					so->so_rcv.sb_mb = m = m->m_next;
125633965Sjdp					*mp = NULL;
125733965Sjdp				} else {
125833965Sjdp					so->so_rcv.sb_mb = m_free(m);
125933965Sjdp					m = so->so_rcv.sb_mb;
126033965Sjdp				}
126133965Sjdp				if (m != NULL) {
126233965Sjdp					m->m_nextpkt = nextrecord;
126333965Sjdp					if (nextrecord == NULL)
126433965Sjdp						so->so_rcv.sb_lastrecord = m;
126533965Sjdp				} else {
126633965Sjdp					so->so_rcv.sb_mb = nextrecord;
126733965Sjdp					SB_EMPTY_FIXUP(&so->so_rcv);
126833965Sjdp				}
126933965Sjdp				SBLASTRECORDCHK(&so->so_rcv);
127033965Sjdp				SBLASTMBUFCHK(&so->so_rcv);
127133965Sjdp			}
127233965Sjdp		} else {
127333965Sjdp			if (flags & MSG_PEEK)
127433965Sjdp				moff += len;
127533965Sjdp			else {
127633965Sjdp				if (mp != NULL) {
127733965Sjdp					SOCKBUF_UNLOCK(&so->so_rcv);
127833965Sjdp					*mp = m_copym(m, 0, len, M_TRYWAIT);
127933965Sjdp					SOCKBUF_LOCK(&so->so_rcv);
128033965Sjdp				}
128133965Sjdp				m->m_data += len;
128233965Sjdp				m->m_len -= len;
128333965Sjdp				so->so_rcv.sb_cc -= len;
128433965Sjdp			}
128533965Sjdp		}
128633965Sjdp		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
128733965Sjdp		if (so->so_oobmark) {
128833965Sjdp			if ((flags & MSG_PEEK) == 0) {
128933965Sjdp				so->so_oobmark -= len;
129033965Sjdp				if (so->so_oobmark == 0) {
129133965Sjdp					so->so_rcv.sb_state |= SBS_RCVATMARK;
129233965Sjdp					break;
129333965Sjdp				}
129433965Sjdp			} else {
129533965Sjdp				offset += len;
129633965Sjdp				if (offset == so->so_oobmark)
129733965Sjdp					break;
129833965Sjdp			}
129933965Sjdp		}
130033965Sjdp		if (flags & MSG_EOR)
130133965Sjdp			break;
130233965Sjdp		/*
130333965Sjdp		 * If the MSG_WAITALL flag is set (for non-atomic socket),
130433965Sjdp		 * we must not quit until "uio->uio_resid == 0" or an error
130533965Sjdp		 * termination.  If a signal/timeout occurs, return
130633965Sjdp		 * with a short count but without error.
130733965Sjdp		 * Keep sockbuf locked against other readers.
130833965Sjdp		 */
130933965Sjdp		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
131033965Sjdp		    !sosendallatonce(so) && nextrecord == NULL) {
131133965Sjdp			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
131233965Sjdp			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
131333965Sjdp				break;
131433965Sjdp			/*
131533965Sjdp			 * Notify the protocol that some data has been
131633965Sjdp			 * drained before blocking.
131733965Sjdp			 */
131833965Sjdp			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
131933965Sjdp				SOCKBUF_UNLOCK(&so->so_rcv);
132033965Sjdp				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
132133965Sjdp				SOCKBUF_LOCK(&so->so_rcv);
132233965Sjdp			}
132333965Sjdp			SBLASTRECORDCHK(&so->so_rcv);
132433965Sjdp			SBLASTMBUFCHK(&so->so_rcv);
132533965Sjdp			error = sbwait(&so->so_rcv);
132633965Sjdp			if (error)
132733965Sjdp				goto release;
132833965Sjdp			m = so->so_rcv.sb_mb;
132933965Sjdp			if (m != NULL)
133033965Sjdp				nextrecord = m->m_nextpkt;
133133965Sjdp		}
133233965Sjdp	}
133333965Sjdp
133433965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
133533965Sjdp	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
133633965Sjdp		flags |= MSG_TRUNC;
133733965Sjdp		if ((flags & MSG_PEEK) == 0)
133833965Sjdp			(void) sbdroprecord_locked(&so->so_rcv);
133933965Sjdp	}
134033965Sjdp	if ((flags & MSG_PEEK) == 0) {
134133965Sjdp		if (m == NULL) {
134233965Sjdp			/*
134333965Sjdp			 * First part is an inline SB_EMPTY_FIXUP().  Second
134433965Sjdp			 * part makes sure sb_lastrecord is up-to-date if
134533965Sjdp			 * there is still data in the socket buffer.
134633965Sjdp			 */
134733965Sjdp			so->so_rcv.sb_mb = nextrecord;
134833965Sjdp			if (so->so_rcv.sb_mb == NULL) {
134933965Sjdp				so->so_rcv.sb_mbtail = NULL;
135033965Sjdp				so->so_rcv.sb_lastrecord = NULL;
135133965Sjdp			} else if (nextrecord->m_nextpkt == NULL)
135233965Sjdp				so->so_rcv.sb_lastrecord = nextrecord;
135333965Sjdp		}
135433965Sjdp		SBLASTRECORDCHK(&so->so_rcv);
135533965Sjdp		SBLASTMBUFCHK(&so->so_rcv);
135633965Sjdp		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
135733965Sjdp			SOCKBUF_UNLOCK(&so->so_rcv);
135833965Sjdp			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
135933965Sjdp			SOCKBUF_LOCK(&so->so_rcv);
136033965Sjdp		}
136133965Sjdp	}
136233965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
136333965Sjdp	if (orig_resid == uio->uio_resid && orig_resid &&
136433965Sjdp	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
136533965Sjdp		sbunlock(&so->so_rcv);
136633965Sjdp		goto restart;
136733965Sjdp	}
136833965Sjdp
136933965Sjdp	if (flagsp != NULL)
137033965Sjdp		*flagsp |= flags;
137133965Sjdprelease:
137233965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
137333965Sjdp	sbunlock(&so->so_rcv);
137433965Sjdpout:
137533965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
137633965Sjdp	SOCKBUF_UNLOCK(&so->so_rcv);
137733965Sjdp	return (error);
137833965Sjdp}
137933965Sjdp
138033965Sjdpint
138133965Sjdpsoshutdown(so, how)
138233965Sjdp	struct socket *so;
138333965Sjdp	int how;
138433965Sjdp{
138533965Sjdp	struct protosw *pr = so->so_proto;
138633965Sjdp
138733965Sjdp	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
138833965Sjdp		return (EINVAL);
138933965Sjdp
139033965Sjdp	if (how != SHUT_WR)
139133965Sjdp		sorflush(so);
139233965Sjdp	if (how != SHUT_RD)
139333965Sjdp		return ((*pr->pr_usrreqs->pru_shutdown)(so));
139433965Sjdp	return (0);
139533965Sjdp}
139633965Sjdp
139733965Sjdpvoid
139833965Sjdpsorflush(so)
139933965Sjdp	struct socket *so;
140033965Sjdp{
140133965Sjdp	struct sockbuf *sb = &so->so_rcv;
140233965Sjdp	struct protosw *pr = so->so_proto;
140333965Sjdp	struct sockbuf asb;
140433965Sjdp
140533965Sjdp	/*
140633965Sjdp	 * XXXRW: This is quite ugly.  The existing code made a copy of the
140733965Sjdp	 * socket buffer, then zero'd the original to clear the buffer
140833965Sjdp	 * fields.  However, with mutexes in the socket buffer, this causes
140933965Sjdp	 * problems.  We only clear the zeroable bits of the original;
141033965Sjdp	 * however, we have to initialize and destroy the mutex in the copy
141133965Sjdp	 * so that dom_dispose() and sbrelease() can lock t as needed.
141233965Sjdp	 */
141333965Sjdp	SOCKBUF_LOCK(sb);
141433965Sjdp	sb->sb_flags |= SB_NOINTR;
141533965Sjdp	(void) sblock(sb, M_WAITOK);
141633965Sjdp	/*
141733965Sjdp	 * socantrcvmore_locked() drops the socket buffer mutex so that it
141833965Sjdp	 * can safely perform wakeups.  Re-acquire the mutex before
141933965Sjdp	 * continuing.
142033965Sjdp	 */
142133965Sjdp	socantrcvmore_locked(so);
142233965Sjdp	SOCKBUF_LOCK(sb);
142333965Sjdp	sbunlock(sb);
142433965Sjdp	/*
142533965Sjdp	 * Invalidate/clear most of the sockbuf structure, but leave
142633965Sjdp	 * selinfo and mutex data unchanged.
142733965Sjdp	 */
142833965Sjdp	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
142933965Sjdp	bcopy(&sb->sb_startzero, &asb.sb_startzero,
143033965Sjdp	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
143133965Sjdp	bzero(&sb->sb_startzero,
143233965Sjdp	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
143333965Sjdp	SOCKBUF_UNLOCK(sb);
143433965Sjdp
143533965Sjdp	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
143633965Sjdp	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
143733965Sjdp		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
143833965Sjdp	sbrelease(&asb, so);
143933965Sjdp	SOCKBUF_LOCK_DESTROY(&asb);
144033965Sjdp}
144133965Sjdp
144233965Sjdp#ifdef INET
144333965Sjdpstatic int
144433965Sjdpdo_setopt_accept_filter(so, sopt)
144533965Sjdp	struct	socket *so;
144633965Sjdp	struct	sockopt *sopt;
144733965Sjdp{
144833965Sjdp	struct accept_filter_arg	*afap;
144933965Sjdp	struct accept_filter	*afp;
145033965Sjdp	struct so_accf	*newaf;
145133965Sjdp	int	error = 0;
145233965Sjdp
145333965Sjdp	newaf = NULL;
145433965Sjdp	afap = NULL;
145533965Sjdp
145633965Sjdp	/*
145733965Sjdp	 * XXXRW: Configuring accept filters should be an atomic test-and-set
145833965Sjdp	 * operation to prevent races during setup and attach.  There may be
145933965Sjdp	 * more general issues of racing and ordering here that are not yet
146033965Sjdp	 * addressed by locking.
146133965Sjdp	 */
146233965Sjdp	/* do not set/remove accept filters on non listen sockets */
146333965Sjdp	SOCK_LOCK(so);
146433965Sjdp	if ((so->so_options & SO_ACCEPTCONN) == 0) {
146533965Sjdp		SOCK_UNLOCK(so);
146633965Sjdp		return (EINVAL);
146733965Sjdp	}
146833965Sjdp
146933965Sjdp	/* removing the filter */
147033965Sjdp	if (sopt == NULL) {
147133965Sjdp		if (so->so_accf != NULL) {
147233965Sjdp			struct so_accf *af = so->so_accf;
147333965Sjdp			if (af->so_accept_filter != NULL &&
147433965Sjdp				af->so_accept_filter->accf_destroy != NULL) {
147533965Sjdp				af->so_accept_filter->accf_destroy(so);
147633965Sjdp			}
147733965Sjdp			if (af->so_accept_filter_str != NULL) {
147833965Sjdp				FREE(af->so_accept_filter_str, M_ACCF);
147933965Sjdp			}
148033965Sjdp			FREE(af, M_ACCF);
148133965Sjdp			so->so_accf = NULL;
148233965Sjdp		}
148333965Sjdp		so->so_options &= ~SO_ACCEPTFILTER;
148433965Sjdp		SOCK_UNLOCK(so);
148533965Sjdp		return (0);
148633965Sjdp	}
148733965Sjdp	SOCK_UNLOCK(so);
148833965Sjdp
148933965Sjdp	/*-
149033965Sjdp	 * Adding a filter.
149133965Sjdp	 *
149233965Sjdp	 * Do memory allocation, copyin, and filter lookup now while we're
149333965Sjdp	 * not holding any locks.  Avoids sleeping with a mutex, as well as
149433965Sjdp	 * introducing a lock order between accept filter locks and socket
149533965Sjdp	 * locks here.
149633965Sjdp	 */
149733965Sjdp	MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP,
149833965Sjdp	    M_WAITOK);
149933965Sjdp	/* don't put large objects on the kernel stack */
150033965Sjdp	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
150133965Sjdp	afap->af_name[sizeof(afap->af_name)-1] = '\0';
150233965Sjdp	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
150333965Sjdp	if (error) {
150433965Sjdp		FREE(afap, M_TEMP);
150533965Sjdp		return (error);
150633965Sjdp	}
150733965Sjdp	afp = accept_filt_get(afap->af_name);
150833965Sjdp	if (afp == NULL) {
150933965Sjdp		FREE(afap, M_TEMP);
151033965Sjdp		return (ENOENT);
151133965Sjdp	}
151233965Sjdp
151333965Sjdp	/*
151433965Sjdp	 * Allocate the new accept filter instance storage.  We may have to
151533965Sjdp	 * free it again later if we fail to attach it.  If attached
151633965Sjdp	 * properly, 'newaf' is NULLed to avoid a free() while in use.
151733965Sjdp	 */
151833965Sjdp	MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK |
151933965Sjdp	    M_ZERO);
152033965Sjdp	if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
152133965Sjdp		int len = strlen(afap->af_name) + 1;
152233965Sjdp		MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF,
152333965Sjdp		    M_WAITOK);
152433965Sjdp		strcpy(newaf->so_accept_filter_str, afap->af_name);
152533965Sjdp	}
152633965Sjdp
152733965Sjdp	SOCK_LOCK(so);
152833965Sjdp	/* must remove previous filter first */
152933965Sjdp	if (so->so_accf != NULL) {
153033965Sjdp		error = EINVAL;
153133965Sjdp		goto out;
153233965Sjdp	}
153333965Sjdp	/*
153433965Sjdp	 * Invoke the accf_create() method of the filter if required.
153533965Sjdp	 * XXXRW: the socket mutex is held over this call, so the create
153633965Sjdp	 * method cannot block.  This may be something we have to change, but
153733965Sjdp	 * it would require addressing possible races.
153833965Sjdp	 */
153933965Sjdp	if (afp->accf_create != NULL) {
154033965Sjdp		newaf->so_accept_filter_arg =
154133965Sjdp		    afp->accf_create(so, afap->af_arg);
154233965Sjdp		if (newaf->so_accept_filter_arg == NULL) {
154333965Sjdp			error = EINVAL;
154433965Sjdp			goto out;
154533965Sjdp		}
154633965Sjdp	}
154733965Sjdp	newaf->so_accept_filter = afp;
154833965Sjdp	so->so_accf = newaf;
154933965Sjdp	so->so_options |= SO_ACCEPTFILTER;
155033965Sjdp	newaf = NULL;
155133965Sjdpout:
155233965Sjdp	SOCK_UNLOCK(so);
155333965Sjdp	if (newaf != NULL) {
155433965Sjdp		if (newaf->so_accept_filter_str != NULL)
155533965Sjdp			FREE(newaf->so_accept_filter_str, M_ACCF);
155633965Sjdp		FREE(newaf, M_ACCF);
155733965Sjdp	}
155833965Sjdp	if (afap != NULL)
155933965Sjdp		FREE(afap, M_TEMP);
156033965Sjdp	return (error);
156133965Sjdp}
156233965Sjdp#endif /* INET */
156333965Sjdp
156433965Sjdp/*
156533965Sjdp * Perhaps this routine, and sooptcopyout(), below, ought to come in
156633965Sjdp * an additional variant to handle the case where the option value needs
156733965Sjdp * to be some kind of integer, but not a specific size.
156833965Sjdp * In addition to their use here, these functions are also called by the
156933965Sjdp * protocol-level pr_ctloutput() routines.
157033965Sjdp */
157133965Sjdpint
157233965Sjdpsooptcopyin(sopt, buf, len, minlen)
157333965Sjdp	struct	sockopt *sopt;
157433965Sjdp	void	*buf;
157533965Sjdp	size_t	len;
157633965Sjdp	size_t	minlen;
157733965Sjdp{
157833965Sjdp	size_t	valsize;
157933965Sjdp
158033965Sjdp	/*
158133965Sjdp	 * If the user gives us more than we wanted, we ignore it,
158233965Sjdp	 * but if we don't get the minimum length the caller
158333965Sjdp	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
158433965Sjdp	 * is set to however much we actually retrieved.
158533965Sjdp	 */
158633965Sjdp	if ((valsize = sopt->sopt_valsize) < minlen)
158733965Sjdp		return EINVAL;
158833965Sjdp	if (valsize > len)
158933965Sjdp		sopt->sopt_valsize = valsize = len;
159033965Sjdp
159133965Sjdp	if (sopt->sopt_td != NULL)
159233965Sjdp		return (copyin(sopt->sopt_val, buf, valsize));
159333965Sjdp
159433965Sjdp	bcopy(sopt->sopt_val, buf, valsize);
159533965Sjdp	return 0;
159633965Sjdp}
159733965Sjdp
159833965Sjdp/*
159933965Sjdp * Kernel version of setsockopt(2)/
160033965Sjdp * XXX: optlen is size_t, not socklen_t
160133965Sjdp */
160233965Sjdpint
160333965Sjdpso_setsockopt(struct socket *so, int level, int optname, void *optval,
160433965Sjdp    size_t optlen)
160533965Sjdp{
160633965Sjdp	struct sockopt sopt;
160733965Sjdp
160833965Sjdp	sopt.sopt_level = level;
160933965Sjdp	sopt.sopt_name = optname;
161033965Sjdp	sopt.sopt_dir = SOPT_SET;
161133965Sjdp	sopt.sopt_val = optval;
161233965Sjdp	sopt.sopt_valsize = optlen;
161333965Sjdp	sopt.sopt_td = NULL;
161433965Sjdp	return (sosetopt(so, &sopt));
161533965Sjdp}
161633965Sjdp
161733965Sjdpint
161833965Sjdpsosetopt(so, sopt)
161933965Sjdp	struct socket *so;
162033965Sjdp	struct sockopt *sopt;
162133965Sjdp{
162233965Sjdp	int	error, optval;
162333965Sjdp	struct	linger l;
162433965Sjdp	struct	timeval tv;
162533965Sjdp	u_long  val;
162633965Sjdp#ifdef MAC
162733965Sjdp	struct mac extmac;
162833965Sjdp#endif
162933965Sjdp
163033965Sjdp	error = 0;
163133965Sjdp	if (sopt->sopt_level != SOL_SOCKET) {
163233965Sjdp		if (so->so_proto && so->so_proto->pr_ctloutput)
163333965Sjdp			return ((*so->so_proto->pr_ctloutput)
163433965Sjdp				  (so, sopt));
163533965Sjdp		error = ENOPROTOOPT;
163633965Sjdp	} else {
163733965Sjdp		switch (sopt->sopt_name) {
163833965Sjdp#ifdef INET
163933965Sjdp		case SO_ACCEPTFILTER:
164033965Sjdp			error = do_setopt_accept_filter(so, sopt);
164133965Sjdp			if (error)
164233965Sjdp				goto bad;
164333965Sjdp			break;
164433965Sjdp#endif
164533965Sjdp		case SO_LINGER:
164633965Sjdp			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
164733965Sjdp			if (error)
164833965Sjdp				goto bad;
164933965Sjdp
165033965Sjdp			SOCK_LOCK(so);
165133965Sjdp			so->so_linger = l.l_linger;
165233965Sjdp			if (l.l_onoff)
165333965Sjdp				so->so_options |= SO_LINGER;
165433965Sjdp			else
165533965Sjdp				so->so_options &= ~SO_LINGER;
165633965Sjdp			SOCK_UNLOCK(so);
165733965Sjdp			break;
165833965Sjdp
165933965Sjdp		case SO_DEBUG:
166033965Sjdp		case SO_KEEPALIVE:
166133965Sjdp		case SO_DONTROUTE:
166233965Sjdp		case SO_USELOOPBACK:
166333965Sjdp		case SO_BROADCAST:
166433965Sjdp		case SO_REUSEADDR:
166533965Sjdp		case SO_REUSEPORT:
166633965Sjdp		case SO_OOBINLINE:
166733965Sjdp		case SO_TIMESTAMP:
166833965Sjdp		case SO_BINTIME:
166933965Sjdp		case SO_NOSIGPIPE:
167033965Sjdp			error = sooptcopyin(sopt, &optval, sizeof optval,
167133965Sjdp					    sizeof optval);
167233965Sjdp			if (error)
167333965Sjdp				goto bad;
167433965Sjdp			SOCK_LOCK(so);
167533965Sjdp			if (optval)
167633965Sjdp				so->so_options |= sopt->sopt_name;
167733965Sjdp			else
167833965Sjdp				so->so_options &= ~sopt->sopt_name;
167933965Sjdp			SOCK_UNLOCK(so);
168033965Sjdp			break;
168133965Sjdp
168233965Sjdp		case SO_SNDBUF:
168333965Sjdp		case SO_RCVBUF:
168433965Sjdp		case SO_SNDLOWAT:
168533965Sjdp		case SO_RCVLOWAT:
168633965Sjdp			error = sooptcopyin(sopt, &optval, sizeof optval,
168733965Sjdp					    sizeof optval);
168833965Sjdp			if (error)
168933965Sjdp				goto bad;
169033965Sjdp
169133965Sjdp			/*
169233965Sjdp			 * Values < 1 make no sense for any of these
169333965Sjdp			 * options, so disallow them.
169433965Sjdp			 */
169533965Sjdp			if (optval < 1) {
169633965Sjdp				error = EINVAL;
169733965Sjdp				goto bad;
169833965Sjdp			}
169933965Sjdp
170033965Sjdp			switch (sopt->sopt_name) {
170133965Sjdp			case SO_SNDBUF:
170233965Sjdp			case SO_RCVBUF:
170333965Sjdp				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
170433965Sjdp				    &so->so_snd : &so->so_rcv, (u_long)optval,
170533965Sjdp				    so, curthread) == 0) {
170633965Sjdp					error = ENOBUFS;
170733965Sjdp					goto bad;
170833965Sjdp				}
170933965Sjdp				break;
171033965Sjdp
171133965Sjdp			/*
171233965Sjdp			 * Make sure the low-water is never greater than
171333965Sjdp			 * the high-water.
171433965Sjdp			 */
171533965Sjdp			case SO_SNDLOWAT:
171633965Sjdp				SOCKBUF_LOCK(&so->so_snd);
171733965Sjdp				so->so_snd.sb_lowat =
171833965Sjdp				    (optval > so->so_snd.sb_hiwat) ?
171933965Sjdp				    so->so_snd.sb_hiwat : optval;
172033965Sjdp				SOCKBUF_UNLOCK(&so->so_snd);
172133965Sjdp				break;
172233965Sjdp			case SO_RCVLOWAT:
172333965Sjdp				SOCKBUF_LOCK(&so->so_rcv);
172433965Sjdp				so->so_rcv.sb_lowat =
172533965Sjdp				    (optval > so->so_rcv.sb_hiwat) ?
172633965Sjdp				    so->so_rcv.sb_hiwat : optval;
172733965Sjdp				SOCKBUF_UNLOCK(&so->so_rcv);
172833965Sjdp				break;
172933965Sjdp			}
173033965Sjdp			break;
173133965Sjdp
173233965Sjdp		case SO_SNDTIMEO:
173333965Sjdp		case SO_RCVTIMEO:
173433965Sjdp			error = sooptcopyin(sopt, &tv, sizeof tv,
173533965Sjdp					    sizeof tv);
173633965Sjdp			if (error)
173733965Sjdp				goto bad;
173833965Sjdp
173933965Sjdp			/* assert(hz > 0); */
174033965Sjdp			if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
174133965Sjdp			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
174233965Sjdp				error = EDOM;
174333965Sjdp				goto bad;
174433965Sjdp			}
174533965Sjdp			/* assert(tick > 0); */
174633965Sjdp			/* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
174733965Sjdp			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
174833965Sjdp			if (val > SHRT_MAX) {
174933965Sjdp				error = EDOM;
175033965Sjdp				goto bad;
175133965Sjdp			}
175233965Sjdp			if (val == 0 && tv.tv_usec != 0)
175333965Sjdp				val = 1;
175433965Sjdp
175533965Sjdp			switch (sopt->sopt_name) {
175633965Sjdp			case SO_SNDTIMEO:
175733965Sjdp				so->so_snd.sb_timeo = val;
175833965Sjdp				break;
175933965Sjdp			case SO_RCVTIMEO:
176033965Sjdp				so->so_rcv.sb_timeo = val;
176133965Sjdp				break;
176233965Sjdp			}
176333965Sjdp			break;
176433965Sjdp		case SO_LABEL:
176533965Sjdp#ifdef MAC
176633965Sjdp			error = sooptcopyin(sopt, &extmac, sizeof extmac,
176733965Sjdp			    sizeof extmac);
176833965Sjdp			if (error)
176933965Sjdp				goto bad;
177033965Sjdp			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
177133965Sjdp			    so, &extmac);
177233965Sjdp#else
177333965Sjdp			error = EOPNOTSUPP;
177433965Sjdp#endif
177533965Sjdp			break;
177633965Sjdp		default:
177733965Sjdp			error = ENOPROTOOPT;
177833965Sjdp			break;
177933965Sjdp		}
178033965Sjdp		if (error == 0 && so->so_proto != NULL &&
178133965Sjdp		    so->so_proto->pr_ctloutput != NULL) {
178233965Sjdp			(void) ((*so->so_proto->pr_ctloutput)
178333965Sjdp				  (so, sopt));
178433965Sjdp		}
178533965Sjdp	}
178633965Sjdpbad:
178733965Sjdp	return (error);
178833965Sjdp}
178933965Sjdp
179033965Sjdp/* Helper routine for getsockopt */
179133965Sjdpint
179233965Sjdpsooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
179333965Sjdp{
179433965Sjdp	int	error;
179533965Sjdp	size_t	valsize;
179633965Sjdp
179733965Sjdp	error = 0;
179833965Sjdp
179933965Sjdp	/*
180033965Sjdp	 * Documented get behavior is that we always return a value,
180133965Sjdp	 * possibly truncated to fit in the user's buffer.
180233965Sjdp	 * Traditional behavior is that we always tell the user
180333965Sjdp	 * precisely how much we copied, rather than something useful
180433965Sjdp	 * like the total amount we had available for her.
180533965Sjdp	 * Note that this interface is not idempotent; the entire answer must
180633965Sjdp	 * generated ahead of time.
180733965Sjdp	 */
180833965Sjdp	valsize = min(len, sopt->sopt_valsize);
180933965Sjdp	sopt->sopt_valsize = valsize;
181033965Sjdp	if (sopt->sopt_val != NULL) {
181133965Sjdp		if (sopt->sopt_td != NULL)
181233965Sjdp			error = copyout(buf, sopt->sopt_val, valsize);
181333965Sjdp		else
181433965Sjdp			bcopy(buf, sopt->sopt_val, valsize);
181533965Sjdp	}
181633965Sjdp	return error;
181733965Sjdp}
181833965Sjdp
181933965Sjdpint
182033965Sjdpsogetopt(so, sopt)
182133965Sjdp	struct socket *so;
182233965Sjdp	struct sockopt *sopt;
182333965Sjdp{
182433965Sjdp	int	error, optval;
182533965Sjdp	struct	linger l;
182633965Sjdp	struct	timeval tv;
182733965Sjdp#ifdef INET
182833965Sjdp	struct accept_filter_arg *afap;
182933965Sjdp#endif
183033965Sjdp#ifdef MAC
183133965Sjdp	struct mac extmac;
183233965Sjdp#endif
183333965Sjdp
183433965Sjdp	error = 0;
183533965Sjdp	if (sopt->sopt_level != SOL_SOCKET) {
183633965Sjdp		if (so->so_proto && so->so_proto->pr_ctloutput) {
183733965Sjdp			return ((*so->so_proto->pr_ctloutput)
183833965Sjdp				  (so, sopt));
183933965Sjdp		} else
184033965Sjdp			return (ENOPROTOOPT);
184133965Sjdp	} else {
184233965Sjdp		switch (sopt->sopt_name) {
184333965Sjdp#ifdef INET
184433965Sjdp		case SO_ACCEPTFILTER:
184533965Sjdp			/* Unlocked read. */
184633965Sjdp			if ((so->so_options & SO_ACCEPTCONN) == 0)
184733965Sjdp				return (EINVAL);
184833965Sjdp			MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
184933965Sjdp				M_TEMP, M_WAITOK | M_ZERO);
185033965Sjdp			SOCK_LOCK(so);
185133965Sjdp			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
185233965Sjdp				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
185333965Sjdp				if (so->so_accf->so_accept_filter_str != NULL)
185433965Sjdp					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
185533965Sjdp			}
185633965Sjdp			SOCK_UNLOCK(so);
185733965Sjdp			error = sooptcopyout(sopt, afap, sizeof(*afap));
185833965Sjdp			FREE(afap, M_TEMP);
185933965Sjdp			break;
186033965Sjdp#endif
186133965Sjdp
186233965Sjdp		case SO_LINGER:
186333965Sjdp			/*
186433965Sjdp			 * XXXRW: We grab the lock here to get a consistent
186533965Sjdp			 * snapshot of both fields.  This may not really
186633965Sjdp			 * be necessary.
186733965Sjdp			 */
186833965Sjdp			SOCK_LOCK(so);
186933965Sjdp			l.l_onoff = so->so_options & SO_LINGER;
187033965Sjdp			l.l_linger = so->so_linger;
187133965Sjdp			SOCK_UNLOCK(so);
187233965Sjdp			error = sooptcopyout(sopt, &l, sizeof l);
187333965Sjdp			break;
187433965Sjdp
187533965Sjdp		case SO_USELOOPBACK:
187633965Sjdp		case SO_DONTROUTE:
187733965Sjdp		case SO_DEBUG:
187833965Sjdp		case SO_KEEPALIVE:
187933965Sjdp		case SO_REUSEADDR:
188033965Sjdp		case SO_REUSEPORT:
188133965Sjdp		case SO_BROADCAST:
188233965Sjdp		case SO_OOBINLINE:
188333965Sjdp		case SO_TIMESTAMP:
188433965Sjdp		case SO_BINTIME:
188533965Sjdp		case SO_NOSIGPIPE:
188633965Sjdp			optval = so->so_options & sopt->sopt_name;
188733965Sjdpinteger:
188833965Sjdp			error = sooptcopyout(sopt, &optval, sizeof optval);
188933965Sjdp			break;
189033965Sjdp
189133965Sjdp		case SO_TYPE:
189233965Sjdp			optval = so->so_type;
189333965Sjdp			goto integer;
189433965Sjdp
189533965Sjdp		case SO_ERROR:
189633965Sjdp			optval = so->so_error;
189733965Sjdp			so->so_error = 0;
189833965Sjdp			goto integer;
189933965Sjdp
190033965Sjdp		case SO_SNDBUF:
190133965Sjdp			optval = so->so_snd.sb_hiwat;
190233965Sjdp			goto integer;
190333965Sjdp
190433965Sjdp		case SO_RCVBUF:
190533965Sjdp			optval = so->so_rcv.sb_hiwat;
190633965Sjdp			goto integer;
190733965Sjdp
190833965Sjdp		case SO_SNDLOWAT:
190933965Sjdp			optval = so->so_snd.sb_lowat;
191033965Sjdp			goto integer;
191133965Sjdp
191233965Sjdp		case SO_RCVLOWAT:
191333965Sjdp			optval = so->so_rcv.sb_lowat;
191433965Sjdp			goto integer;
191533965Sjdp
191633965Sjdp		case SO_SNDTIMEO:
191733965Sjdp		case SO_RCVTIMEO:
191833965Sjdp			optval = (sopt->sopt_name == SO_SNDTIMEO ?
191933965Sjdp				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
192033965Sjdp
192133965Sjdp			tv.tv_sec = optval / hz;
192233965Sjdp			tv.tv_usec = (optval % hz) * tick;
192333965Sjdp			error = sooptcopyout(sopt, &tv, sizeof tv);
192433965Sjdp			break;
192533965Sjdp		case SO_LABEL:
192633965Sjdp#ifdef MAC
192733965Sjdp			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
192833965Sjdp			    sizeof(extmac));
192933965Sjdp			if (error)
193033965Sjdp				return (error);
193133965Sjdp			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
193233965Sjdp			    so, &extmac);
193333965Sjdp			if (error)
193433965Sjdp				return (error);
193533965Sjdp			error = sooptcopyout(sopt, &extmac, sizeof extmac);
193633965Sjdp#else
193733965Sjdp			error = EOPNOTSUPP;
193833965Sjdp#endif
193933965Sjdp			break;
194033965Sjdp		case SO_PEERLABEL:
194133965Sjdp#ifdef MAC
194233965Sjdp			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
194333965Sjdp			    sizeof(extmac));
194433965Sjdp			if (error)
194533965Sjdp				return (error);
194633965Sjdp			error = mac_getsockopt_peerlabel(
194733965Sjdp			    sopt->sopt_td->td_ucred, so, &extmac);
194833965Sjdp			if (error)
194933965Sjdp				return (error);
195033965Sjdp			error = sooptcopyout(sopt, &extmac, sizeof extmac);
195133965Sjdp#else
195233965Sjdp			error = EOPNOTSUPP;
195333965Sjdp#endif
195433965Sjdp			break;
195533965Sjdp		default:
195633965Sjdp			error = ENOPROTOOPT;
195733965Sjdp			break;
195833965Sjdp		}
195933965Sjdp		return (error);
196033965Sjdp	}
196133965Sjdp}
196233965Sjdp
196333965Sjdp/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
196433965Sjdpint
196533965Sjdpsoopt_getm(struct sockopt *sopt, struct mbuf **mp)
196633965Sjdp{
196733965Sjdp	struct mbuf *m, *m_prev;
196833965Sjdp	int sopt_size = sopt->sopt_valsize;
196933965Sjdp
197033965Sjdp	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
197133965Sjdp	if (m == NULL)
197233965Sjdp		return ENOBUFS;
197333965Sjdp	if (sopt_size > MLEN) {
197433965Sjdp		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
197533965Sjdp		if ((m->m_flags & M_EXT) == 0) {
197633965Sjdp			m_free(m);
197733965Sjdp			return ENOBUFS;
197833965Sjdp		}
197933965Sjdp		m->m_len = min(MCLBYTES, sopt_size);
198033965Sjdp	} else {
198133965Sjdp		m->m_len = min(MLEN, sopt_size);
198233965Sjdp	}
198333965Sjdp	sopt_size -= m->m_len;
198433965Sjdp	*mp = m;
198533965Sjdp	m_prev = m;
198633965Sjdp
198733965Sjdp	while (sopt_size) {
198833965Sjdp		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
198933965Sjdp		if (m == NULL) {
199033965Sjdp			m_freem(*mp);
199133965Sjdp			return ENOBUFS;
199233965Sjdp		}
199333965Sjdp		if (sopt_size > MLEN) {
199433965Sjdp			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
199533965Sjdp			    M_DONTWAIT);
199633965Sjdp			if ((m->m_flags & M_EXT) == 0) {
199733965Sjdp				m_freem(m);
199833965Sjdp				m_freem(*mp);
199933965Sjdp				return ENOBUFS;
200033965Sjdp			}
200133965Sjdp			m->m_len = min(MCLBYTES, sopt_size);
200233965Sjdp		} else {
200333965Sjdp			m->m_len = min(MLEN, sopt_size);
200433965Sjdp		}
200533965Sjdp		sopt_size -= m->m_len;
200633965Sjdp		m_prev->m_next = m;
200733965Sjdp		m_prev = m;
200833965Sjdp	}
200933965Sjdp	return 0;
201033965Sjdp}
201133965Sjdp
201233965Sjdp/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
201333965Sjdpint
201433965Sjdpsoopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
201533965Sjdp{
201633965Sjdp	struct mbuf *m0 = m;
201733965Sjdp
201833965Sjdp	if (sopt->sopt_val == NULL)
201933965Sjdp		return 0;
202033965Sjdp	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
202133965Sjdp		if (sopt->sopt_td != NULL) {
202233965Sjdp			int error;
202333965Sjdp
202433965Sjdp			error = copyin(sopt->sopt_val, mtod(m, char *),
202533965Sjdp				       m->m_len);
202633965Sjdp			if (error != 0) {
202733965Sjdp				m_freem(m0);
202833965Sjdp				return(error);
202933965Sjdp			}
203033965Sjdp		} else
203133965Sjdp			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
203233965Sjdp		sopt->sopt_valsize -= m->m_len;
203333965Sjdp		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
203433965Sjdp		m = m->m_next;
203533965Sjdp	}
203633965Sjdp	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
203733965Sjdp		panic("ip6_sooptmcopyin");
203833965Sjdp	return 0;
203933965Sjdp}
204033965Sjdp
204133965Sjdp/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
204233965Sjdpint
204333965Sjdpsoopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
204433965Sjdp{
204533965Sjdp	struct mbuf *m0 = m;
204633965Sjdp	size_t valsize = 0;
204733965Sjdp
204833965Sjdp	if (sopt->sopt_val == NULL)
204933965Sjdp		return 0;
205033965Sjdp	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
205133965Sjdp		if (sopt->sopt_td != NULL) {
205233965Sjdp			int error;
205333965Sjdp
205433965Sjdp			error = copyout(mtod(m, char *), sopt->sopt_val,
205533965Sjdp				       m->m_len);
205633965Sjdp			if (error != 0) {
205733965Sjdp				m_freem(m0);
205833965Sjdp				return(error);
205933965Sjdp			}
206033965Sjdp		} else
206133965Sjdp			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
206233965Sjdp	       sopt->sopt_valsize -= m->m_len;
206333965Sjdp	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
206433965Sjdp	       valsize += m->m_len;
206533965Sjdp	       m = m->m_next;
206633965Sjdp	}
206733965Sjdp	if (m != NULL) {
206833965Sjdp		/* enough soopt buffer should be given from user-land */
206933965Sjdp		m_freem(m0);
207033965Sjdp		return(EINVAL);
207133965Sjdp	}
207233965Sjdp	sopt->sopt_valsize = valsize;
207333965Sjdp	return 0;
207433965Sjdp}
207533965Sjdp
207633965Sjdpvoid
207733965Sjdpsohasoutofband(so)
207833965Sjdp	struct socket *so;
207933965Sjdp{
208033965Sjdp	if (so->so_sigio != NULL)
208133965Sjdp		pgsigio(&so->so_sigio, SIGURG, 0);
208233965Sjdp	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
208333965Sjdp}
208433965Sjdp
208533965Sjdpint
208633965Sjdpsopoll(struct socket *so, int events, struct ucred *active_cred,
208733965Sjdp    struct thread *td)
208833965Sjdp{
208933965Sjdp	int revents = 0;
209033965Sjdp
209133965Sjdp	if (events & (POLLIN | POLLRDNORM))
209233965Sjdp		if (soreadable(so))
209333965Sjdp			revents |= events & (POLLIN | POLLRDNORM);
209433965Sjdp
209533965Sjdp	if (events & POLLINIGNEOF)
209633965Sjdp		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
209733965Sjdp		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
209833965Sjdp			revents |= POLLINIGNEOF;
209933965Sjdp
210033965Sjdp	if (events & (POLLOUT | POLLWRNORM))
210133965Sjdp		if (sowriteable(so))
210233965Sjdp			revents |= events & (POLLOUT | POLLWRNORM);
210333965Sjdp
210433965Sjdp	if (events & (POLLPRI | POLLRDBAND))
210533965Sjdp		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
210633965Sjdp			revents |= events & (POLLPRI | POLLRDBAND);
210733965Sjdp
210833965Sjdp	if (revents == 0) {
210933965Sjdp		if (events &
211033965Sjdp		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
211133965Sjdp		     POLLRDBAND)) {
211233965Sjdp			SOCKBUF_LOCK(&so->so_rcv);
211333965Sjdp			selrecord(td, &so->so_rcv.sb_sel);
211433965Sjdp			so->so_rcv.sb_flags |= SB_SEL;
211533965Sjdp			SOCKBUF_UNLOCK(&so->so_rcv);
211633965Sjdp		}
211733965Sjdp
211833965Sjdp		if (events & (POLLOUT | POLLWRNORM)) {
211933965Sjdp			SOCKBUF_LOCK(&so->so_snd);
212033965Sjdp			selrecord(td, &so->so_snd.sb_sel);
212133965Sjdp			so->so_snd.sb_flags |= SB_SEL;
212233965Sjdp			SOCKBUF_UNLOCK(&so->so_snd);
212333965Sjdp		}
212433965Sjdp	}
212533965Sjdp
212633965Sjdp	return (revents);
212733965Sjdp}
212833965Sjdp
212933965Sjdpint
213033965Sjdpsoo_kqfilter(struct file *fp, struct knote *kn)
213133965Sjdp{
213233965Sjdp	struct socket *so = kn->kn_fp->f_data;
213333965Sjdp	struct sockbuf *sb;
213433965Sjdp
213533965Sjdp	switch (kn->kn_filter) {
213633965Sjdp	case EVFILT_READ:
213733965Sjdp		if (so->so_options & SO_ACCEPTCONN)
213833965Sjdp			kn->kn_fop = &solisten_filtops;
213933965Sjdp		else
214033965Sjdp			kn->kn_fop = &soread_filtops;
214133965Sjdp		sb = &so->so_rcv;
214233965Sjdp		break;
214333965Sjdp	case EVFILT_WRITE:
214433965Sjdp		kn->kn_fop = &sowrite_filtops;
214533965Sjdp		sb = &so->so_snd;
214633965Sjdp		break;
214733965Sjdp	default:
214833965Sjdp		return (EINVAL);
214933965Sjdp	}
215033965Sjdp
215133965Sjdp	SOCKBUF_LOCK(sb);
215233965Sjdp	knlist_add(&sb->sb_sel.si_note, kn, 1);
215333965Sjdp	sb->sb_flags |= SB_KNOTE;
215433965Sjdp	SOCKBUF_UNLOCK(sb);
215533965Sjdp	return (0);
215633965Sjdp}
215733965Sjdp
215833965Sjdpstatic void
215933965Sjdpfilt_sordetach(struct knote *kn)
216033965Sjdp{
216133965Sjdp	struct socket *so = kn->kn_fp->f_data;
216233965Sjdp
216333965Sjdp	SOCKBUF_LOCK(&so->so_rcv);
216433965Sjdp	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
216533965Sjdp	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
216633965Sjdp		so->so_rcv.sb_flags &= ~SB_KNOTE;
216733965Sjdp	SOCKBUF_UNLOCK(&so->so_rcv);
216833965Sjdp}
216933965Sjdp
217033965Sjdp/*ARGSUSED*/
217133965Sjdpstatic int
217233965Sjdpfilt_soread(struct knote *kn, long hint)
217333965Sjdp{
217433965Sjdp	struct socket *so;
217533965Sjdp
217633965Sjdp	so = kn->kn_fp->f_data;
217733965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
217833965Sjdp
217933965Sjdp	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
218033965Sjdp	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
218133965Sjdp		kn->kn_flags |= EV_EOF;
218233965Sjdp		kn->kn_fflags = so->so_error;
218333965Sjdp		return (1);
218433965Sjdp	} else if (so->so_error)	/* temporary udp error */
218533965Sjdp		return (1);
218633965Sjdp	else if (kn->kn_sfflags & NOTE_LOWAT)
218733965Sjdp		return (kn->kn_data >= kn->kn_sdata);
218833965Sjdp	else
218933965Sjdp		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
219033965Sjdp}
219133965Sjdp
219233965Sjdpstatic void
219333965Sjdpfilt_sowdetach(struct knote *kn)
219433965Sjdp{
219533965Sjdp	struct socket *so = kn->kn_fp->f_data;
219633965Sjdp
219733965Sjdp	SOCKBUF_LOCK(&so->so_snd);
219833965Sjdp	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
219933965Sjdp	if (knlist_empty(&so->so_snd.sb_sel.si_note))
220033965Sjdp		so->so_snd.sb_flags &= ~SB_KNOTE;
220133965Sjdp	SOCKBUF_UNLOCK(&so->so_snd);
220233965Sjdp}
220333965Sjdp
220433965Sjdp/*ARGSUSED*/
220533965Sjdpstatic int
220633965Sjdpfilt_sowrite(struct knote *kn, long hint)
220733965Sjdp{
220833965Sjdp	struct socket *so;
220933965Sjdp
221033965Sjdp	so = kn->kn_fp->f_data;
221133965Sjdp	SOCKBUF_LOCK_ASSERT(&so->so_snd);
221233965Sjdp	kn->kn_data = sbspace(&so->so_snd);
221333965Sjdp	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
221433965Sjdp		kn->kn_flags |= EV_EOF;
221533965Sjdp		kn->kn_fflags = so->so_error;
221633965Sjdp		return (1);
221733965Sjdp	} else if (so->so_error)	/* temporary udp error */
221833965Sjdp		return (1);
221933965Sjdp	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
222033965Sjdp	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
222133965Sjdp		return (0);
222233965Sjdp	else if (kn->kn_sfflags & NOTE_LOWAT)
222333965Sjdp		return (kn->kn_data >= kn->kn_sdata);
222433965Sjdp	else
222533965Sjdp		return (kn->kn_data >= so->so_snd.sb_lowat);
222633965Sjdp}
222733965Sjdp
222833965Sjdp/*ARGSUSED*/
222933965Sjdpstatic int
223033965Sjdpfilt_solisten(struct knote *kn, long hint)
223133965Sjdp{
223233965Sjdp	struct socket *so = kn->kn_fp->f_data;
223333965Sjdp
223433965Sjdp	kn->kn_data = so->so_qlen;
223533965Sjdp	return (! TAILQ_EMPTY(&so->so_comp));
223633965Sjdp}
223733965Sjdp
223833965Sjdpint
223933965Sjdpsocheckuid(struct socket *so, uid_t uid)
224033965Sjdp{
224133965Sjdp
224233965Sjdp	if (so == NULL)
224333965Sjdp		return (EPERM);
224433965Sjdp	if (so->so_cred->cr_uid == uid)
224533965Sjdp		return (0);
224633965Sjdp	return (EPERM);
224733965Sjdp}
224833965Sjdp