uipc_socket.c revision 138647
1151497Sru/*
2114402Sru * Copyright (c) 2004 The FreeBSD Foundation
3114402Sru * Copyright (c) 2004 Robert Watson
4114402Sru * Copyright (c) 1982, 1986, 1988, 1990, 1993
5114402Sru *	The Regents of the University of California.  All rights reserved.
6114402Sru *
7114402Sru * Redistribution and use in source and binary forms, with or without
8114402Sru * modification, are permitted provided that the following conditions
9114402Sru * are met:
10114402Sru * 1. Redistributions of source code must retain the above copyright
11114402Sru *    notice, this list of conditions and the following disclaimer.
12114402Sru * 2. Redistributions in binary form must reproduce the above copyright
13114402Sru *    notice, this list of conditions and the following disclaimer in the
14114402Sru *    documentation and/or other materials provided with the distribution.
15114402Sru * 4. Neither the name of the University nor the names of its contributors
16114402Sru *    may be used to endorse or promote products derived from this software
17114402Sru *    without specific prior written permission.
18114402Sru *
19114402Sru * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20114402Sru * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21114402Sru * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22114402Sru * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23114402Sru * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24114402Sru * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25151497Sru * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26151497Sru * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27114402Sru * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28114402Sru * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29114402Sru * SUCH DAMAGE.
30114402Sru *
31151497Sru *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32114402Sru */
33114402Sru
34114402Sru#include <sys/cdefs.h>
35114402Sru__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 138647 2004-12-10 04:49:13Z alc $");
36114402Sru
37114402Sru#include "opt_inet.h"
38114402Sru#include "opt_mac.h"
39114402Sru#include "opt_zero.h"
40114402Sru
41114402Sru#include <sys/param.h>
42114402Sru#include <sys/systm.h>
43114402Sru#include <sys/fcntl.h>
44114402Sru#include <sys/limits.h>
45114402Sru#include <sys/lock.h>
46114402Sru#include <sys/mac.h>
47114402Sru#include <sys/malloc.h>
48114402Sru#include <sys/mbuf.h>
49114402Sru#include <sys/mutex.h>
50114402Sru#include <sys/domain.h>
51114402Sru#include <sys/file.h>			/* for struct knote */
52114402Sru#include <sys/kernel.h>
53114402Sru#include <sys/event.h>
54114402Sru#include <sys/poll.h>
55114402Sru#include <sys/proc.h>
56114402Sru#include <sys/protosw.h>
57114402Sru#include <sys/socket.h>
58114402Sru#include <sys/socketvar.h>
59114402Sru#include <sys/resourcevar.h>
60114402Sru#include <sys/signalvar.h>
61114402Sru#include <sys/sysctl.h>
62114402Sru#include <sys/uio.h>
63114402Sru#include <sys/jail.h>
64114402Sru
65114402Sru#include <vm/uma.h>
66114402Sru
67114402Sru
68114402Srustatic int	soreceive_rcvoob(struct socket *so, struct uio *uio,
69114402Sru		    int flags);
70114402Sru
71114402Sru#ifdef INET
72114402Srustatic int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
73114402Sru#endif
74114402Sru
75114402Srustatic void	filt_sordetach(struct knote *kn);
76114402Srustatic int	filt_soread(struct knote *kn, long hint);
77114402Srustatic void	filt_sowdetach(struct knote *kn);
78114402Srustatic int	filt_sowrite(struct knote *kn, long hint);
79114402Srustatic int	filt_solisten(struct knote *kn, long hint);
80114402Sru
81114402Srustatic struct filterops solisten_filtops =
82114402Sru	{ 1, NULL, filt_sordetach, filt_solisten };
83151497Srustatic struct filterops soread_filtops =
84114402Sru	{ 1, NULL, filt_sordetach, filt_soread };
85114402Srustatic struct filterops sowrite_filtops =
86114402Sru	{ 1, NULL, filt_sowdetach, filt_sowrite };
87114402Sru
88151497Sruuma_zone_t socket_zone;
89114402Sruso_gen_t	so_gencnt;	/* generation count for sockets */
90114402Sru
91114402SruMALLOC_DEFINE(M_SONAME, "soname", "socket name");
92114402SruMALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
93114402Sru
94114402SruSYSCTL_DECL(_kern_ipc);
95114402Sru
96114402Srustatic int somaxconn = SOMAXCONN;
97114402SruSYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
98114402Sru    &somaxconn, 0, "Maximum pending socket connection queue size");
99114402Srustatic int numopensockets;
100114402SruSYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
101114402Sru    &numopensockets, 0, "Number of open sockets");
102114402Sru#ifdef ZERO_COPY_SOCKETS
103151497Sru/* These aren't static because they're used in other files. */
104114402Sruint so_zero_copy_send = 1;
105114402Sruint so_zero_copy_receive = 1;
106114402SruSYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
107114402Sru    "Zero copy controls");
108114402SruSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
109114402Sru    &so_zero_copy_receive, 0, "Enable zero copy receive");
110114402SruSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
111114402Sru    &so_zero_copy_send, 0, "Enable zero copy send");
112114402Sru#endif /* ZERO_COPY_SOCKETS */
113114402Sru
114114402Sru/*
115114402Sru * accept_mtx locks down per-socket fields relating to accept queues.  See
116114402Sru * socketvar.h for an annotation of the protected fields of struct socket.
117114402Sru */
118114402Srustruct mtx accept_mtx;
119114402SruMTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
120114402Sru
121114402Sru/*
122114402Sru * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
123151497Sru * so_gencnt field.
124114402Sru *
125114402Sru * XXXRW: These variables might be better manipulated using atomic operations
126114402Sru * for improved efficiency.
127114402Sru */
128114402Srustatic struct mtx so_global_mtx;
129114402SruMTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
130114402Sru
131114402Sru/*
132114402Sru * Socket operation routines.
133114402Sru * These routines are called by the routines in
134114402Sru * sys_socket.c or from a system process, and
135114402Sru * implement the semantics of socket operations by
136114402Sru * switching out to the protocol specific routines.
137114402Sru */
138114402Sru
139114402Sru/*
140114402Sru * Get a socket structure from our zone, and initialize it.
141114402Sru * Note that it would probably be better to allocate socket
142114402Sru * and PCB at the same time, but I'm not convinced that all
143114402Sru * the protocols can be easily modified to do this.
144114402Sru *
145114402Sru * soalloc() returns a socket with a ref count of 0.
146114402Sru */
147151497Srustruct socket *
148114402Srusoalloc(int mflags)
149114402Sru{
150151497Sru	struct socket *so;
151151497Sru#ifdef MAC
152151497Sru	int error;
153151497Sru#endif
154114402Sru
155114402Sru	so = uma_zalloc(socket_zone, mflags | M_ZERO);
156114402Sru	if (so != NULL) {
157114402Sru#ifdef MAC
158114402Sru		error = mac_init_socket(so, mflags);
159114402Sru		if (error != 0) {
160114402Sru			uma_zfree(socket_zone, so);
161114402Sru			so = NULL;
162114402Sru			return so;
163114402Sru		}
164114402Sru#endif
165114402Sru		SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
166114402Sru		SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
167114402Sru		/* sx_init(&so->so_sxlock, "socket sxlock"); */
168114402Sru		TAILQ_INIT(&so->so_aiojobq);
169114402Sru		mtx_lock(&so_global_mtx);
170114402Sru		so->so_gencnt = ++so_gencnt;
171114402Sru		++numopensockets;
172114402Sru		mtx_unlock(&so_global_mtx);
173114402Sru	}
174114402Sru	return so;
175114402Sru}
176114402Sru
177151497Sru/*
178114402Sru * socreate returns a socket with a ref count of 1.  The socket should be
179114402Sru * closed with soclose().
180114402Sru */
181114402Sruint
182114402Srusocreate(dom, aso, type, proto, cred, td)
183114402Sru	int dom;
184114402Sru	struct socket **aso;
185114402Sru	int type;
186114402Sru	int proto;
187114402Sru	struct ucred *cred;
188114402Sru	struct thread *td;
189151497Sru{
190151497Sru	struct protosw *prp;
191151497Sru	struct socket *so;
192151497Sru	int error;
193114402Sru
194114402Sru	if (proto)
195114402Sru		prp = pffindproto(dom, proto, type);
196114402Sru	else
197114402Sru		prp = pffindtype(dom, type);
198114402Sru
199114402Sru	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
200114402Sru	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
201114402Sru		return (EPROTONOSUPPORT);
202114402Sru
203114402Sru	if (jailed(cred) && jail_socket_unixiproute_only &&
204114402Sru	    prp->pr_domain->dom_family != PF_LOCAL &&
205114402Sru	    prp->pr_domain->dom_family != PF_INET &&
206114402Sru	    prp->pr_domain->dom_family != PF_ROUTE) {
207114402Sru		return (EPROTONOSUPPORT);
208114402Sru	}
209114402Sru
210114402Sru	if (prp->pr_type != type)
211114402Sru		return (EPROTOTYPE);
212114402Sru	so = soalloc(M_WAITOK);
213114402Sru	if (so == NULL)
214114402Sru		return (ENOBUFS);
215114402Sru
216114402Sru	TAILQ_INIT(&so->so_incomp);
217114402Sru	TAILQ_INIT(&so->so_comp);
218114402Sru	so->so_type = type;
219114402Sru	so->so_cred = crhold(cred);
220114402Sru	so->so_proto = prp;
221114402Sru#ifdef MAC
222114402Sru	mac_create_socket(cred, so);
223114402Sru#endif
224114402Sru	SOCK_LOCK(so);
225114402Sru	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
226114402Sru	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
227114402Sru	soref(so);
228114402Sru	SOCK_UNLOCK(so);
229114402Sru	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
230114402Sru	if (error) {
231114402Sru		ACCEPT_LOCK();
232114402Sru		SOCK_LOCK(so);
233114402Sru		so->so_state |= SS_NOFDREF;
234114402Sru		sorele(so);
235114402Sru		return (error);
236114402Sru	}
237114402Sru	*aso = so;
238114402Sru	return (0);
239114402Sru}
240114402Sru
241114402Sruint
242114402Srusobind(so, nam, td)
243114402Sru	struct socket *so;
244114402Sru	struct sockaddr *nam;
245114402Sru	struct thread *td;
246114402Sru{
247114402Sru
248114402Sru	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
249114402Sru}
250114402Sru
251114402Sruvoid
252114402Srusodealloc(struct socket *so)
253114402Sru{
254114402Sru
255114402Sru	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
256114402Sru	mtx_lock(&so_global_mtx);
257114402Sru	so->so_gencnt = ++so_gencnt;
258114402Sru	mtx_unlock(&so_global_mtx);
259114402Sru	if (so->so_rcv.sb_hiwat)
260114402Sru		(void)chgsbsize(so->so_cred->cr_uidinfo,
261114402Sru		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
262114402Sru	if (so->so_snd.sb_hiwat)
263114402Sru		(void)chgsbsize(so->so_cred->cr_uidinfo,
264114402Sru		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
265114402Sru#ifdef INET
266114402Sru	/* remove acccept filter if one is present. */
267114402Sru	if (so->so_accf != NULL)
268114402Sru		do_setopt_accept_filter(so, NULL);
269114402Sru#endif
270114402Sru#ifdef MAC
271114402Sru	mac_destroy_socket(so);
272114402Sru#endif
273114402Sru	crfree(so->so_cred);
274114402Sru	SOCKBUF_LOCK_DESTROY(&so->so_snd);
275114402Sru	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
276114402Sru	/* sx_destroy(&so->so_sxlock); */
277114402Sru	uma_zfree(socket_zone, so);
278114402Sru	/*
279114402Sru	 * XXXRW: Seems like a shame to grab the mutex again down here, but
280114402Sru	 * we don't want to decrement the socket count until after we free
281114402Sru	 * the socket, and we can't increment the gencnt on the socket after
282114402Sru	 * we free, it so...
283114402Sru	 */
284114402Sru	mtx_lock(&so_global_mtx);
285114402Sru	--numopensockets;
286114402Sru	mtx_unlock(&so_global_mtx);
287114402Sru}
288114402Sru
289114402Sruint
290114402Srusolisten(so, backlog, td)
291151497Sru	struct socket *so;
292151497Sru	int backlog;
293114402Sru	struct thread *td;
294114402Sru{
295114402Sru	int error;
296114402Sru
297114402Sru	/*
298114402Sru	 * XXXRW: Ordering issue here -- perhaps we need to set
299114402Sru	 * SO_ACCEPTCONN before the call to pru_listen()?
300114402Sru	 * XXXRW: General atomic test-and-set concerns here also.
301114402Sru	 */
302114402Sru	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
303114402Sru			    SS_ISDISCONNECTING))
304114402Sru		return (EINVAL);
305114402Sru	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
306114402Sru	if (error)
307114402Sru		return (error);
308114402Sru	ACCEPT_LOCK();
309114402Sru	if (TAILQ_EMPTY(&so->so_comp)) {
310114402Sru		SOCK_LOCK(so);
311114402Sru		so->so_options |= SO_ACCEPTCONN;
312114402Sru		SOCK_UNLOCK(so);
313114402Sru	}
314114402Sru	if (backlog < 0 || backlog > somaxconn)
315114402Sru		backlog = somaxconn;
316114402Sru	so->so_qlimit = backlog;
317114402Sru	ACCEPT_UNLOCK();
318114402Sru	return (0);
319114402Sru}
320114402Sru
321114402Sru/*
322114402Sru * Attempt to free a socket.  This should really be sotryfree().
323114402Sru *
324114402Sru * We free the socket if the protocol is no longer interested in the socket,
325114402Sru * there's no file descriptor reference, and the refcount is 0.  While the
326114402Sru * calling macro sotryfree() tests the refcount, sofree() has to test it
327114402Sru * again as it's possible to race with an accept()ing thread if the socket is
328114402Sru * in an listen queue of a listen socket, as being in the listen queue
329114402Sru * doesn't elevate the reference count.  sofree() acquires the accept mutex
330114402Sru * early for this test in order to avoid that race.
331114402Sru */
332114402Sruvoid
333114402Srusofree(so)
334114402Sru	struct socket *so;
335114402Sru{
336114402Sru	struct socket *head;
337114402Sru
338114402Sru	ACCEPT_LOCK_ASSERT();
339114402Sru	SOCK_LOCK_ASSERT(so);
340114402Sru
341114402Sru	if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
342114402Sru	    so->so_count != 0) {
343114402Sru		SOCK_UNLOCK(so);
344114402Sru		ACCEPT_UNLOCK();
345114402Sru		return;
346114402Sru	}
347
348	head = so->so_head;
349	if (head != NULL) {
350		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
351		    (so->so_qstate & SQ_INCOMP) != 0,
352		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
353		    "SQ_INCOMP"));
354		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
355		    (so->so_qstate & SQ_INCOMP) == 0,
356		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
357		/*
358		 * accept(2) is responsible draining the completed
359		 * connection queue and freeing those sockets, so
360		 * we just return here if this socket is currently
361		 * on the completed connection queue.  Otherwise,
362		 * accept(2) may hang after select(2) has indicating
363		 * that a listening socket was ready.  If it's an
364		 * incomplete connection, we remove it from the queue
365		 * and free it; otherwise, it won't be released until
366		 * the listening socket is closed.
367		 */
368		if ((so->so_qstate & SQ_COMP) != 0) {
369			SOCK_UNLOCK(so);
370			ACCEPT_UNLOCK();
371			return;
372		}
373		TAILQ_REMOVE(&head->so_incomp, so, so_list);
374		head->so_incqlen--;
375		so->so_qstate &= ~SQ_INCOMP;
376		so->so_head = NULL;
377	}
378	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
379	    (so->so_qstate & SQ_INCOMP) == 0,
380	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
381	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
382	SOCK_UNLOCK(so);
383	ACCEPT_UNLOCK();
384	SOCKBUF_LOCK(&so->so_snd);
385	so->so_snd.sb_flags |= SB_NOINTR;
386	(void)sblock(&so->so_snd, M_WAITOK);
387	/*
388	 * socantsendmore_locked() drops the socket buffer mutex so that it
389	 * can safely perform wakeups.  Re-acquire the mutex before
390	 * continuing.
391	 */
392	socantsendmore_locked(so);
393	SOCKBUF_LOCK(&so->so_snd);
394	sbunlock(&so->so_snd);
395	sbrelease_locked(&so->so_snd, so);
396	SOCKBUF_UNLOCK(&so->so_snd);
397	sorflush(so);
398	knlist_destroy(&so->so_rcv.sb_sel.si_note);
399	knlist_destroy(&so->so_snd.sb_sel.si_note);
400	sodealloc(so);
401}
402
403/*
404 * Close a socket on last file table reference removal.
405 * Initiate disconnect if connected.
406 * Free socket when disconnect complete.
407 *
408 * This function will sorele() the socket.  Note that soclose() may be
409 * called prior to the ref count reaching zero.  The actual socket
410 * structure will not be freed until the ref count reaches zero.
411 */
412int
413soclose(so)
414	struct socket *so;
415{
416	int error = 0;
417
418	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
419
420	funsetown(&so->so_sigio);
421	if (so->so_options & SO_ACCEPTCONN) {
422		struct socket *sp;
423		ACCEPT_LOCK();
424		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
425			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
426			so->so_incqlen--;
427			sp->so_qstate &= ~SQ_INCOMP;
428			sp->so_head = NULL;
429			ACCEPT_UNLOCK();
430			(void) soabort(sp);
431			ACCEPT_LOCK();
432		}
433		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
434			TAILQ_REMOVE(&so->so_comp, sp, so_list);
435			so->so_qlen--;
436			sp->so_qstate &= ~SQ_COMP;
437			sp->so_head = NULL;
438			ACCEPT_UNLOCK();
439			(void) soabort(sp);
440			ACCEPT_LOCK();
441		}
442		ACCEPT_UNLOCK();
443	}
444	if (so->so_pcb == NULL)
445		goto discard;
446	if (so->so_state & SS_ISCONNECTED) {
447		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
448			error = sodisconnect(so);
449			if (error)
450				goto drop;
451		}
452		if (so->so_options & SO_LINGER) {
453			if ((so->so_state & SS_ISDISCONNECTING) &&
454			    (so->so_state & SS_NBIO))
455				goto drop;
456			while (so->so_state & SS_ISCONNECTED) {
457				error = tsleep(&so->so_timeo,
458				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
459				if (error)
460					break;
461			}
462		}
463	}
464drop:
465	if (so->so_pcb != NULL) {
466		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
467		if (error == 0)
468			error = error2;
469	}
470discard:
471	ACCEPT_LOCK();
472	SOCK_LOCK(so);
473	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
474	so->so_state |= SS_NOFDREF;
475	sorele(so);
476	return (error);
477}
478
479/*
480 * soabort() must not be called with any socket locks held, as it calls
481 * into the protocol, which will call back into the socket code causing
482 * it to acquire additional socket locks that may cause recursion or lock
483 * order reversals.
484 */
485int
486soabort(so)
487	struct socket *so;
488{
489	int error;
490
491	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
492	if (error) {
493		ACCEPT_LOCK();
494		SOCK_LOCK(so);
495		sotryfree(so);	/* note: does not decrement the ref count */
496		return error;
497	}
498	return (0);
499}
500
501int
502soaccept(so, nam)
503	struct socket *so;
504	struct sockaddr **nam;
505{
506	int error;
507
508	SOCK_LOCK(so);
509	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
510	so->so_state &= ~SS_NOFDREF;
511	SOCK_UNLOCK(so);
512	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
513	return (error);
514}
515
516int
517soconnect(so, nam, td)
518	struct socket *so;
519	struct sockaddr *nam;
520	struct thread *td;
521{
522	int error;
523
524	if (so->so_options & SO_ACCEPTCONN)
525		return (EOPNOTSUPP);
526	/*
527	 * If protocol is connection-based, can only connect once.
528	 * Otherwise, if connected, try to disconnect first.
529	 * This allows user to disconnect by connecting to, e.g.,
530	 * a null address.
531	 */
532	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
533	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
534	    (error = sodisconnect(so))))
535		error = EISCONN;
536	else
537		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
538	return (error);
539}
540
541int
542soconnect2(so1, so2)
543	struct socket *so1;
544	struct socket *so2;
545{
546
547	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
548}
549
550int
551sodisconnect(so)
552	struct socket *so;
553{
554	int error;
555
556	if ((so->so_state & SS_ISCONNECTED) == 0)
557		return (ENOTCONN);
558	if (so->so_state & SS_ISDISCONNECTING)
559		return (EALREADY);
560	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
561	return (error);
562}
563
564#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
565/*
566 * Send on a socket.
567 * If send must go all at once and message is larger than
568 * send buffering, then hard error.
569 * Lock against other senders.
570 * If must go all at once and not enough room now, then
571 * inform user that this would block and do nothing.
572 * Otherwise, if nonblocking, send as much as possible.
573 * The data to be sent is described by "uio" if nonzero,
574 * otherwise by the mbuf chain "top" (which must be null
575 * if uio is not).  Data provided in mbuf chain must be small
576 * enough to send all at once.
577 *
578 * Returns nonzero on error, timeout or signal; callers
579 * must check for short counts if EINTR/ERESTART are returned.
580 * Data and control buffers are freed on return.
581 */
582
583#ifdef ZERO_COPY_SOCKETS
584struct so_zerocopy_stats{
585	int size_ok;
586	int align_ok;
587	int found_ifp;
588};
589struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
590#include <netinet/in.h>
591#include <net/route.h>
592#include <netinet/in_pcb.h>
593#include <vm/vm.h>
594#include <vm/vm_page.h>
595#include <vm/vm_object.h>
596#endif /*ZERO_COPY_SOCKETS*/
597
598int
599sosend(so, addr, uio, top, control, flags, td)
600	struct socket *so;
601	struct sockaddr *addr;
602	struct uio *uio;
603	struct mbuf *top;
604	struct mbuf *control;
605	int flags;
606	struct thread *td;
607{
608	struct mbuf **mp;
609	struct mbuf *m;
610	long space, len = 0, resid;
611	int clen = 0, error, dontroute;
612	int atomic = sosendallatonce(so) || top;
613#ifdef ZERO_COPY_SOCKETS
614	int cow_send;
615#endif /* ZERO_COPY_SOCKETS */
616
617	if (uio != NULL)
618		resid = uio->uio_resid;
619	else
620		resid = top->m_pkthdr.len;
621	/*
622	 * In theory resid should be unsigned.
623	 * However, space must be signed, as it might be less than 0
624	 * if we over-committed, and we must use a signed comparison
625	 * of space and resid.  On the other hand, a negative resid
626	 * causes us to loop sending 0-length segments to the protocol.
627	 *
628	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
629	 * type sockets since that's an error.
630	 */
631	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
632		error = EINVAL;
633		goto out;
634	}
635
636	dontroute =
637	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
638	    (so->so_proto->pr_flags & PR_ATOMIC);
639	if (td != NULL)
640		td->td_proc->p_stats->p_ru.ru_msgsnd++;
641	if (control != NULL)
642		clen = control->m_len;
643#define	snderr(errno)	{ error = (errno); goto release; }
644
645	SOCKBUF_LOCK(&so->so_snd);
646restart:
647	SOCKBUF_LOCK_ASSERT(&so->so_snd);
648	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
649	if (error)
650		goto out_locked;
651	do {
652		SOCKBUF_LOCK_ASSERT(&so->so_snd);
653		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
654			snderr(EPIPE);
655		if (so->so_error) {
656			error = so->so_error;
657			so->so_error = 0;
658			goto release;
659		}
660		if ((so->so_state & SS_ISCONNECTED) == 0) {
661			/*
662			 * `sendto' and `sendmsg' is allowed on a connection-
663			 * based socket if it supports implied connect.
664			 * Return ENOTCONN if not connected and no address is
665			 * supplied.
666			 */
667			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
668			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
669				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
670				    !(resid == 0 && clen != 0))
671					snderr(ENOTCONN);
672			} else if (addr == NULL)
673			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
674				   ENOTCONN : EDESTADDRREQ);
675		}
676		space = sbspace(&so->so_snd);
677		if (flags & MSG_OOB)
678			space += 1024;
679		if ((atomic && resid > so->so_snd.sb_hiwat) ||
680		    clen > so->so_snd.sb_hiwat)
681			snderr(EMSGSIZE);
682		if (space < resid + clen &&
683		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
684			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
685				snderr(EWOULDBLOCK);
686			sbunlock(&so->so_snd);
687			error = sbwait(&so->so_snd);
688			if (error)
689				goto out_locked;
690			goto restart;
691		}
692		SOCKBUF_UNLOCK(&so->so_snd);
693		mp = &top;
694		space -= clen;
695		do {
696		    if (uio == NULL) {
697			/*
698			 * Data is prepackaged in "top".
699			 */
700			resid = 0;
701			if (flags & MSG_EOR)
702				top->m_flags |= M_EOR;
703		    } else do {
704#ifdef ZERO_COPY_SOCKETS
705			cow_send = 0;
706#endif /* ZERO_COPY_SOCKETS */
707			if (resid >= MINCLSIZE) {
708#ifdef ZERO_COPY_SOCKETS
709				if (top == NULL) {
710					MGETHDR(m, M_TRYWAIT, MT_DATA);
711					if (m == NULL) {
712						error = ENOBUFS;
713						SOCKBUF_LOCK(&so->so_snd);
714						goto release;
715					}
716					m->m_pkthdr.len = 0;
717					m->m_pkthdr.rcvif = (struct ifnet *)0;
718				} else {
719					MGET(m, M_TRYWAIT, MT_DATA);
720					if (m == NULL) {
721						error = ENOBUFS;
722						SOCKBUF_LOCK(&so->so_snd);
723						goto release;
724					}
725				}
726				if (so_zero_copy_send &&
727				    resid>=PAGE_SIZE &&
728				    space>=PAGE_SIZE &&
729				    uio->uio_iov->iov_len>=PAGE_SIZE) {
730					so_zerocp_stats.size_ok++;
731					if (!((vm_offset_t)
732					  uio->uio_iov->iov_base & PAGE_MASK)){
733						so_zerocp_stats.align_ok++;
734						cow_send = socow_setup(m, uio);
735					}
736				}
737				if (!cow_send) {
738					MCLGET(m, M_TRYWAIT);
739					if ((m->m_flags & M_EXT) == 0) {
740						m_free(m);
741						m = NULL;
742					} else {
743						len = min(min(MCLBYTES, resid), space);
744					}
745				} else
746					len = PAGE_SIZE;
747#else /* ZERO_COPY_SOCKETS */
748				if (top == NULL) {
749					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
750					m->m_pkthdr.len = 0;
751					m->m_pkthdr.rcvif = (struct ifnet *)0;
752				} else
753					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
754				len = min(min(MCLBYTES, resid), space);
755#endif /* ZERO_COPY_SOCKETS */
756			} else {
757				if (top == NULL) {
758					m = m_gethdr(M_TRYWAIT, MT_DATA);
759					m->m_pkthdr.len = 0;
760					m->m_pkthdr.rcvif = (struct ifnet *)0;
761
762					len = min(min(MHLEN, resid), space);
763					/*
764					 * For datagram protocols, leave room
765					 * for protocol headers in first mbuf.
766					 */
767					if (atomic && m && len < MHLEN)
768						MH_ALIGN(m, len);
769				} else {
770					m = m_get(M_TRYWAIT, MT_DATA);
771					len = min(min(MLEN, resid), space);
772				}
773			}
774			if (m == NULL) {
775				error = ENOBUFS;
776				SOCKBUF_LOCK(&so->so_snd);
777				goto release;
778			}
779
780			space -= len;
781#ifdef ZERO_COPY_SOCKETS
782			if (cow_send)
783				error = 0;
784			else
785#endif /* ZERO_COPY_SOCKETS */
786			error = uiomove(mtod(m, void *), (int)len, uio);
787			resid = uio->uio_resid;
788			m->m_len = len;
789			*mp = m;
790			top->m_pkthdr.len += len;
791			if (error) {
792				SOCKBUF_LOCK(&so->so_snd);
793				goto release;
794			}
795			mp = &m->m_next;
796			if (resid <= 0) {
797				if (flags & MSG_EOR)
798					top->m_flags |= M_EOR;
799				break;
800			}
801		    } while (space > 0 && atomic);
802		    if (dontroute) {
803			    SOCK_LOCK(so);
804			    so->so_options |= SO_DONTROUTE;
805			    SOCK_UNLOCK(so);
806		    }
807		    /*
808		     * XXX all the SBS_CANTSENDMORE checks previously
809		     * done could be out of date.  We could have recieved
810		     * a reset packet in an interrupt or maybe we slept
811		     * while doing page faults in uiomove() etc. We could
812		     * probably recheck again inside the locking protection
813		     * here, but there are probably other places that this
814		     * also happens.  We must rethink this.
815		     */
816		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
817			(flags & MSG_OOB) ? PRUS_OOB :
818			/*
819			 * If the user set MSG_EOF, the protocol
820			 * understands this flag and nothing left to
821			 * send then use PRU_SEND_EOF instead of PRU_SEND.
822			 */
823			((flags & MSG_EOF) &&
824			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
825			 (resid <= 0)) ?
826				PRUS_EOF :
827			/* If there is more to send set PRUS_MORETOCOME */
828			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
829			top, addr, control, td);
830		    if (dontroute) {
831			    SOCK_LOCK(so);
832			    so->so_options &= ~SO_DONTROUTE;
833			    SOCK_UNLOCK(so);
834		    }
835		    clen = 0;
836		    control = NULL;
837		    top = NULL;
838		    mp = &top;
839		    if (error) {
840			SOCKBUF_LOCK(&so->so_snd);
841			goto release;
842		    }
843		} while (resid && space > 0);
844		SOCKBUF_LOCK(&so->so_snd);
845	} while (resid);
846
847release:
848	SOCKBUF_LOCK_ASSERT(&so->so_snd);
849	sbunlock(&so->so_snd);
850out_locked:
851	SOCKBUF_LOCK_ASSERT(&so->so_snd);
852	SOCKBUF_UNLOCK(&so->so_snd);
853out:
854	if (top != NULL)
855		m_freem(top);
856	if (control != NULL)
857		m_freem(control);
858	return (error);
859}
860
861/*
862 * The part of soreceive() that implements reading non-inline out-of-band
863 * data from a socket.  For more complete comments, see soreceive(), from
864 * which this code originated.
865 *
866 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(),
867 * is unable to return an mbuf chain to the caller.
868 */
869static int
870soreceive_rcvoob(so, uio, flags)
871	struct socket *so;
872	struct uio *uio;
873	int flags;
874{
875	struct protosw *pr = so->so_proto;
876	struct mbuf *m;
877	int error;
878
879	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
880
881	m = m_get(M_TRYWAIT, MT_DATA);
882	if (m == NULL)
883		return (ENOBUFS);
884	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
885	if (error)
886		goto bad;
887	do {
888#ifdef ZERO_COPY_SOCKETS
889		if (so_zero_copy_receive) {
890			int disposable;
891
892			if ((m->m_flags & M_EXT)
893			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
894				disposable = 1;
895			else
896				disposable = 0;
897
898			error = uiomoveco(mtod(m, void *),
899					  min(uio->uio_resid, m->m_len),
900					  uio, disposable);
901		} else
902#endif /* ZERO_COPY_SOCKETS */
903		error = uiomove(mtod(m, void *),
904		    (int) min(uio->uio_resid, m->m_len), uio);
905		m = m_free(m);
906	} while (uio->uio_resid && error == 0 && m);
907bad:
908	if (m != NULL)
909		m_freem(m);
910	return (error);
911}
912
913/*
914 * Following replacement or removal of the first mbuf on the first mbuf chain
915 * of a socket buffer, push necessary state changes back into the socket
916 * buffer so that other consumers see the values consistently.  'nextrecord'
917 * is the callers locally stored value of the original value of
918 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
919 * NOTE: 'nextrecord' may be NULL.
920 */
921static __inline void
922sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
923{
924
925	SOCKBUF_LOCK_ASSERT(sb);
926	/*
927	 * First, update for the new value of nextrecord.  If necessary, make
928	 * it the first record.
929	 */
930	if (sb->sb_mb != NULL)
931		sb->sb_mb->m_nextpkt = nextrecord;
932	else
933		sb->sb_mb = nextrecord;
934
935        /*
936         * Now update any dependent socket buffer fields to reflect the new
937         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
938	 * addition of a second clause that takes care of the case where
939	 * sb_mb has been updated, but remains the last record.
940         */
941        if (sb->sb_mb == NULL) {
942                sb->sb_mbtail = NULL;
943                sb->sb_lastrecord = NULL;
944        } else if (sb->sb_mb->m_nextpkt == NULL)
945                sb->sb_lastrecord = sb->sb_mb;
946}
947
948
949/*
950 * Implement receive operations on a socket.
951 * We depend on the way that records are added to the sockbuf
952 * by sbappend*.  In particular, each record (mbufs linked through m_next)
953 * must begin with an address if the protocol so specifies,
954 * followed by an optional mbuf or mbufs containing ancillary data,
955 * and then zero or more mbufs of data.
956 * In order to avoid blocking network interrupts for the entire time here,
957 * we splx() while doing the actual copy to user space.
958 * Although the sockbuf is locked, new data may still be appended,
959 * and thus we must maintain consistency of the sockbuf during that time.
960 *
961 * The caller may receive the data as a single mbuf chain by supplying
962 * an mbuf **mp0 for use in returning the chain.  The uio is then used
963 * only for the count in uio_resid.
964 */
965int
966soreceive(so, psa, uio, mp0, controlp, flagsp)
967	struct socket *so;
968	struct sockaddr **psa;
969	struct uio *uio;
970	struct mbuf **mp0;
971	struct mbuf **controlp;
972	int *flagsp;
973{
974	struct mbuf *m, **mp;
975	int flags, len, error, offset;
976	struct protosw *pr = so->so_proto;
977	struct mbuf *nextrecord;
978	int moff, type = 0;
979	int orig_resid = uio->uio_resid;
980
981	mp = mp0;
982	if (psa != NULL)
983		*psa = NULL;
984	if (controlp != NULL)
985		*controlp = NULL;
986	if (flagsp != NULL)
987		flags = *flagsp &~ MSG_EOR;
988	else
989		flags = 0;
990	if (flags & MSG_OOB)
991		return (soreceive_rcvoob(so, uio, flags));
992	if (mp != NULL)
993		*mp = NULL;
994	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
995		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
996
997	SOCKBUF_LOCK(&so->so_rcv);
998restart:
999	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1000	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1001	if (error)
1002		goto out;
1003
1004	m = so->so_rcv.sb_mb;
1005	/*
1006	 * If we have less data than requested, block awaiting more
1007	 * (subject to any timeout) if:
1008	 *   1. the current count is less than the low water mark, or
1009	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1010	 *	receive operation at once if we block (resid <= hiwat).
1011	 *   3. MSG_DONTWAIT is not set
1012	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1013	 * we have to do the receive in sections, and thus risk returning
1014	 * a short count if a timeout or signal occurs after we start.
1015	 */
1016	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1017	    so->so_rcv.sb_cc < uio->uio_resid) &&
1018	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1019	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1020	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1021		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1022		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1023		    m, so->so_rcv.sb_cc));
1024		if (so->so_error) {
1025			if (m != NULL)
1026				goto dontblock;
1027			error = so->so_error;
1028			if ((flags & MSG_PEEK) == 0)
1029				so->so_error = 0;
1030			goto release;
1031		}
1032		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1033		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1034			if (m)
1035				goto dontblock;
1036			else
1037				goto release;
1038		}
1039		for (; m != NULL; m = m->m_next)
1040			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1041				m = so->so_rcv.sb_mb;
1042				goto dontblock;
1043			}
1044		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1045		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1046			error = ENOTCONN;
1047			goto release;
1048		}
1049		if (uio->uio_resid == 0)
1050			goto release;
1051		if ((so->so_state & SS_NBIO) ||
1052		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1053			error = EWOULDBLOCK;
1054			goto release;
1055		}
1056		SBLASTRECORDCHK(&so->so_rcv);
1057		SBLASTMBUFCHK(&so->so_rcv);
1058		sbunlock(&so->so_rcv);
1059		error = sbwait(&so->so_rcv);
1060		if (error)
1061			goto out;
1062		goto restart;
1063	}
1064dontblock:
1065	/*
1066	 * From this point onward, we maintain 'nextrecord' as a cache of the
1067	 * pointer to the next record in the socket buffer.  We must keep the
1068	 * various socket buffer pointers and local stack versions of the
1069	 * pointers in sync, pushing out modifications before dropping the
1070	 * socket buffer mutex, and re-reading them when picking it up.
1071	 *
1072	 * Otherwise, we will race with the network stack appending new data
1073	 * or records onto the socket buffer by using inconsistent/stale
1074	 * versions of the field, possibly resulting in socket buffer
1075	 * corruption.
1076	 *
1077	 * By holding the high-level sblock(), we prevent simultaneous
1078	 * readers from pulling off the front of the socket buffer.
1079	 */
1080	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1081	if (uio->uio_td)
1082		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1083	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1084	SBLASTRECORDCHK(&so->so_rcv);
1085	SBLASTMBUFCHK(&so->so_rcv);
1086	nextrecord = m->m_nextpkt;
1087	if (pr->pr_flags & PR_ADDR) {
1088		KASSERT(m->m_type == MT_SONAME,
1089		    ("m->m_type == %d", m->m_type));
1090		orig_resid = 0;
1091		if (psa != NULL)
1092			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1093			    M_NOWAIT);
1094		if (flags & MSG_PEEK) {
1095			m = m->m_next;
1096		} else {
1097			sbfree(&so->so_rcv, m);
1098			so->so_rcv.sb_mb = m_free(m);
1099			m = so->so_rcv.sb_mb;
1100			sockbuf_pushsync(&so->so_rcv, nextrecord);
1101		}
1102	}
1103
1104	/*
1105	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1106	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1107	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1108	 * perform externalization (or freeing if controlp == NULL).
1109	 */
1110	if (m != NULL && m->m_type == MT_CONTROL) {
1111		struct mbuf *cm = NULL, *cmn;
1112		struct mbuf **cme = &cm;
1113
1114		do {
1115			if (flags & MSG_PEEK) {
1116				if (controlp != NULL) {
1117					*controlp = m_copy(m, 0, m->m_len);
1118					controlp = &(*controlp)->m_next;
1119				}
1120				m = m->m_next;
1121			} else {
1122				sbfree(&so->so_rcv, m);
1123				so->so_rcv.sb_mb = m->m_next;
1124				m->m_next = NULL;
1125				*cme = m;
1126				cme = &(*cme)->m_next;
1127				m = so->so_rcv.sb_mb;
1128			}
1129		} while (m != NULL && m->m_type == MT_CONTROL);
1130		if ((flags & MSG_PEEK) == 0)
1131			sockbuf_pushsync(&so->so_rcv, nextrecord);
1132		while (cm != NULL) {
1133			cmn = cm->m_next;
1134			cm->m_next = NULL;
1135			if (pr->pr_domain->dom_externalize != NULL) {
1136				SOCKBUF_UNLOCK(&so->so_rcv);
1137				error = (*pr->pr_domain->dom_externalize)
1138				    (cm, controlp);
1139				SOCKBUF_LOCK(&so->so_rcv);
1140			} else if (controlp != NULL)
1141				*controlp = cm;
1142			else
1143				m_freem(cm);
1144			if (controlp != NULL) {
1145				orig_resid = 0;
1146				while (*controlp != NULL)
1147					controlp = &(*controlp)->m_next;
1148			}
1149			cm = cmn;
1150		}
1151		nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1152		orig_resid = 0;
1153	}
1154	if (m != NULL) {
1155		if ((flags & MSG_PEEK) == 0) {
1156			KASSERT(m->m_nextpkt == nextrecord,
1157			    ("soreceive: post-control, nextrecord !sync"));
1158			if (nextrecord == NULL) {
1159				KASSERT(so->so_rcv.sb_mb == m,
1160				    ("soreceive: post-control, sb_mb!=m"));
1161				KASSERT(so->so_rcv.sb_lastrecord == m,
1162				    ("soreceive: post-control, lastrecord!=m"));
1163			}
1164		}
1165		type = m->m_type;
1166		if (type == MT_OOBDATA)
1167			flags |= MSG_OOB;
1168	} else {
1169		if ((flags & MSG_PEEK) == 0) {
1170			KASSERT(so->so_rcv.sb_mb == nextrecord,
1171			    ("soreceive: sb_mb != nextrecord"));
1172			if (so->so_rcv.sb_mb == NULL) {
1173				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1174				    ("soreceive: sb_lastercord != NULL"));
1175			}
1176		}
1177	}
1178	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1179	SBLASTRECORDCHK(&so->so_rcv);
1180	SBLASTMBUFCHK(&so->so_rcv);
1181
1182	/*
1183	 * Now continue to read any data mbufs off of the head of the socket
1184	 * buffer until the read request is satisfied.  Note that 'type' is
1185	 * used to store the type of any mbuf reads that have happened so far
1186	 * such that soreceive() can stop reading if the type changes, which
1187	 * causes soreceive() to return only one of regular data and inline
1188	 * out-of-band data in a single socket receive operation.
1189	 */
1190	moff = 0;
1191	offset = 0;
1192	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1193		/*
1194		 * If the type of mbuf has changed since the last mbuf
1195		 * examined ('type'), end the receive operation.
1196	 	 */
1197		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1198		if (m->m_type == MT_OOBDATA) {
1199			if (type != MT_OOBDATA)
1200				break;
1201		} else if (type == MT_OOBDATA)
1202			break;
1203		else
1204		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1205			("m->m_type == %d", m->m_type));
1206		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1207		len = uio->uio_resid;
1208		if (so->so_oobmark && len > so->so_oobmark - offset)
1209			len = so->so_oobmark - offset;
1210		if (len > m->m_len - moff)
1211			len = m->m_len - moff;
1212		/*
1213		 * If mp is set, just pass back the mbufs.
1214		 * Otherwise copy them out via the uio, then free.
1215		 * Sockbuf must be consistent here (points to current mbuf,
1216		 * it points to next record) when we drop priority;
1217		 * we must note any additions to the sockbuf when we
1218		 * block interrupts again.
1219		 */
1220		if (mp == NULL) {
1221			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1222			SBLASTRECORDCHK(&so->so_rcv);
1223			SBLASTMBUFCHK(&so->so_rcv);
1224			SOCKBUF_UNLOCK(&so->so_rcv);
1225#ifdef ZERO_COPY_SOCKETS
1226			if (so_zero_copy_receive) {
1227				int disposable;
1228
1229				if ((m->m_flags & M_EXT)
1230				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1231					disposable = 1;
1232				else
1233					disposable = 0;
1234
1235				error = uiomoveco(mtod(m, char *) + moff,
1236						  (int)len, uio,
1237						  disposable);
1238			} else
1239#endif /* ZERO_COPY_SOCKETS */
1240			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1241			SOCKBUF_LOCK(&so->so_rcv);
1242			if (error)
1243				goto release;
1244		} else
1245			uio->uio_resid -= len;
1246		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1247		if (len == m->m_len - moff) {
1248			if (m->m_flags & M_EOR)
1249				flags |= MSG_EOR;
1250			if (flags & MSG_PEEK) {
1251				m = m->m_next;
1252				moff = 0;
1253			} else {
1254				nextrecord = m->m_nextpkt;
1255				sbfree(&so->so_rcv, m);
1256				if (mp != NULL) {
1257					*mp = m;
1258					mp = &m->m_next;
1259					so->so_rcv.sb_mb = m = m->m_next;
1260					*mp = NULL;
1261				} else {
1262					so->so_rcv.sb_mb = m_free(m);
1263					m = so->so_rcv.sb_mb;
1264				}
1265				if (m != NULL) {
1266					m->m_nextpkt = nextrecord;
1267					if (nextrecord == NULL)
1268						so->so_rcv.sb_lastrecord = m;
1269				} else {
1270					so->so_rcv.sb_mb = nextrecord;
1271					SB_EMPTY_FIXUP(&so->so_rcv);
1272				}
1273				SBLASTRECORDCHK(&so->so_rcv);
1274				SBLASTMBUFCHK(&so->so_rcv);
1275			}
1276		} else {
1277			if (flags & MSG_PEEK)
1278				moff += len;
1279			else {
1280				if (mp != NULL) {
1281					int copy_flag;
1282
1283					if (flags & MSG_DONTWAIT)
1284						copy_flag = M_DONTWAIT;
1285					else
1286						copy_flag = M_TRYWAIT;
1287					if (copy_flag == M_TRYWAIT)
1288						SOCKBUF_UNLOCK(&so->so_rcv);
1289					*mp = m_copym(m, 0, len, copy_flag);
1290					if (copy_flag == M_TRYWAIT)
1291						SOCKBUF_LOCK(&so->so_rcv);
1292 					if (*mp == NULL) {
1293 						/*
1294 						 * m_copym() couldn't allocate an mbuf.
1295						 * Adjust uio_resid back (it was adjusted
1296						 * down by len bytes, which we didn't end
1297						 * up "copying" over).
1298 						 */
1299 						uio->uio_resid += len;
1300 						break;
1301 					}
1302				}
1303				m->m_data += len;
1304				m->m_len -= len;
1305				so->so_rcv.sb_cc -= len;
1306			}
1307		}
1308		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1309		if (so->so_oobmark) {
1310			if ((flags & MSG_PEEK) == 0) {
1311				so->so_oobmark -= len;
1312				if (so->so_oobmark == 0) {
1313					so->so_rcv.sb_state |= SBS_RCVATMARK;
1314					break;
1315				}
1316			} else {
1317				offset += len;
1318				if (offset == so->so_oobmark)
1319					break;
1320			}
1321		}
1322		if (flags & MSG_EOR)
1323			break;
1324		/*
1325		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1326		 * we must not quit until "uio->uio_resid == 0" or an error
1327		 * termination.  If a signal/timeout occurs, return
1328		 * with a short count but without error.
1329		 * Keep sockbuf locked against other readers.
1330		 */
1331		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1332		    !sosendallatonce(so) && nextrecord == NULL) {
1333			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1334			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1335				break;
1336			/*
1337			 * Notify the protocol that some data has been
1338			 * drained before blocking.
1339			 */
1340			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
1341				SOCKBUF_UNLOCK(&so->so_rcv);
1342				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1343				SOCKBUF_LOCK(&so->so_rcv);
1344			}
1345			SBLASTRECORDCHK(&so->so_rcv);
1346			SBLASTMBUFCHK(&so->so_rcv);
1347			error = sbwait(&so->so_rcv);
1348			if (error)
1349				goto release;
1350			m = so->so_rcv.sb_mb;
1351			if (m != NULL)
1352				nextrecord = m->m_nextpkt;
1353		}
1354	}
1355
1356	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1357	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1358		flags |= MSG_TRUNC;
1359		if ((flags & MSG_PEEK) == 0)
1360			(void) sbdroprecord_locked(&so->so_rcv);
1361	}
1362	if ((flags & MSG_PEEK) == 0) {
1363		if (m == NULL) {
1364			/*
1365			 * First part is an inline SB_EMPTY_FIXUP().  Second
1366			 * part makes sure sb_lastrecord is up-to-date if
1367			 * there is still data in the socket buffer.
1368			 */
1369			so->so_rcv.sb_mb = nextrecord;
1370			if (so->so_rcv.sb_mb == NULL) {
1371				so->so_rcv.sb_mbtail = NULL;
1372				so->so_rcv.sb_lastrecord = NULL;
1373			} else if (nextrecord->m_nextpkt == NULL)
1374				so->so_rcv.sb_lastrecord = nextrecord;
1375		}
1376		SBLASTRECORDCHK(&so->so_rcv);
1377		SBLASTMBUFCHK(&so->so_rcv);
1378		/*
1379		 * If soreceive() is being done from the socket callback, then
1380		 * don't need to generate ACK to peer to update window, since
1381		 * ACK will be generated on return to TCP.
1382		 */
1383		if (!(flags & MSG_SOCALLBCK) &&
1384		    (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
1385			SOCKBUF_UNLOCK(&so->so_rcv);
1386			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1387			SOCKBUF_LOCK(&so->so_rcv);
1388		}
1389	}
1390	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1391	if (orig_resid == uio->uio_resid && orig_resid &&
1392	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1393		sbunlock(&so->so_rcv);
1394		goto restart;
1395	}
1396
1397	if (flagsp != NULL)
1398		*flagsp |= flags;
1399release:
1400	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1401	sbunlock(&so->so_rcv);
1402out:
1403	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1404	SOCKBUF_UNLOCK(&so->so_rcv);
1405	return (error);
1406}
1407
1408int
1409soshutdown(so, how)
1410	struct socket *so;
1411	int how;
1412{
1413	struct protosw *pr = so->so_proto;
1414
1415	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1416		return (EINVAL);
1417
1418	if (how != SHUT_WR)
1419		sorflush(so);
1420	if (how != SHUT_RD)
1421		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1422	return (0);
1423}
1424
1425void
1426sorflush(so)
1427	struct socket *so;
1428{
1429	struct sockbuf *sb = &so->so_rcv;
1430	struct protosw *pr = so->so_proto;
1431	struct sockbuf asb;
1432
1433	/*
1434	 * XXXRW: This is quite ugly.  The existing code made a copy of the
1435	 * socket buffer, then zero'd the original to clear the buffer
1436	 * fields.  However, with mutexes in the socket buffer, this causes
1437	 * problems.  We only clear the zeroable bits of the original;
1438	 * however, we have to initialize and destroy the mutex in the copy
1439	 * so that dom_dispose() and sbrelease() can lock t as needed.
1440	 */
1441	SOCKBUF_LOCK(sb);
1442	sb->sb_flags |= SB_NOINTR;
1443	(void) sblock(sb, M_WAITOK);
1444	/*
1445	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1446	 * can safely perform wakeups.  Re-acquire the mutex before
1447	 * continuing.
1448	 */
1449	socantrcvmore_locked(so);
1450	SOCKBUF_LOCK(sb);
1451	sbunlock(sb);
1452	/*
1453	 * Invalidate/clear most of the sockbuf structure, but leave
1454	 * selinfo and mutex data unchanged.
1455	 */
1456	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1457	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1458	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1459	bzero(&sb->sb_startzero,
1460	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1461	SOCKBUF_UNLOCK(sb);
1462
1463	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1464	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1465		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1466	sbrelease(&asb, so);
1467	SOCKBUF_LOCK_DESTROY(&asb);
1468}
1469
1470#ifdef INET
1471static int
1472do_setopt_accept_filter(so, sopt)
1473	struct	socket *so;
1474	struct	sockopt *sopt;
1475{
1476	struct accept_filter_arg	*afap;
1477	struct accept_filter	*afp;
1478	struct so_accf	*newaf;
1479	int	error = 0;
1480
1481	newaf = NULL;
1482	afap = NULL;
1483
1484	/*
1485	 * XXXRW: Configuring accept filters should be an atomic test-and-set
1486	 * operation to prevent races during setup and attach.  There may be
1487	 * more general issues of racing and ordering here that are not yet
1488	 * addressed by locking.
1489	 */
1490	/* do not set/remove accept filters on non listen sockets */
1491	SOCK_LOCK(so);
1492	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1493		SOCK_UNLOCK(so);
1494		return (EINVAL);
1495	}
1496
1497	/* removing the filter */
1498	if (sopt == NULL) {
1499		if (so->so_accf != NULL) {
1500			struct so_accf *af = so->so_accf;
1501			if (af->so_accept_filter != NULL &&
1502				af->so_accept_filter->accf_destroy != NULL) {
1503				af->so_accept_filter->accf_destroy(so);
1504			}
1505			if (af->so_accept_filter_str != NULL) {
1506				FREE(af->so_accept_filter_str, M_ACCF);
1507			}
1508			FREE(af, M_ACCF);
1509			so->so_accf = NULL;
1510		}
1511		so->so_options &= ~SO_ACCEPTFILTER;
1512		SOCK_UNLOCK(so);
1513		return (0);
1514	}
1515	SOCK_UNLOCK(so);
1516
1517	/*-
1518	 * Adding a filter.
1519	 *
1520	 * Do memory allocation, copyin, and filter lookup now while we're
1521	 * not holding any locks.  Avoids sleeping with a mutex, as well as
1522	 * introducing a lock order between accept filter locks and socket
1523	 * locks here.
1524	 */
1525	MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP,
1526	    M_WAITOK);
1527	/* don't put large objects on the kernel stack */
1528	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1529	afap->af_name[sizeof(afap->af_name)-1] = '\0';
1530	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1531	if (error) {
1532		FREE(afap, M_TEMP);
1533		return (error);
1534	}
1535	afp = accept_filt_get(afap->af_name);
1536	if (afp == NULL) {
1537		FREE(afap, M_TEMP);
1538		return (ENOENT);
1539	}
1540
1541	/*
1542	 * Allocate the new accept filter instance storage.  We may have to
1543	 * free it again later if we fail to attach it.  If attached
1544	 * properly, 'newaf' is NULLed to avoid a free() while in use.
1545	 */
1546	MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK |
1547	    M_ZERO);
1548	if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
1549		int len = strlen(afap->af_name) + 1;
1550		MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF,
1551		    M_WAITOK);
1552		strcpy(newaf->so_accept_filter_str, afap->af_name);
1553	}
1554
1555	SOCK_LOCK(so);
1556	/* must remove previous filter first */
1557	if (so->so_accf != NULL) {
1558		error = EINVAL;
1559		goto out;
1560	}
1561	/*
1562	 * Invoke the accf_create() method of the filter if required.
1563	 * XXXRW: the socket mutex is held over this call, so the create
1564	 * method cannot block.  This may be something we have to change, but
1565	 * it would require addressing possible races.
1566	 */
1567	if (afp->accf_create != NULL) {
1568		newaf->so_accept_filter_arg =
1569		    afp->accf_create(so, afap->af_arg);
1570		if (newaf->so_accept_filter_arg == NULL) {
1571			error = EINVAL;
1572			goto out;
1573		}
1574	}
1575	newaf->so_accept_filter = afp;
1576	so->so_accf = newaf;
1577	so->so_options |= SO_ACCEPTFILTER;
1578	newaf = NULL;
1579out:
1580	SOCK_UNLOCK(so);
1581	if (newaf != NULL) {
1582		if (newaf->so_accept_filter_str != NULL)
1583			FREE(newaf->so_accept_filter_str, M_ACCF);
1584		FREE(newaf, M_ACCF);
1585	}
1586	if (afap != NULL)
1587		FREE(afap, M_TEMP);
1588	return (error);
1589}
1590#endif /* INET */
1591
1592/*
1593 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1594 * an additional variant to handle the case where the option value needs
1595 * to be some kind of integer, but not a specific size.
1596 * In addition to their use here, these functions are also called by the
1597 * protocol-level pr_ctloutput() routines.
1598 */
1599int
1600sooptcopyin(sopt, buf, len, minlen)
1601	struct	sockopt *sopt;
1602	void	*buf;
1603	size_t	len;
1604	size_t	minlen;
1605{
1606	size_t	valsize;
1607
1608	/*
1609	 * If the user gives us more than we wanted, we ignore it,
1610	 * but if we don't get the minimum length the caller
1611	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
1612	 * is set to however much we actually retrieved.
1613	 */
1614	if ((valsize = sopt->sopt_valsize) < minlen)
1615		return EINVAL;
1616	if (valsize > len)
1617		sopt->sopt_valsize = valsize = len;
1618
1619	if (sopt->sopt_td != NULL)
1620		return (copyin(sopt->sopt_val, buf, valsize));
1621
1622	bcopy(sopt->sopt_val, buf, valsize);
1623	return 0;
1624}
1625
1626/*
1627 * Kernel version of setsockopt(2)/
1628 * XXX: optlen is size_t, not socklen_t
1629 */
1630int
1631so_setsockopt(struct socket *so, int level, int optname, void *optval,
1632    size_t optlen)
1633{
1634	struct sockopt sopt;
1635
1636	sopt.sopt_level = level;
1637	sopt.sopt_name = optname;
1638	sopt.sopt_dir = SOPT_SET;
1639	sopt.sopt_val = optval;
1640	sopt.sopt_valsize = optlen;
1641	sopt.sopt_td = NULL;
1642	return (sosetopt(so, &sopt));
1643}
1644
1645int
1646sosetopt(so, sopt)
1647	struct socket *so;
1648	struct sockopt *sopt;
1649{
1650	int	error, optval;
1651	struct	linger l;
1652	struct	timeval tv;
1653	u_long  val;
1654#ifdef MAC
1655	struct mac extmac;
1656#endif
1657
1658	error = 0;
1659	if (sopt->sopt_level != SOL_SOCKET) {
1660		if (so->so_proto && so->so_proto->pr_ctloutput)
1661			return ((*so->so_proto->pr_ctloutput)
1662				  (so, sopt));
1663		error = ENOPROTOOPT;
1664	} else {
1665		switch (sopt->sopt_name) {
1666#ifdef INET
1667		case SO_ACCEPTFILTER:
1668			error = do_setopt_accept_filter(so, sopt);
1669			if (error)
1670				goto bad;
1671			break;
1672#endif
1673		case SO_LINGER:
1674			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1675			if (error)
1676				goto bad;
1677
1678			SOCK_LOCK(so);
1679			so->so_linger = l.l_linger;
1680			if (l.l_onoff)
1681				so->so_options |= SO_LINGER;
1682			else
1683				so->so_options &= ~SO_LINGER;
1684			SOCK_UNLOCK(so);
1685			break;
1686
1687		case SO_DEBUG:
1688		case SO_KEEPALIVE:
1689		case SO_DONTROUTE:
1690		case SO_USELOOPBACK:
1691		case SO_BROADCAST:
1692		case SO_REUSEADDR:
1693		case SO_REUSEPORT:
1694		case SO_OOBINLINE:
1695		case SO_TIMESTAMP:
1696		case SO_BINTIME:
1697		case SO_NOSIGPIPE:
1698			error = sooptcopyin(sopt, &optval, sizeof optval,
1699					    sizeof optval);
1700			if (error)
1701				goto bad;
1702			SOCK_LOCK(so);
1703			if (optval)
1704				so->so_options |= sopt->sopt_name;
1705			else
1706				so->so_options &= ~sopt->sopt_name;
1707			SOCK_UNLOCK(so);
1708			break;
1709
1710		case SO_SNDBUF:
1711		case SO_RCVBUF:
1712		case SO_SNDLOWAT:
1713		case SO_RCVLOWAT:
1714			error = sooptcopyin(sopt, &optval, sizeof optval,
1715					    sizeof optval);
1716			if (error)
1717				goto bad;
1718
1719			/*
1720			 * Values < 1 make no sense for any of these
1721			 * options, so disallow them.
1722			 */
1723			if (optval < 1) {
1724				error = EINVAL;
1725				goto bad;
1726			}
1727
1728			switch (sopt->sopt_name) {
1729			case SO_SNDBUF:
1730			case SO_RCVBUF:
1731				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1732				    &so->so_snd : &so->so_rcv, (u_long)optval,
1733				    so, curthread) == 0) {
1734					error = ENOBUFS;
1735					goto bad;
1736				}
1737				break;
1738
1739			/*
1740			 * Make sure the low-water is never greater than
1741			 * the high-water.
1742			 */
1743			case SO_SNDLOWAT:
1744				SOCKBUF_LOCK(&so->so_snd);
1745				so->so_snd.sb_lowat =
1746				    (optval > so->so_snd.sb_hiwat) ?
1747				    so->so_snd.sb_hiwat : optval;
1748				SOCKBUF_UNLOCK(&so->so_snd);
1749				break;
1750			case SO_RCVLOWAT:
1751				SOCKBUF_LOCK(&so->so_rcv);
1752				so->so_rcv.sb_lowat =
1753				    (optval > so->so_rcv.sb_hiwat) ?
1754				    so->so_rcv.sb_hiwat : optval;
1755				SOCKBUF_UNLOCK(&so->so_rcv);
1756				break;
1757			}
1758			break;
1759
1760		case SO_SNDTIMEO:
1761		case SO_RCVTIMEO:
1762			error = sooptcopyin(sopt, &tv, sizeof tv,
1763					    sizeof tv);
1764			if (error)
1765				goto bad;
1766
1767			/* assert(hz > 0); */
1768			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
1769			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1770				error = EDOM;
1771				goto bad;
1772			}
1773			/* assert(tick > 0); */
1774			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
1775			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1776			if (val > INT_MAX) {
1777				error = EDOM;
1778				goto bad;
1779			}
1780			if (val == 0 && tv.tv_usec != 0)
1781				val = 1;
1782
1783			switch (sopt->sopt_name) {
1784			case SO_SNDTIMEO:
1785				so->so_snd.sb_timeo = val;
1786				break;
1787			case SO_RCVTIMEO:
1788				so->so_rcv.sb_timeo = val;
1789				break;
1790			}
1791			break;
1792		case SO_LABEL:
1793#ifdef MAC
1794			error = sooptcopyin(sopt, &extmac, sizeof extmac,
1795			    sizeof extmac);
1796			if (error)
1797				goto bad;
1798			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1799			    so, &extmac);
1800#else
1801			error = EOPNOTSUPP;
1802#endif
1803			break;
1804		default:
1805			error = ENOPROTOOPT;
1806			break;
1807		}
1808		if (error == 0 && so->so_proto != NULL &&
1809		    so->so_proto->pr_ctloutput != NULL) {
1810			(void) ((*so->so_proto->pr_ctloutput)
1811				  (so, sopt));
1812		}
1813	}
1814bad:
1815	return (error);
1816}
1817
1818/* Helper routine for getsockopt */
1819int
1820sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1821{
1822	int	error;
1823	size_t	valsize;
1824
1825	error = 0;
1826
1827	/*
1828	 * Documented get behavior is that we always return a value,
1829	 * possibly truncated to fit in the user's buffer.
1830	 * Traditional behavior is that we always tell the user
1831	 * precisely how much we copied, rather than something useful
1832	 * like the total amount we had available for her.
1833	 * Note that this interface is not idempotent; the entire answer must
1834	 * generated ahead of time.
1835	 */
1836	valsize = min(len, sopt->sopt_valsize);
1837	sopt->sopt_valsize = valsize;
1838	if (sopt->sopt_val != NULL) {
1839		if (sopt->sopt_td != NULL)
1840			error = copyout(buf, sopt->sopt_val, valsize);
1841		else
1842			bcopy(buf, sopt->sopt_val, valsize);
1843	}
1844	return error;
1845}
1846
1847int
1848sogetopt(so, sopt)
1849	struct socket *so;
1850	struct sockopt *sopt;
1851{
1852	int	error, optval;
1853	struct	linger l;
1854	struct	timeval tv;
1855#ifdef INET
1856	struct accept_filter_arg *afap;
1857#endif
1858#ifdef MAC
1859	struct mac extmac;
1860#endif
1861
1862	error = 0;
1863	if (sopt->sopt_level != SOL_SOCKET) {
1864		if (so->so_proto && so->so_proto->pr_ctloutput) {
1865			return ((*so->so_proto->pr_ctloutput)
1866				  (so, sopt));
1867		} else
1868			return (ENOPROTOOPT);
1869	} else {
1870		switch (sopt->sopt_name) {
1871#ifdef INET
1872		case SO_ACCEPTFILTER:
1873			/* Unlocked read. */
1874			if ((so->so_options & SO_ACCEPTCONN) == 0)
1875				return (EINVAL);
1876			MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1877				M_TEMP, M_WAITOK | M_ZERO);
1878			SOCK_LOCK(so);
1879			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1880				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1881				if (so->so_accf->so_accept_filter_str != NULL)
1882					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1883			}
1884			SOCK_UNLOCK(so);
1885			error = sooptcopyout(sopt, afap, sizeof(*afap));
1886			FREE(afap, M_TEMP);
1887			break;
1888#endif
1889
1890		case SO_LINGER:
1891			/*
1892			 * XXXRW: We grab the lock here to get a consistent
1893			 * snapshot of both fields.  This may not really
1894			 * be necessary.
1895			 */
1896			SOCK_LOCK(so);
1897			l.l_onoff = so->so_options & SO_LINGER;
1898			l.l_linger = so->so_linger;
1899			SOCK_UNLOCK(so);
1900			error = sooptcopyout(sopt, &l, sizeof l);
1901			break;
1902
1903		case SO_USELOOPBACK:
1904		case SO_DONTROUTE:
1905		case SO_DEBUG:
1906		case SO_KEEPALIVE:
1907		case SO_REUSEADDR:
1908		case SO_REUSEPORT:
1909		case SO_BROADCAST:
1910		case SO_OOBINLINE:
1911		case SO_TIMESTAMP:
1912		case SO_BINTIME:
1913		case SO_NOSIGPIPE:
1914			optval = so->so_options & sopt->sopt_name;
1915integer:
1916			error = sooptcopyout(sopt, &optval, sizeof optval);
1917			break;
1918
1919		case SO_TYPE:
1920			optval = so->so_type;
1921			goto integer;
1922
1923		case SO_ERROR:
1924			optval = so->so_error;
1925			so->so_error = 0;
1926			goto integer;
1927
1928		case SO_SNDBUF:
1929			optval = so->so_snd.sb_hiwat;
1930			goto integer;
1931
1932		case SO_RCVBUF:
1933			optval = so->so_rcv.sb_hiwat;
1934			goto integer;
1935
1936		case SO_SNDLOWAT:
1937			optval = so->so_snd.sb_lowat;
1938			goto integer;
1939
1940		case SO_RCVLOWAT:
1941			optval = so->so_rcv.sb_lowat;
1942			goto integer;
1943
1944		case SO_SNDTIMEO:
1945		case SO_RCVTIMEO:
1946			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1947				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1948
1949			tv.tv_sec = optval / hz;
1950			tv.tv_usec = (optval % hz) * tick;
1951			error = sooptcopyout(sopt, &tv, sizeof tv);
1952			break;
1953		case SO_LABEL:
1954#ifdef MAC
1955			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1956			    sizeof(extmac));
1957			if (error)
1958				return (error);
1959			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1960			    so, &extmac);
1961			if (error)
1962				return (error);
1963			error = sooptcopyout(sopt, &extmac, sizeof extmac);
1964#else
1965			error = EOPNOTSUPP;
1966#endif
1967			break;
1968		case SO_PEERLABEL:
1969#ifdef MAC
1970			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1971			    sizeof(extmac));
1972			if (error)
1973				return (error);
1974			error = mac_getsockopt_peerlabel(
1975			    sopt->sopt_td->td_ucred, so, &extmac);
1976			if (error)
1977				return (error);
1978			error = sooptcopyout(sopt, &extmac, sizeof extmac);
1979#else
1980			error = EOPNOTSUPP;
1981#endif
1982			break;
1983		default:
1984			error = ENOPROTOOPT;
1985			break;
1986		}
1987		return (error);
1988	}
1989}
1990
1991/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1992int
1993soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1994{
1995	struct mbuf *m, *m_prev;
1996	int sopt_size = sopt->sopt_valsize;
1997
1998	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1999	if (m == NULL)
2000		return ENOBUFS;
2001	if (sopt_size > MLEN) {
2002		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2003		if ((m->m_flags & M_EXT) == 0) {
2004			m_free(m);
2005			return ENOBUFS;
2006		}
2007		m->m_len = min(MCLBYTES, sopt_size);
2008	} else {
2009		m->m_len = min(MLEN, sopt_size);
2010	}
2011	sopt_size -= m->m_len;
2012	*mp = m;
2013	m_prev = m;
2014
2015	while (sopt_size) {
2016		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2017		if (m == NULL) {
2018			m_freem(*mp);
2019			return ENOBUFS;
2020		}
2021		if (sopt_size > MLEN) {
2022			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2023			    M_DONTWAIT);
2024			if ((m->m_flags & M_EXT) == 0) {
2025				m_freem(m);
2026				m_freem(*mp);
2027				return ENOBUFS;
2028			}
2029			m->m_len = min(MCLBYTES, sopt_size);
2030		} else {
2031			m->m_len = min(MLEN, sopt_size);
2032		}
2033		sopt_size -= m->m_len;
2034		m_prev->m_next = m;
2035		m_prev = m;
2036	}
2037	return 0;
2038}
2039
2040/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2041int
2042soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2043{
2044	struct mbuf *m0 = m;
2045
2046	if (sopt->sopt_val == NULL)
2047		return 0;
2048	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2049		if (sopt->sopt_td != NULL) {
2050			int error;
2051
2052			error = copyin(sopt->sopt_val, mtod(m, char *),
2053				       m->m_len);
2054			if (error != 0) {
2055				m_freem(m0);
2056				return(error);
2057			}
2058		} else
2059			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2060		sopt->sopt_valsize -= m->m_len;
2061		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2062		m = m->m_next;
2063	}
2064	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2065		panic("ip6_sooptmcopyin");
2066	return 0;
2067}
2068
2069/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2070int
2071soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2072{
2073	struct mbuf *m0 = m;
2074	size_t valsize = 0;
2075
2076	if (sopt->sopt_val == NULL)
2077		return 0;
2078	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2079		if (sopt->sopt_td != NULL) {
2080			int error;
2081
2082			error = copyout(mtod(m, char *), sopt->sopt_val,
2083				       m->m_len);
2084			if (error != 0) {
2085				m_freem(m0);
2086				return(error);
2087			}
2088		} else
2089			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2090	       sopt->sopt_valsize -= m->m_len;
2091	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2092	       valsize += m->m_len;
2093	       m = m->m_next;
2094	}
2095	if (m != NULL) {
2096		/* enough soopt buffer should be given from user-land */
2097		m_freem(m0);
2098		return(EINVAL);
2099	}
2100	sopt->sopt_valsize = valsize;
2101	return 0;
2102}
2103
2104void
2105sohasoutofband(so)
2106	struct socket *so;
2107{
2108	if (so->so_sigio != NULL)
2109		pgsigio(&so->so_sigio, SIGURG, 0);
2110	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2111}
2112
2113int
2114sopoll(struct socket *so, int events, struct ucred *active_cred,
2115    struct thread *td)
2116{
2117	int revents = 0;
2118
2119	SOCKBUF_LOCK(&so->so_snd);
2120	SOCKBUF_LOCK(&so->so_rcv);
2121	if (events & (POLLIN | POLLRDNORM))
2122		if (soreadable(so))
2123			revents |= events & (POLLIN | POLLRDNORM);
2124
2125	if (events & POLLINIGNEOF)
2126		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2127		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2128			revents |= POLLINIGNEOF;
2129
2130	if (events & (POLLOUT | POLLWRNORM))
2131		if (sowriteable(so))
2132			revents |= events & (POLLOUT | POLLWRNORM);
2133
2134	if (events & (POLLPRI | POLLRDBAND))
2135		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2136			revents |= events & (POLLPRI | POLLRDBAND);
2137
2138	if (revents == 0) {
2139		if (events &
2140		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2141		     POLLRDBAND)) {
2142			selrecord(td, &so->so_rcv.sb_sel);
2143			so->so_rcv.sb_flags |= SB_SEL;
2144		}
2145
2146		if (events & (POLLOUT | POLLWRNORM)) {
2147			selrecord(td, &so->so_snd.sb_sel);
2148			so->so_snd.sb_flags |= SB_SEL;
2149		}
2150	}
2151
2152	SOCKBUF_UNLOCK(&so->so_rcv);
2153	SOCKBUF_UNLOCK(&so->so_snd);
2154	return (revents);
2155}
2156
2157int
2158soo_kqfilter(struct file *fp, struct knote *kn)
2159{
2160	struct socket *so = kn->kn_fp->f_data;
2161	struct sockbuf *sb;
2162
2163	switch (kn->kn_filter) {
2164	case EVFILT_READ:
2165		if (so->so_options & SO_ACCEPTCONN)
2166			kn->kn_fop = &solisten_filtops;
2167		else
2168			kn->kn_fop = &soread_filtops;
2169		sb = &so->so_rcv;
2170		break;
2171	case EVFILT_WRITE:
2172		kn->kn_fop = &sowrite_filtops;
2173		sb = &so->so_snd;
2174		break;
2175	default:
2176		return (EINVAL);
2177	}
2178
2179	SOCKBUF_LOCK(sb);
2180	knlist_add(&sb->sb_sel.si_note, kn, 1);
2181	sb->sb_flags |= SB_KNOTE;
2182	SOCKBUF_UNLOCK(sb);
2183	return (0);
2184}
2185
2186static void
2187filt_sordetach(struct knote *kn)
2188{
2189	struct socket *so = kn->kn_fp->f_data;
2190
2191	SOCKBUF_LOCK(&so->so_rcv);
2192	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2193	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2194		so->so_rcv.sb_flags &= ~SB_KNOTE;
2195	SOCKBUF_UNLOCK(&so->so_rcv);
2196}
2197
2198/*ARGSUSED*/
2199static int
2200filt_soread(struct knote *kn, long hint)
2201{
2202	struct socket *so;
2203
2204	so = kn->kn_fp->f_data;
2205	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2206
2207	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2208	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2209		kn->kn_flags |= EV_EOF;
2210		kn->kn_fflags = so->so_error;
2211		return (1);
2212	} else if (so->so_error)	/* temporary udp error */
2213		return (1);
2214	else if (kn->kn_sfflags & NOTE_LOWAT)
2215		return (kn->kn_data >= kn->kn_sdata);
2216	else
2217		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2218}
2219
2220static void
2221filt_sowdetach(struct knote *kn)
2222{
2223	struct socket *so = kn->kn_fp->f_data;
2224
2225	SOCKBUF_LOCK(&so->so_snd);
2226	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2227	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2228		so->so_snd.sb_flags &= ~SB_KNOTE;
2229	SOCKBUF_UNLOCK(&so->so_snd);
2230}
2231
2232/*ARGSUSED*/
2233static int
2234filt_sowrite(struct knote *kn, long hint)
2235{
2236	struct socket *so;
2237
2238	so = kn->kn_fp->f_data;
2239	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2240	kn->kn_data = sbspace(&so->so_snd);
2241	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2242		kn->kn_flags |= EV_EOF;
2243		kn->kn_fflags = so->so_error;
2244		return (1);
2245	} else if (so->so_error)	/* temporary udp error */
2246		return (1);
2247	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2248	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2249		return (0);
2250	else if (kn->kn_sfflags & NOTE_LOWAT)
2251		return (kn->kn_data >= kn->kn_sdata);
2252	else
2253		return (kn->kn_data >= so->so_snd.sb_lowat);
2254}
2255
2256/*ARGSUSED*/
2257static int
2258filt_solisten(struct knote *kn, long hint)
2259{
2260	struct socket *so = kn->kn_fp->f_data;
2261
2262	kn->kn_data = so->so_qlen;
2263	return (! TAILQ_EMPTY(&so->so_comp));
2264}
2265
2266int
2267socheckuid(struct socket *so, uid_t uid)
2268{
2269
2270	if (so == NULL)
2271		return (EPERM);
2272	if (so->so_cred->cr_uid == uid)
2273		return (0);
2274	return (EPERM);
2275}
2276