uipc_socket.c revision 223863
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103#include <sys/cdefs.h>
104__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 223863 2011-07-08 10:50:13Z andre $");
105
106#include "opt_inet.h"
107#include "opt_inet6.h"
108#include "opt_zero.h"
109#include "opt_compat.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/fcntl.h>
114#include <sys/limits.h>
115#include <sys/lock.h>
116#include <sys/mac.h>
117#include <sys/malloc.h>
118#include <sys/mbuf.h>
119#include <sys/mutex.h>
120#include <sys/domain.h>
121#include <sys/file.h>			/* for struct knote */
122#include <sys/kernel.h>
123#include <sys/event.h>
124#include <sys/eventhandler.h>
125#include <sys/poll.h>
126#include <sys/proc.h>
127#include <sys/protosw.h>
128#include <sys/socket.h>
129#include <sys/socketvar.h>
130#include <sys/resourcevar.h>
131#include <net/route.h>
132#include <sys/signalvar.h>
133#include <sys/stat.h>
134#include <sys/sx.h>
135#include <sys/sysctl.h>
136#include <sys/uio.h>
137#include <sys/jail.h>
138
139#include <net/vnet.h>
140
141#include <security/mac/mac_framework.h>
142
143#include <vm/uma.h>
144
145#ifdef COMPAT_FREEBSD32
146#include <sys/mount.h>
147#include <sys/sysent.h>
148#include <compat/freebsd32/freebsd32.h>
149#endif
150
151static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
152		    int flags);
153
154static void	filt_sordetach(struct knote *kn);
155static int	filt_soread(struct knote *kn, long hint);
156static void	filt_sowdetach(struct knote *kn);
157static int	filt_sowrite(struct knote *kn, long hint);
158static int	filt_solisten(struct knote *kn, long hint);
159
160static struct filterops solisten_filtops = {
161	.f_isfd = 1,
162	.f_detach = filt_sordetach,
163	.f_event = filt_solisten,
164};
165static struct filterops soread_filtops = {
166	.f_isfd = 1,
167	.f_detach = filt_sordetach,
168	.f_event = filt_soread,
169};
170static struct filterops sowrite_filtops = {
171	.f_isfd = 1,
172	.f_detach = filt_sowdetach,
173	.f_event = filt_sowrite,
174};
175
176uma_zone_t socket_zone;
177so_gen_t	so_gencnt;	/* generation count for sockets */
178
179int	maxsockets;
180
181MALLOC_DEFINE(M_SONAME, "soname", "socket name");
182MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
183
184#define	VNET_SO_ASSERT(so)						\
185	VNET_ASSERT(curvnet != NULL,					\
186	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
187
188static int somaxconn = SOMAXCONN;
189static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
190/* XXX: we dont have SYSCTL_USHORT */
191SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
192    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
193    "queue size");
194static int numopensockets;
195SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
196    &numopensockets, 0, "Number of open sockets");
197#ifdef ZERO_COPY_SOCKETS
198/* These aren't static because they're used in other files. */
199int so_zero_copy_send = 1;
200int so_zero_copy_receive = 1;
201SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
202    "Zero copy controls");
203SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
204    &so_zero_copy_receive, 0, "Enable zero copy receive");
205SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
206    &so_zero_copy_send, 0, "Enable zero copy send");
207#endif /* ZERO_COPY_SOCKETS */
208
209/*
210 * accept_mtx locks down per-socket fields relating to accept queues.  See
211 * socketvar.h for an annotation of the protected fields of struct socket.
212 */
213struct mtx accept_mtx;
214MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
215
216/*
217 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
218 * so_gencnt field.
219 */
220static struct mtx so_global_mtx;
221MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
222
223/*
224 * General IPC sysctl name space, used by sockets and a variety of other IPC
225 * types.
226 */
227SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
228
229/*
230 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
231 * of the change so that they can update their dependent limits as required.
232 */
233static int
234sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
235{
236	int error, newmaxsockets;
237
238	newmaxsockets = maxsockets;
239	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
240	if (error == 0 && req->newptr) {
241		if (newmaxsockets > maxsockets) {
242			maxsockets = newmaxsockets;
243			if (maxsockets > ((maxfiles / 4) * 3)) {
244				maxfiles = (maxsockets * 5) / 4;
245				maxfilesperproc = (maxfiles * 9) / 10;
246			}
247			EVENTHANDLER_INVOKE(maxsockets_change);
248		} else
249			error = EINVAL;
250	}
251	return (error);
252}
253
254SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
255    &maxsockets, 0, sysctl_maxsockets, "IU",
256    "Maximum number of sockets avaliable");
257
258/*
259 * Initialise maxsockets.  This SYSINIT must be run after
260 * tunable_mbinit().
261 */
262static void
263init_maxsockets(void *ignored)
264{
265
266	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
267	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
268}
269SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
270
271/*
272 * Socket operation routines.  These routines are called by the routines in
273 * sys_socket.c or from a system process, and implement the semantics of
274 * socket operations by switching out to the protocol specific routines.
275 */
276
277/*
278 * Get a socket structure from our zone, and initialize it.  Note that it
279 * would probably be better to allocate socket and PCB at the same time, but
280 * I'm not convinced that all the protocols can be easily modified to do
281 * this.
282 *
283 * soalloc() returns a socket with a ref count of 0.
284 */
285static struct socket *
286soalloc(struct vnet *vnet)
287{
288	struct socket *so;
289
290	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
291	if (so == NULL)
292		return (NULL);
293#ifdef MAC
294	if (mac_socket_init(so, M_NOWAIT) != 0) {
295		uma_zfree(socket_zone, so);
296		return (NULL);
297	}
298#endif
299	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
300	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
301	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
302	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
303	TAILQ_INIT(&so->so_aiojobq);
304	mtx_lock(&so_global_mtx);
305	so->so_gencnt = ++so_gencnt;
306	++numopensockets;
307#ifdef VIMAGE
308	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
309	    __func__, __LINE__, so));
310	vnet->vnet_sockcnt++;
311	so->so_vnet = vnet;
312#endif
313	mtx_unlock(&so_global_mtx);
314	return (so);
315}
316
317/*
318 * Free the storage associated with a socket at the socket layer, tear down
319 * locks, labels, etc.  All protocol state is assumed already to have been
320 * torn down (and possibly never set up) by the caller.
321 */
322static void
323sodealloc(struct socket *so)
324{
325
326	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
327	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
328
329	mtx_lock(&so_global_mtx);
330	so->so_gencnt = ++so_gencnt;
331	--numopensockets;	/* Could be below, but faster here. */
332#ifdef VIMAGE
333	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
334	    __func__, __LINE__, so));
335	so->so_vnet->vnet_sockcnt--;
336#endif
337	mtx_unlock(&so_global_mtx);
338	if (so->so_rcv.sb_hiwat)
339		(void)chgsbsize(so->so_cred->cr_uidinfo,
340		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
341	if (so->so_snd.sb_hiwat)
342		(void)chgsbsize(so->so_cred->cr_uidinfo,
343		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
344#ifdef INET
345	/* remove acccept filter if one is present. */
346	if (so->so_accf != NULL)
347		do_setopt_accept_filter(so, NULL);
348#endif
349#ifdef MAC
350	mac_socket_destroy(so);
351#endif
352	crfree(so->so_cred);
353	sx_destroy(&so->so_snd.sb_sx);
354	sx_destroy(&so->so_rcv.sb_sx);
355	SOCKBUF_LOCK_DESTROY(&so->so_snd);
356	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
357	uma_zfree(socket_zone, so);
358}
359
360/*
361 * socreate returns a socket with a ref count of 1.  The socket should be
362 * closed with soclose().
363 */
364int
365socreate(int dom, struct socket **aso, int type, int proto,
366    struct ucred *cred, struct thread *td)
367{
368	struct protosw *prp;
369	struct socket *so;
370	int error;
371
372	if (proto)
373		prp = pffindproto(dom, proto, type);
374	else
375		prp = pffindtype(dom, type);
376
377	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
378	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
379		return (EPROTONOSUPPORT);
380
381	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
382		return (EPROTONOSUPPORT);
383
384	if (prp->pr_type != type)
385		return (EPROTOTYPE);
386	so = soalloc(CRED_TO_VNET(cred));
387	if (so == NULL)
388		return (ENOBUFS);
389
390	TAILQ_INIT(&so->so_incomp);
391	TAILQ_INIT(&so->so_comp);
392	so->so_type = type;
393	so->so_cred = crhold(cred);
394	if ((prp->pr_domain->dom_family == PF_INET) ||
395	    (prp->pr_domain->dom_family == PF_ROUTE))
396		so->so_fibnum = td->td_proc->p_fibnum;
397	else
398		so->so_fibnum = 0;
399	so->so_proto = prp;
400#ifdef MAC
401	mac_socket_create(cred, so);
402#endif
403	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
404	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
405	so->so_count = 1;
406	/*
407	 * Auto-sizing of socket buffers is managed by the protocols and
408	 * the appropriate flags must be set in the pru_attach function.
409	 */
410	CURVNET_SET(so->so_vnet);
411	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
412	CURVNET_RESTORE();
413	if (error) {
414		KASSERT(so->so_count == 1, ("socreate: so_count %d",
415		    so->so_count));
416		so->so_count = 0;
417		sodealloc(so);
418		return (error);
419	}
420	*aso = so;
421	return (0);
422}
423
424#ifdef REGRESSION
425static int regression_sonewconn_earlytest = 1;
426SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
427    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
428#endif
429
430/*
431 * When an attempt at a new connection is noted on a socket which accepts
432 * connections, sonewconn is called.  If the connection is possible (subject
433 * to space constraints, etc.) then we allocate a new structure, propoerly
434 * linked into the data structure of the original socket, and return this.
435 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
436 *
437 * Note: the ref count on the socket is 0 on return.
438 */
439struct socket *
440sonewconn(struct socket *head, int connstatus)
441{
442	struct socket *so;
443	int over;
444
445	ACCEPT_LOCK();
446	over = (head->so_qlen > 3 * head->so_qlimit / 2);
447	ACCEPT_UNLOCK();
448#ifdef REGRESSION
449	if (regression_sonewconn_earlytest && over)
450#else
451	if (over)
452#endif
453		return (NULL);
454	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
455	    __func__, __LINE__, head));
456	so = soalloc(head->so_vnet);
457	if (so == NULL)
458		return (NULL);
459	if ((head->so_options & SO_ACCEPTFILTER) != 0)
460		connstatus = 0;
461	so->so_head = head;
462	so->so_type = head->so_type;
463	so->so_options = head->so_options &~ SO_ACCEPTCONN;
464	so->so_linger = head->so_linger;
465	so->so_state = head->so_state | SS_NOFDREF;
466	so->so_fibnum = head->so_fibnum;
467	so->so_proto = head->so_proto;
468	so->so_cred = crhold(head->so_cred);
469#ifdef MAC
470	mac_socket_newconn(head, so);
471#endif
472	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
473	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
474	VNET_SO_ASSERT(head);
475	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
476	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
477		sodealloc(so);
478		return (NULL);
479	}
480	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
481	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
482	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
483	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
484	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
485	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
486	so->so_state |= connstatus;
487	ACCEPT_LOCK();
488	if (connstatus) {
489		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
490		so->so_qstate |= SQ_COMP;
491		head->so_qlen++;
492	} else {
493		/*
494		 * Keep removing sockets from the head until there's room for
495		 * us to insert on the tail.  In pre-locking revisions, this
496		 * was a simple if(), but as we could be racing with other
497		 * threads and soabort() requires dropping locks, we must
498		 * loop waiting for the condition to be true.
499		 */
500		while (head->so_incqlen > head->so_qlimit) {
501			struct socket *sp;
502			sp = TAILQ_FIRST(&head->so_incomp);
503			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
504			head->so_incqlen--;
505			sp->so_qstate &= ~SQ_INCOMP;
506			sp->so_head = NULL;
507			ACCEPT_UNLOCK();
508			soabort(sp);
509			ACCEPT_LOCK();
510		}
511		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
512		so->so_qstate |= SQ_INCOMP;
513		head->so_incqlen++;
514	}
515	ACCEPT_UNLOCK();
516	if (connstatus) {
517		sorwakeup(head);
518		wakeup_one(&head->so_timeo);
519	}
520	return (so);
521}
522
523int
524sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
525{
526	int error;
527
528	CURVNET_SET(so->so_vnet);
529	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
530	CURVNET_RESTORE();
531	return error;
532}
533
534/*
535 * solisten() transitions a socket from a non-listening state to a listening
536 * state, but can also be used to update the listen queue depth on an
537 * existing listen socket.  The protocol will call back into the sockets
538 * layer using solisten_proto_check() and solisten_proto() to check and set
539 * socket-layer listen state.  Call backs are used so that the protocol can
540 * acquire both protocol and socket layer locks in whatever order is required
541 * by the protocol.
542 *
543 * Protocol implementors are advised to hold the socket lock across the
544 * socket-layer test and set to avoid races at the socket layer.
545 */
546int
547solisten(struct socket *so, int backlog, struct thread *td)
548{
549	int error;
550
551	CURVNET_SET(so->so_vnet);
552	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
553	CURVNET_RESTORE();
554	return error;
555}
556
557int
558solisten_proto_check(struct socket *so)
559{
560
561	SOCK_LOCK_ASSERT(so);
562
563	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
564	    SS_ISDISCONNECTING))
565		return (EINVAL);
566	return (0);
567}
568
569void
570solisten_proto(struct socket *so, int backlog)
571{
572
573	SOCK_LOCK_ASSERT(so);
574
575	if (backlog < 0 || backlog > somaxconn)
576		backlog = somaxconn;
577	so->so_qlimit = backlog;
578	so->so_options |= SO_ACCEPTCONN;
579}
580
581/*
582 * Evaluate the reference count and named references on a socket; if no
583 * references remain, free it.  This should be called whenever a reference is
584 * released, such as in sorele(), but also when named reference flags are
585 * cleared in socket or protocol code.
586 *
587 * sofree() will free the socket if:
588 *
589 * - There are no outstanding file descriptor references or related consumers
590 *   (so_count == 0).
591 *
592 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
593 *
594 * - The protocol does not have an outstanding strong reference on the socket
595 *   (SS_PROTOREF).
596 *
597 * - The socket is not in a completed connection queue, so a process has been
598 *   notified that it is present.  If it is removed, the user process may
599 *   block in accept() despite select() saying the socket was ready.
600 */
601void
602sofree(struct socket *so)
603{
604	struct protosw *pr = so->so_proto;
605	struct socket *head;
606
607	ACCEPT_LOCK_ASSERT();
608	SOCK_LOCK_ASSERT(so);
609
610	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
611	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
612		SOCK_UNLOCK(so);
613		ACCEPT_UNLOCK();
614		return;
615	}
616
617	head = so->so_head;
618	if (head != NULL) {
619		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
620		    (so->so_qstate & SQ_INCOMP) != 0,
621		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
622		    "SQ_INCOMP"));
623		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
624		    (so->so_qstate & SQ_INCOMP) == 0,
625		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
626		TAILQ_REMOVE(&head->so_incomp, so, so_list);
627		head->so_incqlen--;
628		so->so_qstate &= ~SQ_INCOMP;
629		so->so_head = NULL;
630	}
631	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
632	    (so->so_qstate & SQ_INCOMP) == 0,
633	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
634	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
635	if (so->so_options & SO_ACCEPTCONN) {
636		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
637		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
638	}
639	SOCK_UNLOCK(so);
640	ACCEPT_UNLOCK();
641
642	VNET_SO_ASSERT(so);
643	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
644		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
645	if (pr->pr_usrreqs->pru_detach != NULL)
646		(*pr->pr_usrreqs->pru_detach)(so);
647
648	/*
649	 * From this point on, we assume that no other references to this
650	 * socket exist anywhere else in the stack.  Therefore, no locks need
651	 * to be acquired or held.
652	 *
653	 * We used to do a lot of socket buffer and socket locking here, as
654	 * well as invoke sorflush() and perform wakeups.  The direct call to
655	 * dom_dispose() and sbrelease_internal() are an inlining of what was
656	 * necessary from sorflush().
657	 *
658	 * Notice that the socket buffer and kqueue state are torn down
659	 * before calling pru_detach.  This means that protocols shold not
660	 * assume they can perform socket wakeups, etc, in their detach code.
661	 */
662	sbdestroy(&so->so_snd, so);
663	sbdestroy(&so->so_rcv, so);
664	knlist_destroy(&so->so_rcv.sb_sel.si_note);
665	knlist_destroy(&so->so_snd.sb_sel.si_note);
666	sodealloc(so);
667}
668
669/*
670 * Close a socket on last file table reference removal.  Initiate disconnect
671 * if connected.  Free socket when disconnect complete.
672 *
673 * This function will sorele() the socket.  Note that soclose() may be called
674 * prior to the ref count reaching zero.  The actual socket structure will
675 * not be freed until the ref count reaches zero.
676 */
677int
678soclose(struct socket *so)
679{
680	int error = 0;
681
682	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
683
684	CURVNET_SET(so->so_vnet);
685	funsetown(&so->so_sigio);
686	if (so->so_state & SS_ISCONNECTED) {
687		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
688			error = sodisconnect(so);
689			if (error) {
690				if (error == ENOTCONN)
691					error = 0;
692				goto drop;
693			}
694		}
695		if (so->so_options & SO_LINGER) {
696			if ((so->so_state & SS_ISDISCONNECTING) &&
697			    (so->so_state & SS_NBIO))
698				goto drop;
699			while (so->so_state & SS_ISCONNECTED) {
700				error = tsleep(&so->so_timeo,
701				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
702				if (error)
703					break;
704			}
705		}
706	}
707
708drop:
709	if (so->so_proto->pr_usrreqs->pru_close != NULL)
710		(*so->so_proto->pr_usrreqs->pru_close)(so);
711	if (so->so_options & SO_ACCEPTCONN) {
712		struct socket *sp;
713		ACCEPT_LOCK();
714		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
715			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
716			so->so_incqlen--;
717			sp->so_qstate &= ~SQ_INCOMP;
718			sp->so_head = NULL;
719			ACCEPT_UNLOCK();
720			soabort(sp);
721			ACCEPT_LOCK();
722		}
723		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
724			TAILQ_REMOVE(&so->so_comp, sp, so_list);
725			so->so_qlen--;
726			sp->so_qstate &= ~SQ_COMP;
727			sp->so_head = NULL;
728			ACCEPT_UNLOCK();
729			soabort(sp);
730			ACCEPT_LOCK();
731		}
732		ACCEPT_UNLOCK();
733	}
734	ACCEPT_LOCK();
735	SOCK_LOCK(so);
736	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
737	so->so_state |= SS_NOFDREF;
738	sorele(so);
739	CURVNET_RESTORE();
740	return (error);
741}
742
743/*
744 * soabort() is used to abruptly tear down a connection, such as when a
745 * resource limit is reached (listen queue depth exceeded), or if a listen
746 * socket is closed while there are sockets waiting to be accepted.
747 *
748 * This interface is tricky, because it is called on an unreferenced socket,
749 * and must be called only by a thread that has actually removed the socket
750 * from the listen queue it was on, or races with other threads are risked.
751 *
752 * This interface will call into the protocol code, so must not be called
753 * with any socket locks held.  Protocols do call it while holding their own
754 * recursible protocol mutexes, but this is something that should be subject
755 * to review in the future.
756 */
757void
758soabort(struct socket *so)
759{
760
761	/*
762	 * In as much as is possible, assert that no references to this
763	 * socket are held.  This is not quite the same as asserting that the
764	 * current thread is responsible for arranging for no references, but
765	 * is as close as we can get for now.
766	 */
767	KASSERT(so->so_count == 0, ("soabort: so_count"));
768	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
769	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
770	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
771	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
772	VNET_SO_ASSERT(so);
773
774	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
775		(*so->so_proto->pr_usrreqs->pru_abort)(so);
776	ACCEPT_LOCK();
777	SOCK_LOCK(so);
778	sofree(so);
779}
780
781int
782soaccept(struct socket *so, struct sockaddr **nam)
783{
784	int error;
785
786	SOCK_LOCK(so);
787	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
788	so->so_state &= ~SS_NOFDREF;
789	SOCK_UNLOCK(so);
790
791	CURVNET_SET(so->so_vnet);
792	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
793	CURVNET_RESTORE();
794	return (error);
795}
796
797int
798soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
799{
800	int error;
801
802	if (so->so_options & SO_ACCEPTCONN)
803		return (EOPNOTSUPP);
804
805	CURVNET_SET(so->so_vnet);
806	/*
807	 * If protocol is connection-based, can only connect once.
808	 * Otherwise, if connected, try to disconnect first.  This allows
809	 * user to disconnect by connecting to, e.g., a null address.
810	 */
811	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
812	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
813	    (error = sodisconnect(so)))) {
814		error = EISCONN;
815	} else {
816		/*
817		 * Prevent accumulated error from previous connection from
818		 * biting us.
819		 */
820		so->so_error = 0;
821		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
822	}
823	CURVNET_RESTORE();
824
825	return (error);
826}
827
828int
829soconnect2(struct socket *so1, struct socket *so2)
830{
831	int error;
832
833	CURVNET_SET(so1->so_vnet);
834	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
835	CURVNET_RESTORE();
836	return (error);
837}
838
839int
840sodisconnect(struct socket *so)
841{
842	int error;
843
844	if ((so->so_state & SS_ISCONNECTED) == 0)
845		return (ENOTCONN);
846	if (so->so_state & SS_ISDISCONNECTING)
847		return (EALREADY);
848	VNET_SO_ASSERT(so);
849	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
850	return (error);
851}
852
853#ifdef ZERO_COPY_SOCKETS
854struct so_zerocopy_stats{
855	int size_ok;
856	int align_ok;
857	int found_ifp;
858};
859struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
860#include <netinet/in.h>
861#include <net/route.h>
862#include <netinet/in_pcb.h>
863#include <vm/vm.h>
864#include <vm/vm_page.h>
865#include <vm/vm_object.h>
866
867/*
868 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
869 * sosend_dgram() and sosend_generic() use m_uiotombuf().
870 *
871 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
872 * all of the data referenced by the uio.  If desired, it uses zero-copy.
873 * *space will be updated to reflect data copied in.
874 *
875 * NB: If atomic I/O is requested, the caller must already have checked that
876 * space can hold resid bytes.
877 *
878 * NB: In the event of an error, the caller may need to free the partial
879 * chain pointed to by *mpp.  The contents of both *uio and *space may be
880 * modified even in the case of an error.
881 */
882static int
883sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
884    int flags)
885{
886	struct mbuf *m, **mp, *top;
887	long len, resid;
888	int error;
889#ifdef ZERO_COPY_SOCKETS
890	int cow_send;
891#endif
892
893	*retmp = top = NULL;
894	mp = &top;
895	len = 0;
896	resid = uio->uio_resid;
897	error = 0;
898	do {
899#ifdef ZERO_COPY_SOCKETS
900		cow_send = 0;
901#endif /* ZERO_COPY_SOCKETS */
902		if (resid >= MINCLSIZE) {
903#ifdef ZERO_COPY_SOCKETS
904			if (top == NULL) {
905				m = m_gethdr(M_WAITOK, MT_DATA);
906				m->m_pkthdr.len = 0;
907				m->m_pkthdr.rcvif = NULL;
908			} else
909				m = m_get(M_WAITOK, MT_DATA);
910			if (so_zero_copy_send &&
911			    resid>=PAGE_SIZE &&
912			    *space>=PAGE_SIZE &&
913			    uio->uio_iov->iov_len>=PAGE_SIZE) {
914				so_zerocp_stats.size_ok++;
915				so_zerocp_stats.align_ok++;
916				cow_send = socow_setup(m, uio);
917				len = cow_send;
918			}
919			if (!cow_send) {
920				m_clget(m, M_WAITOK);
921				len = min(min(MCLBYTES, resid), *space);
922			}
923#else /* ZERO_COPY_SOCKETS */
924			if (top == NULL) {
925				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
926				m->m_pkthdr.len = 0;
927				m->m_pkthdr.rcvif = NULL;
928			} else
929				m = m_getcl(M_WAIT, MT_DATA, 0);
930			len = min(min(MCLBYTES, resid), *space);
931#endif /* ZERO_COPY_SOCKETS */
932		} else {
933			if (top == NULL) {
934				m = m_gethdr(M_WAIT, MT_DATA);
935				m->m_pkthdr.len = 0;
936				m->m_pkthdr.rcvif = NULL;
937
938				len = min(min(MHLEN, resid), *space);
939				/*
940				 * For datagram protocols, leave room
941				 * for protocol headers in first mbuf.
942				 */
943				if (atomic && m && len < MHLEN)
944					MH_ALIGN(m, len);
945			} else {
946				m = m_get(M_WAIT, MT_DATA);
947				len = min(min(MLEN, resid), *space);
948			}
949		}
950		if (m == NULL) {
951			error = ENOBUFS;
952			goto out;
953		}
954
955		*space -= len;
956#ifdef ZERO_COPY_SOCKETS
957		if (cow_send)
958			error = 0;
959		else
960#endif /* ZERO_COPY_SOCKETS */
961		error = uiomove(mtod(m, void *), (int)len, uio);
962		resid = uio->uio_resid;
963		m->m_len = len;
964		*mp = m;
965		top->m_pkthdr.len += len;
966		if (error)
967			goto out;
968		mp = &m->m_next;
969		if (resid <= 0) {
970			if (flags & MSG_EOR)
971				top->m_flags |= M_EOR;
972			break;
973		}
974	} while (*space > 0 && atomic);
975out:
976	*retmp = top;
977	return (error);
978}
979#endif /*ZERO_COPY_SOCKETS*/
980
981#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
982
983int
984sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
985    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
986{
987	long space, resid;
988	int clen = 0, error, dontroute;
989#ifdef ZERO_COPY_SOCKETS
990	int atomic = sosendallatonce(so) || top;
991#endif
992
993	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
994	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
995	    ("sodgram_send: !PR_ATOMIC"));
996
997	if (uio != NULL)
998		resid = uio->uio_resid;
999	else
1000		resid = top->m_pkthdr.len;
1001	/*
1002	 * In theory resid should be unsigned.  However, space must be
1003	 * signed, as it might be less than 0 if we over-committed, and we
1004	 * must use a signed comparison of space and resid.  On the other
1005	 * hand, a negative resid causes us to loop sending 0-length
1006	 * segments to the protocol.
1007	 */
1008	if (resid < 0) {
1009		error = EINVAL;
1010		goto out;
1011	}
1012
1013	dontroute =
1014	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1015	if (td != NULL)
1016		td->td_ru.ru_msgsnd++;
1017	if (control != NULL)
1018		clen = control->m_len;
1019
1020	SOCKBUF_LOCK(&so->so_snd);
1021	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1022		SOCKBUF_UNLOCK(&so->so_snd);
1023		error = EPIPE;
1024		goto out;
1025	}
1026	if (so->so_error) {
1027		error = so->so_error;
1028		so->so_error = 0;
1029		SOCKBUF_UNLOCK(&so->so_snd);
1030		goto out;
1031	}
1032	if ((so->so_state & SS_ISCONNECTED) == 0) {
1033		/*
1034		 * `sendto' and `sendmsg' is allowed on a connection-based
1035		 * socket if it supports implied connect.  Return ENOTCONN if
1036		 * not connected and no address is supplied.
1037		 */
1038		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1039		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1040			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1041			    !(resid == 0 && clen != 0)) {
1042				SOCKBUF_UNLOCK(&so->so_snd);
1043				error = ENOTCONN;
1044				goto out;
1045			}
1046		} else if (addr == NULL) {
1047			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1048				error = ENOTCONN;
1049			else
1050				error = EDESTADDRREQ;
1051			SOCKBUF_UNLOCK(&so->so_snd);
1052			goto out;
1053		}
1054	}
1055
1056	/*
1057	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1058	 * problem and need fixing.
1059	 */
1060	space = sbspace(&so->so_snd);
1061	if (flags & MSG_OOB)
1062		space += 1024;
1063	space -= clen;
1064	SOCKBUF_UNLOCK(&so->so_snd);
1065	if (resid > space) {
1066		error = EMSGSIZE;
1067		goto out;
1068	}
1069	if (uio == NULL) {
1070		resid = 0;
1071		if (flags & MSG_EOR)
1072			top->m_flags |= M_EOR;
1073	} else {
1074#ifdef ZERO_COPY_SOCKETS
1075		error = sosend_copyin(uio, &top, atomic, &space, flags);
1076		if (error)
1077			goto out;
1078#else
1079		/*
1080		 * Copy the data from userland into a mbuf chain.
1081		 * If no data is to be copied in, a single empty mbuf
1082		 * is returned.
1083		 */
1084		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1085		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1086		if (top == NULL) {
1087			error = EFAULT;	/* only possible error */
1088			goto out;
1089		}
1090		space -= resid - uio->uio_resid;
1091#endif
1092		resid = uio->uio_resid;
1093	}
1094	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1095	/*
1096	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1097	 * than with.
1098	 */
1099	if (dontroute) {
1100		SOCK_LOCK(so);
1101		so->so_options |= SO_DONTROUTE;
1102		SOCK_UNLOCK(so);
1103	}
1104	/*
1105	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1106	 * of date.  We could have recieved a reset packet in an interrupt or
1107	 * maybe we slept while doing page faults in uiomove() etc.  We could
1108	 * probably recheck again inside the locking protection here, but
1109	 * there are probably other places that this also happens.  We must
1110	 * rethink this.
1111	 */
1112	VNET_SO_ASSERT(so);
1113	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1114	    (flags & MSG_OOB) ? PRUS_OOB :
1115	/*
1116	 * If the user set MSG_EOF, the protocol understands this flag and
1117	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1118	 */
1119	    ((flags & MSG_EOF) &&
1120	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1121	     (resid <= 0)) ?
1122		PRUS_EOF :
1123		/* If there is more to send set PRUS_MORETOCOME */
1124		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1125		top, addr, control, td);
1126	if (dontroute) {
1127		SOCK_LOCK(so);
1128		so->so_options &= ~SO_DONTROUTE;
1129		SOCK_UNLOCK(so);
1130	}
1131	clen = 0;
1132	control = NULL;
1133	top = NULL;
1134out:
1135	if (top != NULL)
1136		m_freem(top);
1137	if (control != NULL)
1138		m_freem(control);
1139	return (error);
1140}
1141
1142/*
1143 * Send on a socket.  If send must go all at once and message is larger than
1144 * send buffering, then hard error.  Lock against other senders.  If must go
1145 * all at once and not enough room now, then inform user that this would
1146 * block and do nothing.  Otherwise, if nonblocking, send as much as
1147 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1148 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1149 * in mbuf chain must be small enough to send all at once.
1150 *
1151 * Returns nonzero on error, timeout or signal; callers must check for short
1152 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1153 * on return.
1154 */
1155int
1156sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1157    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1158{
1159	long space, resid;
1160	int clen = 0, error, dontroute;
1161	int atomic = sosendallatonce(so) || top;
1162
1163	if (uio != NULL)
1164		resid = uio->uio_resid;
1165	else
1166		resid = top->m_pkthdr.len;
1167	/*
1168	 * In theory resid should be unsigned.  However, space must be
1169	 * signed, as it might be less than 0 if we over-committed, and we
1170	 * must use a signed comparison of space and resid.  On the other
1171	 * hand, a negative resid causes us to loop sending 0-length
1172	 * segments to the protocol.
1173	 *
1174	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1175	 * type sockets since that's an error.
1176	 */
1177	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1178		error = EINVAL;
1179		goto out;
1180	}
1181
1182	dontroute =
1183	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1184	    (so->so_proto->pr_flags & PR_ATOMIC);
1185	if (td != NULL)
1186		td->td_ru.ru_msgsnd++;
1187	if (control != NULL)
1188		clen = control->m_len;
1189
1190	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1191	if (error)
1192		goto out;
1193
1194restart:
1195	do {
1196		SOCKBUF_LOCK(&so->so_snd);
1197		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1198			SOCKBUF_UNLOCK(&so->so_snd);
1199			error = EPIPE;
1200			goto release;
1201		}
1202		if (so->so_error) {
1203			error = so->so_error;
1204			so->so_error = 0;
1205			SOCKBUF_UNLOCK(&so->so_snd);
1206			goto release;
1207		}
1208		if ((so->so_state & SS_ISCONNECTED) == 0) {
1209			/*
1210			 * `sendto' and `sendmsg' is allowed on a connection-
1211			 * based socket if it supports implied connect.
1212			 * Return ENOTCONN if not connected and no address is
1213			 * supplied.
1214			 */
1215			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1216			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1217				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1218				    !(resid == 0 && clen != 0)) {
1219					SOCKBUF_UNLOCK(&so->so_snd);
1220					error = ENOTCONN;
1221					goto release;
1222				}
1223			} else if (addr == NULL) {
1224				SOCKBUF_UNLOCK(&so->so_snd);
1225				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1226					error = ENOTCONN;
1227				else
1228					error = EDESTADDRREQ;
1229				goto release;
1230			}
1231		}
1232		space = sbspace(&so->so_snd);
1233		if (flags & MSG_OOB)
1234			space += 1024;
1235		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1236		    clen > so->so_snd.sb_hiwat) {
1237			SOCKBUF_UNLOCK(&so->so_snd);
1238			error = EMSGSIZE;
1239			goto release;
1240		}
1241		if (space < resid + clen &&
1242		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1243			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1244				SOCKBUF_UNLOCK(&so->so_snd);
1245				error = EWOULDBLOCK;
1246				goto release;
1247			}
1248			error = sbwait(&so->so_snd);
1249			SOCKBUF_UNLOCK(&so->so_snd);
1250			if (error)
1251				goto release;
1252			goto restart;
1253		}
1254		SOCKBUF_UNLOCK(&so->so_snd);
1255		space -= clen;
1256		do {
1257			if (uio == NULL) {
1258				resid = 0;
1259				if (flags & MSG_EOR)
1260					top->m_flags |= M_EOR;
1261			} else {
1262#ifdef ZERO_COPY_SOCKETS
1263				error = sosend_copyin(uio, &top, atomic,
1264				    &space, flags);
1265				if (error != 0)
1266					goto release;
1267#else
1268				/*
1269				 * Copy the data from userland into a mbuf
1270				 * chain.  If no data is to be copied in,
1271				 * a single empty mbuf is returned.
1272				 */
1273				top = m_uiotombuf(uio, M_WAITOK, space,
1274				    (atomic ? max_hdr : 0),
1275				    (atomic ? M_PKTHDR : 0) |
1276				    ((flags & MSG_EOR) ? M_EOR : 0));
1277				if (top == NULL) {
1278					error = EFAULT; /* only possible error */
1279					goto release;
1280				}
1281				space -= resid - uio->uio_resid;
1282#endif
1283				resid = uio->uio_resid;
1284			}
1285			if (dontroute) {
1286				SOCK_LOCK(so);
1287				so->so_options |= SO_DONTROUTE;
1288				SOCK_UNLOCK(so);
1289			}
1290			/*
1291			 * XXX all the SBS_CANTSENDMORE checks previously
1292			 * done could be out of date.  We could have recieved
1293			 * a reset packet in an interrupt or maybe we slept
1294			 * while doing page faults in uiomove() etc.  We
1295			 * could probably recheck again inside the locking
1296			 * protection here, but there are probably other
1297			 * places that this also happens.  We must rethink
1298			 * this.
1299			 */
1300			VNET_SO_ASSERT(so);
1301			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1302			    (flags & MSG_OOB) ? PRUS_OOB :
1303			/*
1304			 * If the user set MSG_EOF, the protocol understands
1305			 * this flag and nothing left to send then use
1306			 * PRU_SEND_EOF instead of PRU_SEND.
1307			 */
1308			    ((flags & MSG_EOF) &&
1309			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1310			     (resid <= 0)) ?
1311				PRUS_EOF :
1312			/* If there is more to send set PRUS_MORETOCOME. */
1313			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1314			    top, addr, control, td);
1315			if (dontroute) {
1316				SOCK_LOCK(so);
1317				so->so_options &= ~SO_DONTROUTE;
1318				SOCK_UNLOCK(so);
1319			}
1320			clen = 0;
1321			control = NULL;
1322			top = NULL;
1323			if (error)
1324				goto release;
1325		} while (resid && space > 0);
1326	} while (resid);
1327
1328release:
1329	sbunlock(&so->so_snd);
1330out:
1331	if (top != NULL)
1332		m_freem(top);
1333	if (control != NULL)
1334		m_freem(control);
1335	return (error);
1336}
1337
1338int
1339sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1340    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1341{
1342	int error;
1343
1344	CURVNET_SET(so->so_vnet);
1345	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1346	    control, flags, td);
1347	CURVNET_RESTORE();
1348	return (error);
1349}
1350
1351/*
1352 * The part of soreceive() that implements reading non-inline out-of-band
1353 * data from a socket.  For more complete comments, see soreceive(), from
1354 * which this code originated.
1355 *
1356 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1357 * unable to return an mbuf chain to the caller.
1358 */
1359static int
1360soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1361{
1362	struct protosw *pr = so->so_proto;
1363	struct mbuf *m;
1364	int error;
1365
1366	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1367	VNET_SO_ASSERT(so);
1368
1369	m = m_get(M_WAIT, MT_DATA);
1370	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1371	if (error)
1372		goto bad;
1373	do {
1374#ifdef ZERO_COPY_SOCKETS
1375		if (so_zero_copy_receive) {
1376			int disposable;
1377
1378			if ((m->m_flags & M_EXT)
1379			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1380				disposable = 1;
1381			else
1382				disposable = 0;
1383
1384			error = uiomoveco(mtod(m, void *),
1385					  min(uio->uio_resid, m->m_len),
1386					  uio, disposable);
1387		} else
1388#endif /* ZERO_COPY_SOCKETS */
1389		error = uiomove(mtod(m, void *),
1390		    (int) min(uio->uio_resid, m->m_len), uio);
1391		m = m_free(m);
1392	} while (uio->uio_resid && error == 0 && m);
1393bad:
1394	if (m != NULL)
1395		m_freem(m);
1396	return (error);
1397}
1398
1399/*
1400 * Following replacement or removal of the first mbuf on the first mbuf chain
1401 * of a socket buffer, push necessary state changes back into the socket
1402 * buffer so that other consumers see the values consistently.  'nextrecord'
1403 * is the callers locally stored value of the original value of
1404 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1405 * NOTE: 'nextrecord' may be NULL.
1406 */
1407static __inline void
1408sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1409{
1410
1411	SOCKBUF_LOCK_ASSERT(sb);
1412	/*
1413	 * First, update for the new value of nextrecord.  If necessary, make
1414	 * it the first record.
1415	 */
1416	if (sb->sb_mb != NULL)
1417		sb->sb_mb->m_nextpkt = nextrecord;
1418	else
1419		sb->sb_mb = nextrecord;
1420
1421        /*
1422         * Now update any dependent socket buffer fields to reflect the new
1423         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1424	 * addition of a second clause that takes care of the case where
1425	 * sb_mb has been updated, but remains the last record.
1426         */
1427        if (sb->sb_mb == NULL) {
1428                sb->sb_mbtail = NULL;
1429                sb->sb_lastrecord = NULL;
1430        } else if (sb->sb_mb->m_nextpkt == NULL)
1431                sb->sb_lastrecord = sb->sb_mb;
1432}
1433
1434
1435/*
1436 * Implement receive operations on a socket.  We depend on the way that
1437 * records are added to the sockbuf by sbappend.  In particular, each record
1438 * (mbufs linked through m_next) must begin with an address if the protocol
1439 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1440 * data, and then zero or more mbufs of data.  In order to allow parallelism
1441 * between network receive and copying to user space, as well as avoid
1442 * sleeping with a mutex held, we release the socket buffer mutex during the
1443 * user space copy.  Although the sockbuf is locked, new data may still be
1444 * appended, and thus we must maintain consistency of the sockbuf during that
1445 * time.
1446 *
1447 * The caller may receive the data as a single mbuf chain by supplying an
1448 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1449 * the count in uio_resid.
1450 */
1451int
1452soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1453    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1454{
1455	struct mbuf *m, **mp;
1456	int flags, len, error, offset;
1457	struct protosw *pr = so->so_proto;
1458	struct mbuf *nextrecord;
1459	int moff, type = 0;
1460	int orig_resid = uio->uio_resid;
1461
1462	mp = mp0;
1463	if (psa != NULL)
1464		*psa = NULL;
1465	if (controlp != NULL)
1466		*controlp = NULL;
1467	if (flagsp != NULL)
1468		flags = *flagsp &~ MSG_EOR;
1469	else
1470		flags = 0;
1471	if (flags & MSG_OOB)
1472		return (soreceive_rcvoob(so, uio, flags));
1473	if (mp != NULL)
1474		*mp = NULL;
1475	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1476	    && uio->uio_resid) {
1477		VNET_SO_ASSERT(so);
1478		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1479	}
1480
1481	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1482	if (error)
1483		return (error);
1484
1485restart:
1486	SOCKBUF_LOCK(&so->so_rcv);
1487	m = so->so_rcv.sb_mb;
1488	/*
1489	 * If we have less data than requested, block awaiting more (subject
1490	 * to any timeout) if:
1491	 *   1. the current count is less than the low water mark, or
1492	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1493	 *	receive operation at once if we block (resid <= hiwat).
1494	 *   3. MSG_DONTWAIT is not set
1495	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1496	 * we have to do the receive in sections, and thus risk returning a
1497	 * short count if a timeout or signal occurs after we start.
1498	 */
1499	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1500	    so->so_rcv.sb_cc < uio->uio_resid) &&
1501	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1502	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1503	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1504		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1505		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1506		    m, so->so_rcv.sb_cc));
1507		if (so->so_error) {
1508			if (m != NULL)
1509				goto dontblock;
1510			error = so->so_error;
1511			if ((flags & MSG_PEEK) == 0)
1512				so->so_error = 0;
1513			SOCKBUF_UNLOCK(&so->so_rcv);
1514			goto release;
1515		}
1516		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1517		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1518			if (m == NULL) {
1519				SOCKBUF_UNLOCK(&so->so_rcv);
1520				goto release;
1521			} else
1522				goto dontblock;
1523		}
1524		for (; m != NULL; m = m->m_next)
1525			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1526				m = so->so_rcv.sb_mb;
1527				goto dontblock;
1528			}
1529		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1530		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1531			SOCKBUF_UNLOCK(&so->so_rcv);
1532			error = ENOTCONN;
1533			goto release;
1534		}
1535		if (uio->uio_resid == 0) {
1536			SOCKBUF_UNLOCK(&so->so_rcv);
1537			goto release;
1538		}
1539		if ((so->so_state & SS_NBIO) ||
1540		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1541			SOCKBUF_UNLOCK(&so->so_rcv);
1542			error = EWOULDBLOCK;
1543			goto release;
1544		}
1545		SBLASTRECORDCHK(&so->so_rcv);
1546		SBLASTMBUFCHK(&so->so_rcv);
1547		error = sbwait(&so->so_rcv);
1548		SOCKBUF_UNLOCK(&so->so_rcv);
1549		if (error)
1550			goto release;
1551		goto restart;
1552	}
1553dontblock:
1554	/*
1555	 * From this point onward, we maintain 'nextrecord' as a cache of the
1556	 * pointer to the next record in the socket buffer.  We must keep the
1557	 * various socket buffer pointers and local stack versions of the
1558	 * pointers in sync, pushing out modifications before dropping the
1559	 * socket buffer mutex, and re-reading them when picking it up.
1560	 *
1561	 * Otherwise, we will race with the network stack appending new data
1562	 * or records onto the socket buffer by using inconsistent/stale
1563	 * versions of the field, possibly resulting in socket buffer
1564	 * corruption.
1565	 *
1566	 * By holding the high-level sblock(), we prevent simultaneous
1567	 * readers from pulling off the front of the socket buffer.
1568	 */
1569	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1570	if (uio->uio_td)
1571		uio->uio_td->td_ru.ru_msgrcv++;
1572	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1573	SBLASTRECORDCHK(&so->so_rcv);
1574	SBLASTMBUFCHK(&so->so_rcv);
1575	nextrecord = m->m_nextpkt;
1576	if (pr->pr_flags & PR_ADDR) {
1577		KASSERT(m->m_type == MT_SONAME,
1578		    ("m->m_type == %d", m->m_type));
1579		orig_resid = 0;
1580		if (psa != NULL)
1581			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1582			    M_NOWAIT);
1583		if (flags & MSG_PEEK) {
1584			m = m->m_next;
1585		} else {
1586			sbfree(&so->so_rcv, m);
1587			so->so_rcv.sb_mb = m_free(m);
1588			m = so->so_rcv.sb_mb;
1589			sockbuf_pushsync(&so->so_rcv, nextrecord);
1590		}
1591	}
1592
1593	/*
1594	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1595	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1596	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1597	 * perform externalization (or freeing if controlp == NULL).
1598	 */
1599	if (m != NULL && m->m_type == MT_CONTROL) {
1600		struct mbuf *cm = NULL, *cmn;
1601		struct mbuf **cme = &cm;
1602
1603		do {
1604			if (flags & MSG_PEEK) {
1605				if (controlp != NULL) {
1606					*controlp = m_copy(m, 0, m->m_len);
1607					controlp = &(*controlp)->m_next;
1608				}
1609				m = m->m_next;
1610			} else {
1611				sbfree(&so->so_rcv, m);
1612				so->so_rcv.sb_mb = m->m_next;
1613				m->m_next = NULL;
1614				*cme = m;
1615				cme = &(*cme)->m_next;
1616				m = so->so_rcv.sb_mb;
1617			}
1618		} while (m != NULL && m->m_type == MT_CONTROL);
1619		if ((flags & MSG_PEEK) == 0)
1620			sockbuf_pushsync(&so->so_rcv, nextrecord);
1621		while (cm != NULL) {
1622			cmn = cm->m_next;
1623			cm->m_next = NULL;
1624			if (pr->pr_domain->dom_externalize != NULL) {
1625				SOCKBUF_UNLOCK(&so->so_rcv);
1626				VNET_SO_ASSERT(so);
1627				error = (*pr->pr_domain->dom_externalize)
1628				    (cm, controlp);
1629				SOCKBUF_LOCK(&so->so_rcv);
1630			} else if (controlp != NULL)
1631				*controlp = cm;
1632			else
1633				m_freem(cm);
1634			if (controlp != NULL) {
1635				orig_resid = 0;
1636				while (*controlp != NULL)
1637					controlp = &(*controlp)->m_next;
1638			}
1639			cm = cmn;
1640		}
1641		if (m != NULL)
1642			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1643		else
1644			nextrecord = so->so_rcv.sb_mb;
1645		orig_resid = 0;
1646	}
1647	if (m != NULL) {
1648		if ((flags & MSG_PEEK) == 0) {
1649			KASSERT(m->m_nextpkt == nextrecord,
1650			    ("soreceive: post-control, nextrecord !sync"));
1651			if (nextrecord == NULL) {
1652				KASSERT(so->so_rcv.sb_mb == m,
1653				    ("soreceive: post-control, sb_mb!=m"));
1654				KASSERT(so->so_rcv.sb_lastrecord == m,
1655				    ("soreceive: post-control, lastrecord!=m"));
1656			}
1657		}
1658		type = m->m_type;
1659		if (type == MT_OOBDATA)
1660			flags |= MSG_OOB;
1661	} else {
1662		if ((flags & MSG_PEEK) == 0) {
1663			KASSERT(so->so_rcv.sb_mb == nextrecord,
1664			    ("soreceive: sb_mb != nextrecord"));
1665			if (so->so_rcv.sb_mb == NULL) {
1666				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1667				    ("soreceive: sb_lastercord != NULL"));
1668			}
1669		}
1670	}
1671	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1672	SBLASTRECORDCHK(&so->so_rcv);
1673	SBLASTMBUFCHK(&so->so_rcv);
1674
1675	/*
1676	 * Now continue to read any data mbufs off of the head of the socket
1677	 * buffer until the read request is satisfied.  Note that 'type' is
1678	 * used to store the type of any mbuf reads that have happened so far
1679	 * such that soreceive() can stop reading if the type changes, which
1680	 * causes soreceive() to return only one of regular data and inline
1681	 * out-of-band data in a single socket receive operation.
1682	 */
1683	moff = 0;
1684	offset = 0;
1685	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1686		/*
1687		 * If the type of mbuf has changed since the last mbuf
1688		 * examined ('type'), end the receive operation.
1689	 	 */
1690		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1691		if (m->m_type == MT_OOBDATA) {
1692			if (type != MT_OOBDATA)
1693				break;
1694		} else if (type == MT_OOBDATA)
1695			break;
1696		else
1697		    KASSERT(m->m_type == MT_DATA,
1698			("m->m_type == %d", m->m_type));
1699		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1700		len = uio->uio_resid;
1701		if (so->so_oobmark && len > so->so_oobmark - offset)
1702			len = so->so_oobmark - offset;
1703		if (len > m->m_len - moff)
1704			len = m->m_len - moff;
1705		/*
1706		 * If mp is set, just pass back the mbufs.  Otherwise copy
1707		 * them out via the uio, then free.  Sockbuf must be
1708		 * consistent here (points to current mbuf, it points to next
1709		 * record) when we drop priority; we must note any additions
1710		 * to the sockbuf when we block interrupts again.
1711		 */
1712		if (mp == NULL) {
1713			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1714			SBLASTRECORDCHK(&so->so_rcv);
1715			SBLASTMBUFCHK(&so->so_rcv);
1716			SOCKBUF_UNLOCK(&so->so_rcv);
1717#ifdef ZERO_COPY_SOCKETS
1718			if (so_zero_copy_receive) {
1719				int disposable;
1720
1721				if ((m->m_flags & M_EXT)
1722				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1723					disposable = 1;
1724				else
1725					disposable = 0;
1726
1727				error = uiomoveco(mtod(m, char *) + moff,
1728						  (int)len, uio,
1729						  disposable);
1730			} else
1731#endif /* ZERO_COPY_SOCKETS */
1732			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1733			SOCKBUF_LOCK(&so->so_rcv);
1734			if (error) {
1735				/*
1736				 * The MT_SONAME mbuf has already been removed
1737				 * from the record, so it is necessary to
1738				 * remove the data mbufs, if any, to preserve
1739				 * the invariant in the case of PR_ADDR that
1740				 * requires MT_SONAME mbufs at the head of
1741				 * each record.
1742				 */
1743				if (m && pr->pr_flags & PR_ATOMIC &&
1744				    ((flags & MSG_PEEK) == 0))
1745					(void)sbdroprecord_locked(&so->so_rcv);
1746				SOCKBUF_UNLOCK(&so->so_rcv);
1747				goto release;
1748			}
1749		} else
1750			uio->uio_resid -= len;
1751		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1752		if (len == m->m_len - moff) {
1753			if (m->m_flags & M_EOR)
1754				flags |= MSG_EOR;
1755			if (flags & MSG_PEEK) {
1756				m = m->m_next;
1757				moff = 0;
1758			} else {
1759				nextrecord = m->m_nextpkt;
1760				sbfree(&so->so_rcv, m);
1761				if (mp != NULL) {
1762					*mp = m;
1763					mp = &m->m_next;
1764					so->so_rcv.sb_mb = m = m->m_next;
1765					*mp = NULL;
1766				} else {
1767					so->so_rcv.sb_mb = m_free(m);
1768					m = so->so_rcv.sb_mb;
1769				}
1770				sockbuf_pushsync(&so->so_rcv, nextrecord);
1771				SBLASTRECORDCHK(&so->so_rcv);
1772				SBLASTMBUFCHK(&so->so_rcv);
1773			}
1774		} else {
1775			if (flags & MSG_PEEK)
1776				moff += len;
1777			else {
1778				if (mp != NULL) {
1779					int copy_flag;
1780
1781					if (flags & MSG_DONTWAIT)
1782						copy_flag = M_DONTWAIT;
1783					else
1784						copy_flag = M_WAIT;
1785					if (copy_flag == M_WAIT)
1786						SOCKBUF_UNLOCK(&so->so_rcv);
1787					*mp = m_copym(m, 0, len, copy_flag);
1788					if (copy_flag == M_WAIT)
1789						SOCKBUF_LOCK(&so->so_rcv);
1790 					if (*mp == NULL) {
1791 						/*
1792 						 * m_copym() couldn't
1793						 * allocate an mbuf.  Adjust
1794						 * uio_resid back (it was
1795						 * adjusted down by len
1796						 * bytes, which we didn't end
1797						 * up "copying" over).
1798 						 */
1799 						uio->uio_resid += len;
1800 						break;
1801 					}
1802				}
1803				m->m_data += len;
1804				m->m_len -= len;
1805				so->so_rcv.sb_cc -= len;
1806			}
1807		}
1808		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1809		if (so->so_oobmark) {
1810			if ((flags & MSG_PEEK) == 0) {
1811				so->so_oobmark -= len;
1812				if (so->so_oobmark == 0) {
1813					so->so_rcv.sb_state |= SBS_RCVATMARK;
1814					break;
1815				}
1816			} else {
1817				offset += len;
1818				if (offset == so->so_oobmark)
1819					break;
1820			}
1821		}
1822		if (flags & MSG_EOR)
1823			break;
1824		/*
1825		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1826		 * must not quit until "uio->uio_resid == 0" or an error
1827		 * termination.  If a signal/timeout occurs, return with a
1828		 * short count but without error.  Keep sockbuf locked
1829		 * against other readers.
1830		 */
1831		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1832		    !sosendallatonce(so) && nextrecord == NULL) {
1833			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1834			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1835				break;
1836			/*
1837			 * Notify the protocol that some data has been
1838			 * drained before blocking.
1839			 */
1840			if (pr->pr_flags & PR_WANTRCVD) {
1841				SOCKBUF_UNLOCK(&so->so_rcv);
1842				VNET_SO_ASSERT(so);
1843				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1844				SOCKBUF_LOCK(&so->so_rcv);
1845			}
1846			SBLASTRECORDCHK(&so->so_rcv);
1847			SBLASTMBUFCHK(&so->so_rcv);
1848			/*
1849			 * We could receive some data while was notifying
1850			 * the protocol. Skip blocking in this case.
1851			 */
1852			if (so->so_rcv.sb_mb == NULL) {
1853				error = sbwait(&so->so_rcv);
1854				if (error) {
1855					SOCKBUF_UNLOCK(&so->so_rcv);
1856					goto release;
1857				}
1858			}
1859			m = so->so_rcv.sb_mb;
1860			if (m != NULL)
1861				nextrecord = m->m_nextpkt;
1862		}
1863	}
1864
1865	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1866	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1867		flags |= MSG_TRUNC;
1868		if ((flags & MSG_PEEK) == 0)
1869			(void) sbdroprecord_locked(&so->so_rcv);
1870	}
1871	if ((flags & MSG_PEEK) == 0) {
1872		if (m == NULL) {
1873			/*
1874			 * First part is an inline SB_EMPTY_FIXUP().  Second
1875			 * part makes sure sb_lastrecord is up-to-date if
1876			 * there is still data in the socket buffer.
1877			 */
1878			so->so_rcv.sb_mb = nextrecord;
1879			if (so->so_rcv.sb_mb == NULL) {
1880				so->so_rcv.sb_mbtail = NULL;
1881				so->so_rcv.sb_lastrecord = NULL;
1882			} else if (nextrecord->m_nextpkt == NULL)
1883				so->so_rcv.sb_lastrecord = nextrecord;
1884		}
1885		SBLASTRECORDCHK(&so->so_rcv);
1886		SBLASTMBUFCHK(&so->so_rcv);
1887		/*
1888		 * If soreceive() is being done from the socket callback,
1889		 * then don't need to generate ACK to peer to update window,
1890		 * since ACK will be generated on return to TCP.
1891		 */
1892		if (!(flags & MSG_SOCALLBCK) &&
1893		    (pr->pr_flags & PR_WANTRCVD)) {
1894			SOCKBUF_UNLOCK(&so->so_rcv);
1895			VNET_SO_ASSERT(so);
1896			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1897			SOCKBUF_LOCK(&so->so_rcv);
1898		}
1899	}
1900	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1901	if (orig_resid == uio->uio_resid && orig_resid &&
1902	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1903		SOCKBUF_UNLOCK(&so->so_rcv);
1904		goto restart;
1905	}
1906	SOCKBUF_UNLOCK(&so->so_rcv);
1907
1908	if (flagsp != NULL)
1909		*flagsp |= flags;
1910release:
1911	sbunlock(&so->so_rcv);
1912	return (error);
1913}
1914
1915/*
1916 * Optimized version of soreceive() for stream (TCP) sockets.
1917 */
1918int
1919soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1920    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1921{
1922	int len = 0, error = 0, flags, oresid;
1923	struct sockbuf *sb;
1924	struct mbuf *m, *n = NULL;
1925
1926	/* We only do stream sockets. */
1927	if (so->so_type != SOCK_STREAM)
1928		return (EINVAL);
1929	if (psa != NULL)
1930		*psa = NULL;
1931	if (controlp != NULL)
1932		return (EINVAL);
1933	if (flagsp != NULL)
1934		flags = *flagsp &~ MSG_EOR;
1935	else
1936		flags = 0;
1937	if (flags & MSG_OOB)
1938		return (soreceive_rcvoob(so, uio, flags));
1939	if (mp0 != NULL)
1940		*mp0 = NULL;
1941
1942	sb = &so->so_rcv;
1943
1944	/* Prevent other readers from entering the socket. */
1945	error = sblock(sb, SBLOCKWAIT(flags));
1946	if (error)
1947		goto out;
1948	SOCKBUF_LOCK(sb);
1949
1950	/* Easy one, no space to copyout anything. */
1951	if (uio->uio_resid == 0) {
1952		error = EINVAL;
1953		goto out;
1954	}
1955	oresid = uio->uio_resid;
1956
1957	/* We will never ever get anything unless we are or were connected. */
1958	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1959		error = ENOTCONN;
1960		goto out;
1961	}
1962
1963restart:
1964	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1965
1966	/* Abort if socket has reported problems. */
1967	if (so->so_error) {
1968		if (sb->sb_cc > 0)
1969			goto deliver;
1970		if (oresid > uio->uio_resid)
1971			goto out;
1972		error = so->so_error;
1973		if (!(flags & MSG_PEEK))
1974			so->so_error = 0;
1975		goto out;
1976	}
1977
1978	/* Door is closed.  Deliver what is left, if any. */
1979	if (sb->sb_state & SBS_CANTRCVMORE) {
1980		if (sb->sb_cc > 0)
1981			goto deliver;
1982		else
1983			goto out;
1984	}
1985
1986	/* Socket buffer is empty and we shall not block. */
1987	if (sb->sb_cc == 0 &&
1988	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1989		error = EAGAIN;
1990		goto out;
1991	}
1992
1993	/* Socket buffer got some data that we shall deliver now. */
1994	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1995	    ((sb->sb_flags & SS_NBIO) ||
1996	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1997	     sb->sb_cc >= sb->sb_lowat ||
1998	     sb->sb_cc >= uio->uio_resid ||
1999	     sb->sb_cc >= sb->sb_hiwat) ) {
2000		goto deliver;
2001	}
2002
2003	/* On MSG_WAITALL we must wait until all data or error arrives. */
2004	if ((flags & MSG_WAITALL) &&
2005	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
2006		goto deliver;
2007
2008	/*
2009	 * Wait and block until (more) data comes in.
2010	 * NB: Drops the sockbuf lock during wait.
2011	 */
2012	error = sbwait(sb);
2013	if (error)
2014		goto out;
2015	goto restart;
2016
2017deliver:
2018	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2019	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2020	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2021
2022	/* Statistics. */
2023	if (uio->uio_td)
2024		uio->uio_td->td_ru.ru_msgrcv++;
2025
2026	/* Fill uio until full or current end of socket buffer is reached. */
2027	len = min(uio->uio_resid, sb->sb_cc);
2028	if (mp0 != NULL) {
2029		/* Dequeue as many mbufs as possible. */
2030		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2031			for (*mp0 = m = sb->sb_mb;
2032			     m != NULL && m->m_len <= len;
2033			     m = m->m_next) {
2034				len -= m->m_len;
2035				uio->uio_resid -= m->m_len;
2036				sbfree(sb, m);
2037				n = m;
2038			}
2039			sb->sb_mb = m;
2040			if (sb->sb_mb == NULL)
2041				SB_EMPTY_FIXUP(sb);
2042			n->m_next = NULL;
2043		}
2044		/* Copy the remainder. */
2045		if (len > 0) {
2046			KASSERT(sb->sb_mb != NULL,
2047			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2048
2049			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2050			if (m == NULL)
2051				len = 0;	/* Don't flush data from sockbuf. */
2052			else
2053				uio->uio_resid -= m->m_len;
2054			if (*mp0 != NULL)
2055				n->m_next = m;
2056			else
2057				*mp0 = m;
2058			if (*mp0 == NULL) {
2059				error = ENOBUFS;
2060				goto out;
2061			}
2062		}
2063	} else {
2064		/* NB: Must unlock socket buffer as uiomove may sleep. */
2065		SOCKBUF_UNLOCK(sb);
2066		error = m_mbuftouio(uio, sb->sb_mb, len);
2067		SOCKBUF_LOCK(sb);
2068		if (error)
2069			goto out;
2070	}
2071	SBLASTRECORDCHK(sb);
2072	SBLASTMBUFCHK(sb);
2073
2074	/*
2075	 * Remove the delivered data from the socket buffer unless we
2076	 * were only peeking.
2077	 */
2078	if (!(flags & MSG_PEEK)) {
2079		if (len > 0)
2080			sbdrop_locked(sb, len);
2081
2082		/* Notify protocol that we drained some data. */
2083		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2084		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2085		     !(flags & MSG_SOCALLBCK))) {
2086			SOCKBUF_UNLOCK(sb);
2087			VNET_SO_ASSERT(so);
2088			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2089			SOCKBUF_LOCK(sb);
2090		}
2091	}
2092
2093	/*
2094	 * For MSG_WAITALL we may have to loop again and wait for
2095	 * more data to come in.
2096	 */
2097	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2098		goto restart;
2099out:
2100	SOCKBUF_LOCK_ASSERT(sb);
2101	SBLASTRECORDCHK(sb);
2102	SBLASTMBUFCHK(sb);
2103	SOCKBUF_UNLOCK(sb);
2104	sbunlock(sb);
2105	return (error);
2106}
2107
2108/*
2109 * Optimized version of soreceive() for simple datagram cases from userspace.
2110 * Unlike in the stream case, we're able to drop a datagram if copyout()
2111 * fails, and because we handle datagrams atomically, we don't need to use a
2112 * sleep lock to prevent I/O interlacing.
2113 */
2114int
2115soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2116    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2117{
2118	struct mbuf *m, *m2;
2119	int flags, len, error;
2120	struct protosw *pr = so->so_proto;
2121	struct mbuf *nextrecord;
2122
2123	if (psa != NULL)
2124		*psa = NULL;
2125	if (controlp != NULL)
2126		*controlp = NULL;
2127	if (flagsp != NULL)
2128		flags = *flagsp &~ MSG_EOR;
2129	else
2130		flags = 0;
2131
2132	/*
2133	 * For any complicated cases, fall back to the full
2134	 * soreceive_generic().
2135	 */
2136	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2137		return (soreceive_generic(so, psa, uio, mp0, controlp,
2138		    flagsp));
2139
2140	/*
2141	 * Enforce restrictions on use.
2142	 */
2143	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2144	    ("soreceive_dgram: wantrcvd"));
2145	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2146	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2147	    ("soreceive_dgram: SBS_RCVATMARK"));
2148	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2149	    ("soreceive_dgram: P_CONNREQUIRED"));
2150
2151	/*
2152	 * Loop blocking while waiting for a datagram.
2153	 */
2154	SOCKBUF_LOCK(&so->so_rcv);
2155	while ((m = so->so_rcv.sb_mb) == NULL) {
2156		KASSERT(so->so_rcv.sb_cc == 0,
2157		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2158		    so->so_rcv.sb_cc));
2159		if (so->so_error) {
2160			error = so->so_error;
2161			so->so_error = 0;
2162			SOCKBUF_UNLOCK(&so->so_rcv);
2163			return (error);
2164		}
2165		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2166		    uio->uio_resid == 0) {
2167			SOCKBUF_UNLOCK(&so->so_rcv);
2168			return (0);
2169		}
2170		if ((so->so_state & SS_NBIO) ||
2171		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2172			SOCKBUF_UNLOCK(&so->so_rcv);
2173			return (EWOULDBLOCK);
2174		}
2175		SBLASTRECORDCHK(&so->so_rcv);
2176		SBLASTMBUFCHK(&so->so_rcv);
2177		error = sbwait(&so->so_rcv);
2178		if (error) {
2179			SOCKBUF_UNLOCK(&so->so_rcv);
2180			return (error);
2181		}
2182	}
2183	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2184
2185	if (uio->uio_td)
2186		uio->uio_td->td_ru.ru_msgrcv++;
2187	SBLASTRECORDCHK(&so->so_rcv);
2188	SBLASTMBUFCHK(&so->so_rcv);
2189	nextrecord = m->m_nextpkt;
2190	if (nextrecord == NULL) {
2191		KASSERT(so->so_rcv.sb_lastrecord == m,
2192		    ("soreceive_dgram: lastrecord != m"));
2193	}
2194
2195	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2196	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2197
2198	/*
2199	 * Pull 'm' and its chain off the front of the packet queue.
2200	 */
2201	so->so_rcv.sb_mb = NULL;
2202	sockbuf_pushsync(&so->so_rcv, nextrecord);
2203
2204	/*
2205	 * Walk 'm's chain and free that many bytes from the socket buffer.
2206	 */
2207	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2208		sbfree(&so->so_rcv, m2);
2209
2210	/*
2211	 * Do a few last checks before we let go of the lock.
2212	 */
2213	SBLASTRECORDCHK(&so->so_rcv);
2214	SBLASTMBUFCHK(&so->so_rcv);
2215	SOCKBUF_UNLOCK(&so->so_rcv);
2216
2217	if (pr->pr_flags & PR_ADDR) {
2218		KASSERT(m->m_type == MT_SONAME,
2219		    ("m->m_type == %d", m->m_type));
2220		if (psa != NULL)
2221			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2222			    M_NOWAIT);
2223		m = m_free(m);
2224	}
2225	if (m == NULL) {
2226		/* XXXRW: Can this happen? */
2227		return (0);
2228	}
2229
2230	/*
2231	 * Packet to copyout() is now in 'm' and it is disconnected from the
2232	 * queue.
2233	 *
2234	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2235	 * in the first mbuf chain on the socket buffer.  We call into the
2236	 * protocol to perform externalization (or freeing if controlp ==
2237	 * NULL).
2238	 */
2239	if (m->m_type == MT_CONTROL) {
2240		struct mbuf *cm = NULL, *cmn;
2241		struct mbuf **cme = &cm;
2242
2243		do {
2244			m2 = m->m_next;
2245			m->m_next = NULL;
2246			*cme = m;
2247			cme = &(*cme)->m_next;
2248			m = m2;
2249		} while (m != NULL && m->m_type == MT_CONTROL);
2250		while (cm != NULL) {
2251			cmn = cm->m_next;
2252			cm->m_next = NULL;
2253			if (pr->pr_domain->dom_externalize != NULL) {
2254				error = (*pr->pr_domain->dom_externalize)
2255				    (cm, controlp);
2256			} else if (controlp != NULL)
2257				*controlp = cm;
2258			else
2259				m_freem(cm);
2260			if (controlp != NULL) {
2261				while (*controlp != NULL)
2262					controlp = &(*controlp)->m_next;
2263			}
2264			cm = cmn;
2265		}
2266	}
2267	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2268
2269	while (m != NULL && uio->uio_resid > 0) {
2270		len = uio->uio_resid;
2271		if (len > m->m_len)
2272			len = m->m_len;
2273		error = uiomove(mtod(m, char *), (int)len, uio);
2274		if (error) {
2275			m_freem(m);
2276			return (error);
2277		}
2278		if (len == m->m_len)
2279			m = m_free(m);
2280		else {
2281			m->m_data += len;
2282			m->m_len -= len;
2283		}
2284	}
2285	if (m != NULL)
2286		flags |= MSG_TRUNC;
2287	m_freem(m);
2288	if (flagsp != NULL)
2289		*flagsp |= flags;
2290	return (0);
2291}
2292
2293int
2294soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2295    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2296{
2297	int error;
2298
2299	CURVNET_SET(so->so_vnet);
2300	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2301	    controlp, flagsp));
2302	CURVNET_RESTORE();
2303	return (error);
2304}
2305
2306int
2307soshutdown(struct socket *so, int how)
2308{
2309	struct protosw *pr = so->so_proto;
2310	int error;
2311
2312	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2313		return (EINVAL);
2314
2315	CURVNET_SET(so->so_vnet);
2316	if (pr->pr_usrreqs->pru_flush != NULL) {
2317	        (*pr->pr_usrreqs->pru_flush)(so, how);
2318	}
2319	if (how != SHUT_WR)
2320		sorflush(so);
2321	if (how != SHUT_RD) {
2322		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2323		CURVNET_RESTORE();
2324		return (error);
2325	}
2326	CURVNET_RESTORE();
2327	return (0);
2328}
2329
2330void
2331sorflush(struct socket *so)
2332{
2333	struct sockbuf *sb = &so->so_rcv;
2334	struct protosw *pr = so->so_proto;
2335	struct sockbuf asb;
2336
2337	VNET_SO_ASSERT(so);
2338
2339	/*
2340	 * In order to avoid calling dom_dispose with the socket buffer mutex
2341	 * held, and in order to generally avoid holding the lock for a long
2342	 * time, we make a copy of the socket buffer and clear the original
2343	 * (except locks, state).  The new socket buffer copy won't have
2344	 * initialized locks so we can only call routines that won't use or
2345	 * assert those locks.
2346	 *
2347	 * Dislodge threads currently blocked in receive and wait to acquire
2348	 * a lock against other simultaneous readers before clearing the
2349	 * socket buffer.  Don't let our acquire be interrupted by a signal
2350	 * despite any existing socket disposition on interruptable waiting.
2351	 */
2352	socantrcvmore(so);
2353	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2354
2355	/*
2356	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2357	 * and mutex data unchanged.
2358	 */
2359	SOCKBUF_LOCK(sb);
2360	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2361	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2362	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2363	bzero(&sb->sb_startzero,
2364	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2365	SOCKBUF_UNLOCK(sb);
2366	sbunlock(sb);
2367
2368	/*
2369	 * Dispose of special rights and flush the socket buffer.  Don't call
2370	 * any unsafe routines (that rely on locks being initialized) on asb.
2371	 */
2372	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2373		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2374	sbrelease_internal(&asb, so);
2375}
2376
2377/*
2378 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2379 * additional variant to handle the case where the option value needs to be
2380 * some kind of integer, but not a specific size.  In addition to their use
2381 * here, these functions are also called by the protocol-level pr_ctloutput()
2382 * routines.
2383 */
2384int
2385sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2386{
2387	size_t	valsize;
2388
2389	/*
2390	 * If the user gives us more than we wanted, we ignore it, but if we
2391	 * don't get the minimum length the caller wants, we return EINVAL.
2392	 * On success, sopt->sopt_valsize is set to however much we actually
2393	 * retrieved.
2394	 */
2395	if ((valsize = sopt->sopt_valsize) < minlen)
2396		return EINVAL;
2397	if (valsize > len)
2398		sopt->sopt_valsize = valsize = len;
2399
2400	if (sopt->sopt_td != NULL)
2401		return (copyin(sopt->sopt_val, buf, valsize));
2402
2403	bcopy(sopt->sopt_val, buf, valsize);
2404	return (0);
2405}
2406
2407/*
2408 * Kernel version of setsockopt(2).
2409 *
2410 * XXX: optlen is size_t, not socklen_t
2411 */
2412int
2413so_setsockopt(struct socket *so, int level, int optname, void *optval,
2414    size_t optlen)
2415{
2416	struct sockopt sopt;
2417
2418	sopt.sopt_level = level;
2419	sopt.sopt_name = optname;
2420	sopt.sopt_dir = SOPT_SET;
2421	sopt.sopt_val = optval;
2422	sopt.sopt_valsize = optlen;
2423	sopt.sopt_td = NULL;
2424	return (sosetopt(so, &sopt));
2425}
2426
2427int
2428sosetopt(struct socket *so, struct sockopt *sopt)
2429{
2430	int	error, optval;
2431	struct	linger l;
2432	struct	timeval tv;
2433	u_long  val;
2434	uint32_t val32;
2435#ifdef MAC
2436	struct mac extmac;
2437#endif
2438
2439	CURVNET_SET(so->so_vnet);
2440	error = 0;
2441	if (sopt->sopt_level != SOL_SOCKET) {
2442		if (so->so_proto && so->so_proto->pr_ctloutput) {
2443			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2444			CURVNET_RESTORE();
2445			return (error);
2446		}
2447		error = ENOPROTOOPT;
2448	} else {
2449		switch (sopt->sopt_name) {
2450#ifdef INET
2451		case SO_ACCEPTFILTER:
2452			error = do_setopt_accept_filter(so, sopt);
2453			if (error)
2454				goto bad;
2455			break;
2456#endif
2457		case SO_LINGER:
2458			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2459			if (error)
2460				goto bad;
2461
2462			SOCK_LOCK(so);
2463			so->so_linger = l.l_linger;
2464			if (l.l_onoff)
2465				so->so_options |= SO_LINGER;
2466			else
2467				so->so_options &= ~SO_LINGER;
2468			SOCK_UNLOCK(so);
2469			break;
2470
2471		case SO_DEBUG:
2472		case SO_KEEPALIVE:
2473		case SO_DONTROUTE:
2474		case SO_USELOOPBACK:
2475		case SO_BROADCAST:
2476		case SO_REUSEADDR:
2477		case SO_REUSEPORT:
2478		case SO_OOBINLINE:
2479		case SO_TIMESTAMP:
2480		case SO_BINTIME:
2481		case SO_NOSIGPIPE:
2482		case SO_NO_DDP:
2483		case SO_NO_OFFLOAD:
2484			error = sooptcopyin(sopt, &optval, sizeof optval,
2485					    sizeof optval);
2486			if (error)
2487				goto bad;
2488			SOCK_LOCK(so);
2489			if (optval)
2490				so->so_options |= sopt->sopt_name;
2491			else
2492				so->so_options &= ~sopt->sopt_name;
2493			SOCK_UNLOCK(so);
2494			break;
2495
2496		case SO_SETFIB:
2497			error = sooptcopyin(sopt, &optval, sizeof optval,
2498					    sizeof optval);
2499			if (optval < 0 || optval > rt_numfibs) {
2500				error = EINVAL;
2501				goto bad;
2502			}
2503			if (so->so_proto != NULL &&
2504			   ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2505			   (so->so_proto->pr_domain->dom_family == PF_ROUTE))) {
2506				so->so_fibnum = optval;
2507				/* Note: ignore error */
2508				if (so->so_proto->pr_ctloutput)
2509					(*so->so_proto->pr_ctloutput)(so, sopt);
2510			} else {
2511				so->so_fibnum = 0;
2512			}
2513			break;
2514
2515		case SO_USER_COOKIE:
2516			error = sooptcopyin(sopt, &val32, sizeof val32,
2517					    sizeof val32);
2518			if (error)
2519				goto bad;
2520			so->so_user_cookie = val32;
2521			break;
2522
2523		case SO_SNDBUF:
2524		case SO_RCVBUF:
2525		case SO_SNDLOWAT:
2526		case SO_RCVLOWAT:
2527			error = sooptcopyin(sopt, &optval, sizeof optval,
2528					    sizeof optval);
2529			if (error)
2530				goto bad;
2531
2532			/*
2533			 * Values < 1 make no sense for any of these options,
2534			 * so disallow them.
2535			 */
2536			if (optval < 1) {
2537				error = EINVAL;
2538				goto bad;
2539			}
2540
2541			switch (sopt->sopt_name) {
2542			case SO_SNDBUF:
2543			case SO_RCVBUF:
2544				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2545				    &so->so_snd : &so->so_rcv, (u_long)optval,
2546				    so, curthread) == 0) {
2547					error = ENOBUFS;
2548					goto bad;
2549				}
2550				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2551				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2552				break;
2553
2554			/*
2555			 * Make sure the low-water is never greater than the
2556			 * high-water.
2557			 */
2558			case SO_SNDLOWAT:
2559				SOCKBUF_LOCK(&so->so_snd);
2560				so->so_snd.sb_lowat =
2561				    (optval > so->so_snd.sb_hiwat) ?
2562				    so->so_snd.sb_hiwat : optval;
2563				SOCKBUF_UNLOCK(&so->so_snd);
2564				break;
2565			case SO_RCVLOWAT:
2566				SOCKBUF_LOCK(&so->so_rcv);
2567				so->so_rcv.sb_lowat =
2568				    (optval > so->so_rcv.sb_hiwat) ?
2569				    so->so_rcv.sb_hiwat : optval;
2570				SOCKBUF_UNLOCK(&so->so_rcv);
2571				break;
2572			}
2573			break;
2574
2575		case SO_SNDTIMEO:
2576		case SO_RCVTIMEO:
2577#ifdef COMPAT_FREEBSD32
2578			if (SV_CURPROC_FLAG(SV_ILP32)) {
2579				struct timeval32 tv32;
2580
2581				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2582				    sizeof tv32);
2583				CP(tv32, tv, tv_sec);
2584				CP(tv32, tv, tv_usec);
2585			} else
2586#endif
2587				error = sooptcopyin(sopt, &tv, sizeof tv,
2588				    sizeof tv);
2589			if (error)
2590				goto bad;
2591
2592			/* assert(hz > 0); */
2593			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2594			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2595				error = EDOM;
2596				goto bad;
2597			}
2598			/* assert(tick > 0); */
2599			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2600			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2601			if (val > INT_MAX) {
2602				error = EDOM;
2603				goto bad;
2604			}
2605			if (val == 0 && tv.tv_usec != 0)
2606				val = 1;
2607
2608			switch (sopt->sopt_name) {
2609			case SO_SNDTIMEO:
2610				so->so_snd.sb_timeo = val;
2611				break;
2612			case SO_RCVTIMEO:
2613				so->so_rcv.sb_timeo = val;
2614				break;
2615			}
2616			break;
2617
2618		case SO_LABEL:
2619#ifdef MAC
2620			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2621			    sizeof extmac);
2622			if (error)
2623				goto bad;
2624			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2625			    so, &extmac);
2626#else
2627			error = EOPNOTSUPP;
2628#endif
2629			break;
2630
2631		default:
2632			error = ENOPROTOOPT;
2633			break;
2634		}
2635		if (error == 0 && so->so_proto != NULL &&
2636		    so->so_proto->pr_ctloutput != NULL) {
2637			(void) ((*so->so_proto->pr_ctloutput)
2638				  (so, sopt));
2639		}
2640	}
2641bad:
2642	CURVNET_RESTORE();
2643	return (error);
2644}
2645
2646/*
2647 * Helper routine for getsockopt.
2648 */
2649int
2650sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2651{
2652	int	error;
2653	size_t	valsize;
2654
2655	error = 0;
2656
2657	/*
2658	 * Documented get behavior is that we always return a value, possibly
2659	 * truncated to fit in the user's buffer.  Traditional behavior is
2660	 * that we always tell the user precisely how much we copied, rather
2661	 * than something useful like the total amount we had available for
2662	 * her.  Note that this interface is not idempotent; the entire
2663	 * answer must generated ahead of time.
2664	 */
2665	valsize = min(len, sopt->sopt_valsize);
2666	sopt->sopt_valsize = valsize;
2667	if (sopt->sopt_val != NULL) {
2668		if (sopt->sopt_td != NULL)
2669			error = copyout(buf, sopt->sopt_val, valsize);
2670		else
2671			bcopy(buf, sopt->sopt_val, valsize);
2672	}
2673	return (error);
2674}
2675
2676int
2677sogetopt(struct socket *so, struct sockopt *sopt)
2678{
2679	int	error, optval;
2680	struct	linger l;
2681	struct	timeval tv;
2682#ifdef MAC
2683	struct mac extmac;
2684#endif
2685
2686	CURVNET_SET(so->so_vnet);
2687	error = 0;
2688	if (sopt->sopt_level != SOL_SOCKET) {
2689		if (so->so_proto && so->so_proto->pr_ctloutput)
2690			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2691		else
2692			error = ENOPROTOOPT;
2693		CURVNET_RESTORE();
2694		return (error);
2695	} else {
2696		switch (sopt->sopt_name) {
2697#ifdef INET
2698		case SO_ACCEPTFILTER:
2699			error = do_getopt_accept_filter(so, sopt);
2700			break;
2701#endif
2702		case SO_LINGER:
2703			SOCK_LOCK(so);
2704			l.l_onoff = so->so_options & SO_LINGER;
2705			l.l_linger = so->so_linger;
2706			SOCK_UNLOCK(so);
2707			error = sooptcopyout(sopt, &l, sizeof l);
2708			break;
2709
2710		case SO_USELOOPBACK:
2711		case SO_DONTROUTE:
2712		case SO_DEBUG:
2713		case SO_KEEPALIVE:
2714		case SO_REUSEADDR:
2715		case SO_REUSEPORT:
2716		case SO_BROADCAST:
2717		case SO_OOBINLINE:
2718		case SO_ACCEPTCONN:
2719		case SO_TIMESTAMP:
2720		case SO_BINTIME:
2721		case SO_NOSIGPIPE:
2722			optval = so->so_options & sopt->sopt_name;
2723integer:
2724			error = sooptcopyout(sopt, &optval, sizeof optval);
2725			break;
2726
2727		case SO_TYPE:
2728			optval = so->so_type;
2729			goto integer;
2730
2731		case SO_ERROR:
2732			SOCK_LOCK(so);
2733			optval = so->so_error;
2734			so->so_error = 0;
2735			SOCK_UNLOCK(so);
2736			goto integer;
2737
2738		case SO_SNDBUF:
2739			optval = so->so_snd.sb_hiwat;
2740			goto integer;
2741
2742		case SO_RCVBUF:
2743			optval = so->so_rcv.sb_hiwat;
2744			goto integer;
2745
2746		case SO_SNDLOWAT:
2747			optval = so->so_snd.sb_lowat;
2748			goto integer;
2749
2750		case SO_RCVLOWAT:
2751			optval = so->so_rcv.sb_lowat;
2752			goto integer;
2753
2754		case SO_SNDTIMEO:
2755		case SO_RCVTIMEO:
2756			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2757				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2758
2759			tv.tv_sec = optval / hz;
2760			tv.tv_usec = (optval % hz) * tick;
2761#ifdef COMPAT_FREEBSD32
2762			if (SV_CURPROC_FLAG(SV_ILP32)) {
2763				struct timeval32 tv32;
2764
2765				CP(tv, tv32, tv_sec);
2766				CP(tv, tv32, tv_usec);
2767				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2768			} else
2769#endif
2770				error = sooptcopyout(sopt, &tv, sizeof tv);
2771			break;
2772
2773		case SO_LABEL:
2774#ifdef MAC
2775			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2776			    sizeof(extmac));
2777			if (error)
2778				goto bad;
2779			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2780			    so, &extmac);
2781			if (error)
2782				goto bad;
2783			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2784#else
2785			error = EOPNOTSUPP;
2786#endif
2787			break;
2788
2789		case SO_PEERLABEL:
2790#ifdef MAC
2791			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2792			    sizeof(extmac));
2793			if (error)
2794				goto bad;
2795			error = mac_getsockopt_peerlabel(
2796			    sopt->sopt_td->td_ucred, so, &extmac);
2797			if (error)
2798				goto bad;
2799			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2800#else
2801			error = EOPNOTSUPP;
2802#endif
2803			break;
2804
2805		case SO_LISTENQLIMIT:
2806			optval = so->so_qlimit;
2807			goto integer;
2808
2809		case SO_LISTENQLEN:
2810			optval = so->so_qlen;
2811			goto integer;
2812
2813		case SO_LISTENINCQLEN:
2814			optval = so->so_incqlen;
2815			goto integer;
2816
2817		default:
2818			error = ENOPROTOOPT;
2819			break;
2820		}
2821	}
2822#ifdef MAC
2823bad:
2824#endif
2825	CURVNET_RESTORE();
2826	return (error);
2827}
2828
2829/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2830int
2831soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2832{
2833	struct mbuf *m, *m_prev;
2834	int sopt_size = sopt->sopt_valsize;
2835
2836	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2837	if (m == NULL)
2838		return ENOBUFS;
2839	if (sopt_size > MLEN) {
2840		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2841		if ((m->m_flags & M_EXT) == 0) {
2842			m_free(m);
2843			return ENOBUFS;
2844		}
2845		m->m_len = min(MCLBYTES, sopt_size);
2846	} else {
2847		m->m_len = min(MLEN, sopt_size);
2848	}
2849	sopt_size -= m->m_len;
2850	*mp = m;
2851	m_prev = m;
2852
2853	while (sopt_size) {
2854		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2855		if (m == NULL) {
2856			m_freem(*mp);
2857			return ENOBUFS;
2858		}
2859		if (sopt_size > MLEN) {
2860			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2861			    M_DONTWAIT);
2862			if ((m->m_flags & M_EXT) == 0) {
2863				m_freem(m);
2864				m_freem(*mp);
2865				return ENOBUFS;
2866			}
2867			m->m_len = min(MCLBYTES, sopt_size);
2868		} else {
2869			m->m_len = min(MLEN, sopt_size);
2870		}
2871		sopt_size -= m->m_len;
2872		m_prev->m_next = m;
2873		m_prev = m;
2874	}
2875	return (0);
2876}
2877
2878/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2879int
2880soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2881{
2882	struct mbuf *m0 = m;
2883
2884	if (sopt->sopt_val == NULL)
2885		return (0);
2886	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2887		if (sopt->sopt_td != NULL) {
2888			int error;
2889
2890			error = copyin(sopt->sopt_val, mtod(m, char *),
2891				       m->m_len);
2892			if (error != 0) {
2893				m_freem(m0);
2894				return(error);
2895			}
2896		} else
2897			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2898		sopt->sopt_valsize -= m->m_len;
2899		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2900		m = m->m_next;
2901	}
2902	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2903		panic("ip6_sooptmcopyin");
2904	return (0);
2905}
2906
2907/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2908int
2909soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2910{
2911	struct mbuf *m0 = m;
2912	size_t valsize = 0;
2913
2914	if (sopt->sopt_val == NULL)
2915		return (0);
2916	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2917		if (sopt->sopt_td != NULL) {
2918			int error;
2919
2920			error = copyout(mtod(m, char *), sopt->sopt_val,
2921				       m->m_len);
2922			if (error != 0) {
2923				m_freem(m0);
2924				return(error);
2925			}
2926		} else
2927			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2928	       sopt->sopt_valsize -= m->m_len;
2929	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2930	       valsize += m->m_len;
2931	       m = m->m_next;
2932	}
2933	if (m != NULL) {
2934		/* enough soopt buffer should be given from user-land */
2935		m_freem(m0);
2936		return(EINVAL);
2937	}
2938	sopt->sopt_valsize = valsize;
2939	return (0);
2940}
2941
2942/*
2943 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2944 * out-of-band data, which will then notify socket consumers.
2945 */
2946void
2947sohasoutofband(struct socket *so)
2948{
2949
2950	if (so->so_sigio != NULL)
2951		pgsigio(&so->so_sigio, SIGURG, 0);
2952	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2953}
2954
2955int
2956sopoll(struct socket *so, int events, struct ucred *active_cred,
2957    struct thread *td)
2958{
2959
2960	/*
2961	 * We do not need to set or assert curvnet as long as everyone uses
2962	 * sopoll_generic().
2963	 */
2964	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2965	    td));
2966}
2967
2968int
2969sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2970    struct thread *td)
2971{
2972	int revents = 0;
2973
2974	SOCKBUF_LOCK(&so->so_snd);
2975	SOCKBUF_LOCK(&so->so_rcv);
2976	if (events & (POLLIN | POLLRDNORM))
2977		if (soreadabledata(so))
2978			revents |= events & (POLLIN | POLLRDNORM);
2979
2980	if (events & (POLLOUT | POLLWRNORM))
2981		if (sowriteable(so))
2982			revents |= events & (POLLOUT | POLLWRNORM);
2983
2984	if (events & (POLLPRI | POLLRDBAND))
2985		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2986			revents |= events & (POLLPRI | POLLRDBAND);
2987
2988	if ((events & POLLINIGNEOF) == 0) {
2989		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2990			revents |= events & (POLLIN | POLLRDNORM);
2991			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2992				revents |= POLLHUP;
2993		}
2994	}
2995
2996	if (revents == 0) {
2997		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2998			selrecord(td, &so->so_rcv.sb_sel);
2999			so->so_rcv.sb_flags |= SB_SEL;
3000		}
3001
3002		if (events & (POLLOUT | POLLWRNORM)) {
3003			selrecord(td, &so->so_snd.sb_sel);
3004			so->so_snd.sb_flags |= SB_SEL;
3005		}
3006	}
3007
3008	SOCKBUF_UNLOCK(&so->so_rcv);
3009	SOCKBUF_UNLOCK(&so->so_snd);
3010	return (revents);
3011}
3012
3013int
3014soo_kqfilter(struct file *fp, struct knote *kn)
3015{
3016	struct socket *so = kn->kn_fp->f_data;
3017	struct sockbuf *sb;
3018
3019	switch (kn->kn_filter) {
3020	case EVFILT_READ:
3021		if (so->so_options & SO_ACCEPTCONN)
3022			kn->kn_fop = &solisten_filtops;
3023		else
3024			kn->kn_fop = &soread_filtops;
3025		sb = &so->so_rcv;
3026		break;
3027	case EVFILT_WRITE:
3028		kn->kn_fop = &sowrite_filtops;
3029		sb = &so->so_snd;
3030		break;
3031	default:
3032		return (EINVAL);
3033	}
3034
3035	SOCKBUF_LOCK(sb);
3036	knlist_add(&sb->sb_sel.si_note, kn, 1);
3037	sb->sb_flags |= SB_KNOTE;
3038	SOCKBUF_UNLOCK(sb);
3039	return (0);
3040}
3041
3042/*
3043 * Some routines that return EOPNOTSUPP for entry points that are not
3044 * supported by a protocol.  Fill in as needed.
3045 */
3046int
3047pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3048{
3049
3050	return EOPNOTSUPP;
3051}
3052
3053int
3054pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3055{
3056
3057	return EOPNOTSUPP;
3058}
3059
3060int
3061pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3062{
3063
3064	return EOPNOTSUPP;
3065}
3066
3067int
3068pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3069{
3070
3071	return EOPNOTSUPP;
3072}
3073
3074int
3075pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3076{
3077
3078	return EOPNOTSUPP;
3079}
3080
3081int
3082pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3083    struct ifnet *ifp, struct thread *td)
3084{
3085
3086	return EOPNOTSUPP;
3087}
3088
3089int
3090pru_disconnect_notsupp(struct socket *so)
3091{
3092
3093	return EOPNOTSUPP;
3094}
3095
3096int
3097pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3098{
3099
3100	return EOPNOTSUPP;
3101}
3102
3103int
3104pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3105{
3106
3107	return EOPNOTSUPP;
3108}
3109
3110int
3111pru_rcvd_notsupp(struct socket *so, int flags)
3112{
3113
3114	return EOPNOTSUPP;
3115}
3116
3117int
3118pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3119{
3120
3121	return EOPNOTSUPP;
3122}
3123
3124int
3125pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3126    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3127{
3128
3129	return EOPNOTSUPP;
3130}
3131
3132/*
3133 * This isn't really a ``null'' operation, but it's the default one and
3134 * doesn't do anything destructive.
3135 */
3136int
3137pru_sense_null(struct socket *so, struct stat *sb)
3138{
3139
3140	sb->st_blksize = so->so_snd.sb_hiwat;
3141	return 0;
3142}
3143
3144int
3145pru_shutdown_notsupp(struct socket *so)
3146{
3147
3148	return EOPNOTSUPP;
3149}
3150
3151int
3152pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3153{
3154
3155	return EOPNOTSUPP;
3156}
3157
3158int
3159pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3160    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3161{
3162
3163	return EOPNOTSUPP;
3164}
3165
3166int
3167pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3168    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3169{
3170
3171	return EOPNOTSUPP;
3172}
3173
3174int
3175pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3176    struct thread *td)
3177{
3178
3179	return EOPNOTSUPP;
3180}
3181
3182static void
3183filt_sordetach(struct knote *kn)
3184{
3185	struct socket *so = kn->kn_fp->f_data;
3186
3187	SOCKBUF_LOCK(&so->so_rcv);
3188	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3189	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3190		so->so_rcv.sb_flags &= ~SB_KNOTE;
3191	SOCKBUF_UNLOCK(&so->so_rcv);
3192}
3193
3194/*ARGSUSED*/
3195static int
3196filt_soread(struct knote *kn, long hint)
3197{
3198	struct socket *so;
3199
3200	so = kn->kn_fp->f_data;
3201	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3202
3203	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3204	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3205		kn->kn_flags |= EV_EOF;
3206		kn->kn_fflags = so->so_error;
3207		return (1);
3208	} else if (so->so_error)	/* temporary udp error */
3209		return (1);
3210	else if (kn->kn_sfflags & NOTE_LOWAT)
3211		return (kn->kn_data >= kn->kn_sdata);
3212	else
3213		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3214}
3215
3216static void
3217filt_sowdetach(struct knote *kn)
3218{
3219	struct socket *so = kn->kn_fp->f_data;
3220
3221	SOCKBUF_LOCK(&so->so_snd);
3222	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3223	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3224		so->so_snd.sb_flags &= ~SB_KNOTE;
3225	SOCKBUF_UNLOCK(&so->so_snd);
3226}
3227
3228/*ARGSUSED*/
3229static int
3230filt_sowrite(struct knote *kn, long hint)
3231{
3232	struct socket *so;
3233
3234	so = kn->kn_fp->f_data;
3235	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3236	kn->kn_data = sbspace(&so->so_snd);
3237	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3238		kn->kn_flags |= EV_EOF;
3239		kn->kn_fflags = so->so_error;
3240		return (1);
3241	} else if (so->so_error)	/* temporary udp error */
3242		return (1);
3243	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3244	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3245		return (0);
3246	else if (kn->kn_sfflags & NOTE_LOWAT)
3247		return (kn->kn_data >= kn->kn_sdata);
3248	else
3249		return (kn->kn_data >= so->so_snd.sb_lowat);
3250}
3251
3252/*ARGSUSED*/
3253static int
3254filt_solisten(struct knote *kn, long hint)
3255{
3256	struct socket *so = kn->kn_fp->f_data;
3257
3258	kn->kn_data = so->so_qlen;
3259	return (! TAILQ_EMPTY(&so->so_comp));
3260}
3261
3262int
3263socheckuid(struct socket *so, uid_t uid)
3264{
3265
3266	if (so == NULL)
3267		return (EPERM);
3268	if (so->so_cred->cr_uid != uid)
3269		return (EPERM);
3270	return (0);
3271}
3272
3273static int
3274sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3275{
3276	int error;
3277	int val;
3278
3279	val = somaxconn;
3280	error = sysctl_handle_int(oidp, &val, 0, req);
3281	if (error || !req->newptr )
3282		return (error);
3283
3284	if (val < 1 || val > USHRT_MAX)
3285		return (EINVAL);
3286
3287	somaxconn = val;
3288	return (0);
3289}
3290
3291/*
3292 * These functions are used by protocols to notify the socket layer (and its
3293 * consumers) of state changes in the sockets driven by protocol-side events.
3294 */
3295
3296/*
3297 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3298 *
3299 * Normal sequence from the active (originating) side is that
3300 * soisconnecting() is called during processing of connect() call, resulting
3301 * in an eventual call to soisconnected() if/when the connection is
3302 * established.  When the connection is torn down soisdisconnecting() is
3303 * called during processing of disconnect() call, and soisdisconnected() is
3304 * called when the connection to the peer is totally severed.  The semantics
3305 * of these routines are such that connectionless protocols can call
3306 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3307 * calls when setting up a ``connection'' takes no time.
3308 *
3309 * From the passive side, a socket is created with two queues of sockets:
3310 * so_incomp for connections in progress and so_comp for connections already
3311 * made and awaiting user acceptance.  As a protocol is preparing incoming
3312 * connections, it creates a socket structure queued on so_incomp by calling
3313 * sonewconn().  When the connection is established, soisconnected() is
3314 * called, and transfers the socket structure to so_comp, making it available
3315 * to accept().
3316 *
3317 * If a socket is closed with sockets on either so_incomp or so_comp, these
3318 * sockets are dropped.
3319 *
3320 * If higher-level protocols are implemented in the kernel, the wakeups done
3321 * here will sometimes cause software-interrupt process scheduling.
3322 */
3323void
3324soisconnecting(struct socket *so)
3325{
3326
3327	SOCK_LOCK(so);
3328	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3329	so->so_state |= SS_ISCONNECTING;
3330	SOCK_UNLOCK(so);
3331}
3332
3333void
3334soisconnected(struct socket *so)
3335{
3336	struct socket *head;
3337	int ret;
3338
3339restart:
3340	ACCEPT_LOCK();
3341	SOCK_LOCK(so);
3342	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3343	so->so_state |= SS_ISCONNECTED;
3344	head = so->so_head;
3345	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3346		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3347			SOCK_UNLOCK(so);
3348			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3349			head->so_incqlen--;
3350			so->so_qstate &= ~SQ_INCOMP;
3351			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3352			head->so_qlen++;
3353			so->so_qstate |= SQ_COMP;
3354			ACCEPT_UNLOCK();
3355			sorwakeup(head);
3356			wakeup_one(&head->so_timeo);
3357		} else {
3358			ACCEPT_UNLOCK();
3359			soupcall_set(so, SO_RCV,
3360			    head->so_accf->so_accept_filter->accf_callback,
3361			    head->so_accf->so_accept_filter_arg);
3362			so->so_options &= ~SO_ACCEPTFILTER;
3363			ret = head->so_accf->so_accept_filter->accf_callback(so,
3364			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3365			if (ret == SU_ISCONNECTED)
3366				soupcall_clear(so, SO_RCV);
3367			SOCK_UNLOCK(so);
3368			if (ret == SU_ISCONNECTED)
3369				goto restart;
3370		}
3371		return;
3372	}
3373	SOCK_UNLOCK(so);
3374	ACCEPT_UNLOCK();
3375	wakeup(&so->so_timeo);
3376	sorwakeup(so);
3377	sowwakeup(so);
3378}
3379
3380void
3381soisdisconnecting(struct socket *so)
3382{
3383
3384	/*
3385	 * Note: This code assumes that SOCK_LOCK(so) and
3386	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3387	 */
3388	SOCKBUF_LOCK(&so->so_rcv);
3389	so->so_state &= ~SS_ISCONNECTING;
3390	so->so_state |= SS_ISDISCONNECTING;
3391	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3392	sorwakeup_locked(so);
3393	SOCKBUF_LOCK(&so->so_snd);
3394	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3395	sowwakeup_locked(so);
3396	wakeup(&so->so_timeo);
3397}
3398
3399void
3400soisdisconnected(struct socket *so)
3401{
3402
3403	/*
3404	 * Note: This code assumes that SOCK_LOCK(so) and
3405	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3406	 */
3407	SOCKBUF_LOCK(&so->so_rcv);
3408	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3409	so->so_state |= SS_ISDISCONNECTED;
3410	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3411	sorwakeup_locked(so);
3412	SOCKBUF_LOCK(&so->so_snd);
3413	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3414	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3415	sowwakeup_locked(so);
3416	wakeup(&so->so_timeo);
3417}
3418
3419/*
3420 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3421 */
3422struct sockaddr *
3423sodupsockaddr(const struct sockaddr *sa, int mflags)
3424{
3425	struct sockaddr *sa2;
3426
3427	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3428	if (sa2)
3429		bcopy(sa, sa2, sa->sa_len);
3430	return sa2;
3431}
3432
3433/*
3434 * Register per-socket buffer upcalls.
3435 */
3436void
3437soupcall_set(struct socket *so, int which,
3438    int (*func)(struct socket *, void *, int), void *arg)
3439{
3440	struct sockbuf *sb;
3441
3442	switch (which) {
3443	case SO_RCV:
3444		sb = &so->so_rcv;
3445		break;
3446	case SO_SND:
3447		sb = &so->so_snd;
3448		break;
3449	default:
3450		panic("soupcall_set: bad which");
3451	}
3452	SOCKBUF_LOCK_ASSERT(sb);
3453#if 0
3454	/* XXX: accf_http actually wants to do this on purpose. */
3455	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3456#endif
3457	sb->sb_upcall = func;
3458	sb->sb_upcallarg = arg;
3459	sb->sb_flags |= SB_UPCALL;
3460}
3461
3462void
3463soupcall_clear(struct socket *so, int which)
3464{
3465	struct sockbuf *sb;
3466
3467	switch (which) {
3468	case SO_RCV:
3469		sb = &so->so_rcv;
3470		break;
3471	case SO_SND:
3472		sb = &so->so_snd;
3473		break;
3474	default:
3475		panic("soupcall_clear: bad which");
3476	}
3477	SOCKBUF_LOCK_ASSERT(sb);
3478	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3479	sb->sb_upcall = NULL;
3480	sb->sb_upcallarg = NULL;
3481	sb->sb_flags &= ~SB_UPCALL;
3482}
3483
3484/*
3485 * Create an external-format (``xsocket'') structure using the information in
3486 * the kernel-format socket structure pointed to by so.  This is done to
3487 * reduce the spew of irrelevant information over this interface, to isolate
3488 * user code from changes in the kernel structure, and potentially to provide
3489 * information-hiding if we decide that some of this information should be
3490 * hidden from users.
3491 */
3492void
3493sotoxsocket(struct socket *so, struct xsocket *xso)
3494{
3495
3496	xso->xso_len = sizeof *xso;
3497	xso->xso_so = so;
3498	xso->so_type = so->so_type;
3499	xso->so_options = so->so_options;
3500	xso->so_linger = so->so_linger;
3501	xso->so_state = so->so_state;
3502	xso->so_pcb = so->so_pcb;
3503	xso->xso_protocol = so->so_proto->pr_protocol;
3504	xso->xso_family = so->so_proto->pr_domain->dom_family;
3505	xso->so_qlen = so->so_qlen;
3506	xso->so_incqlen = so->so_incqlen;
3507	xso->so_qlimit = so->so_qlimit;
3508	xso->so_timeo = so->so_timeo;
3509	xso->so_error = so->so_error;
3510	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3511	xso->so_oobmark = so->so_oobmark;
3512	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3513	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3514	xso->so_uid = so->so_cred->cr_uid;
3515}
3516
3517
3518/*
3519 * Socket accessor functions to provide external consumers with
3520 * a safe interface to socket state
3521 *
3522 */
3523
3524void
3525so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3526{
3527
3528	TAILQ_FOREACH(so, &so->so_comp, so_list)
3529		func(so, arg);
3530}
3531
3532struct sockbuf *
3533so_sockbuf_rcv(struct socket *so)
3534{
3535
3536	return (&so->so_rcv);
3537}
3538
3539struct sockbuf *
3540so_sockbuf_snd(struct socket *so)
3541{
3542
3543	return (&so->so_snd);
3544}
3545
3546int
3547so_state_get(const struct socket *so)
3548{
3549
3550	return (so->so_state);
3551}
3552
3553void
3554so_state_set(struct socket *so, int val)
3555{
3556
3557	so->so_state = val;
3558}
3559
3560int
3561so_options_get(const struct socket *so)
3562{
3563
3564	return (so->so_options);
3565}
3566
3567void
3568so_options_set(struct socket *so, int val)
3569{
3570
3571	so->so_options = val;
3572}
3573
3574int
3575so_error_get(const struct socket *so)
3576{
3577
3578	return (so->so_error);
3579}
3580
3581void
3582so_error_set(struct socket *so, int val)
3583{
3584
3585	so->so_error = val;
3586}
3587
3588int
3589so_linger_get(const struct socket *so)
3590{
3591
3592	return (so->so_linger);
3593}
3594
3595void
3596so_linger_set(struct socket *so, int val)
3597{
3598
3599	so->so_linger = val;
3600}
3601
3602struct protosw *
3603so_protosw_get(const struct socket *so)
3604{
3605
3606	return (so->so_proto);
3607}
3608
3609void
3610so_protosw_set(struct socket *so, struct protosw *val)
3611{
3612
3613	so->so_proto = val;
3614}
3615
3616void
3617so_sorwakeup(struct socket *so)
3618{
3619
3620	sorwakeup(so);
3621}
3622
3623void
3624so_sowwakeup(struct socket *so)
3625{
3626
3627	sowwakeup(so);
3628}
3629
3630void
3631so_sorwakeup_locked(struct socket *so)
3632{
3633
3634	sorwakeup_locked(so);
3635}
3636
3637void
3638so_sowwakeup_locked(struct socket *so)
3639{
3640
3641	sowwakeup_locked(so);
3642}
3643
3644void
3645so_lock(struct socket *so)
3646{
3647	SOCK_LOCK(so);
3648}
3649
3650void
3651so_unlock(struct socket *so)
3652{
3653	SOCK_UNLOCK(so);
3654}
3655