1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103#include <sys/cdefs.h>
104__FBSDID("$FreeBSD$");
105
106#include "opt_inet.h"
107#include "opt_inet6.h"
108#include "opt_zero.h"
109#include "opt_compat.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/fcntl.h>
114#include <sys/limits.h>
115#include <sys/lock.h>
116#include <sys/mac.h>
117#include <sys/malloc.h>
118#include <sys/mbuf.h>
119#include <sys/mutex.h>
120#include <sys/domain.h>
121#include <sys/file.h>			/* for struct knote */
122#include <sys/kernel.h>
123#include <sys/event.h>
124#include <sys/eventhandler.h>
125#include <sys/poll.h>
126#include <sys/proc.h>
127#include <sys/protosw.h>
128#include <sys/socket.h>
129#include <sys/socketvar.h>
130#include <sys/resourcevar.h>
131#include <net/route.h>
132#include <sys/signalvar.h>
133#include <sys/stat.h>
134#include <sys/sx.h>
135#include <sys/sysctl.h>
136#include <sys/uio.h>
137#include <sys/jail.h>
138#include <sys/syslog.h>
139#include <netinet/in.h>
140
141#include <net/vnet.h>
142
143#include <security/mac/mac_framework.h>
144
145#include <vm/uma.h>
146
147#ifdef COMPAT_FREEBSD32
148#include <sys/mount.h>
149#include <sys/sysent.h>
150#include <compat/freebsd32/freebsd32.h>
151#endif
152
153static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
154		    int flags);
155
156static void	filt_sordetach(struct knote *kn);
157static int	filt_soread(struct knote *kn, long hint);
158static void	filt_sowdetach(struct knote *kn);
159static int	filt_sowrite(struct knote *kn, long hint);
160static int	filt_solisten(struct knote *kn, long hint);
161
162static struct filterops solisten_filtops = {
163	.f_isfd = 1,
164	.f_detach = filt_sordetach,
165	.f_event = filt_solisten,
166};
167static struct filterops soread_filtops = {
168	.f_isfd = 1,
169	.f_detach = filt_sordetach,
170	.f_event = filt_soread,
171};
172static struct filterops sowrite_filtops = {
173	.f_isfd = 1,
174	.f_detach = filt_sowdetach,
175	.f_event = filt_sowrite,
176};
177
178so_gen_t	so_gencnt;	/* generation count for sockets */
179
180MALLOC_DEFINE(M_SONAME, "soname", "socket name");
181MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
182
183#define	VNET_SO_ASSERT(so)						\
184	VNET_ASSERT(curvnet != NULL,					\
185	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
186
187/*
188 * Limit on the number of connections in the listen queue waiting
189 * for accept(2).
190 */
191static int somaxconn = SOMAXCONN;
192
193static int
194sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
195{
196	int error;
197	int val;
198
199	val = somaxconn;
200	error = sysctl_handle_int(oidp, &val, 0, req);
201	if (error || !req->newptr )
202		return (error);
203
204	if (val < 1 || val > USHRT_MAX)
205		return (EINVAL);
206
207	somaxconn = val;
208	return (0);
209}
210SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
211    0, sizeof(int), sysctl_somaxconn, "I",
212    "Maximum listen socket pending connection accept queue size");
213
214static int numopensockets;
215SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
216    &numopensockets, 0, "Number of open sockets");
217
218#ifdef ZERO_COPY_SOCKETS
219/* These aren't static because they're used in other files. */
220int so_zero_copy_send = 1;
221int so_zero_copy_receive = 1;
222SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
223    "Zero copy controls");
224SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
225    &so_zero_copy_receive, 0, "Enable zero copy receive");
226SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
227    &so_zero_copy_send, 0, "Enable zero copy send");
228#endif /* ZERO_COPY_SOCKETS */
229
230/*
231 * accept_mtx locks down per-socket fields relating to accept queues.  See
232 * socketvar.h for an annotation of the protected fields of struct socket.
233 */
234struct mtx accept_mtx;
235MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
236
237/*
238 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
239 * so_gencnt field.
240 */
241static struct mtx so_global_mtx;
242MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
243
244/*
245 * General IPC sysctl name space, used by sockets and a variety of other IPC
246 * types.
247 */
248SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
249
250/*
251 * Initialize the socket subsystem and set up the socket
252 * memory allocator.
253 */
254uma_zone_t socket_zone;
255int	maxsockets;
256
257static void
258socket_zone_change(void *tag)
259{
260
261	uma_zone_set_max(socket_zone, maxsockets);
262}
263
264static void
265socket_init(void *tag)
266{
267
268        socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
269            NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
270        uma_zone_set_max(socket_zone, maxsockets);
271        EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
272                EVENTHANDLER_PRI_FIRST);
273}
274SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
275
276/*
277 * Initialise maxsockets.  This SYSINIT must be run after
278 * tunable_mbinit().
279 */
280static void
281init_maxsockets(void *ignored)
282{
283
284	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
285	maxsockets = imax(maxsockets, maxfiles);
286}
287SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
288
289/*
290 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
291 * of the change so that they can update their dependent limits as required.
292 */
293static int
294sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
295{
296	int error, newmaxsockets;
297
298	newmaxsockets = maxsockets;
299	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
300	if (error == 0 && req->newptr) {
301		if (newmaxsockets > maxsockets &&
302		    newmaxsockets <= maxfiles) {
303			maxsockets = newmaxsockets;
304			EVENTHANDLER_INVOKE(maxsockets_change);
305		} else
306			error = EINVAL;
307	}
308	return (error);
309}
310SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
311    &maxsockets, 0, sysctl_maxsockets, "IU",
312    "Maximum number of sockets avaliable");
313
314/*
315 * Socket operation routines.  These routines are called by the routines in
316 * sys_socket.c or from a system process, and implement the semantics of
317 * socket operations by switching out to the protocol specific routines.
318 */
319
320/*
321 * Get a socket structure from our zone, and initialize it.  Note that it
322 * would probably be better to allocate socket and PCB at the same time, but
323 * I'm not convinced that all the protocols can be easily modified to do
324 * this.
325 *
326 * soalloc() returns a socket with a ref count of 0.
327 */
328static struct socket *
329soalloc(struct vnet *vnet)
330{
331	struct socket *so;
332
333	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
334	if (so == NULL)
335		return (NULL);
336#ifdef MAC
337	if (mac_socket_init(so, M_NOWAIT) != 0) {
338		uma_zfree(socket_zone, so);
339		return (NULL);
340	}
341#endif
342	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
343	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
344	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
345	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
346	TAILQ_INIT(&so->so_aiojobq);
347	mtx_lock(&so_global_mtx);
348	so->so_gencnt = ++so_gencnt;
349	++numopensockets;
350#ifdef VIMAGE
351	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
352	    __func__, __LINE__, so));
353	vnet->vnet_sockcnt++;
354	so->so_vnet = vnet;
355#endif
356	mtx_unlock(&so_global_mtx);
357	return (so);
358}
359
360/*
361 * Free the storage associated with a socket at the socket layer, tear down
362 * locks, labels, etc.  All protocol state is assumed already to have been
363 * torn down (and possibly never set up) by the caller.
364 */
365static void
366sodealloc(struct socket *so)
367{
368
369	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
370	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
371
372	mtx_lock(&so_global_mtx);
373	so->so_gencnt = ++so_gencnt;
374	--numopensockets;	/* Could be below, but faster here. */
375#ifdef VIMAGE
376	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
377	    __func__, __LINE__, so));
378	so->so_vnet->vnet_sockcnt--;
379#endif
380	mtx_unlock(&so_global_mtx);
381	if (so->so_rcv.sb_hiwat)
382		(void)chgsbsize(so->so_cred->cr_uidinfo,
383		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
384	if (so->so_snd.sb_hiwat)
385		(void)chgsbsize(so->so_cred->cr_uidinfo,
386		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
387#ifdef INET
388	/* remove acccept filter if one is present. */
389	if (so->so_accf != NULL)
390		do_setopt_accept_filter(so, NULL);
391#endif
392#ifdef MAC
393	mac_socket_destroy(so);
394#endif
395	crfree(so->so_cred);
396	sx_destroy(&so->so_snd.sb_sx);
397	sx_destroy(&so->so_rcv.sb_sx);
398	SOCKBUF_LOCK_DESTROY(&so->so_snd);
399	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
400	uma_zfree(socket_zone, so);
401}
402
403/*
404 * socreate returns a socket with a ref count of 1.  The socket should be
405 * closed with soclose().
406 */
407int
408socreate(int dom, struct socket **aso, int type, int proto,
409    struct ucred *cred, struct thread *td)
410{
411	struct protosw *prp;
412	struct socket *so;
413	int error;
414
415	if (proto)
416		prp = pffindproto(dom, proto, type);
417	else
418		prp = pffindtype(dom, type);
419
420	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
421	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
422		return (EPROTONOSUPPORT);
423
424	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
425		return (EPROTONOSUPPORT);
426
427	if (prp->pr_type != type)
428		return (EPROTOTYPE);
429	so = soalloc(CRED_TO_VNET(cred));
430	if (so == NULL)
431		return (ENOBUFS);
432
433	TAILQ_INIT(&so->so_incomp);
434	TAILQ_INIT(&so->so_comp);
435	so->so_type = type;
436	so->so_cred = crhold(cred);
437	if ((prp->pr_domain->dom_family == PF_INET) ||
438	    (prp->pr_domain->dom_family == PF_INET6) ||
439	    (prp->pr_domain->dom_family == PF_ROUTE))
440		so->so_fibnum = td->td_proc->p_fibnum;
441	else
442		so->so_fibnum = 0;
443	so->so_proto = prp;
444#ifdef MAC
445	mac_socket_create(cred, so);
446#endif
447	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
448	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
449	so->so_count = 1;
450	/*
451	 * Auto-sizing of socket buffers is managed by the protocols and
452	 * the appropriate flags must be set in the pru_attach function.
453	 */
454	CURVNET_SET(so->so_vnet);
455	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
456	CURVNET_RESTORE();
457	if (error) {
458		KASSERT(so->so_count == 1, ("socreate: so_count %d",
459		    so->so_count));
460		so->so_count = 0;
461		sodealloc(so);
462		return (error);
463	}
464	*aso = so;
465	return (0);
466}
467
468#ifdef REGRESSION
469static int regression_sonewconn_earlytest = 1;
470SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
471    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
472#endif
473
474/*
475 * When an attempt at a new connection is noted on a socket which accepts
476 * connections, sonewconn is called.  If the connection is possible (subject
477 * to space constraints, etc.) then we allocate a new structure, propoerly
478 * linked into the data structure of the original socket, and return this.
479 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
480 *
481 * Note: the ref count on the socket is 0 on return.
482 */
483struct socket *
484sonewconn(struct socket *head, int connstatus)
485{
486	static struct timeval lastover;
487	static struct timeval overinterval = { 60, 0 };
488	static int overcount;
489
490	struct socket *so;
491	int over;
492
493	ACCEPT_LOCK();
494	over = (head->so_qlen > 3 * head->so_qlimit / 2);
495	ACCEPT_UNLOCK();
496#ifdef REGRESSION
497	if (regression_sonewconn_earlytest && over) {
498#else
499	if (over) {
500#endif
501		overcount++;
502
503		if (ratecheck(&lastover, &overinterval)) {
504			log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
505			    "%i already in queue awaiting acceptance "
506			    "(%d occurrences)\n",
507			    __func__, head->so_pcb, head->so_qlen, overcount);
508
509			overcount = 0;
510		}
511
512		return (NULL);
513	}
514	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
515	    __func__, __LINE__, head));
516	so = soalloc(head->so_vnet);
517	if (so == NULL) {
518		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
519		    "limit reached or out of memory\n",
520		    __func__, head->so_pcb);
521		return (NULL);
522	}
523	if ((head->so_options & SO_ACCEPTFILTER) != 0)
524		connstatus = 0;
525	so->so_head = head;
526	so->so_type = head->so_type;
527	so->so_options = head->so_options &~ SO_ACCEPTCONN;
528	so->so_linger = head->so_linger;
529	so->so_state = head->so_state | SS_NOFDREF;
530	so->so_fibnum = head->so_fibnum;
531	so->so_proto = head->so_proto;
532	so->so_cred = crhold(head->so_cred);
533#ifdef MAC
534	mac_socket_newconn(head, so);
535#endif
536	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
537	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
538	VNET_SO_ASSERT(head);
539	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
540		sodealloc(so);
541		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
542		    __func__, head->so_pcb);
543		return (NULL);
544	}
545	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
546		sodealloc(so);
547		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
548		    __func__, head->so_pcb);
549		return (NULL);
550	}
551	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
552	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
553	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
554	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
555	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
556	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
557	so->so_state |= connstatus;
558	ACCEPT_LOCK();
559	/*
560	 * The accept socket may be tearing down but we just
561	 * won a race on the ACCEPT_LOCK.
562	 * However, if sctp_peeloff() is called on a 1-to-many
563	 * style socket, the SO_ACCEPTCONN doesn't need to be set.
564	 */
565	if (!(head->so_options & SO_ACCEPTCONN) &&
566	    ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
567	     (head->so_type != SOCK_SEQPACKET))) {
568		SOCK_LOCK(so);
569		so->so_head = NULL;
570		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
571		return (NULL);
572	}
573	if (connstatus) {
574		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
575		so->so_qstate |= SQ_COMP;
576		head->so_qlen++;
577	} else {
578		/*
579		 * Keep removing sockets from the head until there's room for
580		 * us to insert on the tail.  In pre-locking revisions, this
581		 * was a simple if(), but as we could be racing with other
582		 * threads and soabort() requires dropping locks, we must
583		 * loop waiting for the condition to be true.
584		 */
585		while (head->so_incqlen > head->so_qlimit) {
586			struct socket *sp;
587			sp = TAILQ_FIRST(&head->so_incomp);
588			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
589			head->so_incqlen--;
590			sp->so_qstate &= ~SQ_INCOMP;
591			sp->so_head = NULL;
592			ACCEPT_UNLOCK();
593			soabort(sp);
594			ACCEPT_LOCK();
595		}
596		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
597		so->so_qstate |= SQ_INCOMP;
598		head->so_incqlen++;
599	}
600	ACCEPT_UNLOCK();
601	if (connstatus) {
602		sorwakeup(head);
603		wakeup_one(&head->so_timeo);
604	}
605	return (so);
606}
607
608int
609sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
610{
611	int error;
612
613	CURVNET_SET(so->so_vnet);
614	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
615	CURVNET_RESTORE();
616	return error;
617}
618
619/*
620 * solisten() transitions a socket from a non-listening state to a listening
621 * state, but can also be used to update the listen queue depth on an
622 * existing listen socket.  The protocol will call back into the sockets
623 * layer using solisten_proto_check() and solisten_proto() to check and set
624 * socket-layer listen state.  Call backs are used so that the protocol can
625 * acquire both protocol and socket layer locks in whatever order is required
626 * by the protocol.
627 *
628 * Protocol implementors are advised to hold the socket lock across the
629 * socket-layer test and set to avoid races at the socket layer.
630 */
631int
632solisten(struct socket *so, int backlog, struct thread *td)
633{
634	int error;
635
636	CURVNET_SET(so->so_vnet);
637	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
638	CURVNET_RESTORE();
639	return error;
640}
641
642int
643solisten_proto_check(struct socket *so)
644{
645
646	SOCK_LOCK_ASSERT(so);
647
648	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
649	    SS_ISDISCONNECTING))
650		return (EINVAL);
651	return (0);
652}
653
654void
655solisten_proto(struct socket *so, int backlog)
656{
657
658	SOCK_LOCK_ASSERT(so);
659
660	if (backlog < 0 || backlog > somaxconn)
661		backlog = somaxconn;
662	so->so_qlimit = backlog;
663	so->so_options |= SO_ACCEPTCONN;
664}
665
666/*
667 * Evaluate the reference count and named references on a socket; if no
668 * references remain, free it.  This should be called whenever a reference is
669 * released, such as in sorele(), but also when named reference flags are
670 * cleared in socket or protocol code.
671 *
672 * sofree() will free the socket if:
673 *
674 * - There are no outstanding file descriptor references or related consumers
675 *   (so_count == 0).
676 *
677 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
678 *
679 * - The protocol does not have an outstanding strong reference on the socket
680 *   (SS_PROTOREF).
681 *
682 * - The socket is not in a completed connection queue, so a process has been
683 *   notified that it is present.  If it is removed, the user process may
684 *   block in accept() despite select() saying the socket was ready.
685 */
686void
687sofree(struct socket *so)
688{
689	struct protosw *pr = so->so_proto;
690	struct socket *head;
691
692	ACCEPT_LOCK_ASSERT();
693	SOCK_LOCK_ASSERT(so);
694
695	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
696	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
697		SOCK_UNLOCK(so);
698		ACCEPT_UNLOCK();
699		return;
700	}
701
702	head = so->so_head;
703	if (head != NULL) {
704		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
705		    (so->so_qstate & SQ_INCOMP) != 0,
706		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
707		    "SQ_INCOMP"));
708		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
709		    (so->so_qstate & SQ_INCOMP) == 0,
710		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
711		TAILQ_REMOVE(&head->so_incomp, so, so_list);
712		head->so_incqlen--;
713		so->so_qstate &= ~SQ_INCOMP;
714		so->so_head = NULL;
715	}
716	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
717	    (so->so_qstate & SQ_INCOMP) == 0,
718	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
719	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
720	if (so->so_options & SO_ACCEPTCONN) {
721		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
722		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
723	}
724	SOCK_UNLOCK(so);
725	ACCEPT_UNLOCK();
726
727	VNET_SO_ASSERT(so);
728	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
729		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
730	if (pr->pr_usrreqs->pru_detach != NULL)
731		(*pr->pr_usrreqs->pru_detach)(so);
732
733	/*
734	 * From this point on, we assume that no other references to this
735	 * socket exist anywhere else in the stack.  Therefore, no locks need
736	 * to be acquired or held.
737	 *
738	 * We used to do a lot of socket buffer and socket locking here, as
739	 * well as invoke sorflush() and perform wakeups.  The direct call to
740	 * dom_dispose() and sbrelease_internal() are an inlining of what was
741	 * necessary from sorflush().
742	 *
743	 * Notice that the socket buffer and kqueue state are torn down
744	 * before calling pru_detach.  This means that protocols shold not
745	 * assume they can perform socket wakeups, etc, in their detach code.
746	 */
747	sbdestroy(&so->so_snd, so);
748	sbdestroy(&so->so_rcv, so);
749	seldrain(&so->so_snd.sb_sel);
750	seldrain(&so->so_rcv.sb_sel);
751	knlist_destroy(&so->so_rcv.sb_sel.si_note);
752	knlist_destroy(&so->so_snd.sb_sel.si_note);
753	sodealloc(so);
754}
755
756/*
757 * Close a socket on last file table reference removal.  Initiate disconnect
758 * if connected.  Free socket when disconnect complete.
759 *
760 * This function will sorele() the socket.  Note that soclose() may be called
761 * prior to the ref count reaching zero.  The actual socket structure will
762 * not be freed until the ref count reaches zero.
763 */
764int
765soclose(struct socket *so)
766{
767	int error = 0;
768
769	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
770
771	CURVNET_SET(so->so_vnet);
772	funsetown(&so->so_sigio);
773	if (so->so_state & SS_ISCONNECTED) {
774		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
775			error = sodisconnect(so);
776			if (error) {
777				if (error == ENOTCONN)
778					error = 0;
779				goto drop;
780			}
781		}
782		if (so->so_options & SO_LINGER) {
783			if ((so->so_state & SS_ISDISCONNECTING) &&
784			    (so->so_state & SS_NBIO))
785				goto drop;
786			while (so->so_state & SS_ISCONNECTED) {
787				error = tsleep(&so->so_timeo,
788				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
789				if (error)
790					break;
791			}
792		}
793	}
794
795drop:
796	if (so->so_proto->pr_usrreqs->pru_close != NULL)
797		(*so->so_proto->pr_usrreqs->pru_close)(so);
798	ACCEPT_LOCK();
799	if (so->so_options & SO_ACCEPTCONN) {
800		struct socket *sp;
801		/*
802		 * Prevent new additions to the accept queues due
803		 * to ACCEPT_LOCK races while we are draining them.
804		 */
805		so->so_options &= ~SO_ACCEPTCONN;
806		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
807			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
808			so->so_incqlen--;
809			sp->so_qstate &= ~SQ_INCOMP;
810			sp->so_head = NULL;
811			ACCEPT_UNLOCK();
812			soabort(sp);
813			ACCEPT_LOCK();
814		}
815		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
816			TAILQ_REMOVE(&so->so_comp, sp, so_list);
817			so->so_qlen--;
818			sp->so_qstate &= ~SQ_COMP;
819			sp->so_head = NULL;
820			ACCEPT_UNLOCK();
821			soabort(sp);
822			ACCEPT_LOCK();
823		}
824		KASSERT((TAILQ_EMPTY(&so->so_comp)),
825		    ("%s: so_comp populated", __func__));
826		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
827		    ("%s: so_incomp populated", __func__));
828	}
829	SOCK_LOCK(so);
830	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
831	so->so_state |= SS_NOFDREF;
832	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
833	CURVNET_RESTORE();
834	return (error);
835}
836
837/*
838 * soabort() is used to abruptly tear down a connection, such as when a
839 * resource limit is reached (listen queue depth exceeded), or if a listen
840 * socket is closed while there are sockets waiting to be accepted.
841 *
842 * This interface is tricky, because it is called on an unreferenced socket,
843 * and must be called only by a thread that has actually removed the socket
844 * from the listen queue it was on, or races with other threads are risked.
845 *
846 * This interface will call into the protocol code, so must not be called
847 * with any socket locks held.  Protocols do call it while holding their own
848 * recursible protocol mutexes, but this is something that should be subject
849 * to review in the future.
850 */
851void
852soabort(struct socket *so)
853{
854
855	/*
856	 * In as much as is possible, assert that no references to this
857	 * socket are held.  This is not quite the same as asserting that the
858	 * current thread is responsible for arranging for no references, but
859	 * is as close as we can get for now.
860	 */
861	KASSERT(so->so_count == 0, ("soabort: so_count"));
862	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
863	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
864	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
865	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
866	VNET_SO_ASSERT(so);
867
868	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
869		(*so->so_proto->pr_usrreqs->pru_abort)(so);
870	ACCEPT_LOCK();
871	SOCK_LOCK(so);
872	sofree(so);
873}
874
875int
876soaccept(struct socket *so, struct sockaddr **nam)
877{
878	int error;
879
880	SOCK_LOCK(so);
881	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
882	so->so_state &= ~SS_NOFDREF;
883	SOCK_UNLOCK(so);
884
885	CURVNET_SET(so->so_vnet);
886	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
887	CURVNET_RESTORE();
888	return (error);
889}
890
891int
892soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
893{
894	int error;
895
896	if (so->so_options & SO_ACCEPTCONN)
897		return (EOPNOTSUPP);
898
899	CURVNET_SET(so->so_vnet);
900	/*
901	 * If protocol is connection-based, can only connect once.
902	 * Otherwise, if connected, try to disconnect first.  This allows
903	 * user to disconnect by connecting to, e.g., a null address.
904	 */
905	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
906	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
907	    (error = sodisconnect(so)))) {
908		error = EISCONN;
909	} else {
910		/*
911		 * Prevent accumulated error from previous connection from
912		 * biting us.
913		 */
914		so->so_error = 0;
915		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
916	}
917	CURVNET_RESTORE();
918
919	return (error);
920}
921
922int
923soconnect2(struct socket *so1, struct socket *so2)
924{
925	int error;
926
927	CURVNET_SET(so1->so_vnet);
928	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
929	CURVNET_RESTORE();
930	return (error);
931}
932
933int
934sodisconnect(struct socket *so)
935{
936	int error;
937
938	if ((so->so_state & SS_ISCONNECTED) == 0)
939		return (ENOTCONN);
940	if (so->so_state & SS_ISDISCONNECTING)
941		return (EALREADY);
942	VNET_SO_ASSERT(so);
943	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
944	return (error);
945}
946
947#ifdef ZERO_COPY_SOCKETS
948struct so_zerocopy_stats{
949	int size_ok;
950	int align_ok;
951	int found_ifp;
952};
953struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
954
955/*
956 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
957 * sosend_dgram() and sosend_generic() use m_uiotombuf().
958 *
959 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
960 * all of the data referenced by the uio.  If desired, it uses zero-copy.
961 * *space will be updated to reflect data copied in.
962 *
963 * NB: If atomic I/O is requested, the caller must already have checked that
964 * space can hold resid bytes.
965 *
966 * NB: In the event of an error, the caller may need to free the partial
967 * chain pointed to by *mpp.  The contents of both *uio and *space may be
968 * modified even in the case of an error.
969 */
970static int
971sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
972    int flags)
973{
974	struct mbuf *m, **mp, *top;
975	long len;
976	ssize_t resid;
977	int error;
978	int cow_send;
979
980	*retmp = top = NULL;
981	mp = &top;
982	len = 0;
983	resid = uio->uio_resid;
984	error = 0;
985	do {
986		cow_send = 0;
987		if (resid >= MINCLSIZE) {
988			if (top == NULL) {
989				m = m_gethdr(M_WAITOK, MT_DATA);
990				m->m_pkthdr.len = 0;
991				m->m_pkthdr.rcvif = NULL;
992			} else
993				m = m_get(M_WAITOK, MT_DATA);
994			if (so_zero_copy_send &&
995			    resid >= PAGE_SIZE &&
996			    *space >= PAGE_SIZE &&
997			    uio->uio_iov->iov_len >= PAGE_SIZE) {
998				so_zerocp_stats.size_ok++;
999				so_zerocp_stats.align_ok++;
1000				cow_send = socow_setup(m, uio);
1001				len = cow_send;
1002			}
1003			if (!cow_send) {
1004				m_clget(m, M_WAITOK);
1005				len = min(min(MCLBYTES, resid), *space);
1006			}
1007		} else {
1008			if (top == NULL) {
1009				m = m_gethdr(M_WAIT, MT_DATA);
1010				m->m_pkthdr.len = 0;
1011				m->m_pkthdr.rcvif = NULL;
1012
1013				len = min(min(MHLEN, resid), *space);
1014				/*
1015				 * For datagram protocols, leave room
1016				 * for protocol headers in first mbuf.
1017				 */
1018				if (atomic && m && len < MHLEN)
1019					MH_ALIGN(m, len);
1020			} else {
1021				m = m_get(M_WAIT, MT_DATA);
1022				len = min(min(MLEN, resid), *space);
1023			}
1024		}
1025		if (m == NULL) {
1026			error = ENOBUFS;
1027			goto out;
1028		}
1029
1030		*space -= len;
1031		if (cow_send)
1032			error = 0;
1033		else
1034			error = uiomove(mtod(m, void *), (int)len, uio);
1035		resid = uio->uio_resid;
1036		m->m_len = len;
1037		*mp = m;
1038		top->m_pkthdr.len += len;
1039		if (error)
1040			goto out;
1041		mp = &m->m_next;
1042		if (resid <= 0) {
1043			if (flags & MSG_EOR)
1044				top->m_flags |= M_EOR;
1045			break;
1046		}
1047	} while (*space > 0 && atomic);
1048out:
1049	*retmp = top;
1050	return (error);
1051}
1052#endif /* ZERO_COPY_SOCKETS */
1053
1054#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1055
1056int
1057sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1058    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1059{
1060	long space;
1061	ssize_t resid;
1062	int clen = 0, error, dontroute;
1063#ifdef ZERO_COPY_SOCKETS
1064	int atomic = sosendallatonce(so) || top;
1065#endif
1066
1067	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
1068	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1069	    ("sodgram_send: !PR_ATOMIC"));
1070
1071	if (uio != NULL)
1072		resid = uio->uio_resid;
1073	else
1074		resid = top->m_pkthdr.len;
1075	/*
1076	 * In theory resid should be unsigned.  However, space must be
1077	 * signed, as it might be less than 0 if we over-committed, and we
1078	 * must use a signed comparison of space and resid.  On the other
1079	 * hand, a negative resid causes us to loop sending 0-length
1080	 * segments to the protocol.
1081	 */
1082	if (resid < 0) {
1083		error = EINVAL;
1084		goto out;
1085	}
1086
1087	dontroute =
1088	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1089	if (td != NULL)
1090		td->td_ru.ru_msgsnd++;
1091	if (control != NULL)
1092		clen = control->m_len;
1093
1094	SOCKBUF_LOCK(&so->so_snd);
1095	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1096		SOCKBUF_UNLOCK(&so->so_snd);
1097		error = EPIPE;
1098		goto out;
1099	}
1100	if (so->so_error) {
1101		error = so->so_error;
1102		so->so_error = 0;
1103		SOCKBUF_UNLOCK(&so->so_snd);
1104		goto out;
1105	}
1106	if ((so->so_state & SS_ISCONNECTED) == 0) {
1107		/*
1108		 * `sendto' and `sendmsg' is allowed on a connection-based
1109		 * socket if it supports implied connect.  Return ENOTCONN if
1110		 * not connected and no address is supplied.
1111		 */
1112		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1113		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1114			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1115			    !(resid == 0 && clen != 0)) {
1116				SOCKBUF_UNLOCK(&so->so_snd);
1117				error = ENOTCONN;
1118				goto out;
1119			}
1120		} else if (addr == NULL) {
1121			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1122				error = ENOTCONN;
1123			else
1124				error = EDESTADDRREQ;
1125			SOCKBUF_UNLOCK(&so->so_snd);
1126			goto out;
1127		}
1128	}
1129
1130	/*
1131	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1132	 * problem and need fixing.
1133	 */
1134	space = sbspace(&so->so_snd);
1135	if (flags & MSG_OOB)
1136		space += 1024;
1137	space -= clen;
1138	SOCKBUF_UNLOCK(&so->so_snd);
1139	if (resid > space) {
1140		error = EMSGSIZE;
1141		goto out;
1142	}
1143	if (uio == NULL) {
1144		resid = 0;
1145		if (flags & MSG_EOR)
1146			top->m_flags |= M_EOR;
1147	} else {
1148#ifdef ZERO_COPY_SOCKETS
1149		error = sosend_copyin(uio, &top, atomic, &space, flags);
1150		if (error)
1151			goto out;
1152#else
1153		/*
1154		 * Copy the data from userland into a mbuf chain.
1155		 * If no data is to be copied in, a single empty mbuf
1156		 * is returned.
1157		 */
1158		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1159		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1160		if (top == NULL) {
1161			error = EFAULT;	/* only possible error */
1162			goto out;
1163		}
1164		space -= resid - uio->uio_resid;
1165#endif
1166		resid = uio->uio_resid;
1167	}
1168	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1169	/*
1170	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1171	 * than with.
1172	 */
1173	if (dontroute) {
1174		SOCK_LOCK(so);
1175		so->so_options |= SO_DONTROUTE;
1176		SOCK_UNLOCK(so);
1177	}
1178	/*
1179	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1180	 * of date.  We could have recieved a reset packet in an interrupt or
1181	 * maybe we slept while doing page faults in uiomove() etc.  We could
1182	 * probably recheck again inside the locking protection here, but
1183	 * there are probably other places that this also happens.  We must
1184	 * rethink this.
1185	 */
1186	VNET_SO_ASSERT(so);
1187	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1188	    (flags & MSG_OOB) ? PRUS_OOB :
1189	/*
1190	 * If the user set MSG_EOF, the protocol understands this flag and
1191	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1192	 */
1193	    ((flags & MSG_EOF) &&
1194	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1195	     (resid <= 0)) ?
1196		PRUS_EOF :
1197		/* If there is more to send set PRUS_MORETOCOME */
1198		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1199		top, addr, control, td);
1200	if (dontroute) {
1201		SOCK_LOCK(so);
1202		so->so_options &= ~SO_DONTROUTE;
1203		SOCK_UNLOCK(so);
1204	}
1205	clen = 0;
1206	control = NULL;
1207	top = NULL;
1208out:
1209	if (top != NULL)
1210		m_freem(top);
1211	if (control != NULL)
1212		m_freem(control);
1213	return (error);
1214}
1215
1216/*
1217 * Send on a socket.  If send must go all at once and message is larger than
1218 * send buffering, then hard error.  Lock against other senders.  If must go
1219 * all at once and not enough room now, then inform user that this would
1220 * block and do nothing.  Otherwise, if nonblocking, send as much as
1221 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1222 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1223 * in mbuf chain must be small enough to send all at once.
1224 *
1225 * Returns nonzero on error, timeout or signal; callers must check for short
1226 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1227 * on return.
1228 */
1229int
1230sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1231    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1232{
1233	long space;
1234	ssize_t resid;
1235	int clen = 0, error, dontroute;
1236	int atomic = sosendallatonce(so) || top;
1237
1238	if (uio != NULL)
1239		resid = uio->uio_resid;
1240	else
1241		resid = top->m_pkthdr.len;
1242	/*
1243	 * In theory resid should be unsigned.  However, space must be
1244	 * signed, as it might be less than 0 if we over-committed, and we
1245	 * must use a signed comparison of space and resid.  On the other
1246	 * hand, a negative resid causes us to loop sending 0-length
1247	 * segments to the protocol.
1248	 *
1249	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1250	 * type sockets since that's an error.
1251	 */
1252	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1253		error = EINVAL;
1254		goto out;
1255	}
1256
1257	dontroute =
1258	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1259	    (so->so_proto->pr_flags & PR_ATOMIC);
1260	if (td != NULL)
1261		td->td_ru.ru_msgsnd++;
1262	if (control != NULL)
1263		clen = control->m_len;
1264
1265	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1266	if (error)
1267		goto out;
1268
1269restart:
1270	do {
1271		SOCKBUF_LOCK(&so->so_snd);
1272		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1273			SOCKBUF_UNLOCK(&so->so_snd);
1274			error = EPIPE;
1275			goto release;
1276		}
1277		if (so->so_error) {
1278			error = so->so_error;
1279			so->so_error = 0;
1280			SOCKBUF_UNLOCK(&so->so_snd);
1281			goto release;
1282		}
1283		if ((so->so_state & SS_ISCONNECTED) == 0) {
1284			/*
1285			 * `sendto' and `sendmsg' is allowed on a connection-
1286			 * based socket if it supports implied connect.
1287			 * Return ENOTCONN if not connected and no address is
1288			 * supplied.
1289			 */
1290			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1291			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1292				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1293				    !(resid == 0 && clen != 0)) {
1294					SOCKBUF_UNLOCK(&so->so_snd);
1295					error = ENOTCONN;
1296					goto release;
1297				}
1298			} else if (addr == NULL) {
1299				SOCKBUF_UNLOCK(&so->so_snd);
1300				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1301					error = ENOTCONN;
1302				else
1303					error = EDESTADDRREQ;
1304				goto release;
1305			}
1306		}
1307		space = sbspace(&so->so_snd);
1308		if (flags & MSG_OOB)
1309			space += 1024;
1310		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1311		    clen > so->so_snd.sb_hiwat) {
1312			SOCKBUF_UNLOCK(&so->so_snd);
1313			error = EMSGSIZE;
1314			goto release;
1315		}
1316		if (space < resid + clen &&
1317		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1318			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1319				SOCKBUF_UNLOCK(&so->so_snd);
1320				error = EWOULDBLOCK;
1321				goto release;
1322			}
1323			error = sbwait(&so->so_snd);
1324			SOCKBUF_UNLOCK(&so->so_snd);
1325			if (error)
1326				goto release;
1327			goto restart;
1328		}
1329		SOCKBUF_UNLOCK(&so->so_snd);
1330		space -= clen;
1331		do {
1332			if (uio == NULL) {
1333				resid = 0;
1334				if (flags & MSG_EOR)
1335					top->m_flags |= M_EOR;
1336			} else {
1337#ifdef ZERO_COPY_SOCKETS
1338				error = sosend_copyin(uio, &top, atomic,
1339				    &space, flags);
1340				if (error != 0)
1341					goto release;
1342#else
1343				/*
1344				 * Copy the data from userland into a mbuf
1345				 * chain.  If no data is to be copied in,
1346				 * a single empty mbuf is returned.
1347				 */
1348				top = m_uiotombuf(uio, M_WAITOK, space,
1349				    (atomic ? max_hdr : 0),
1350				    (atomic ? M_PKTHDR : 0) |
1351				    ((flags & MSG_EOR) ? M_EOR : 0));
1352				if (top == NULL) {
1353					error = EFAULT; /* only possible error */
1354					goto release;
1355				}
1356				space -= resid - uio->uio_resid;
1357#endif
1358				resid = uio->uio_resid;
1359			}
1360			if (dontroute) {
1361				SOCK_LOCK(so);
1362				so->so_options |= SO_DONTROUTE;
1363				SOCK_UNLOCK(so);
1364			}
1365			/*
1366			 * XXX all the SBS_CANTSENDMORE checks previously
1367			 * done could be out of date.  We could have recieved
1368			 * a reset packet in an interrupt or maybe we slept
1369			 * while doing page faults in uiomove() etc.  We
1370			 * could probably recheck again inside the locking
1371			 * protection here, but there are probably other
1372			 * places that this also happens.  We must rethink
1373			 * this.
1374			 */
1375			VNET_SO_ASSERT(so);
1376			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1377			    (flags & MSG_OOB) ? PRUS_OOB :
1378			/*
1379			 * If the user set MSG_EOF, the protocol understands
1380			 * this flag and nothing left to send then use
1381			 * PRU_SEND_EOF instead of PRU_SEND.
1382			 */
1383			    ((flags & MSG_EOF) &&
1384			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1385			     (resid <= 0)) ?
1386				PRUS_EOF :
1387			/* If there is more to send set PRUS_MORETOCOME. */
1388			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1389			    top, addr, control, td);
1390			if (dontroute) {
1391				SOCK_LOCK(so);
1392				so->so_options &= ~SO_DONTROUTE;
1393				SOCK_UNLOCK(so);
1394			}
1395			clen = 0;
1396			control = NULL;
1397			top = NULL;
1398			if (error)
1399				goto release;
1400		} while (resid && space > 0);
1401	} while (resid);
1402
1403release:
1404	sbunlock(&so->so_snd);
1405out:
1406	if (top != NULL)
1407		m_freem(top);
1408	if (control != NULL)
1409		m_freem(control);
1410	return (error);
1411}
1412
1413int
1414sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1415    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1416{
1417	int error;
1418
1419	CURVNET_SET(so->so_vnet);
1420	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1421	    control, flags, td);
1422	CURVNET_RESTORE();
1423	return (error);
1424}
1425
1426/*
1427 * The part of soreceive() that implements reading non-inline out-of-band
1428 * data from a socket.  For more complete comments, see soreceive(), from
1429 * which this code originated.
1430 *
1431 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1432 * unable to return an mbuf chain to the caller.
1433 */
1434static int
1435soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1436{
1437	struct protosw *pr = so->so_proto;
1438	struct mbuf *m;
1439	int error;
1440
1441	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1442	VNET_SO_ASSERT(so);
1443
1444	m = m_get(M_WAIT, MT_DATA);
1445	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1446	if (error)
1447		goto bad;
1448	do {
1449#ifdef ZERO_COPY_SOCKETS
1450		if (so_zero_copy_receive) {
1451			int disposable;
1452
1453			if ((m->m_flags & M_EXT)
1454			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1455				disposable = 1;
1456			else
1457				disposable = 0;
1458
1459			error = uiomoveco(mtod(m, void *),
1460					  min(uio->uio_resid, m->m_len),
1461					  uio, disposable);
1462		} else
1463#endif /* ZERO_COPY_SOCKETS */
1464		error = uiomove(mtod(m, void *),
1465		    (int) min(uio->uio_resid, m->m_len), uio);
1466		m = m_free(m);
1467	} while (uio->uio_resid && error == 0 && m);
1468bad:
1469	if (m != NULL)
1470		m_freem(m);
1471	return (error);
1472}
1473
1474/*
1475 * Following replacement or removal of the first mbuf on the first mbuf chain
1476 * of a socket buffer, push necessary state changes back into the socket
1477 * buffer so that other consumers see the values consistently.  'nextrecord'
1478 * is the callers locally stored value of the original value of
1479 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1480 * NOTE: 'nextrecord' may be NULL.
1481 */
1482static __inline void
1483sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1484{
1485
1486	SOCKBUF_LOCK_ASSERT(sb);
1487	/*
1488	 * First, update for the new value of nextrecord.  If necessary, make
1489	 * it the first record.
1490	 */
1491	if (sb->sb_mb != NULL)
1492		sb->sb_mb->m_nextpkt = nextrecord;
1493	else
1494		sb->sb_mb = nextrecord;
1495
1496        /*
1497         * Now update any dependent socket buffer fields to reflect the new
1498         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1499	 * addition of a second clause that takes care of the case where
1500	 * sb_mb has been updated, but remains the last record.
1501         */
1502        if (sb->sb_mb == NULL) {
1503                sb->sb_mbtail = NULL;
1504                sb->sb_lastrecord = NULL;
1505        } else if (sb->sb_mb->m_nextpkt == NULL)
1506                sb->sb_lastrecord = sb->sb_mb;
1507}
1508
1509
1510/*
1511 * Implement receive operations on a socket.  We depend on the way that
1512 * records are added to the sockbuf by sbappend.  In particular, each record
1513 * (mbufs linked through m_next) must begin with an address if the protocol
1514 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1515 * data, and then zero or more mbufs of data.  In order to allow parallelism
1516 * between network receive and copying to user space, as well as avoid
1517 * sleeping with a mutex held, we release the socket buffer mutex during the
1518 * user space copy.  Although the sockbuf is locked, new data may still be
1519 * appended, and thus we must maintain consistency of the sockbuf during that
1520 * time.
1521 *
1522 * The caller may receive the data as a single mbuf chain by supplying an
1523 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1524 * the count in uio_resid.
1525 */
1526int
1527soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1528    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1529{
1530	struct mbuf *m, **mp;
1531	int flags, error, offset;
1532	ssize_t len;
1533	struct protosw *pr = so->so_proto;
1534	struct mbuf *nextrecord;
1535	int moff, type = 0;
1536	ssize_t orig_resid = uio->uio_resid;
1537
1538	mp = mp0;
1539	if (psa != NULL)
1540		*psa = NULL;
1541	if (controlp != NULL)
1542		*controlp = NULL;
1543	if (flagsp != NULL)
1544		flags = *flagsp &~ MSG_EOR;
1545	else
1546		flags = 0;
1547	if (flags & MSG_OOB)
1548		return (soreceive_rcvoob(so, uio, flags));
1549	if (mp != NULL)
1550		*mp = NULL;
1551	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1552	    && uio->uio_resid) {
1553		VNET_SO_ASSERT(so);
1554		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1555	}
1556
1557	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1558	if (error)
1559		return (error);
1560
1561restart:
1562	SOCKBUF_LOCK(&so->so_rcv);
1563	m = so->so_rcv.sb_mb;
1564	/*
1565	 * If we have less data than requested, block awaiting more (subject
1566	 * to any timeout) if:
1567	 *   1. the current count is less than the low water mark, or
1568	 *   2. MSG_DONTWAIT is not set
1569	 */
1570	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1571	    so->so_rcv.sb_cc < uio->uio_resid) &&
1572	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1573	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1574		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1575		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1576		    m, so->so_rcv.sb_cc));
1577		if (so->so_error) {
1578			if (m != NULL)
1579				goto dontblock;
1580			error = so->so_error;
1581			if ((flags & MSG_PEEK) == 0)
1582				so->so_error = 0;
1583			SOCKBUF_UNLOCK(&so->so_rcv);
1584			goto release;
1585		}
1586		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1587		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1588			if (m == NULL) {
1589				SOCKBUF_UNLOCK(&so->so_rcv);
1590				goto release;
1591			} else
1592				goto dontblock;
1593		}
1594		for (; m != NULL; m = m->m_next)
1595			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1596				m = so->so_rcv.sb_mb;
1597				goto dontblock;
1598			}
1599		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1600		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1601			SOCKBUF_UNLOCK(&so->so_rcv);
1602			error = ENOTCONN;
1603			goto release;
1604		}
1605		if (uio->uio_resid == 0) {
1606			SOCKBUF_UNLOCK(&so->so_rcv);
1607			goto release;
1608		}
1609		if ((so->so_state & SS_NBIO) ||
1610		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1611			SOCKBUF_UNLOCK(&so->so_rcv);
1612			error = EWOULDBLOCK;
1613			goto release;
1614		}
1615		SBLASTRECORDCHK(&so->so_rcv);
1616		SBLASTMBUFCHK(&so->so_rcv);
1617		error = sbwait(&so->so_rcv);
1618		SOCKBUF_UNLOCK(&so->so_rcv);
1619		if (error)
1620			goto release;
1621		goto restart;
1622	}
1623dontblock:
1624	/*
1625	 * From this point onward, we maintain 'nextrecord' as a cache of the
1626	 * pointer to the next record in the socket buffer.  We must keep the
1627	 * various socket buffer pointers and local stack versions of the
1628	 * pointers in sync, pushing out modifications before dropping the
1629	 * socket buffer mutex, and re-reading them when picking it up.
1630	 *
1631	 * Otherwise, we will race with the network stack appending new data
1632	 * or records onto the socket buffer by using inconsistent/stale
1633	 * versions of the field, possibly resulting in socket buffer
1634	 * corruption.
1635	 *
1636	 * By holding the high-level sblock(), we prevent simultaneous
1637	 * readers from pulling off the front of the socket buffer.
1638	 */
1639	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1640	if (uio->uio_td)
1641		uio->uio_td->td_ru.ru_msgrcv++;
1642	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1643	SBLASTRECORDCHK(&so->so_rcv);
1644	SBLASTMBUFCHK(&so->so_rcv);
1645	nextrecord = m->m_nextpkt;
1646	if (pr->pr_flags & PR_ADDR) {
1647		KASSERT(m->m_type == MT_SONAME,
1648		    ("m->m_type == %d", m->m_type));
1649		orig_resid = 0;
1650		if (psa != NULL)
1651			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1652			    M_NOWAIT);
1653		if (flags & MSG_PEEK) {
1654			m = m->m_next;
1655		} else {
1656			sbfree(&so->so_rcv, m);
1657			so->so_rcv.sb_mb = m_free(m);
1658			m = so->so_rcv.sb_mb;
1659			sockbuf_pushsync(&so->so_rcv, nextrecord);
1660		}
1661	}
1662
1663	/*
1664	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1665	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1666	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1667	 * perform externalization (or freeing if controlp == NULL).
1668	 */
1669	if (m != NULL && m->m_type == MT_CONTROL) {
1670		struct mbuf *cm = NULL, *cmn;
1671		struct mbuf **cme = &cm;
1672
1673		do {
1674			if (flags & MSG_PEEK) {
1675				if (controlp != NULL) {
1676					*controlp = m_copy(m, 0, m->m_len);
1677					controlp = &(*controlp)->m_next;
1678				}
1679				m = m->m_next;
1680			} else {
1681				sbfree(&so->so_rcv, m);
1682				so->so_rcv.sb_mb = m->m_next;
1683				m->m_next = NULL;
1684				*cme = m;
1685				cme = &(*cme)->m_next;
1686				m = so->so_rcv.sb_mb;
1687			}
1688		} while (m != NULL && m->m_type == MT_CONTROL);
1689		if ((flags & MSG_PEEK) == 0)
1690			sockbuf_pushsync(&so->so_rcv, nextrecord);
1691		while (cm != NULL) {
1692			cmn = cm->m_next;
1693			cm->m_next = NULL;
1694			if (pr->pr_domain->dom_externalize != NULL) {
1695				SOCKBUF_UNLOCK(&so->so_rcv);
1696				VNET_SO_ASSERT(so);
1697				error = (*pr->pr_domain->dom_externalize)
1698				    (cm, controlp);
1699				SOCKBUF_LOCK(&so->so_rcv);
1700			} else if (controlp != NULL)
1701				*controlp = cm;
1702			else
1703				m_freem(cm);
1704			if (controlp != NULL) {
1705				orig_resid = 0;
1706				while (*controlp != NULL)
1707					controlp = &(*controlp)->m_next;
1708			}
1709			cm = cmn;
1710		}
1711		if (m != NULL)
1712			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1713		else
1714			nextrecord = so->so_rcv.sb_mb;
1715		orig_resid = 0;
1716	}
1717	if (m != NULL) {
1718		if ((flags & MSG_PEEK) == 0) {
1719			KASSERT(m->m_nextpkt == nextrecord,
1720			    ("soreceive: post-control, nextrecord !sync"));
1721			if (nextrecord == NULL) {
1722				KASSERT(so->so_rcv.sb_mb == m,
1723				    ("soreceive: post-control, sb_mb!=m"));
1724				KASSERT(so->so_rcv.sb_lastrecord == m,
1725				    ("soreceive: post-control, lastrecord!=m"));
1726			}
1727		}
1728		type = m->m_type;
1729		if (type == MT_OOBDATA)
1730			flags |= MSG_OOB;
1731	} else {
1732		if ((flags & MSG_PEEK) == 0) {
1733			KASSERT(so->so_rcv.sb_mb == nextrecord,
1734			    ("soreceive: sb_mb != nextrecord"));
1735			if (so->so_rcv.sb_mb == NULL) {
1736				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1737				    ("soreceive: sb_lastercord != NULL"));
1738			}
1739		}
1740	}
1741	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1742	SBLASTRECORDCHK(&so->so_rcv);
1743	SBLASTMBUFCHK(&so->so_rcv);
1744
1745	/*
1746	 * Now continue to read any data mbufs off of the head of the socket
1747	 * buffer until the read request is satisfied.  Note that 'type' is
1748	 * used to store the type of any mbuf reads that have happened so far
1749	 * such that soreceive() can stop reading if the type changes, which
1750	 * causes soreceive() to return only one of regular data and inline
1751	 * out-of-band data in a single socket receive operation.
1752	 */
1753	moff = 0;
1754	offset = 0;
1755	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1756		/*
1757		 * If the type of mbuf has changed since the last mbuf
1758		 * examined ('type'), end the receive operation.
1759	 	 */
1760		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1761		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1762			if (type != m->m_type)
1763				break;
1764		} else if (type == MT_OOBDATA)
1765			break;
1766		else
1767		    KASSERT(m->m_type == MT_DATA,
1768			("m->m_type == %d", m->m_type));
1769		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1770		len = uio->uio_resid;
1771		if (so->so_oobmark && len > so->so_oobmark - offset)
1772			len = so->so_oobmark - offset;
1773		if (len > m->m_len - moff)
1774			len = m->m_len - moff;
1775		/*
1776		 * If mp is set, just pass back the mbufs.  Otherwise copy
1777		 * them out via the uio, then free.  Sockbuf must be
1778		 * consistent here (points to current mbuf, it points to next
1779		 * record) when we drop priority; we must note any additions
1780		 * to the sockbuf when we block interrupts again.
1781		 */
1782		if (mp == NULL) {
1783			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1784			SBLASTRECORDCHK(&so->so_rcv);
1785			SBLASTMBUFCHK(&so->so_rcv);
1786			SOCKBUF_UNLOCK(&so->so_rcv);
1787#ifdef ZERO_COPY_SOCKETS
1788			if (so_zero_copy_receive) {
1789				int disposable;
1790
1791				if ((m->m_flags & M_EXT)
1792				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1793					disposable = 1;
1794				else
1795					disposable = 0;
1796
1797				error = uiomoveco(mtod(m, char *) + moff,
1798						  (int)len, uio,
1799						  disposable);
1800			} else
1801#endif /* ZERO_COPY_SOCKETS */
1802			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1803			SOCKBUF_LOCK(&so->so_rcv);
1804			if (error) {
1805				/*
1806				 * The MT_SONAME mbuf has already been removed
1807				 * from the record, so it is necessary to
1808				 * remove the data mbufs, if any, to preserve
1809				 * the invariant in the case of PR_ADDR that
1810				 * requires MT_SONAME mbufs at the head of
1811				 * each record.
1812				 */
1813				if (m && pr->pr_flags & PR_ATOMIC &&
1814				    ((flags & MSG_PEEK) == 0))
1815					(void)sbdroprecord_locked(&so->so_rcv);
1816				SOCKBUF_UNLOCK(&so->so_rcv);
1817				goto release;
1818			}
1819		} else
1820			uio->uio_resid -= len;
1821		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1822		if (len == m->m_len - moff) {
1823			if (m->m_flags & M_EOR)
1824				flags |= MSG_EOR;
1825			if (flags & MSG_PEEK) {
1826				m = m->m_next;
1827				moff = 0;
1828			} else {
1829				nextrecord = m->m_nextpkt;
1830				sbfree(&so->so_rcv, m);
1831				if (mp != NULL) {
1832					*mp = m;
1833					mp = &m->m_next;
1834					so->so_rcv.sb_mb = m = m->m_next;
1835					*mp = NULL;
1836				} else {
1837					so->so_rcv.sb_mb = m_free(m);
1838					m = so->so_rcv.sb_mb;
1839				}
1840				sockbuf_pushsync(&so->so_rcv, nextrecord);
1841				SBLASTRECORDCHK(&so->so_rcv);
1842				SBLASTMBUFCHK(&so->so_rcv);
1843			}
1844		} else {
1845			if (flags & MSG_PEEK)
1846				moff += len;
1847			else {
1848				if (mp != NULL) {
1849					int copy_flag;
1850
1851					if (flags & MSG_DONTWAIT)
1852						copy_flag = M_DONTWAIT;
1853					else
1854						copy_flag = M_WAIT;
1855					if (copy_flag == M_WAIT)
1856						SOCKBUF_UNLOCK(&so->so_rcv);
1857					*mp = m_copym(m, 0, len, copy_flag);
1858					if (copy_flag == M_WAIT)
1859						SOCKBUF_LOCK(&so->so_rcv);
1860 					if (*mp == NULL) {
1861 						/*
1862 						 * m_copym() couldn't
1863						 * allocate an mbuf.  Adjust
1864						 * uio_resid back (it was
1865						 * adjusted down by len
1866						 * bytes, which we didn't end
1867						 * up "copying" over).
1868 						 */
1869 						uio->uio_resid += len;
1870 						break;
1871 					}
1872				}
1873				m->m_data += len;
1874				m->m_len -= len;
1875				so->so_rcv.sb_cc -= len;
1876			}
1877		}
1878		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1879		if (so->so_oobmark) {
1880			if ((flags & MSG_PEEK) == 0) {
1881				so->so_oobmark -= len;
1882				if (so->so_oobmark == 0) {
1883					so->so_rcv.sb_state |= SBS_RCVATMARK;
1884					break;
1885				}
1886			} else {
1887				offset += len;
1888				if (offset == so->so_oobmark)
1889					break;
1890			}
1891		}
1892		if (flags & MSG_EOR)
1893			break;
1894		/*
1895		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1896		 * must not quit until "uio->uio_resid == 0" or an error
1897		 * termination.  If a signal/timeout occurs, return with a
1898		 * short count but without error.  Keep sockbuf locked
1899		 * against other readers.
1900		 */
1901		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1902		    !sosendallatonce(so) && nextrecord == NULL) {
1903			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1904			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1905				break;
1906			/*
1907			 * Notify the protocol that some data has been
1908			 * drained before blocking.
1909			 */
1910			if (pr->pr_flags & PR_WANTRCVD) {
1911				SOCKBUF_UNLOCK(&so->so_rcv);
1912				VNET_SO_ASSERT(so);
1913				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1914				SOCKBUF_LOCK(&so->so_rcv);
1915			}
1916			SBLASTRECORDCHK(&so->so_rcv);
1917			SBLASTMBUFCHK(&so->so_rcv);
1918			/*
1919			 * We could receive some data while was notifying
1920			 * the protocol. Skip blocking in this case.
1921			 */
1922			if (so->so_rcv.sb_mb == NULL) {
1923				error = sbwait(&so->so_rcv);
1924				if (error) {
1925					SOCKBUF_UNLOCK(&so->so_rcv);
1926					goto release;
1927				}
1928			}
1929			m = so->so_rcv.sb_mb;
1930			if (m != NULL)
1931				nextrecord = m->m_nextpkt;
1932		}
1933	}
1934
1935	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1936	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1937		flags |= MSG_TRUNC;
1938		if ((flags & MSG_PEEK) == 0)
1939			(void) sbdroprecord_locked(&so->so_rcv);
1940	}
1941	if ((flags & MSG_PEEK) == 0) {
1942		if (m == NULL) {
1943			/*
1944			 * First part is an inline SB_EMPTY_FIXUP().  Second
1945			 * part makes sure sb_lastrecord is up-to-date if
1946			 * there is still data in the socket buffer.
1947			 */
1948			so->so_rcv.sb_mb = nextrecord;
1949			if (so->so_rcv.sb_mb == NULL) {
1950				so->so_rcv.sb_mbtail = NULL;
1951				so->so_rcv.sb_lastrecord = NULL;
1952			} else if (nextrecord->m_nextpkt == NULL)
1953				so->so_rcv.sb_lastrecord = nextrecord;
1954		}
1955		SBLASTRECORDCHK(&so->so_rcv);
1956		SBLASTMBUFCHK(&so->so_rcv);
1957		/*
1958		 * If soreceive() is being done from the socket callback,
1959		 * then don't need to generate ACK to peer to update window,
1960		 * since ACK will be generated on return to TCP.
1961		 */
1962		if (!(flags & MSG_SOCALLBCK) &&
1963		    (pr->pr_flags & PR_WANTRCVD)) {
1964			SOCKBUF_UNLOCK(&so->so_rcv);
1965			VNET_SO_ASSERT(so);
1966			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1967			SOCKBUF_LOCK(&so->so_rcv);
1968		}
1969	}
1970	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1971	if (orig_resid == uio->uio_resid && orig_resid &&
1972	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1973		SOCKBUF_UNLOCK(&so->so_rcv);
1974		goto restart;
1975	}
1976	SOCKBUF_UNLOCK(&so->so_rcv);
1977
1978	if (flagsp != NULL)
1979		*flagsp |= flags;
1980release:
1981	sbunlock(&so->so_rcv);
1982	return (error);
1983}
1984
1985/*
1986 * Optimized version of soreceive() for stream (TCP) sockets.
1987 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1988 */
1989int
1990soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1991    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1992{
1993	int len = 0, error = 0, flags, oresid;
1994	struct sockbuf *sb;
1995	struct mbuf *m, *n = NULL;
1996
1997	/* We only do stream sockets. */
1998	if (so->so_type != SOCK_STREAM)
1999		return (EINVAL);
2000	if (psa != NULL)
2001		*psa = NULL;
2002	if (controlp != NULL)
2003		return (EINVAL);
2004	if (flagsp != NULL)
2005		flags = *flagsp &~ MSG_EOR;
2006	else
2007		flags = 0;
2008	if (flags & MSG_OOB)
2009		return (soreceive_rcvoob(so, uio, flags));
2010	if (mp0 != NULL)
2011		*mp0 = NULL;
2012
2013	sb = &so->so_rcv;
2014
2015	/* Prevent other readers from entering the socket. */
2016	error = sblock(sb, SBLOCKWAIT(flags));
2017	if (error)
2018		goto out;
2019	SOCKBUF_LOCK(sb);
2020
2021	/* Easy one, no space to copyout anything. */
2022	if (uio->uio_resid == 0) {
2023		error = EINVAL;
2024		goto out;
2025	}
2026	oresid = uio->uio_resid;
2027
2028	/* We will never ever get anything unless we are or were connected. */
2029	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2030		error = ENOTCONN;
2031		goto out;
2032	}
2033
2034restart:
2035	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2036
2037	/* Abort if socket has reported problems. */
2038	if (so->so_error) {
2039		if (sb->sb_cc > 0)
2040			goto deliver;
2041		if (oresid > uio->uio_resid)
2042			goto out;
2043		error = so->so_error;
2044		if (!(flags & MSG_PEEK))
2045			so->so_error = 0;
2046		goto out;
2047	}
2048
2049	/* Door is closed.  Deliver what is left, if any. */
2050	if (sb->sb_state & SBS_CANTRCVMORE) {
2051		if (sb->sb_cc > 0)
2052			goto deliver;
2053		else
2054			goto out;
2055	}
2056
2057	/* Socket buffer is empty and we shall not block. */
2058	if (sb->sb_cc == 0 &&
2059	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2060		error = EAGAIN;
2061		goto out;
2062	}
2063
2064	/* Socket buffer got some data that we shall deliver now. */
2065	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2066	    ((sb->sb_flags & SS_NBIO) ||
2067	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2068	     sb->sb_cc >= sb->sb_lowat ||
2069	     sb->sb_cc >= uio->uio_resid ||
2070	     sb->sb_cc >= sb->sb_hiwat) ) {
2071		goto deliver;
2072	}
2073
2074	/* On MSG_WAITALL we must wait until all data or error arrives. */
2075	if ((flags & MSG_WAITALL) &&
2076	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
2077		goto deliver;
2078
2079	/*
2080	 * Wait and block until (more) data comes in.
2081	 * NB: Drops the sockbuf lock during wait.
2082	 */
2083	error = sbwait(sb);
2084	if (error)
2085		goto out;
2086	goto restart;
2087
2088deliver:
2089	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2090	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2091	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2092
2093	/* Statistics. */
2094	if (uio->uio_td)
2095		uio->uio_td->td_ru.ru_msgrcv++;
2096
2097	/* Fill uio until full or current end of socket buffer is reached. */
2098	len = min(uio->uio_resid, sb->sb_cc);
2099	if (mp0 != NULL) {
2100		/* Dequeue as many mbufs as possible. */
2101		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2102			if (*mp0 == NULL)
2103				*mp0 = sb->sb_mb;
2104			else
2105				m_cat(*mp0, sb->sb_mb);
2106			for (m = sb->sb_mb;
2107			     m != NULL && m->m_len <= len;
2108			     m = m->m_next) {
2109				len -= m->m_len;
2110				uio->uio_resid -= m->m_len;
2111				sbfree(sb, m);
2112				n = m;
2113			}
2114			n->m_next = NULL;
2115			sb->sb_mb = m;
2116			sb->sb_lastrecord = sb->sb_mb;
2117			if (sb->sb_mb == NULL)
2118				SB_EMPTY_FIXUP(sb);
2119		}
2120		/* Copy the remainder. */
2121		if (len > 0) {
2122			KASSERT(sb->sb_mb != NULL,
2123			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2124
2125			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2126			if (m == NULL)
2127				len = 0;	/* Don't flush data from sockbuf. */
2128			else
2129				uio->uio_resid -= len;
2130			if (*mp0 != NULL)
2131				m_cat(*mp0, m);
2132			else
2133				*mp0 = m;
2134			if (*mp0 == NULL) {
2135				error = ENOBUFS;
2136				goto out;
2137			}
2138		}
2139	} else {
2140		/* NB: Must unlock socket buffer as uiomove may sleep. */
2141		SOCKBUF_UNLOCK(sb);
2142		error = m_mbuftouio(uio, sb->sb_mb, len);
2143		SOCKBUF_LOCK(sb);
2144		if (error)
2145			goto out;
2146	}
2147	SBLASTRECORDCHK(sb);
2148	SBLASTMBUFCHK(sb);
2149
2150	/*
2151	 * Remove the delivered data from the socket buffer unless we
2152	 * were only peeking.
2153	 */
2154	if (!(flags & MSG_PEEK)) {
2155		if (len > 0)
2156			sbdrop_locked(sb, len);
2157
2158		/* Notify protocol that we drained some data. */
2159		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2160		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2161		     !(flags & MSG_SOCALLBCK))) {
2162			SOCKBUF_UNLOCK(sb);
2163			VNET_SO_ASSERT(so);
2164			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2165			SOCKBUF_LOCK(sb);
2166		}
2167	}
2168
2169	/*
2170	 * For MSG_WAITALL we may have to loop again and wait for
2171	 * more data to come in.
2172	 */
2173	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2174		goto restart;
2175out:
2176	SOCKBUF_LOCK_ASSERT(sb);
2177	SBLASTRECORDCHK(sb);
2178	SBLASTMBUFCHK(sb);
2179	SOCKBUF_UNLOCK(sb);
2180	sbunlock(sb);
2181	return (error);
2182}
2183
2184/*
2185 * Optimized version of soreceive() for simple datagram cases from userspace.
2186 * Unlike in the stream case, we're able to drop a datagram if copyout()
2187 * fails, and because we handle datagrams atomically, we don't need to use a
2188 * sleep lock to prevent I/O interlacing.
2189 */
2190int
2191soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2192    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2193{
2194	struct mbuf *m, *m2;
2195	int flags, error;
2196	ssize_t len;
2197	struct protosw *pr = so->so_proto;
2198	struct mbuf *nextrecord;
2199
2200	if (psa != NULL)
2201		*psa = NULL;
2202	if (controlp != NULL)
2203		*controlp = NULL;
2204	if (flagsp != NULL)
2205		flags = *flagsp &~ MSG_EOR;
2206	else
2207		flags = 0;
2208
2209	/*
2210	 * For any complicated cases, fall back to the full
2211	 * soreceive_generic().
2212	 */
2213	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2214		return (soreceive_generic(so, psa, uio, mp0, controlp,
2215		    flagsp));
2216
2217	/*
2218	 * Enforce restrictions on use.
2219	 */
2220	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2221	    ("soreceive_dgram: wantrcvd"));
2222	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2223	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2224	    ("soreceive_dgram: SBS_RCVATMARK"));
2225	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2226	    ("soreceive_dgram: P_CONNREQUIRED"));
2227
2228	/*
2229	 * Loop blocking while waiting for a datagram.
2230	 */
2231	SOCKBUF_LOCK(&so->so_rcv);
2232	while ((m = so->so_rcv.sb_mb) == NULL) {
2233		KASSERT(so->so_rcv.sb_cc == 0,
2234		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2235		    so->so_rcv.sb_cc));
2236		if (so->so_error) {
2237			error = so->so_error;
2238			so->so_error = 0;
2239			SOCKBUF_UNLOCK(&so->so_rcv);
2240			return (error);
2241		}
2242		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2243		    uio->uio_resid == 0) {
2244			SOCKBUF_UNLOCK(&so->so_rcv);
2245			return (0);
2246		}
2247		if ((so->so_state & SS_NBIO) ||
2248		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2249			SOCKBUF_UNLOCK(&so->so_rcv);
2250			return (EWOULDBLOCK);
2251		}
2252		SBLASTRECORDCHK(&so->so_rcv);
2253		SBLASTMBUFCHK(&so->so_rcv);
2254		error = sbwait(&so->so_rcv);
2255		if (error) {
2256			SOCKBUF_UNLOCK(&so->so_rcv);
2257			return (error);
2258		}
2259	}
2260	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2261
2262	if (uio->uio_td)
2263		uio->uio_td->td_ru.ru_msgrcv++;
2264	SBLASTRECORDCHK(&so->so_rcv);
2265	SBLASTMBUFCHK(&so->so_rcv);
2266	nextrecord = m->m_nextpkt;
2267	if (nextrecord == NULL) {
2268		KASSERT(so->so_rcv.sb_lastrecord == m,
2269		    ("soreceive_dgram: lastrecord != m"));
2270	}
2271
2272	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2273	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2274
2275	/*
2276	 * Pull 'm' and its chain off the front of the packet queue.
2277	 */
2278	so->so_rcv.sb_mb = NULL;
2279	sockbuf_pushsync(&so->so_rcv, nextrecord);
2280
2281	/*
2282	 * Walk 'm's chain and free that many bytes from the socket buffer.
2283	 */
2284	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2285		sbfree(&so->so_rcv, m2);
2286
2287	/*
2288	 * Do a few last checks before we let go of the lock.
2289	 */
2290	SBLASTRECORDCHK(&so->so_rcv);
2291	SBLASTMBUFCHK(&so->so_rcv);
2292	SOCKBUF_UNLOCK(&so->so_rcv);
2293
2294	if (pr->pr_flags & PR_ADDR) {
2295		KASSERT(m->m_type == MT_SONAME,
2296		    ("m->m_type == %d", m->m_type));
2297		if (psa != NULL)
2298			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2299			    M_NOWAIT);
2300		m = m_free(m);
2301	}
2302	if (m == NULL) {
2303		/* XXXRW: Can this happen? */
2304		return (0);
2305	}
2306
2307	/*
2308	 * Packet to copyout() is now in 'm' and it is disconnected from the
2309	 * queue.
2310	 *
2311	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2312	 * in the first mbuf chain on the socket buffer.  We call into the
2313	 * protocol to perform externalization (or freeing if controlp ==
2314	 * NULL).
2315	 */
2316	if (m->m_type == MT_CONTROL) {
2317		struct mbuf *cm = NULL, *cmn;
2318		struct mbuf **cme = &cm;
2319
2320		do {
2321			m2 = m->m_next;
2322			m->m_next = NULL;
2323			*cme = m;
2324			cme = &(*cme)->m_next;
2325			m = m2;
2326		} while (m != NULL && m->m_type == MT_CONTROL);
2327		while (cm != NULL) {
2328			cmn = cm->m_next;
2329			cm->m_next = NULL;
2330			if (pr->pr_domain->dom_externalize != NULL) {
2331				error = (*pr->pr_domain->dom_externalize)
2332				    (cm, controlp);
2333			} else if (controlp != NULL)
2334				*controlp = cm;
2335			else
2336				m_freem(cm);
2337			if (controlp != NULL) {
2338				while (*controlp != NULL)
2339					controlp = &(*controlp)->m_next;
2340			}
2341			cm = cmn;
2342		}
2343	}
2344	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2345
2346	while (m != NULL && uio->uio_resid > 0) {
2347		len = uio->uio_resid;
2348		if (len > m->m_len)
2349			len = m->m_len;
2350		error = uiomove(mtod(m, char *), (int)len, uio);
2351		if (error) {
2352			m_freem(m);
2353			return (error);
2354		}
2355		if (len == m->m_len)
2356			m = m_free(m);
2357		else {
2358			m->m_data += len;
2359			m->m_len -= len;
2360		}
2361	}
2362	if (m != NULL)
2363		flags |= MSG_TRUNC;
2364	m_freem(m);
2365	if (flagsp != NULL)
2366		*flagsp |= flags;
2367	return (0);
2368}
2369
2370int
2371soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2372    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2373{
2374	int error;
2375
2376	CURVNET_SET(so->so_vnet);
2377	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2378	    controlp, flagsp));
2379	CURVNET_RESTORE();
2380	return (error);
2381}
2382
2383int
2384soshutdown(struct socket *so, int how)
2385{
2386	struct protosw *pr = so->so_proto;
2387	int error;
2388
2389	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2390		return (EINVAL);
2391
2392	CURVNET_SET(so->so_vnet);
2393	if (pr->pr_usrreqs->pru_flush != NULL) {
2394	        (*pr->pr_usrreqs->pru_flush)(so, how);
2395	}
2396	if (how != SHUT_WR)
2397		sorflush(so);
2398	if (how != SHUT_RD) {
2399		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2400		wakeup(&so->so_timeo);
2401		CURVNET_RESTORE();
2402		return (error);
2403	}
2404	wakeup(&so->so_timeo);
2405	CURVNET_RESTORE();
2406	return (0);
2407}
2408
2409void
2410sorflush(struct socket *so)
2411{
2412	struct sockbuf *sb = &so->so_rcv;
2413	struct protosw *pr = so->so_proto;
2414	struct sockbuf asb;
2415
2416	VNET_SO_ASSERT(so);
2417
2418	/*
2419	 * In order to avoid calling dom_dispose with the socket buffer mutex
2420	 * held, and in order to generally avoid holding the lock for a long
2421	 * time, we make a copy of the socket buffer and clear the original
2422	 * (except locks, state).  The new socket buffer copy won't have
2423	 * initialized locks so we can only call routines that won't use or
2424	 * assert those locks.
2425	 *
2426	 * Dislodge threads currently blocked in receive and wait to acquire
2427	 * a lock against other simultaneous readers before clearing the
2428	 * socket buffer.  Don't let our acquire be interrupted by a signal
2429	 * despite any existing socket disposition on interruptable waiting.
2430	 */
2431	socantrcvmore(so);
2432	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2433
2434	/*
2435	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2436	 * and mutex data unchanged.
2437	 */
2438	SOCKBUF_LOCK(sb);
2439	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2440	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2441	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2442	bzero(&sb->sb_startzero,
2443	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2444	SOCKBUF_UNLOCK(sb);
2445	sbunlock(sb);
2446
2447	/*
2448	 * Dispose of special rights and flush the socket buffer.  Don't call
2449	 * any unsafe routines (that rely on locks being initialized) on asb.
2450	 */
2451	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2452		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2453	sbrelease_internal(&asb, so);
2454}
2455
2456/*
2457 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2458 * additional variant to handle the case where the option value needs to be
2459 * some kind of integer, but not a specific size.  In addition to their use
2460 * here, these functions are also called by the protocol-level pr_ctloutput()
2461 * routines.
2462 */
2463int
2464sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2465{
2466	size_t	valsize;
2467
2468	/*
2469	 * If the user gives us more than we wanted, we ignore it, but if we
2470	 * don't get the minimum length the caller wants, we return EINVAL.
2471	 * On success, sopt->sopt_valsize is set to however much we actually
2472	 * retrieved.
2473	 */
2474	if ((valsize = sopt->sopt_valsize) < minlen)
2475		return EINVAL;
2476	if (valsize > len)
2477		sopt->sopt_valsize = valsize = len;
2478
2479	if (sopt->sopt_td != NULL)
2480		return (copyin(sopt->sopt_val, buf, valsize));
2481
2482	bcopy(sopt->sopt_val, buf, valsize);
2483	return (0);
2484}
2485
2486/*
2487 * Kernel version of setsockopt(2).
2488 *
2489 * XXX: optlen is size_t, not socklen_t
2490 */
2491int
2492so_setsockopt(struct socket *so, int level, int optname, void *optval,
2493    size_t optlen)
2494{
2495	struct sockopt sopt;
2496
2497	sopt.sopt_level = level;
2498	sopt.sopt_name = optname;
2499	sopt.sopt_dir = SOPT_SET;
2500	sopt.sopt_val = optval;
2501	sopt.sopt_valsize = optlen;
2502	sopt.sopt_td = NULL;
2503	return (sosetopt(so, &sopt));
2504}
2505
2506int
2507sosetopt(struct socket *so, struct sockopt *sopt)
2508{
2509	int	error, optval;
2510	struct	linger l;
2511	struct	timeval tv;
2512	u_long  val;
2513	uint32_t val32;
2514#ifdef MAC
2515	struct mac extmac;
2516#endif
2517
2518	CURVNET_SET(so->so_vnet);
2519	error = 0;
2520	if (sopt->sopt_level != SOL_SOCKET) {
2521		if (so->so_proto->pr_ctloutput != NULL) {
2522			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2523			CURVNET_RESTORE();
2524			return (error);
2525		}
2526		error = ENOPROTOOPT;
2527	} else {
2528		switch (sopt->sopt_name) {
2529#ifdef INET
2530		case SO_ACCEPTFILTER:
2531			error = do_setopt_accept_filter(so, sopt);
2532			if (error)
2533				goto bad;
2534			break;
2535#endif
2536		case SO_LINGER:
2537			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2538			if (error)
2539				goto bad;
2540
2541			SOCK_LOCK(so);
2542			so->so_linger = l.l_linger;
2543			if (l.l_onoff)
2544				so->so_options |= SO_LINGER;
2545			else
2546				so->so_options &= ~SO_LINGER;
2547			SOCK_UNLOCK(so);
2548			break;
2549
2550		case SO_DEBUG:
2551		case SO_KEEPALIVE:
2552		case SO_DONTROUTE:
2553		case SO_USELOOPBACK:
2554		case SO_BROADCAST:
2555		case SO_REUSEADDR:
2556		case SO_REUSEPORT:
2557		case SO_OOBINLINE:
2558		case SO_TIMESTAMP:
2559		case SO_BINTIME:
2560		case SO_NOSIGPIPE:
2561		case SO_NO_DDP:
2562		case SO_NO_OFFLOAD:
2563			error = sooptcopyin(sopt, &optval, sizeof optval,
2564					    sizeof optval);
2565			if (error)
2566				goto bad;
2567			SOCK_LOCK(so);
2568			if (optval)
2569				so->so_options |= sopt->sopt_name;
2570			else
2571				so->so_options &= ~sopt->sopt_name;
2572			SOCK_UNLOCK(so);
2573			break;
2574
2575		case SO_SETFIB:
2576			error = sooptcopyin(sopt, &optval, sizeof optval,
2577					    sizeof optval);
2578			if (error)
2579				goto bad;
2580
2581			if (optval < 0 || optval >= rt_numfibs) {
2582				error = EINVAL;
2583				goto bad;
2584			}
2585			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2586			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2587			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2588				so->so_fibnum = optval;
2589			else
2590				so->so_fibnum = 0;
2591			break;
2592
2593		case SO_USER_COOKIE:
2594			error = sooptcopyin(sopt, &val32, sizeof val32,
2595					    sizeof val32);
2596			if (error)
2597				goto bad;
2598			so->so_user_cookie = val32;
2599			break;
2600
2601		case SO_SNDBUF:
2602		case SO_RCVBUF:
2603		case SO_SNDLOWAT:
2604		case SO_RCVLOWAT:
2605			error = sooptcopyin(sopt, &optval, sizeof optval,
2606					    sizeof optval);
2607			if (error)
2608				goto bad;
2609
2610			/*
2611			 * Values < 1 make no sense for any of these options,
2612			 * so disallow them.
2613			 */
2614			if (optval < 1) {
2615				error = EINVAL;
2616				goto bad;
2617			}
2618
2619			switch (sopt->sopt_name) {
2620			case SO_SNDBUF:
2621			case SO_RCVBUF:
2622				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2623				    &so->so_snd : &so->so_rcv, (u_long)optval,
2624				    so, curthread) == 0) {
2625					error = ENOBUFS;
2626					goto bad;
2627				}
2628				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2629				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2630				break;
2631
2632			/*
2633			 * Make sure the low-water is never greater than the
2634			 * high-water.
2635			 */
2636			case SO_SNDLOWAT:
2637				SOCKBUF_LOCK(&so->so_snd);
2638				so->so_snd.sb_lowat =
2639				    (optval > so->so_snd.sb_hiwat) ?
2640				    so->so_snd.sb_hiwat : optval;
2641				SOCKBUF_UNLOCK(&so->so_snd);
2642				break;
2643			case SO_RCVLOWAT:
2644				SOCKBUF_LOCK(&so->so_rcv);
2645				so->so_rcv.sb_lowat =
2646				    (optval > so->so_rcv.sb_hiwat) ?
2647				    so->so_rcv.sb_hiwat : optval;
2648				SOCKBUF_UNLOCK(&so->so_rcv);
2649				break;
2650			}
2651			break;
2652
2653		case SO_SNDTIMEO:
2654		case SO_RCVTIMEO:
2655#ifdef COMPAT_FREEBSD32
2656			if (SV_CURPROC_FLAG(SV_ILP32)) {
2657				struct timeval32 tv32;
2658
2659				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2660				    sizeof tv32);
2661				CP(tv32, tv, tv_sec);
2662				CP(tv32, tv, tv_usec);
2663			} else
2664#endif
2665				error = sooptcopyin(sopt, &tv, sizeof tv,
2666				    sizeof tv);
2667			if (error)
2668				goto bad;
2669			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2670			    tv.tv_usec >= 1000000) {
2671				error = EDOM;
2672				goto bad;
2673			}
2674			val = tvtohz(&tv);
2675
2676			switch (sopt->sopt_name) {
2677			case SO_SNDTIMEO:
2678				so->so_snd.sb_timeo = val;
2679				break;
2680			case SO_RCVTIMEO:
2681				so->so_rcv.sb_timeo = val;
2682				break;
2683			}
2684			break;
2685
2686		case SO_LABEL:
2687#ifdef MAC
2688			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2689			    sizeof extmac);
2690			if (error)
2691				goto bad;
2692			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2693			    so, &extmac);
2694#else
2695			error = EOPNOTSUPP;
2696#endif
2697			break;
2698
2699		default:
2700			error = ENOPROTOOPT;
2701			break;
2702		}
2703		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2704			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
2705	}
2706bad:
2707	CURVNET_RESTORE();
2708	return (error);
2709}
2710
2711/*
2712 * Helper routine for getsockopt.
2713 */
2714int
2715sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2716{
2717	int	error;
2718	size_t	valsize;
2719
2720	error = 0;
2721
2722	/*
2723	 * Documented get behavior is that we always return a value, possibly
2724	 * truncated to fit in the user's buffer.  Traditional behavior is
2725	 * that we always tell the user precisely how much we copied, rather
2726	 * than something useful like the total amount we had available for
2727	 * her.  Note that this interface is not idempotent; the entire
2728	 * answer must generated ahead of time.
2729	 */
2730	valsize = min(len, sopt->sopt_valsize);
2731	sopt->sopt_valsize = valsize;
2732	if (sopt->sopt_val != NULL) {
2733		if (sopt->sopt_td != NULL)
2734			error = copyout(buf, sopt->sopt_val, valsize);
2735		else
2736			bcopy(buf, sopt->sopt_val, valsize);
2737	}
2738	return (error);
2739}
2740
2741int
2742sogetopt(struct socket *so, struct sockopt *sopt)
2743{
2744	int	error, optval;
2745	struct	linger l;
2746	struct	timeval tv;
2747#ifdef MAC
2748	struct mac extmac;
2749#endif
2750
2751	CURVNET_SET(so->so_vnet);
2752	error = 0;
2753	if (sopt->sopt_level != SOL_SOCKET) {
2754		if (so->so_proto->pr_ctloutput != NULL)
2755			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2756		else
2757			error = ENOPROTOOPT;
2758		CURVNET_RESTORE();
2759		return (error);
2760	} else {
2761		switch (sopt->sopt_name) {
2762#ifdef INET
2763		case SO_ACCEPTFILTER:
2764			error = do_getopt_accept_filter(so, sopt);
2765			break;
2766#endif
2767		case SO_LINGER:
2768			SOCK_LOCK(so);
2769			l.l_onoff = so->so_options & SO_LINGER;
2770			l.l_linger = so->so_linger;
2771			SOCK_UNLOCK(so);
2772			error = sooptcopyout(sopt, &l, sizeof l);
2773			break;
2774
2775		case SO_USELOOPBACK:
2776		case SO_DONTROUTE:
2777		case SO_DEBUG:
2778		case SO_KEEPALIVE:
2779		case SO_REUSEADDR:
2780		case SO_REUSEPORT:
2781		case SO_BROADCAST:
2782		case SO_OOBINLINE:
2783		case SO_ACCEPTCONN:
2784		case SO_TIMESTAMP:
2785		case SO_BINTIME:
2786		case SO_NOSIGPIPE:
2787			optval = so->so_options & sopt->sopt_name;
2788integer:
2789			error = sooptcopyout(sopt, &optval, sizeof optval);
2790			break;
2791
2792		case SO_TYPE:
2793			optval = so->so_type;
2794			goto integer;
2795
2796		case SO_PROTOCOL:
2797			optval = so->so_proto->pr_protocol;
2798			goto integer;
2799
2800		case SO_ERROR:
2801			SOCK_LOCK(so);
2802			optval = so->so_error;
2803			so->so_error = 0;
2804			SOCK_UNLOCK(so);
2805			goto integer;
2806
2807		case SO_SNDBUF:
2808			optval = so->so_snd.sb_hiwat;
2809			goto integer;
2810
2811		case SO_RCVBUF:
2812			optval = so->so_rcv.sb_hiwat;
2813			goto integer;
2814
2815		case SO_SNDLOWAT:
2816			optval = so->so_snd.sb_lowat;
2817			goto integer;
2818
2819		case SO_RCVLOWAT:
2820			optval = so->so_rcv.sb_lowat;
2821			goto integer;
2822
2823		case SO_SNDTIMEO:
2824		case SO_RCVTIMEO:
2825			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2826				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2827
2828			tv.tv_sec = optval / hz;
2829			tv.tv_usec = (optval % hz) * tick;
2830#ifdef COMPAT_FREEBSD32
2831			if (SV_CURPROC_FLAG(SV_ILP32)) {
2832				struct timeval32 tv32;
2833
2834				CP(tv, tv32, tv_sec);
2835				CP(tv, tv32, tv_usec);
2836				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2837			} else
2838#endif
2839				error = sooptcopyout(sopt, &tv, sizeof tv);
2840			break;
2841
2842		case SO_LABEL:
2843#ifdef MAC
2844			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2845			    sizeof(extmac));
2846			if (error)
2847				goto bad;
2848			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2849			    so, &extmac);
2850			if (error)
2851				goto bad;
2852			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2853#else
2854			error = EOPNOTSUPP;
2855#endif
2856			break;
2857
2858		case SO_PEERLABEL:
2859#ifdef MAC
2860			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2861			    sizeof(extmac));
2862			if (error)
2863				goto bad;
2864			error = mac_getsockopt_peerlabel(
2865			    sopt->sopt_td->td_ucred, so, &extmac);
2866			if (error)
2867				goto bad;
2868			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2869#else
2870			error = EOPNOTSUPP;
2871#endif
2872			break;
2873
2874		case SO_LISTENQLIMIT:
2875			optval = so->so_qlimit;
2876			goto integer;
2877
2878		case SO_LISTENQLEN:
2879			optval = so->so_qlen;
2880			goto integer;
2881
2882		case SO_LISTENINCQLEN:
2883			optval = so->so_incqlen;
2884			goto integer;
2885
2886		default:
2887			error = ENOPROTOOPT;
2888			break;
2889		}
2890	}
2891#ifdef MAC
2892bad:
2893#endif
2894	CURVNET_RESTORE();
2895	return (error);
2896}
2897
2898/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2899int
2900soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2901{
2902	struct mbuf *m, *m_prev;
2903	int sopt_size = sopt->sopt_valsize;
2904
2905	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2906	if (m == NULL)
2907		return ENOBUFS;
2908	if (sopt_size > MLEN) {
2909		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2910		if ((m->m_flags & M_EXT) == 0) {
2911			m_free(m);
2912			return ENOBUFS;
2913		}
2914		m->m_len = min(MCLBYTES, sopt_size);
2915	} else {
2916		m->m_len = min(MLEN, sopt_size);
2917	}
2918	sopt_size -= m->m_len;
2919	*mp = m;
2920	m_prev = m;
2921
2922	while (sopt_size) {
2923		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2924		if (m == NULL) {
2925			m_freem(*mp);
2926			return ENOBUFS;
2927		}
2928		if (sopt_size > MLEN) {
2929			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2930			    M_DONTWAIT);
2931			if ((m->m_flags & M_EXT) == 0) {
2932				m_freem(m);
2933				m_freem(*mp);
2934				return ENOBUFS;
2935			}
2936			m->m_len = min(MCLBYTES, sopt_size);
2937		} else {
2938			m->m_len = min(MLEN, sopt_size);
2939		}
2940		sopt_size -= m->m_len;
2941		m_prev->m_next = m;
2942		m_prev = m;
2943	}
2944	return (0);
2945}
2946
2947/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2948int
2949soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2950{
2951	struct mbuf *m0 = m;
2952
2953	if (sopt->sopt_val == NULL)
2954		return (0);
2955	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2956		if (sopt->sopt_td != NULL) {
2957			int error;
2958
2959			error = copyin(sopt->sopt_val, mtod(m, char *),
2960				       m->m_len);
2961			if (error != 0) {
2962				m_freem(m0);
2963				return(error);
2964			}
2965		} else
2966			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2967		sopt->sopt_valsize -= m->m_len;
2968		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2969		m = m->m_next;
2970	}
2971	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2972		panic("ip6_sooptmcopyin");
2973	return (0);
2974}
2975
2976/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2977int
2978soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2979{
2980	struct mbuf *m0 = m;
2981	size_t valsize = 0;
2982
2983	if (sopt->sopt_val == NULL)
2984		return (0);
2985	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2986		if (sopt->sopt_td != NULL) {
2987			int error;
2988
2989			error = copyout(mtod(m, char *), sopt->sopt_val,
2990				       m->m_len);
2991			if (error != 0) {
2992				m_freem(m0);
2993				return(error);
2994			}
2995		} else
2996			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2997	       sopt->sopt_valsize -= m->m_len;
2998	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2999	       valsize += m->m_len;
3000	       m = m->m_next;
3001	}
3002	if (m != NULL) {
3003		/* enough soopt buffer should be given from user-land */
3004		m_freem(m0);
3005		return(EINVAL);
3006	}
3007	sopt->sopt_valsize = valsize;
3008	return (0);
3009}
3010
3011/*
3012 * sohasoutofband(): protocol notifies socket layer of the arrival of new
3013 * out-of-band data, which will then notify socket consumers.
3014 */
3015void
3016sohasoutofband(struct socket *so)
3017{
3018
3019	if (so->so_sigio != NULL)
3020		pgsigio(&so->so_sigio, SIGURG, 0);
3021	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
3022}
3023
3024int
3025sopoll(struct socket *so, int events, struct ucred *active_cred,
3026    struct thread *td)
3027{
3028
3029	/*
3030	 * We do not need to set or assert curvnet as long as everyone uses
3031	 * sopoll_generic().
3032	 */
3033	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3034	    td));
3035}
3036
3037int
3038sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3039    struct thread *td)
3040{
3041	int revents = 0;
3042
3043	SOCKBUF_LOCK(&so->so_snd);
3044	SOCKBUF_LOCK(&so->so_rcv);
3045	if (events & (POLLIN | POLLRDNORM))
3046		if (soreadabledata(so))
3047			revents |= events & (POLLIN | POLLRDNORM);
3048
3049	if (events & (POLLOUT | POLLWRNORM))
3050		if (sowriteable(so))
3051			revents |= events & (POLLOUT | POLLWRNORM);
3052
3053	if (events & (POLLPRI | POLLRDBAND))
3054		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
3055			revents |= events & (POLLPRI | POLLRDBAND);
3056
3057	if ((events & POLLINIGNEOF) == 0) {
3058		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3059			revents |= events & (POLLIN | POLLRDNORM);
3060			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3061				revents |= POLLHUP;
3062		}
3063	}
3064
3065	if (revents == 0) {
3066		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3067			selrecord(td, &so->so_rcv.sb_sel);
3068			so->so_rcv.sb_flags |= SB_SEL;
3069		}
3070
3071		if (events & (POLLOUT | POLLWRNORM)) {
3072			selrecord(td, &so->so_snd.sb_sel);
3073			so->so_snd.sb_flags |= SB_SEL;
3074		}
3075	}
3076
3077	SOCKBUF_UNLOCK(&so->so_rcv);
3078	SOCKBUF_UNLOCK(&so->so_snd);
3079	return (revents);
3080}
3081
3082int
3083soo_kqfilter(struct file *fp, struct knote *kn)
3084{
3085	struct socket *so = kn->kn_fp->f_data;
3086	struct sockbuf *sb;
3087
3088	switch (kn->kn_filter) {
3089	case EVFILT_READ:
3090		if (so->so_options & SO_ACCEPTCONN)
3091			kn->kn_fop = &solisten_filtops;
3092		else
3093			kn->kn_fop = &soread_filtops;
3094		sb = &so->so_rcv;
3095		break;
3096	case EVFILT_WRITE:
3097		kn->kn_fop = &sowrite_filtops;
3098		sb = &so->so_snd;
3099		break;
3100	default:
3101		return (EINVAL);
3102	}
3103
3104	SOCKBUF_LOCK(sb);
3105	knlist_add(&sb->sb_sel.si_note, kn, 1);
3106	sb->sb_flags |= SB_KNOTE;
3107	SOCKBUF_UNLOCK(sb);
3108	return (0);
3109}
3110
3111/*
3112 * Some routines that return EOPNOTSUPP for entry points that are not
3113 * supported by a protocol.  Fill in as needed.
3114 */
3115int
3116pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3117{
3118
3119	return EOPNOTSUPP;
3120}
3121
3122int
3123pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3124{
3125
3126	return EOPNOTSUPP;
3127}
3128
3129int
3130pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3131{
3132
3133	return EOPNOTSUPP;
3134}
3135
3136int
3137pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3138{
3139
3140	return EOPNOTSUPP;
3141}
3142
3143int
3144pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3145{
3146
3147	return EOPNOTSUPP;
3148}
3149
3150int
3151pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3152    struct ifnet *ifp, struct thread *td)
3153{
3154
3155	return EOPNOTSUPP;
3156}
3157
3158int
3159pru_disconnect_notsupp(struct socket *so)
3160{
3161
3162	return EOPNOTSUPP;
3163}
3164
3165int
3166pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3167{
3168
3169	return EOPNOTSUPP;
3170}
3171
3172int
3173pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3174{
3175
3176	return EOPNOTSUPP;
3177}
3178
3179int
3180pru_rcvd_notsupp(struct socket *so, int flags)
3181{
3182
3183	return EOPNOTSUPP;
3184}
3185
3186int
3187pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3188{
3189
3190	return EOPNOTSUPP;
3191}
3192
3193int
3194pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3195    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3196{
3197
3198	return EOPNOTSUPP;
3199}
3200
3201/*
3202 * This isn't really a ``null'' operation, but it's the default one and
3203 * doesn't do anything destructive.
3204 */
3205int
3206pru_sense_null(struct socket *so, struct stat *sb)
3207{
3208
3209	sb->st_blksize = so->so_snd.sb_hiwat;
3210	return 0;
3211}
3212
3213int
3214pru_shutdown_notsupp(struct socket *so)
3215{
3216
3217	return EOPNOTSUPP;
3218}
3219
3220int
3221pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3222{
3223
3224	return EOPNOTSUPP;
3225}
3226
3227int
3228pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3229    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3230{
3231
3232	return EOPNOTSUPP;
3233}
3234
3235int
3236pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3237    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3238{
3239
3240	return EOPNOTSUPP;
3241}
3242
3243int
3244pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3245    struct thread *td)
3246{
3247
3248	return EOPNOTSUPP;
3249}
3250
3251static void
3252filt_sordetach(struct knote *kn)
3253{
3254	struct socket *so = kn->kn_fp->f_data;
3255
3256	SOCKBUF_LOCK(&so->so_rcv);
3257	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3258	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3259		so->so_rcv.sb_flags &= ~SB_KNOTE;
3260	SOCKBUF_UNLOCK(&so->so_rcv);
3261}
3262
3263/*ARGSUSED*/
3264static int
3265filt_soread(struct knote *kn, long hint)
3266{
3267	struct socket *so;
3268
3269	so = kn->kn_fp->f_data;
3270	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3271
3272	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3273	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3274		kn->kn_flags |= EV_EOF;
3275		kn->kn_fflags = so->so_error;
3276		return (1);
3277	} else if (so->so_error)	/* temporary udp error */
3278		return (1);
3279	else if (kn->kn_sfflags & NOTE_LOWAT)
3280		return (kn->kn_data >= kn->kn_sdata);
3281	else
3282		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3283}
3284
3285static void
3286filt_sowdetach(struct knote *kn)
3287{
3288	struct socket *so = kn->kn_fp->f_data;
3289
3290	SOCKBUF_LOCK(&so->so_snd);
3291	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3292	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3293		so->so_snd.sb_flags &= ~SB_KNOTE;
3294	SOCKBUF_UNLOCK(&so->so_snd);
3295}
3296
3297/*ARGSUSED*/
3298static int
3299filt_sowrite(struct knote *kn, long hint)
3300{
3301	struct socket *so;
3302
3303	so = kn->kn_fp->f_data;
3304	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3305	kn->kn_data = sbspace(&so->so_snd);
3306	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3307		kn->kn_flags |= EV_EOF;
3308		kn->kn_fflags = so->so_error;
3309		return (1);
3310	} else if (so->so_error)	/* temporary udp error */
3311		return (1);
3312	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3313	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3314		return (0);
3315	else if (kn->kn_sfflags & NOTE_LOWAT)
3316		return (kn->kn_data >= kn->kn_sdata);
3317	else
3318		return (kn->kn_data >= so->so_snd.sb_lowat);
3319}
3320
3321/*ARGSUSED*/
3322static int
3323filt_solisten(struct knote *kn, long hint)
3324{
3325	struct socket *so = kn->kn_fp->f_data;
3326
3327	kn->kn_data = so->so_qlen;
3328	return (! TAILQ_EMPTY(&so->so_comp));
3329}
3330
3331int
3332socheckuid(struct socket *so, uid_t uid)
3333{
3334
3335	if (so == NULL)
3336		return (EPERM);
3337	if (so->so_cred->cr_uid != uid)
3338		return (EPERM);
3339	return (0);
3340}
3341
3342/*
3343 * These functions are used by protocols to notify the socket layer (and its
3344 * consumers) of state changes in the sockets driven by protocol-side events.
3345 */
3346
3347/*
3348 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3349 *
3350 * Normal sequence from the active (originating) side is that
3351 * soisconnecting() is called during processing of connect() call, resulting
3352 * in an eventual call to soisconnected() if/when the connection is
3353 * established.  When the connection is torn down soisdisconnecting() is
3354 * called during processing of disconnect() call, and soisdisconnected() is
3355 * called when the connection to the peer is totally severed.  The semantics
3356 * of these routines are such that connectionless protocols can call
3357 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3358 * calls when setting up a ``connection'' takes no time.
3359 *
3360 * From the passive side, a socket is created with two queues of sockets:
3361 * so_incomp for connections in progress and so_comp for connections already
3362 * made and awaiting user acceptance.  As a protocol is preparing incoming
3363 * connections, it creates a socket structure queued on so_incomp by calling
3364 * sonewconn().  When the connection is established, soisconnected() is
3365 * called, and transfers the socket structure to so_comp, making it available
3366 * to accept().
3367 *
3368 * If a socket is closed with sockets on either so_incomp or so_comp, these
3369 * sockets are dropped.
3370 *
3371 * If higher-level protocols are implemented in the kernel, the wakeups done
3372 * here will sometimes cause software-interrupt process scheduling.
3373 */
3374void
3375soisconnecting(struct socket *so)
3376{
3377
3378	SOCK_LOCK(so);
3379	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3380	so->so_state |= SS_ISCONNECTING;
3381	SOCK_UNLOCK(so);
3382}
3383
3384void
3385soisconnected(struct socket *so)
3386{
3387	struct socket *head;
3388	int ret;
3389
3390restart:
3391	ACCEPT_LOCK();
3392	SOCK_LOCK(so);
3393	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3394	so->so_state |= SS_ISCONNECTED;
3395	head = so->so_head;
3396	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3397		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3398			SOCK_UNLOCK(so);
3399			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3400			head->so_incqlen--;
3401			so->so_qstate &= ~SQ_INCOMP;
3402			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3403			head->so_qlen++;
3404			so->so_qstate |= SQ_COMP;
3405			ACCEPT_UNLOCK();
3406			sorwakeup(head);
3407			wakeup_one(&head->so_timeo);
3408		} else {
3409			ACCEPT_UNLOCK();
3410			soupcall_set(so, SO_RCV,
3411			    head->so_accf->so_accept_filter->accf_callback,
3412			    head->so_accf->so_accept_filter_arg);
3413			so->so_options &= ~SO_ACCEPTFILTER;
3414			ret = head->so_accf->so_accept_filter->accf_callback(so,
3415			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3416			if (ret == SU_ISCONNECTED)
3417				soupcall_clear(so, SO_RCV);
3418			SOCK_UNLOCK(so);
3419			if (ret == SU_ISCONNECTED)
3420				goto restart;
3421		}
3422		return;
3423	}
3424	SOCK_UNLOCK(so);
3425	ACCEPT_UNLOCK();
3426	wakeup(&so->so_timeo);
3427	sorwakeup(so);
3428	sowwakeup(so);
3429}
3430
3431void
3432soisdisconnecting(struct socket *so)
3433{
3434
3435	/*
3436	 * Note: This code assumes that SOCK_LOCK(so) and
3437	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3438	 */
3439	SOCKBUF_LOCK(&so->so_rcv);
3440	so->so_state &= ~SS_ISCONNECTING;
3441	so->so_state |= SS_ISDISCONNECTING;
3442	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3443	sorwakeup_locked(so);
3444	SOCKBUF_LOCK(&so->so_snd);
3445	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3446	sowwakeup_locked(so);
3447	wakeup(&so->so_timeo);
3448}
3449
3450void
3451soisdisconnected(struct socket *so)
3452{
3453
3454	/*
3455	 * Note: This code assumes that SOCK_LOCK(so) and
3456	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3457	 */
3458	SOCKBUF_LOCK(&so->so_rcv);
3459	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3460	so->so_state |= SS_ISDISCONNECTED;
3461	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3462	sorwakeup_locked(so);
3463	SOCKBUF_LOCK(&so->so_snd);
3464	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3465	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3466	sowwakeup_locked(so);
3467	wakeup(&so->so_timeo);
3468}
3469
3470/*
3471 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3472 */
3473struct sockaddr *
3474sodupsockaddr(const struct sockaddr *sa, int mflags)
3475{
3476	struct sockaddr *sa2;
3477
3478	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3479	if (sa2)
3480		bcopy(sa, sa2, sa->sa_len);
3481	return sa2;
3482}
3483
3484/*
3485 * Register per-socket buffer upcalls.
3486 */
3487void
3488soupcall_set(struct socket *so, int which,
3489    int (*func)(struct socket *, void *, int), void *arg)
3490{
3491	struct sockbuf *sb;
3492
3493	switch (which) {
3494	case SO_RCV:
3495		sb = &so->so_rcv;
3496		break;
3497	case SO_SND:
3498		sb = &so->so_snd;
3499		break;
3500	default:
3501		panic("soupcall_set: bad which");
3502	}
3503	SOCKBUF_LOCK_ASSERT(sb);
3504#if 0
3505	/* XXX: accf_http actually wants to do this on purpose. */
3506	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3507#endif
3508	sb->sb_upcall = func;
3509	sb->sb_upcallarg = arg;
3510	sb->sb_flags |= SB_UPCALL;
3511}
3512
3513void
3514soupcall_clear(struct socket *so, int which)
3515{
3516	struct sockbuf *sb;
3517
3518	switch (which) {
3519	case SO_RCV:
3520		sb = &so->so_rcv;
3521		break;
3522	case SO_SND:
3523		sb = &so->so_snd;
3524		break;
3525	default:
3526		panic("soupcall_clear: bad which");
3527	}
3528	SOCKBUF_LOCK_ASSERT(sb);
3529	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3530	sb->sb_upcall = NULL;
3531	sb->sb_upcallarg = NULL;
3532	sb->sb_flags &= ~SB_UPCALL;
3533}
3534
3535/*
3536 * Create an external-format (``xsocket'') structure using the information in
3537 * the kernel-format socket structure pointed to by so.  This is done to
3538 * reduce the spew of irrelevant information over this interface, to isolate
3539 * user code from changes in the kernel structure, and potentially to provide
3540 * information-hiding if we decide that some of this information should be
3541 * hidden from users.
3542 */
3543void
3544sotoxsocket(struct socket *so, struct xsocket *xso)
3545{
3546
3547	xso->xso_len = sizeof *xso;
3548	xso->xso_so = so;
3549	xso->so_type = so->so_type;
3550	xso->so_options = so->so_options;
3551	xso->so_linger = so->so_linger;
3552	xso->so_state = so->so_state;
3553	xso->so_pcb = so->so_pcb;
3554	xso->xso_protocol = so->so_proto->pr_protocol;
3555	xso->xso_family = so->so_proto->pr_domain->dom_family;
3556	xso->so_qlen = so->so_qlen;
3557	xso->so_incqlen = so->so_incqlen;
3558	xso->so_qlimit = so->so_qlimit;
3559	xso->so_timeo = so->so_timeo;
3560	xso->so_error = so->so_error;
3561	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3562	xso->so_oobmark = so->so_oobmark;
3563	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3564	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3565	xso->so_uid = so->so_cred->cr_uid;
3566}
3567
3568
3569/*
3570 * Socket accessor functions to provide external consumers with
3571 * a safe interface to socket state
3572 *
3573 */
3574
3575void
3576so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3577{
3578
3579	TAILQ_FOREACH(so, &so->so_comp, so_list)
3580		func(so, arg);
3581}
3582
3583struct sockbuf *
3584so_sockbuf_rcv(struct socket *so)
3585{
3586
3587	return (&so->so_rcv);
3588}
3589
3590struct sockbuf *
3591so_sockbuf_snd(struct socket *so)
3592{
3593
3594	return (&so->so_snd);
3595}
3596
3597int
3598so_state_get(const struct socket *so)
3599{
3600
3601	return (so->so_state);
3602}
3603
3604void
3605so_state_set(struct socket *so, int val)
3606{
3607
3608	so->so_state = val;
3609}
3610
3611int
3612so_options_get(const struct socket *so)
3613{
3614
3615	return (so->so_options);
3616}
3617
3618void
3619so_options_set(struct socket *so, int val)
3620{
3621
3622	so->so_options = val;
3623}
3624
3625int
3626so_error_get(const struct socket *so)
3627{
3628
3629	return (so->so_error);
3630}
3631
3632void
3633so_error_set(struct socket *so, int val)
3634{
3635
3636	so->so_error = val;
3637}
3638
3639int
3640so_linger_get(const struct socket *so)
3641{
3642
3643	return (so->so_linger);
3644}
3645
3646void
3647so_linger_set(struct socket *so, int val)
3648{
3649
3650	so->so_linger = val;
3651}
3652
3653struct protosw *
3654so_protosw_get(const struct socket *so)
3655{
3656
3657	return (so->so_proto);
3658}
3659
3660void
3661so_protosw_set(struct socket *so, struct protosw *val)
3662{
3663
3664	so->so_proto = val;
3665}
3666
3667void
3668so_sorwakeup(struct socket *so)
3669{
3670
3671	sorwakeup(so);
3672}
3673
3674void
3675so_sowwakeup(struct socket *so)
3676{
3677
3678	sowwakeup(so);
3679}
3680
3681void
3682so_sorwakeup_locked(struct socket *so)
3683{
3684
3685	sorwakeup_locked(so);
3686}
3687
3688void
3689so_sowwakeup_locked(struct socket *so)
3690{
3691
3692	sowwakeup_locked(so);
3693}
3694
3695void
3696so_lock(struct socket *so)
3697{
3698	SOCK_LOCK(so);
3699}
3700
3701void
3702so_unlock(struct socket *so)
3703{
3704	SOCK_UNLOCK(so);
3705}
3706