uipc_socket.c revision 243994
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103#include <sys/cdefs.h>
104__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 243994 2012-12-07 22:15:51Z pjd $");
105
106#include "opt_inet.h"
107#include "opt_inet6.h"
108#include "opt_zero.h"
109#include "opt_compat.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/fcntl.h>
114#include <sys/limits.h>
115#include <sys/lock.h>
116#include <sys/mac.h>
117#include <sys/malloc.h>
118#include <sys/mbuf.h>
119#include <sys/mutex.h>
120#include <sys/domain.h>
121#include <sys/file.h>			/* for struct knote */
122#include <sys/kernel.h>
123#include <sys/event.h>
124#include <sys/eventhandler.h>
125#include <sys/poll.h>
126#include <sys/proc.h>
127#include <sys/protosw.h>
128#include <sys/socket.h>
129#include <sys/socketvar.h>
130#include <sys/resourcevar.h>
131#include <net/route.h>
132#include <sys/signalvar.h>
133#include <sys/stat.h>
134#include <sys/sx.h>
135#include <sys/sysctl.h>
136#include <sys/uio.h>
137#include <sys/jail.h>
138#include <sys/syslog.h>
139
140#include <net/vnet.h>
141
142#include <security/mac/mac_framework.h>
143
144#include <vm/uma.h>
145
146#ifdef COMPAT_FREEBSD32
147#include <sys/mount.h>
148#include <sys/sysent.h>
149#include <compat/freebsd32/freebsd32.h>
150#endif
151
152static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
153		    int flags);
154
155static void	filt_sordetach(struct knote *kn);
156static int	filt_soread(struct knote *kn, long hint);
157static void	filt_sowdetach(struct knote *kn);
158static int	filt_sowrite(struct knote *kn, long hint);
159static int	filt_solisten(struct knote *kn, long hint);
160
161static struct filterops solisten_filtops = {
162	.f_isfd = 1,
163	.f_detach = filt_sordetach,
164	.f_event = filt_solisten,
165};
166static struct filterops soread_filtops = {
167	.f_isfd = 1,
168	.f_detach = filt_sordetach,
169	.f_event = filt_soread,
170};
171static struct filterops sowrite_filtops = {
172	.f_isfd = 1,
173	.f_detach = filt_sowdetach,
174	.f_event = filt_sowrite,
175};
176
177so_gen_t	so_gencnt;	/* generation count for sockets */
178
179MALLOC_DEFINE(M_SONAME, "soname", "socket name");
180MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
181
182#define	VNET_SO_ASSERT(so)						\
183	VNET_ASSERT(curvnet != NULL,					\
184	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
185
186/*
187 * Limit on the number of connections in the listen queue waiting
188 * for accept(2).
189 * NB: The orginal sysctl somaxconn is still available but hidden
190 * to prevent confusion about the actual purpose of this number.
191 */
192static int somaxconn = SOMAXCONN;
193
194static int
195sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
196{
197	int error;
198	int val;
199
200	val = somaxconn;
201	error = sysctl_handle_int(oidp, &val, 0, req);
202	if (error || !req->newptr )
203		return (error);
204
205	if (val < 1 || val > USHRT_MAX)
206		return (EINVAL);
207
208	somaxconn = val;
209	return (0);
210}
211SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
212    0, sizeof(int), sysctl_somaxconn, "I",
213    "Maximum listen socket pending connection accept queue size");
214SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
215    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
216    0, sizeof(int), sysctl_somaxconn, "I",
217    "Maximum listen socket pending connection accept queue size (compat)");
218
219static int numopensockets;
220SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
221    &numopensockets, 0, "Number of open sockets");
222
223#if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP)
224SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
225    "Zero copy controls");
226#ifdef SOCKET_RECV_PFLIP
227int so_zero_copy_receive = 1;
228SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
229    &so_zero_copy_receive, 0, "Enable zero copy receive");
230#endif
231#ifdef SOCKET_SEND_COW
232int so_zero_copy_send = 1;
233SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
234    &so_zero_copy_send, 0, "Enable zero copy send");
235#endif /* SOCKET_SEND_COW */
236#endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */
237
238/*
239 * accept_mtx locks down per-socket fields relating to accept queues.  See
240 * socketvar.h for an annotation of the protected fields of struct socket.
241 */
242struct mtx accept_mtx;
243MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
244
245/*
246 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
247 * so_gencnt field.
248 */
249static struct mtx so_global_mtx;
250MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
251
252/*
253 * General IPC sysctl name space, used by sockets and a variety of other IPC
254 * types.
255 */
256SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
257
258/*
259 * Initialize the socket subsystem and set up the socket
260 * memory allocator.
261 */
262static uma_zone_t socket_zone;
263int	maxsockets;
264
265static void
266socket_zone_change(void *tag)
267{
268
269	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
270}
271
272static void
273socket_init(void *tag)
274{
275
276	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
277	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
278	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
279	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
280	    EVENTHANDLER_PRI_FIRST);
281}
282SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
283
284/*
285 * Initialise maxsockets.  This SYSINIT must be run after
286 * tunable_mbinit().
287 */
288static void
289init_maxsockets(void *ignored)
290{
291
292	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
293	maxsockets = imax(maxsockets, maxfiles);
294}
295SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
296
297/*
298 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
299 * of the change so that they can update their dependent limits as required.
300 */
301static int
302sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
303{
304	int error, newmaxsockets;
305
306	newmaxsockets = maxsockets;
307	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
308	if (error == 0 && req->newptr) {
309		if (newmaxsockets > maxsockets &&
310		    newmaxsockets <= maxfiles) {
311			maxsockets = newmaxsockets;
312			EVENTHANDLER_INVOKE(maxsockets_change);
313		} else
314			error = EINVAL;
315	}
316	return (error);
317}
318SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
319    &maxsockets, 0, sysctl_maxsockets, "IU",
320    "Maximum number of sockets avaliable");
321
322/*
323 * Socket operation routines.  These routines are called by the routines in
324 * sys_socket.c or from a system process, and implement the semantics of
325 * socket operations by switching out to the protocol specific routines.
326 */
327
328/*
329 * Get a socket structure from our zone, and initialize it.  Note that it
330 * would probably be better to allocate socket and PCB at the same time, but
331 * I'm not convinced that all the protocols can be easily modified to do
332 * this.
333 *
334 * soalloc() returns a socket with a ref count of 0.
335 */
336static struct socket *
337soalloc(struct vnet *vnet)
338{
339	struct socket *so;
340
341	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
342	if (so == NULL)
343		return (NULL);
344#ifdef MAC
345	if (mac_socket_init(so, M_NOWAIT) != 0) {
346		uma_zfree(socket_zone, so);
347		return (NULL);
348	}
349#endif
350	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
351	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
352	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
353	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
354	TAILQ_INIT(&so->so_aiojobq);
355	mtx_lock(&so_global_mtx);
356	so->so_gencnt = ++so_gencnt;
357	++numopensockets;
358#ifdef VIMAGE
359	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
360	    __func__, __LINE__, so));
361	vnet->vnet_sockcnt++;
362	so->so_vnet = vnet;
363#endif
364	mtx_unlock(&so_global_mtx);
365	return (so);
366}
367
368/*
369 * Free the storage associated with a socket at the socket layer, tear down
370 * locks, labels, etc.  All protocol state is assumed already to have been
371 * torn down (and possibly never set up) by the caller.
372 */
373static void
374sodealloc(struct socket *so)
375{
376
377	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
378	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
379
380	mtx_lock(&so_global_mtx);
381	so->so_gencnt = ++so_gencnt;
382	--numopensockets;	/* Could be below, but faster here. */
383#ifdef VIMAGE
384	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
385	    __func__, __LINE__, so));
386	so->so_vnet->vnet_sockcnt--;
387#endif
388	mtx_unlock(&so_global_mtx);
389	if (so->so_rcv.sb_hiwat)
390		(void)chgsbsize(so->so_cred->cr_uidinfo,
391		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
392	if (so->so_snd.sb_hiwat)
393		(void)chgsbsize(so->so_cred->cr_uidinfo,
394		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
395#ifdef INET
396	/* remove acccept filter if one is present. */
397	if (so->so_accf != NULL)
398		do_setopt_accept_filter(so, NULL);
399#endif
400#ifdef MAC
401	mac_socket_destroy(so);
402#endif
403	crfree(so->so_cred);
404	sx_destroy(&so->so_snd.sb_sx);
405	sx_destroy(&so->so_rcv.sb_sx);
406	SOCKBUF_LOCK_DESTROY(&so->so_snd);
407	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
408	uma_zfree(socket_zone, so);
409}
410
411/*
412 * socreate returns a socket with a ref count of 1.  The socket should be
413 * closed with soclose().
414 */
415int
416socreate(int dom, struct socket **aso, int type, int proto,
417    struct ucred *cred, struct thread *td)
418{
419	struct protosw *prp;
420	struct socket *so;
421	int error;
422
423	if (proto)
424		prp = pffindproto(dom, proto, type);
425	else
426		prp = pffindtype(dom, type);
427
428	if (prp == NULL) {
429		/* No support for domain. */
430		if (pffinddomain(dom) == NULL)
431			return (EAFNOSUPPORT);
432		/* No support for socket type. */
433		if (proto == 0 && type != 0)
434			return (EPROTOTYPE);
435		return (EPROTONOSUPPORT);
436	}
437	if (prp->pr_usrreqs->pru_attach == NULL ||
438	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
439		return (EPROTONOSUPPORT);
440
441	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
442		return (EPROTONOSUPPORT);
443
444	if (prp->pr_type != type)
445		return (EPROTOTYPE);
446	so = soalloc(CRED_TO_VNET(cred));
447	if (so == NULL)
448		return (ENOBUFS);
449
450	TAILQ_INIT(&so->so_incomp);
451	TAILQ_INIT(&so->so_comp);
452	so->so_type = type;
453	so->so_cred = crhold(cred);
454	if ((prp->pr_domain->dom_family == PF_INET) ||
455	    (prp->pr_domain->dom_family == PF_INET6) ||
456	    (prp->pr_domain->dom_family == PF_ROUTE))
457		so->so_fibnum = td->td_proc->p_fibnum;
458	else
459		so->so_fibnum = 0;
460	so->so_proto = prp;
461#ifdef MAC
462	mac_socket_create(cred, so);
463#endif
464	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
465	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
466	so->so_count = 1;
467	/*
468	 * Auto-sizing of socket buffers is managed by the protocols and
469	 * the appropriate flags must be set in the pru_attach function.
470	 */
471	CURVNET_SET(so->so_vnet);
472	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
473	CURVNET_RESTORE();
474	if (error) {
475		KASSERT(so->so_count == 1, ("socreate: so_count %d",
476		    so->so_count));
477		so->so_count = 0;
478		sodealloc(so);
479		return (error);
480	}
481	*aso = so;
482	return (0);
483}
484
485#ifdef REGRESSION
486static int regression_sonewconn_earlytest = 1;
487SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
488    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
489#endif
490
491/*
492 * When an attempt at a new connection is noted on a socket which accepts
493 * connections, sonewconn is called.  If the connection is possible (subject
494 * to space constraints, etc.) then we allocate a new structure, propoerly
495 * linked into the data structure of the original socket, and return this.
496 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
497 *
498 * Note: the ref count on the socket is 0 on return.
499 */
500struct socket *
501sonewconn(struct socket *head, int connstatus)
502{
503	struct socket *so;
504	int over;
505
506	ACCEPT_LOCK();
507	over = (head->so_qlen > 3 * head->so_qlimit / 2);
508	ACCEPT_UNLOCK();
509#ifdef REGRESSION
510	if (regression_sonewconn_earlytest && over) {
511#else
512	if (over) {
513#endif
514		log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
515		    "%i already in queue awaiting acceptance\n",
516		    __func__, head->so_pcb, over);
517		return (NULL);
518	}
519	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
520	    __func__, __LINE__, head));
521	so = soalloc(head->so_vnet);
522	if (so == NULL) {
523		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
524		    "limit reached or out of memory\n",
525		    __func__, head->so_pcb);
526		return (NULL);
527	}
528	if ((head->so_options & SO_ACCEPTFILTER) != 0)
529		connstatus = 0;
530	so->so_head = head;
531	so->so_type = head->so_type;
532	so->so_options = head->so_options &~ SO_ACCEPTCONN;
533	so->so_linger = head->so_linger;
534	so->so_state = head->so_state | SS_NOFDREF;
535	so->so_fibnum = head->so_fibnum;
536	so->so_proto = head->so_proto;
537	so->so_cred = crhold(head->so_cred);
538#ifdef MAC
539	mac_socket_newconn(head, so);
540#endif
541	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
542	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
543	VNET_SO_ASSERT(head);
544	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
545		sodealloc(so);
546		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
547		    __func__, head->so_pcb);
548		return (NULL);
549	}
550	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
551		sodealloc(so);
552		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
553		    __func__, head->so_pcb);
554		return (NULL);
555	}
556	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
557	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
558	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
559	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
560	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
561	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
562	so->so_state |= connstatus;
563	ACCEPT_LOCK();
564	/*
565	 * The accept socket may be tearing down but we just
566	 * won a race on the ACCEPT_LOCK.
567	 */
568	if (!(head->so_options & SO_ACCEPTCONN)) {
569		SOCK_LOCK(so);
570		so->so_head = NULL;
571		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
572		return (NULL);
573	}
574	if (connstatus) {
575		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
576		so->so_qstate |= SQ_COMP;
577		head->so_qlen++;
578	} else {
579		/*
580		 * Keep removing sockets from the head until there's room for
581		 * us to insert on the tail.  In pre-locking revisions, this
582		 * was a simple if(), but as we could be racing with other
583		 * threads and soabort() requires dropping locks, we must
584		 * loop waiting for the condition to be true.
585		 */
586		while (head->so_incqlen > head->so_qlimit) {
587			struct socket *sp;
588			sp = TAILQ_FIRST(&head->so_incomp);
589			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
590			head->so_incqlen--;
591			sp->so_qstate &= ~SQ_INCOMP;
592			sp->so_head = NULL;
593			ACCEPT_UNLOCK();
594			soabort(sp);
595			ACCEPT_LOCK();
596		}
597		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
598		so->so_qstate |= SQ_INCOMP;
599		head->so_incqlen++;
600	}
601	ACCEPT_UNLOCK();
602	if (connstatus) {
603		sorwakeup(head);
604		wakeup_one(&head->so_timeo);
605	}
606	return (so);
607}
608
609int
610sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
611{
612	int error;
613
614	CURVNET_SET(so->so_vnet);
615	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
616	CURVNET_RESTORE();
617	return error;
618}
619
620/*
621 * solisten() transitions a socket from a non-listening state to a listening
622 * state, but can also be used to update the listen queue depth on an
623 * existing listen socket.  The protocol will call back into the sockets
624 * layer using solisten_proto_check() and solisten_proto() to check and set
625 * socket-layer listen state.  Call backs are used so that the protocol can
626 * acquire both protocol and socket layer locks in whatever order is required
627 * by the protocol.
628 *
629 * Protocol implementors are advised to hold the socket lock across the
630 * socket-layer test and set to avoid races at the socket layer.
631 */
632int
633solisten(struct socket *so, int backlog, struct thread *td)
634{
635	int error;
636
637	CURVNET_SET(so->so_vnet);
638	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
639	CURVNET_RESTORE();
640	return error;
641}
642
643int
644solisten_proto_check(struct socket *so)
645{
646
647	SOCK_LOCK_ASSERT(so);
648
649	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
650	    SS_ISDISCONNECTING))
651		return (EINVAL);
652	return (0);
653}
654
655void
656solisten_proto(struct socket *so, int backlog)
657{
658
659	SOCK_LOCK_ASSERT(so);
660
661	if (backlog < 0 || backlog > somaxconn)
662		backlog = somaxconn;
663	so->so_qlimit = backlog;
664	so->so_options |= SO_ACCEPTCONN;
665}
666
667/*
668 * Evaluate the reference count and named references on a socket; if no
669 * references remain, free it.  This should be called whenever a reference is
670 * released, such as in sorele(), but also when named reference flags are
671 * cleared in socket or protocol code.
672 *
673 * sofree() will free the socket if:
674 *
675 * - There are no outstanding file descriptor references or related consumers
676 *   (so_count == 0).
677 *
678 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
679 *
680 * - The protocol does not have an outstanding strong reference on the socket
681 *   (SS_PROTOREF).
682 *
683 * - The socket is not in a completed connection queue, so a process has been
684 *   notified that it is present.  If it is removed, the user process may
685 *   block in accept() despite select() saying the socket was ready.
686 */
687void
688sofree(struct socket *so)
689{
690	struct protosw *pr = so->so_proto;
691	struct socket *head;
692
693	ACCEPT_LOCK_ASSERT();
694	SOCK_LOCK_ASSERT(so);
695
696	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
697	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
698		SOCK_UNLOCK(so);
699		ACCEPT_UNLOCK();
700		return;
701	}
702
703	head = so->so_head;
704	if (head != NULL) {
705		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
706		    (so->so_qstate & SQ_INCOMP) != 0,
707		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
708		    "SQ_INCOMP"));
709		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
710		    (so->so_qstate & SQ_INCOMP) == 0,
711		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
712		TAILQ_REMOVE(&head->so_incomp, so, so_list);
713		head->so_incqlen--;
714		so->so_qstate &= ~SQ_INCOMP;
715		so->so_head = NULL;
716	}
717	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
718	    (so->so_qstate & SQ_INCOMP) == 0,
719	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
720	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
721	if (so->so_options & SO_ACCEPTCONN) {
722		KASSERT((TAILQ_EMPTY(&so->so_comp)),
723		    ("sofree: so_comp populated"));
724		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
725		    ("sofree: so_incomp populated"));
726	}
727	SOCK_UNLOCK(so);
728	ACCEPT_UNLOCK();
729
730	VNET_SO_ASSERT(so);
731	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
732		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
733	if (pr->pr_usrreqs->pru_detach != NULL)
734		(*pr->pr_usrreqs->pru_detach)(so);
735
736	/*
737	 * From this point on, we assume that no other references to this
738	 * socket exist anywhere else in the stack.  Therefore, no locks need
739	 * to be acquired or held.
740	 *
741	 * We used to do a lot of socket buffer and socket locking here, as
742	 * well as invoke sorflush() and perform wakeups.  The direct call to
743	 * dom_dispose() and sbrelease_internal() are an inlining of what was
744	 * necessary from sorflush().
745	 *
746	 * Notice that the socket buffer and kqueue state are torn down
747	 * before calling pru_detach.  This means that protocols shold not
748	 * assume they can perform socket wakeups, etc, in their detach code.
749	 */
750	sbdestroy(&so->so_snd, so);
751	sbdestroy(&so->so_rcv, so);
752	seldrain(&so->so_snd.sb_sel);
753	seldrain(&so->so_rcv.sb_sel);
754	knlist_destroy(&so->so_rcv.sb_sel.si_note);
755	knlist_destroy(&so->so_snd.sb_sel.si_note);
756	sodealloc(so);
757}
758
759/*
760 * Close a socket on last file table reference removal.  Initiate disconnect
761 * if connected.  Free socket when disconnect complete.
762 *
763 * This function will sorele() the socket.  Note that soclose() may be called
764 * prior to the ref count reaching zero.  The actual socket structure will
765 * not be freed until the ref count reaches zero.
766 */
767int
768soclose(struct socket *so)
769{
770	int error = 0;
771
772	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
773
774	CURVNET_SET(so->so_vnet);
775	funsetown(&so->so_sigio);
776	if (so->so_state & SS_ISCONNECTED) {
777		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
778			error = sodisconnect(so);
779			if (error) {
780				if (error == ENOTCONN)
781					error = 0;
782				goto drop;
783			}
784		}
785		if (so->so_options & SO_LINGER) {
786			if ((so->so_state & SS_ISDISCONNECTING) &&
787			    (so->so_state & SS_NBIO))
788				goto drop;
789			while (so->so_state & SS_ISCONNECTED) {
790				error = tsleep(&so->so_timeo,
791				    PSOCK | PCATCH, "soclos",
792				    so->so_linger * hz);
793				if (error)
794					break;
795			}
796		}
797	}
798
799drop:
800	if (so->so_proto->pr_usrreqs->pru_close != NULL)
801		(*so->so_proto->pr_usrreqs->pru_close)(so);
802	ACCEPT_LOCK();
803	if (so->so_options & SO_ACCEPTCONN) {
804		struct socket *sp;
805		/*
806		 * Prevent new additions to the accept queues due
807		 * to ACCEPT_LOCK races while we are draining them.
808		 */
809		so->so_options &= ~SO_ACCEPTCONN;
810		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
811			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
812			so->so_incqlen--;
813			sp->so_qstate &= ~SQ_INCOMP;
814			sp->so_head = NULL;
815			ACCEPT_UNLOCK();
816			soabort(sp);
817			ACCEPT_LOCK();
818		}
819		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
820			TAILQ_REMOVE(&so->so_comp, sp, so_list);
821			so->so_qlen--;
822			sp->so_qstate &= ~SQ_COMP;
823			sp->so_head = NULL;
824			ACCEPT_UNLOCK();
825			soabort(sp);
826			ACCEPT_LOCK();
827		}
828		KASSERT((TAILQ_EMPTY(&so->so_comp)),
829		    ("%s: so_comp populated", __func__));
830		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
831		    ("%s: so_incomp populated", __func__));
832	}
833	SOCK_LOCK(so);
834	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
835	so->so_state |= SS_NOFDREF;
836	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
837	CURVNET_RESTORE();
838	return (error);
839}
840
841/*
842 * soabort() is used to abruptly tear down a connection, such as when a
843 * resource limit is reached (listen queue depth exceeded), or if a listen
844 * socket is closed while there are sockets waiting to be accepted.
845 *
846 * This interface is tricky, because it is called on an unreferenced socket,
847 * and must be called only by a thread that has actually removed the socket
848 * from the listen queue it was on, or races with other threads are risked.
849 *
850 * This interface will call into the protocol code, so must not be called
851 * with any socket locks held.  Protocols do call it while holding their own
852 * recursible protocol mutexes, but this is something that should be subject
853 * to review in the future.
854 */
855void
856soabort(struct socket *so)
857{
858
859	/*
860	 * In as much as is possible, assert that no references to this
861	 * socket are held.  This is not quite the same as asserting that the
862	 * current thread is responsible for arranging for no references, but
863	 * is as close as we can get for now.
864	 */
865	KASSERT(so->so_count == 0, ("soabort: so_count"));
866	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
867	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
868	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
869	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
870	VNET_SO_ASSERT(so);
871
872	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
873		(*so->so_proto->pr_usrreqs->pru_abort)(so);
874	ACCEPT_LOCK();
875	SOCK_LOCK(so);
876	sofree(so);
877}
878
879int
880soaccept(struct socket *so, struct sockaddr **nam)
881{
882	int error;
883
884	SOCK_LOCK(so);
885	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
886	so->so_state &= ~SS_NOFDREF;
887	SOCK_UNLOCK(so);
888
889	CURVNET_SET(so->so_vnet);
890	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
891	CURVNET_RESTORE();
892	return (error);
893}
894
895int
896soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
897{
898	int error;
899
900	if (so->so_options & SO_ACCEPTCONN)
901		return (EOPNOTSUPP);
902
903	CURVNET_SET(so->so_vnet);
904	/*
905	 * If protocol is connection-based, can only connect once.
906	 * Otherwise, if connected, try to disconnect first.  This allows
907	 * user to disconnect by connecting to, e.g., a null address.
908	 */
909	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
910	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
911	    (error = sodisconnect(so)))) {
912		error = EISCONN;
913	} else {
914		/*
915		 * Prevent accumulated error from previous connection from
916		 * biting us.
917		 */
918		so->so_error = 0;
919		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
920	}
921	CURVNET_RESTORE();
922
923	return (error);
924}
925
926int
927soconnect2(struct socket *so1, struct socket *so2)
928{
929	int error;
930
931	CURVNET_SET(so1->so_vnet);
932	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
933	CURVNET_RESTORE();
934	return (error);
935}
936
937int
938sodisconnect(struct socket *so)
939{
940	int error;
941
942	if ((so->so_state & SS_ISCONNECTED) == 0)
943		return (ENOTCONN);
944	if (so->so_state & SS_ISDISCONNECTING)
945		return (EALREADY);
946	VNET_SO_ASSERT(so);
947	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
948	return (error);
949}
950
951#ifdef SOCKET_SEND_COW
952struct so_zerocopy_stats{
953	int size_ok;
954	int align_ok;
955	int found_ifp;
956};
957struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
958
959/*
960 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
961 * sosend_dgram() and sosend_generic() use m_uiotombuf().
962 *
963 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
964 * all of the data referenced by the uio.  If desired, it uses zero-copy.
965 * *space will be updated to reflect data copied in.
966 *
967 * NB: If atomic I/O is requested, the caller must already have checked that
968 * space can hold resid bytes.
969 *
970 * NB: In the event of an error, the caller may need to free the partial
971 * chain pointed to by *mpp.  The contents of both *uio and *space may be
972 * modified even in the case of an error.
973 */
974static int
975sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
976    int flags)
977{
978	struct mbuf *m, **mp, *top;
979	long len;
980	ssize_t resid;
981	int error;
982	int cow_send;
983
984	*retmp = top = NULL;
985	mp = &top;
986	len = 0;
987	resid = uio->uio_resid;
988	error = 0;
989	do {
990		cow_send = 0;
991		if (resid >= MINCLSIZE) {
992			if (top == NULL) {
993				m = m_gethdr(M_WAITOK, MT_DATA);
994				m->m_pkthdr.len = 0;
995				m->m_pkthdr.rcvif = NULL;
996			} else
997				m = m_get(M_WAITOK, MT_DATA);
998			if (so_zero_copy_send &&
999			    resid >= PAGE_SIZE &&
1000			    *space >= PAGE_SIZE &&
1001			    uio->uio_iov->iov_len >= PAGE_SIZE) {
1002				so_zerocp_stats.size_ok++;
1003				so_zerocp_stats.align_ok++;
1004				cow_send = socow_setup(m, uio);
1005				len = cow_send;
1006			}
1007			if (!cow_send) {
1008				m_clget(m, M_WAITOK);
1009				len = min(min(MCLBYTES, resid), *space);
1010			}
1011		} else {
1012			if (top == NULL) {
1013				m = m_gethdr(M_WAITOK, MT_DATA);
1014				m->m_pkthdr.len = 0;
1015				m->m_pkthdr.rcvif = NULL;
1016
1017				len = min(min(MHLEN, resid), *space);
1018				/*
1019				 * For datagram protocols, leave room
1020				 * for protocol headers in first mbuf.
1021				 */
1022				if (atomic && m && len < MHLEN)
1023					MH_ALIGN(m, len);
1024			} else {
1025				m = m_get(M_WAITOK, MT_DATA);
1026				len = min(min(MLEN, resid), *space);
1027			}
1028		}
1029		if (m == NULL) {
1030			error = ENOBUFS;
1031			goto out;
1032		}
1033
1034		*space -= len;
1035		if (cow_send)
1036			error = 0;
1037		else
1038			error = uiomove(mtod(m, void *), (int)len, uio);
1039		resid = uio->uio_resid;
1040		m->m_len = len;
1041		*mp = m;
1042		top->m_pkthdr.len += len;
1043		if (error)
1044			goto out;
1045		mp = &m->m_next;
1046		if (resid <= 0) {
1047			if (flags & MSG_EOR)
1048				top->m_flags |= M_EOR;
1049			break;
1050		}
1051	} while (*space > 0 && atomic);
1052out:
1053	*retmp = top;
1054	return (error);
1055}
1056#endif /* SOCKET_SEND_COW */
1057
1058#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1059
1060int
1061sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1062    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1063{
1064	long space;
1065	ssize_t resid;
1066	int clen = 0, error, dontroute;
1067#ifdef SOCKET_SEND_COW
1068	int atomic = sosendallatonce(so) || top;
1069#endif
1070
1071	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1072	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1073	    ("sosend_dgram: !PR_ATOMIC"));
1074
1075	if (uio != NULL)
1076		resid = uio->uio_resid;
1077	else
1078		resid = top->m_pkthdr.len;
1079	/*
1080	 * In theory resid should be unsigned.  However, space must be
1081	 * signed, as it might be less than 0 if we over-committed, and we
1082	 * must use a signed comparison of space and resid.  On the other
1083	 * hand, a negative resid causes us to loop sending 0-length
1084	 * segments to the protocol.
1085	 */
1086	if (resid < 0) {
1087		error = EINVAL;
1088		goto out;
1089	}
1090
1091	dontroute =
1092	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1093	if (td != NULL)
1094		td->td_ru.ru_msgsnd++;
1095	if (control != NULL)
1096		clen = control->m_len;
1097
1098	SOCKBUF_LOCK(&so->so_snd);
1099	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1100		SOCKBUF_UNLOCK(&so->so_snd);
1101		error = EPIPE;
1102		goto out;
1103	}
1104	if (so->so_error) {
1105		error = so->so_error;
1106		so->so_error = 0;
1107		SOCKBUF_UNLOCK(&so->so_snd);
1108		goto out;
1109	}
1110	if ((so->so_state & SS_ISCONNECTED) == 0) {
1111		/*
1112		 * `sendto' and `sendmsg' is allowed on a connection-based
1113		 * socket if it supports implied connect.  Return ENOTCONN if
1114		 * not connected and no address is supplied.
1115		 */
1116		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1117		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1118			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1119			    !(resid == 0 && clen != 0)) {
1120				SOCKBUF_UNLOCK(&so->so_snd);
1121				error = ENOTCONN;
1122				goto out;
1123			}
1124		} else if (addr == NULL) {
1125			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1126				error = ENOTCONN;
1127			else
1128				error = EDESTADDRREQ;
1129			SOCKBUF_UNLOCK(&so->so_snd);
1130			goto out;
1131		}
1132	}
1133
1134	/*
1135	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1136	 * problem and need fixing.
1137	 */
1138	space = sbspace(&so->so_snd);
1139	if (flags & MSG_OOB)
1140		space += 1024;
1141	space -= clen;
1142	SOCKBUF_UNLOCK(&so->so_snd);
1143	if (resid > space) {
1144		error = EMSGSIZE;
1145		goto out;
1146	}
1147	if (uio == NULL) {
1148		resid = 0;
1149		if (flags & MSG_EOR)
1150			top->m_flags |= M_EOR;
1151	} else {
1152#ifdef SOCKET_SEND_COW
1153		error = sosend_copyin(uio, &top, atomic, &space, flags);
1154		if (error)
1155			goto out;
1156#else
1157		/*
1158		 * Copy the data from userland into a mbuf chain.
1159		 * If no data is to be copied in, a single empty mbuf
1160		 * is returned.
1161		 */
1162		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1163		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1164		if (top == NULL) {
1165			error = EFAULT;	/* only possible error */
1166			goto out;
1167		}
1168		space -= resid - uio->uio_resid;
1169#endif /* SOCKET_SEND_COW */
1170		resid = uio->uio_resid;
1171	}
1172	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1173	/*
1174	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1175	 * than with.
1176	 */
1177	if (dontroute) {
1178		SOCK_LOCK(so);
1179		so->so_options |= SO_DONTROUTE;
1180		SOCK_UNLOCK(so);
1181	}
1182	/*
1183	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1184	 * of date.  We could have recieved a reset packet in an interrupt or
1185	 * maybe we slept while doing page faults in uiomove() etc.  We could
1186	 * probably recheck again inside the locking protection here, but
1187	 * there are probably other places that this also happens.  We must
1188	 * rethink this.
1189	 */
1190	VNET_SO_ASSERT(so);
1191	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1192	    (flags & MSG_OOB) ? PRUS_OOB :
1193	/*
1194	 * If the user set MSG_EOF, the protocol understands this flag and
1195	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1196	 */
1197	    ((flags & MSG_EOF) &&
1198	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1199	     (resid <= 0)) ?
1200		PRUS_EOF :
1201		/* If there is more to send set PRUS_MORETOCOME */
1202		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1203		top, addr, control, td);
1204	if (dontroute) {
1205		SOCK_LOCK(so);
1206		so->so_options &= ~SO_DONTROUTE;
1207		SOCK_UNLOCK(so);
1208	}
1209	clen = 0;
1210	control = NULL;
1211	top = NULL;
1212out:
1213	if (top != NULL)
1214		m_freem(top);
1215	if (control != NULL)
1216		m_freem(control);
1217	return (error);
1218}
1219
1220/*
1221 * Send on a socket.  If send must go all at once and message is larger than
1222 * send buffering, then hard error.  Lock against other senders.  If must go
1223 * all at once and not enough room now, then inform user that this would
1224 * block and do nothing.  Otherwise, if nonblocking, send as much as
1225 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1226 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1227 * in mbuf chain must be small enough to send all at once.
1228 *
1229 * Returns nonzero on error, timeout or signal; callers must check for short
1230 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1231 * on return.
1232 */
1233int
1234sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1235    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1236{
1237	long space;
1238	ssize_t resid;
1239	int clen = 0, error, dontroute;
1240	int atomic = sosendallatonce(so) || top;
1241
1242	if (uio != NULL)
1243		resid = uio->uio_resid;
1244	else
1245		resid = top->m_pkthdr.len;
1246	/*
1247	 * In theory resid should be unsigned.  However, space must be
1248	 * signed, as it might be less than 0 if we over-committed, and we
1249	 * must use a signed comparison of space and resid.  On the other
1250	 * hand, a negative resid causes us to loop sending 0-length
1251	 * segments to the protocol.
1252	 *
1253	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1254	 * type sockets since that's an error.
1255	 */
1256	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1257		error = EINVAL;
1258		goto out;
1259	}
1260
1261	dontroute =
1262	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1263	    (so->so_proto->pr_flags & PR_ATOMIC);
1264	if (td != NULL)
1265		td->td_ru.ru_msgsnd++;
1266	if (control != NULL)
1267		clen = control->m_len;
1268
1269	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1270	if (error)
1271		goto out;
1272
1273restart:
1274	do {
1275		SOCKBUF_LOCK(&so->so_snd);
1276		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1277			SOCKBUF_UNLOCK(&so->so_snd);
1278			error = EPIPE;
1279			goto release;
1280		}
1281		if (so->so_error) {
1282			error = so->so_error;
1283			so->so_error = 0;
1284			SOCKBUF_UNLOCK(&so->so_snd);
1285			goto release;
1286		}
1287		if ((so->so_state & SS_ISCONNECTED) == 0) {
1288			/*
1289			 * `sendto' and `sendmsg' is allowed on a connection-
1290			 * based socket if it supports implied connect.
1291			 * Return ENOTCONN if not connected and no address is
1292			 * supplied.
1293			 */
1294			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1295			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1296				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1297				    !(resid == 0 && clen != 0)) {
1298					SOCKBUF_UNLOCK(&so->so_snd);
1299					error = ENOTCONN;
1300					goto release;
1301				}
1302			} else if (addr == NULL) {
1303				SOCKBUF_UNLOCK(&so->so_snd);
1304				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1305					error = ENOTCONN;
1306				else
1307					error = EDESTADDRREQ;
1308				goto release;
1309			}
1310		}
1311		space = sbspace(&so->so_snd);
1312		if (flags & MSG_OOB)
1313			space += 1024;
1314		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1315		    clen > so->so_snd.sb_hiwat) {
1316			SOCKBUF_UNLOCK(&so->so_snd);
1317			error = EMSGSIZE;
1318			goto release;
1319		}
1320		if (space < resid + clen &&
1321		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1322			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1323				SOCKBUF_UNLOCK(&so->so_snd);
1324				error = EWOULDBLOCK;
1325				goto release;
1326			}
1327			error = sbwait(&so->so_snd);
1328			SOCKBUF_UNLOCK(&so->so_snd);
1329			if (error)
1330				goto release;
1331			goto restart;
1332		}
1333		SOCKBUF_UNLOCK(&so->so_snd);
1334		space -= clen;
1335		do {
1336			if (uio == NULL) {
1337				resid = 0;
1338				if (flags & MSG_EOR)
1339					top->m_flags |= M_EOR;
1340			} else {
1341#ifdef SOCKET_SEND_COW
1342				error = sosend_copyin(uio, &top, atomic,
1343				    &space, flags);
1344				if (error != 0)
1345					goto release;
1346#else
1347				/*
1348				 * Copy the data from userland into a mbuf
1349				 * chain.  If no data is to be copied in,
1350				 * a single empty mbuf is returned.
1351				 */
1352				top = m_uiotombuf(uio, M_WAITOK, space,
1353				    (atomic ? max_hdr : 0),
1354				    (atomic ? M_PKTHDR : 0) |
1355				    ((flags & MSG_EOR) ? M_EOR : 0));
1356				if (top == NULL) {
1357					error = EFAULT; /* only possible error */
1358					goto release;
1359				}
1360				space -= resid - uio->uio_resid;
1361#endif /* SOCKET_SEND_COW */
1362				resid = uio->uio_resid;
1363			}
1364			if (dontroute) {
1365				SOCK_LOCK(so);
1366				so->so_options |= SO_DONTROUTE;
1367				SOCK_UNLOCK(so);
1368			}
1369			/*
1370			 * XXX all the SBS_CANTSENDMORE checks previously
1371			 * done could be out of date.  We could have recieved
1372			 * a reset packet in an interrupt or maybe we slept
1373			 * while doing page faults in uiomove() etc.  We
1374			 * could probably recheck again inside the locking
1375			 * protection here, but there are probably other
1376			 * places that this also happens.  We must rethink
1377			 * this.
1378			 */
1379			VNET_SO_ASSERT(so);
1380			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1381			    (flags & MSG_OOB) ? PRUS_OOB :
1382			/*
1383			 * If the user set MSG_EOF, the protocol understands
1384			 * this flag and nothing left to send then use
1385			 * PRU_SEND_EOF instead of PRU_SEND.
1386			 */
1387			    ((flags & MSG_EOF) &&
1388			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1389			     (resid <= 0)) ?
1390				PRUS_EOF :
1391			/* If there is more to send set PRUS_MORETOCOME. */
1392			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1393			    top, addr, control, td);
1394			if (dontroute) {
1395				SOCK_LOCK(so);
1396				so->so_options &= ~SO_DONTROUTE;
1397				SOCK_UNLOCK(so);
1398			}
1399			clen = 0;
1400			control = NULL;
1401			top = NULL;
1402			if (error)
1403				goto release;
1404		} while (resid && space > 0);
1405	} while (resid);
1406
1407release:
1408	sbunlock(&so->so_snd);
1409out:
1410	if (top != NULL)
1411		m_freem(top);
1412	if (control != NULL)
1413		m_freem(control);
1414	return (error);
1415}
1416
1417int
1418sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1419    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1420{
1421	int error;
1422
1423	CURVNET_SET(so->so_vnet);
1424	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1425	    control, flags, td);
1426	CURVNET_RESTORE();
1427	return (error);
1428}
1429
1430/*
1431 * The part of soreceive() that implements reading non-inline out-of-band
1432 * data from a socket.  For more complete comments, see soreceive(), from
1433 * which this code originated.
1434 *
1435 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1436 * unable to return an mbuf chain to the caller.
1437 */
1438static int
1439soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1440{
1441	struct protosw *pr = so->so_proto;
1442	struct mbuf *m;
1443	int error;
1444
1445	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1446	VNET_SO_ASSERT(so);
1447
1448	m = m_get(M_WAITOK, MT_DATA);
1449	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1450	if (error)
1451		goto bad;
1452	do {
1453#ifdef SOCKET_RECV_PFLIP
1454		if (so_zero_copy_receive) {
1455			int disposable;
1456
1457			if ((m->m_flags & M_EXT)
1458			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1459				disposable = 1;
1460			else
1461				disposable = 0;
1462
1463			error = uiomoveco(mtod(m, void *),
1464			    min(uio->uio_resid, m->m_len), uio, disposable);
1465		} else
1466#endif /* SOCKET_RECV_PFLIP */
1467		error = uiomove(mtod(m, void *),
1468		    (int) min(uio->uio_resid, m->m_len), uio);
1469		m = m_free(m);
1470	} while (uio->uio_resid && error == 0 && m);
1471bad:
1472	if (m != NULL)
1473		m_freem(m);
1474	return (error);
1475}
1476
1477/*
1478 * Following replacement or removal of the first mbuf on the first mbuf chain
1479 * of a socket buffer, push necessary state changes back into the socket
1480 * buffer so that other consumers see the values consistently.  'nextrecord'
1481 * is the callers locally stored value of the original value of
1482 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1483 * NOTE: 'nextrecord' may be NULL.
1484 */
1485static __inline void
1486sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1487{
1488
1489	SOCKBUF_LOCK_ASSERT(sb);
1490	/*
1491	 * First, update for the new value of nextrecord.  If necessary, make
1492	 * it the first record.
1493	 */
1494	if (sb->sb_mb != NULL)
1495		sb->sb_mb->m_nextpkt = nextrecord;
1496	else
1497		sb->sb_mb = nextrecord;
1498
1499	/*
1500	 * Now update any dependent socket buffer fields to reflect the new
1501	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1502	 * addition of a second clause that takes care of the case where
1503	 * sb_mb has been updated, but remains the last record.
1504	 */
1505	if (sb->sb_mb == NULL) {
1506		sb->sb_mbtail = NULL;
1507		sb->sb_lastrecord = NULL;
1508	} else if (sb->sb_mb->m_nextpkt == NULL)
1509		sb->sb_lastrecord = sb->sb_mb;
1510}
1511
1512/*
1513 * Implement receive operations on a socket.  We depend on the way that
1514 * records are added to the sockbuf by sbappend.  In particular, each record
1515 * (mbufs linked through m_next) must begin with an address if the protocol
1516 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1517 * data, and then zero or more mbufs of data.  In order to allow parallelism
1518 * between network receive and copying to user space, as well as avoid
1519 * sleeping with a mutex held, we release the socket buffer mutex during the
1520 * user space copy.  Although the sockbuf is locked, new data may still be
1521 * appended, and thus we must maintain consistency of the sockbuf during that
1522 * time.
1523 *
1524 * The caller may receive the data as a single mbuf chain by supplying an
1525 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1526 * the count in uio_resid.
1527 */
1528int
1529soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1530    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1531{
1532	struct mbuf *m, **mp;
1533	int flags, error, offset;
1534	ssize_t len;
1535	struct protosw *pr = so->so_proto;
1536	struct mbuf *nextrecord;
1537	int moff, type = 0;
1538	ssize_t orig_resid = uio->uio_resid;
1539
1540	mp = mp0;
1541	if (psa != NULL)
1542		*psa = NULL;
1543	if (controlp != NULL)
1544		*controlp = NULL;
1545	if (flagsp != NULL)
1546		flags = *flagsp &~ MSG_EOR;
1547	else
1548		flags = 0;
1549	if (flags & MSG_OOB)
1550		return (soreceive_rcvoob(so, uio, flags));
1551	if (mp != NULL)
1552		*mp = NULL;
1553	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1554	    && uio->uio_resid) {
1555		VNET_SO_ASSERT(so);
1556		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1557	}
1558
1559	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1560	if (error)
1561		return (error);
1562
1563restart:
1564	SOCKBUF_LOCK(&so->so_rcv);
1565	m = so->so_rcv.sb_mb;
1566	/*
1567	 * If we have less data than requested, block awaiting more (subject
1568	 * to any timeout) if:
1569	 *   1. the current count is less than the low water mark, or
1570	 *   2. MSG_DONTWAIT is not set
1571	 */
1572	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1573	    so->so_rcv.sb_cc < uio->uio_resid) &&
1574	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1575	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1576		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1577		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1578		    m, so->so_rcv.sb_cc));
1579		if (so->so_error) {
1580			if (m != NULL)
1581				goto dontblock;
1582			error = so->so_error;
1583			if ((flags & MSG_PEEK) == 0)
1584				so->so_error = 0;
1585			SOCKBUF_UNLOCK(&so->so_rcv);
1586			goto release;
1587		}
1588		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1589		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1590			if (m == NULL) {
1591				SOCKBUF_UNLOCK(&so->so_rcv);
1592				goto release;
1593			} else
1594				goto dontblock;
1595		}
1596		for (; m != NULL; m = m->m_next)
1597			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1598				m = so->so_rcv.sb_mb;
1599				goto dontblock;
1600			}
1601		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1602		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1603			SOCKBUF_UNLOCK(&so->so_rcv);
1604			error = ENOTCONN;
1605			goto release;
1606		}
1607		if (uio->uio_resid == 0) {
1608			SOCKBUF_UNLOCK(&so->so_rcv);
1609			goto release;
1610		}
1611		if ((so->so_state & SS_NBIO) ||
1612		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1613			SOCKBUF_UNLOCK(&so->so_rcv);
1614			error = EWOULDBLOCK;
1615			goto release;
1616		}
1617		SBLASTRECORDCHK(&so->so_rcv);
1618		SBLASTMBUFCHK(&so->so_rcv);
1619		error = sbwait(&so->so_rcv);
1620		SOCKBUF_UNLOCK(&so->so_rcv);
1621		if (error)
1622			goto release;
1623		goto restart;
1624	}
1625dontblock:
1626	/*
1627	 * From this point onward, we maintain 'nextrecord' as a cache of the
1628	 * pointer to the next record in the socket buffer.  We must keep the
1629	 * various socket buffer pointers and local stack versions of the
1630	 * pointers in sync, pushing out modifications before dropping the
1631	 * socket buffer mutex, and re-reading them when picking it up.
1632	 *
1633	 * Otherwise, we will race with the network stack appending new data
1634	 * or records onto the socket buffer by using inconsistent/stale
1635	 * versions of the field, possibly resulting in socket buffer
1636	 * corruption.
1637	 *
1638	 * By holding the high-level sblock(), we prevent simultaneous
1639	 * readers from pulling off the front of the socket buffer.
1640	 */
1641	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1642	if (uio->uio_td)
1643		uio->uio_td->td_ru.ru_msgrcv++;
1644	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1645	SBLASTRECORDCHK(&so->so_rcv);
1646	SBLASTMBUFCHK(&so->so_rcv);
1647	nextrecord = m->m_nextpkt;
1648	if (pr->pr_flags & PR_ADDR) {
1649		KASSERT(m->m_type == MT_SONAME,
1650		    ("m->m_type == %d", m->m_type));
1651		orig_resid = 0;
1652		if (psa != NULL)
1653			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1654			    M_NOWAIT);
1655		if (flags & MSG_PEEK) {
1656			m = m->m_next;
1657		} else {
1658			sbfree(&so->so_rcv, m);
1659			so->so_rcv.sb_mb = m_free(m);
1660			m = so->so_rcv.sb_mb;
1661			sockbuf_pushsync(&so->so_rcv, nextrecord);
1662		}
1663	}
1664
1665	/*
1666	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1667	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1668	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1669	 * perform externalization (or freeing if controlp == NULL).
1670	 */
1671	if (m != NULL && m->m_type == MT_CONTROL) {
1672		struct mbuf *cm = NULL, *cmn;
1673		struct mbuf **cme = &cm;
1674
1675		do {
1676			if (flags & MSG_PEEK) {
1677				if (controlp != NULL) {
1678					*controlp = m_copy(m, 0, m->m_len);
1679					controlp = &(*controlp)->m_next;
1680				}
1681				m = m->m_next;
1682			} else {
1683				sbfree(&so->so_rcv, m);
1684				so->so_rcv.sb_mb = m->m_next;
1685				m->m_next = NULL;
1686				*cme = m;
1687				cme = &(*cme)->m_next;
1688				m = so->so_rcv.sb_mb;
1689			}
1690		} while (m != NULL && m->m_type == MT_CONTROL);
1691		if ((flags & MSG_PEEK) == 0)
1692			sockbuf_pushsync(&so->so_rcv, nextrecord);
1693		while (cm != NULL) {
1694			cmn = cm->m_next;
1695			cm->m_next = NULL;
1696			if (pr->pr_domain->dom_externalize != NULL) {
1697				SOCKBUF_UNLOCK(&so->so_rcv);
1698				VNET_SO_ASSERT(so);
1699				error = (*pr->pr_domain->dom_externalize)
1700				    (cm, controlp);
1701				SOCKBUF_LOCK(&so->so_rcv);
1702			} else if (controlp != NULL)
1703				*controlp = cm;
1704			else
1705				m_freem(cm);
1706			if (controlp != NULL) {
1707				orig_resid = 0;
1708				while (*controlp != NULL)
1709					controlp = &(*controlp)->m_next;
1710			}
1711			cm = cmn;
1712		}
1713		if (m != NULL)
1714			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1715		else
1716			nextrecord = so->so_rcv.sb_mb;
1717		orig_resid = 0;
1718	}
1719	if (m != NULL) {
1720		if ((flags & MSG_PEEK) == 0) {
1721			KASSERT(m->m_nextpkt == nextrecord,
1722			    ("soreceive: post-control, nextrecord !sync"));
1723			if (nextrecord == NULL) {
1724				KASSERT(so->so_rcv.sb_mb == m,
1725				    ("soreceive: post-control, sb_mb!=m"));
1726				KASSERT(so->so_rcv.sb_lastrecord == m,
1727				    ("soreceive: post-control, lastrecord!=m"));
1728			}
1729		}
1730		type = m->m_type;
1731		if (type == MT_OOBDATA)
1732			flags |= MSG_OOB;
1733	} else {
1734		if ((flags & MSG_PEEK) == 0) {
1735			KASSERT(so->so_rcv.sb_mb == nextrecord,
1736			    ("soreceive: sb_mb != nextrecord"));
1737			if (so->so_rcv.sb_mb == NULL) {
1738				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1739				    ("soreceive: sb_lastercord != NULL"));
1740			}
1741		}
1742	}
1743	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1744	SBLASTRECORDCHK(&so->so_rcv);
1745	SBLASTMBUFCHK(&so->so_rcv);
1746
1747	/*
1748	 * Now continue to read any data mbufs off of the head of the socket
1749	 * buffer until the read request is satisfied.  Note that 'type' is
1750	 * used to store the type of any mbuf reads that have happened so far
1751	 * such that soreceive() can stop reading if the type changes, which
1752	 * causes soreceive() to return only one of regular data and inline
1753	 * out-of-band data in a single socket receive operation.
1754	 */
1755	moff = 0;
1756	offset = 0;
1757	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1758		/*
1759		 * If the type of mbuf has changed since the last mbuf
1760		 * examined ('type'), end the receive operation.
1761		 */
1762		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1763		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1764			if (type != m->m_type)
1765				break;
1766		} else if (type == MT_OOBDATA)
1767			break;
1768		else
1769		    KASSERT(m->m_type == MT_DATA,
1770			("m->m_type == %d", m->m_type));
1771		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1772		len = uio->uio_resid;
1773		if (so->so_oobmark && len > so->so_oobmark - offset)
1774			len = so->so_oobmark - offset;
1775		if (len > m->m_len - moff)
1776			len = m->m_len - moff;
1777		/*
1778		 * If mp is set, just pass back the mbufs.  Otherwise copy
1779		 * them out via the uio, then free.  Sockbuf must be
1780		 * consistent here (points to current mbuf, it points to next
1781		 * record) when we drop priority; we must note any additions
1782		 * to the sockbuf when we block interrupts again.
1783		 */
1784		if (mp == NULL) {
1785			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786			SBLASTRECORDCHK(&so->so_rcv);
1787			SBLASTMBUFCHK(&so->so_rcv);
1788			SOCKBUF_UNLOCK(&so->so_rcv);
1789#ifdef SOCKET_RECV_PFLIP
1790			if (so_zero_copy_receive) {
1791				int disposable;
1792
1793				if ((m->m_flags & M_EXT)
1794				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1795					disposable = 1;
1796				else
1797					disposable = 0;
1798
1799				error = uiomoveco(mtod(m, char *) + moff,
1800				    (int)len, uio, disposable);
1801			} else
1802#endif /* SOCKET_RECV_PFLIP */
1803			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1804			SOCKBUF_LOCK(&so->so_rcv);
1805			if (error) {
1806				/*
1807				 * The MT_SONAME mbuf has already been removed
1808				 * from the record, so it is necessary to
1809				 * remove the data mbufs, if any, to preserve
1810				 * the invariant in the case of PR_ADDR that
1811				 * requires MT_SONAME mbufs at the head of
1812				 * each record.
1813				 */
1814				if (m && pr->pr_flags & PR_ATOMIC &&
1815				    ((flags & MSG_PEEK) == 0))
1816					(void)sbdroprecord_locked(&so->so_rcv);
1817				SOCKBUF_UNLOCK(&so->so_rcv);
1818				goto release;
1819			}
1820		} else
1821			uio->uio_resid -= len;
1822		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1823		if (len == m->m_len - moff) {
1824			if (m->m_flags & M_EOR)
1825				flags |= MSG_EOR;
1826			if (flags & MSG_PEEK) {
1827				m = m->m_next;
1828				moff = 0;
1829			} else {
1830				nextrecord = m->m_nextpkt;
1831				sbfree(&so->so_rcv, m);
1832				if (mp != NULL) {
1833					*mp = m;
1834					mp = &m->m_next;
1835					so->so_rcv.sb_mb = m = m->m_next;
1836					*mp = NULL;
1837				} else {
1838					so->so_rcv.sb_mb = m_free(m);
1839					m = so->so_rcv.sb_mb;
1840				}
1841				sockbuf_pushsync(&so->so_rcv, nextrecord);
1842				SBLASTRECORDCHK(&so->so_rcv);
1843				SBLASTMBUFCHK(&so->so_rcv);
1844			}
1845		} else {
1846			if (flags & MSG_PEEK)
1847				moff += len;
1848			else {
1849				if (mp != NULL) {
1850					int copy_flag;
1851
1852					if (flags & MSG_DONTWAIT)
1853						copy_flag = M_NOWAIT;
1854					else
1855						copy_flag = M_WAIT;
1856					if (copy_flag == M_WAITOK)
1857						SOCKBUF_UNLOCK(&so->so_rcv);
1858					*mp = m_copym(m, 0, len, copy_flag);
1859					if (copy_flag == M_WAITOK)
1860						SOCKBUF_LOCK(&so->so_rcv);
1861					if (*mp == NULL) {
1862						/*
1863						 * m_copym() couldn't
1864						 * allocate an mbuf.  Adjust
1865						 * uio_resid back (it was
1866						 * adjusted down by len
1867						 * bytes, which we didn't end
1868						 * up "copying" over).
1869						 */
1870						uio->uio_resid += len;
1871						break;
1872					}
1873				}
1874				m->m_data += len;
1875				m->m_len -= len;
1876				so->so_rcv.sb_cc -= len;
1877			}
1878		}
1879		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1880		if (so->so_oobmark) {
1881			if ((flags & MSG_PEEK) == 0) {
1882				so->so_oobmark -= len;
1883				if (so->so_oobmark == 0) {
1884					so->so_rcv.sb_state |= SBS_RCVATMARK;
1885					break;
1886				}
1887			} else {
1888				offset += len;
1889				if (offset == so->so_oobmark)
1890					break;
1891			}
1892		}
1893		if (flags & MSG_EOR)
1894			break;
1895		/*
1896		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1897		 * must not quit until "uio->uio_resid == 0" or an error
1898		 * termination.  If a signal/timeout occurs, return with a
1899		 * short count but without error.  Keep sockbuf locked
1900		 * against other readers.
1901		 */
1902		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1903		    !sosendallatonce(so) && nextrecord == NULL) {
1904			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1905			if (so->so_error ||
1906			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
1907				break;
1908			/*
1909			 * Notify the protocol that some data has been
1910			 * drained before blocking.
1911			 */
1912			if (pr->pr_flags & PR_WANTRCVD) {
1913				SOCKBUF_UNLOCK(&so->so_rcv);
1914				VNET_SO_ASSERT(so);
1915				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1916				SOCKBUF_LOCK(&so->so_rcv);
1917			}
1918			SBLASTRECORDCHK(&so->so_rcv);
1919			SBLASTMBUFCHK(&so->so_rcv);
1920			/*
1921			 * We could receive some data while was notifying
1922			 * the protocol. Skip blocking in this case.
1923			 */
1924			if (so->so_rcv.sb_mb == NULL) {
1925				error = sbwait(&so->so_rcv);
1926				if (error) {
1927					SOCKBUF_UNLOCK(&so->so_rcv);
1928					goto release;
1929				}
1930			}
1931			m = so->so_rcv.sb_mb;
1932			if (m != NULL)
1933				nextrecord = m->m_nextpkt;
1934		}
1935	}
1936
1937	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1938	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1939		flags |= MSG_TRUNC;
1940		if ((flags & MSG_PEEK) == 0)
1941			(void) sbdroprecord_locked(&so->so_rcv);
1942	}
1943	if ((flags & MSG_PEEK) == 0) {
1944		if (m == NULL) {
1945			/*
1946			 * First part is an inline SB_EMPTY_FIXUP().  Second
1947			 * part makes sure sb_lastrecord is up-to-date if
1948			 * there is still data in the socket buffer.
1949			 */
1950			so->so_rcv.sb_mb = nextrecord;
1951			if (so->so_rcv.sb_mb == NULL) {
1952				so->so_rcv.sb_mbtail = NULL;
1953				so->so_rcv.sb_lastrecord = NULL;
1954			} else if (nextrecord->m_nextpkt == NULL)
1955				so->so_rcv.sb_lastrecord = nextrecord;
1956		}
1957		SBLASTRECORDCHK(&so->so_rcv);
1958		SBLASTMBUFCHK(&so->so_rcv);
1959		/*
1960		 * If soreceive() is being done from the socket callback,
1961		 * then don't need to generate ACK to peer to update window,
1962		 * since ACK will be generated on return to TCP.
1963		 */
1964		if (!(flags & MSG_SOCALLBCK) &&
1965		    (pr->pr_flags & PR_WANTRCVD)) {
1966			SOCKBUF_UNLOCK(&so->so_rcv);
1967			VNET_SO_ASSERT(so);
1968			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1969			SOCKBUF_LOCK(&so->so_rcv);
1970		}
1971	}
1972	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1973	if (orig_resid == uio->uio_resid && orig_resid &&
1974	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1975		SOCKBUF_UNLOCK(&so->so_rcv);
1976		goto restart;
1977	}
1978	SOCKBUF_UNLOCK(&so->so_rcv);
1979
1980	if (flagsp != NULL)
1981		*flagsp |= flags;
1982release:
1983	sbunlock(&so->so_rcv);
1984	return (error);
1985}
1986
1987/*
1988 * Optimized version of soreceive() for stream (TCP) sockets.
1989 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1990 */
1991int
1992soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1993    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1994{
1995	int len = 0, error = 0, flags, oresid;
1996	struct sockbuf *sb;
1997	struct mbuf *m, *n = NULL;
1998
1999	/* We only do stream sockets. */
2000	if (so->so_type != SOCK_STREAM)
2001		return (EINVAL);
2002	if (psa != NULL)
2003		*psa = NULL;
2004	if (controlp != NULL)
2005		return (EINVAL);
2006	if (flagsp != NULL)
2007		flags = *flagsp &~ MSG_EOR;
2008	else
2009		flags = 0;
2010	if (flags & MSG_OOB)
2011		return (soreceive_rcvoob(so, uio, flags));
2012	if (mp0 != NULL)
2013		*mp0 = NULL;
2014
2015	sb = &so->so_rcv;
2016
2017	/* Prevent other readers from entering the socket. */
2018	error = sblock(sb, SBLOCKWAIT(flags));
2019	if (error)
2020		goto out;
2021	SOCKBUF_LOCK(sb);
2022
2023	/* Easy one, no space to copyout anything. */
2024	if (uio->uio_resid == 0) {
2025		error = EINVAL;
2026		goto out;
2027	}
2028	oresid = uio->uio_resid;
2029
2030	/* We will never ever get anything unless we are or were connected. */
2031	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2032		error = ENOTCONN;
2033		goto out;
2034	}
2035
2036restart:
2037	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2038
2039	/* Abort if socket has reported problems. */
2040	if (so->so_error) {
2041		if (sb->sb_cc > 0)
2042			goto deliver;
2043		if (oresid > uio->uio_resid)
2044			goto out;
2045		error = so->so_error;
2046		if (!(flags & MSG_PEEK))
2047			so->so_error = 0;
2048		goto out;
2049	}
2050
2051	/* Door is closed.  Deliver what is left, if any. */
2052	if (sb->sb_state & SBS_CANTRCVMORE) {
2053		if (sb->sb_cc > 0)
2054			goto deliver;
2055		else
2056			goto out;
2057	}
2058
2059	/* Socket buffer is empty and we shall not block. */
2060	if (sb->sb_cc == 0 &&
2061	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2062		error = EAGAIN;
2063		goto out;
2064	}
2065
2066	/* Socket buffer got some data that we shall deliver now. */
2067	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2068	    ((sb->sb_flags & SS_NBIO) ||
2069	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2070	     sb->sb_cc >= sb->sb_lowat ||
2071	     sb->sb_cc >= uio->uio_resid ||
2072	     sb->sb_cc >= sb->sb_hiwat) ) {
2073		goto deliver;
2074	}
2075
2076	/* On MSG_WAITALL we must wait until all data or error arrives. */
2077	if ((flags & MSG_WAITALL) &&
2078	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
2079		goto deliver;
2080
2081	/*
2082	 * Wait and block until (more) data comes in.
2083	 * NB: Drops the sockbuf lock during wait.
2084	 */
2085	error = sbwait(sb);
2086	if (error)
2087		goto out;
2088	goto restart;
2089
2090deliver:
2091	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2092	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2093	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2094
2095	/* Statistics. */
2096	if (uio->uio_td)
2097		uio->uio_td->td_ru.ru_msgrcv++;
2098
2099	/* Fill uio until full or current end of socket buffer is reached. */
2100	len = min(uio->uio_resid, sb->sb_cc);
2101	if (mp0 != NULL) {
2102		/* Dequeue as many mbufs as possible. */
2103		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2104			if (*mp0 == NULL)
2105				*mp0 = sb->sb_mb;
2106			else
2107				m_cat(*mp0, sb->sb_mb);
2108			for (m = sb->sb_mb;
2109			     m != NULL && m->m_len <= len;
2110			     m = m->m_next) {
2111				len -= m->m_len;
2112				uio->uio_resid -= m->m_len;
2113				sbfree(sb, m);
2114				n = m;
2115			}
2116			n->m_next = NULL;
2117			sb->sb_mb = m;
2118			sb->sb_lastrecord = sb->sb_mb;
2119			if (sb->sb_mb == NULL)
2120				SB_EMPTY_FIXUP(sb);
2121		}
2122		/* Copy the remainder. */
2123		if (len > 0) {
2124			KASSERT(sb->sb_mb != NULL,
2125			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2126
2127			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2128			if (m == NULL)
2129				len = 0;	/* Don't flush data from sockbuf. */
2130			else
2131				uio->uio_resid -= len;
2132			if (*mp0 != NULL)
2133				m_cat(*mp0, m);
2134			else
2135				*mp0 = m;
2136			if (*mp0 == NULL) {
2137				error = ENOBUFS;
2138				goto out;
2139			}
2140		}
2141	} else {
2142		/* NB: Must unlock socket buffer as uiomove may sleep. */
2143		SOCKBUF_UNLOCK(sb);
2144		error = m_mbuftouio(uio, sb->sb_mb, len);
2145		SOCKBUF_LOCK(sb);
2146		if (error)
2147			goto out;
2148	}
2149	SBLASTRECORDCHK(sb);
2150	SBLASTMBUFCHK(sb);
2151
2152	/*
2153	 * Remove the delivered data from the socket buffer unless we
2154	 * were only peeking.
2155	 */
2156	if (!(flags & MSG_PEEK)) {
2157		if (len > 0)
2158			sbdrop_locked(sb, len);
2159
2160		/* Notify protocol that we drained some data. */
2161		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2162		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2163		     !(flags & MSG_SOCALLBCK))) {
2164			SOCKBUF_UNLOCK(sb);
2165			VNET_SO_ASSERT(so);
2166			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2167			SOCKBUF_LOCK(sb);
2168		}
2169	}
2170
2171	/*
2172	 * For MSG_WAITALL we may have to loop again and wait for
2173	 * more data to come in.
2174	 */
2175	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2176		goto restart;
2177out:
2178	SOCKBUF_LOCK_ASSERT(sb);
2179	SBLASTRECORDCHK(sb);
2180	SBLASTMBUFCHK(sb);
2181	SOCKBUF_UNLOCK(sb);
2182	sbunlock(sb);
2183	return (error);
2184}
2185
2186/*
2187 * Optimized version of soreceive() for simple datagram cases from userspace.
2188 * Unlike in the stream case, we're able to drop a datagram if copyout()
2189 * fails, and because we handle datagrams atomically, we don't need to use a
2190 * sleep lock to prevent I/O interlacing.
2191 */
2192int
2193soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2194    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2195{
2196	struct mbuf *m, *m2;
2197	int flags, error;
2198	ssize_t len;
2199	struct protosw *pr = so->so_proto;
2200	struct mbuf *nextrecord;
2201
2202	if (psa != NULL)
2203		*psa = NULL;
2204	if (controlp != NULL)
2205		*controlp = NULL;
2206	if (flagsp != NULL)
2207		flags = *flagsp &~ MSG_EOR;
2208	else
2209		flags = 0;
2210
2211	/*
2212	 * For any complicated cases, fall back to the full
2213	 * soreceive_generic().
2214	 */
2215	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2216		return (soreceive_generic(so, psa, uio, mp0, controlp,
2217		    flagsp));
2218
2219	/*
2220	 * Enforce restrictions on use.
2221	 */
2222	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2223	    ("soreceive_dgram: wantrcvd"));
2224	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2225	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2226	    ("soreceive_dgram: SBS_RCVATMARK"));
2227	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2228	    ("soreceive_dgram: P_CONNREQUIRED"));
2229
2230	/*
2231	 * Loop blocking while waiting for a datagram.
2232	 */
2233	SOCKBUF_LOCK(&so->so_rcv);
2234	while ((m = so->so_rcv.sb_mb) == NULL) {
2235		KASSERT(so->so_rcv.sb_cc == 0,
2236		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2237		    so->so_rcv.sb_cc));
2238		if (so->so_error) {
2239			error = so->so_error;
2240			so->so_error = 0;
2241			SOCKBUF_UNLOCK(&so->so_rcv);
2242			return (error);
2243		}
2244		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2245		    uio->uio_resid == 0) {
2246			SOCKBUF_UNLOCK(&so->so_rcv);
2247			return (0);
2248		}
2249		if ((so->so_state & SS_NBIO) ||
2250		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2251			SOCKBUF_UNLOCK(&so->so_rcv);
2252			return (EWOULDBLOCK);
2253		}
2254		SBLASTRECORDCHK(&so->so_rcv);
2255		SBLASTMBUFCHK(&so->so_rcv);
2256		error = sbwait(&so->so_rcv);
2257		if (error) {
2258			SOCKBUF_UNLOCK(&so->so_rcv);
2259			return (error);
2260		}
2261	}
2262	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2263
2264	if (uio->uio_td)
2265		uio->uio_td->td_ru.ru_msgrcv++;
2266	SBLASTRECORDCHK(&so->so_rcv);
2267	SBLASTMBUFCHK(&so->so_rcv);
2268	nextrecord = m->m_nextpkt;
2269	if (nextrecord == NULL) {
2270		KASSERT(so->so_rcv.sb_lastrecord == m,
2271		    ("soreceive_dgram: lastrecord != m"));
2272	}
2273
2274	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2275	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2276
2277	/*
2278	 * Pull 'm' and its chain off the front of the packet queue.
2279	 */
2280	so->so_rcv.sb_mb = NULL;
2281	sockbuf_pushsync(&so->so_rcv, nextrecord);
2282
2283	/*
2284	 * Walk 'm's chain and free that many bytes from the socket buffer.
2285	 */
2286	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2287		sbfree(&so->so_rcv, m2);
2288
2289	/*
2290	 * Do a few last checks before we let go of the lock.
2291	 */
2292	SBLASTRECORDCHK(&so->so_rcv);
2293	SBLASTMBUFCHK(&so->so_rcv);
2294	SOCKBUF_UNLOCK(&so->so_rcv);
2295
2296	if (pr->pr_flags & PR_ADDR) {
2297		KASSERT(m->m_type == MT_SONAME,
2298		    ("m->m_type == %d", m->m_type));
2299		if (psa != NULL)
2300			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2301			    M_NOWAIT);
2302		m = m_free(m);
2303	}
2304	if (m == NULL) {
2305		/* XXXRW: Can this happen? */
2306		return (0);
2307	}
2308
2309	/*
2310	 * Packet to copyout() is now in 'm' and it is disconnected from the
2311	 * queue.
2312	 *
2313	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2314	 * in the first mbuf chain on the socket buffer.  We call into the
2315	 * protocol to perform externalization (or freeing if controlp ==
2316	 * NULL).
2317	 */
2318	if (m->m_type == MT_CONTROL) {
2319		struct mbuf *cm = NULL, *cmn;
2320		struct mbuf **cme = &cm;
2321
2322		do {
2323			m2 = m->m_next;
2324			m->m_next = NULL;
2325			*cme = m;
2326			cme = &(*cme)->m_next;
2327			m = m2;
2328		} while (m != NULL && m->m_type == MT_CONTROL);
2329		while (cm != NULL) {
2330			cmn = cm->m_next;
2331			cm->m_next = NULL;
2332			if (pr->pr_domain->dom_externalize != NULL) {
2333				error = (*pr->pr_domain->dom_externalize)
2334				    (cm, controlp);
2335			} else if (controlp != NULL)
2336				*controlp = cm;
2337			else
2338				m_freem(cm);
2339			if (controlp != NULL) {
2340				while (*controlp != NULL)
2341					controlp = &(*controlp)->m_next;
2342			}
2343			cm = cmn;
2344		}
2345	}
2346	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2347
2348	while (m != NULL && uio->uio_resid > 0) {
2349		len = uio->uio_resid;
2350		if (len > m->m_len)
2351			len = m->m_len;
2352		error = uiomove(mtod(m, char *), (int)len, uio);
2353		if (error) {
2354			m_freem(m);
2355			return (error);
2356		}
2357		if (len == m->m_len)
2358			m = m_free(m);
2359		else {
2360			m->m_data += len;
2361			m->m_len -= len;
2362		}
2363	}
2364	if (m != NULL)
2365		flags |= MSG_TRUNC;
2366	m_freem(m);
2367	if (flagsp != NULL)
2368		*flagsp |= flags;
2369	return (0);
2370}
2371
2372int
2373soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2374    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2375{
2376	int error;
2377
2378	CURVNET_SET(so->so_vnet);
2379	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2380	    controlp, flagsp));
2381	CURVNET_RESTORE();
2382	return (error);
2383}
2384
2385int
2386soshutdown(struct socket *so, int how)
2387{
2388	struct protosw *pr = so->so_proto;
2389	int error;
2390
2391	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2392		return (EINVAL);
2393
2394	CURVNET_SET(so->so_vnet);
2395	if (pr->pr_usrreqs->pru_flush != NULL)
2396		(*pr->pr_usrreqs->pru_flush)(so, how);
2397	if (how != SHUT_WR)
2398		sorflush(so);
2399	if (how != SHUT_RD) {
2400		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2401		CURVNET_RESTORE();
2402		return (error);
2403	}
2404	CURVNET_RESTORE();
2405	return (0);
2406}
2407
2408void
2409sorflush(struct socket *so)
2410{
2411	struct sockbuf *sb = &so->so_rcv;
2412	struct protosw *pr = so->so_proto;
2413	struct sockbuf asb;
2414
2415	VNET_SO_ASSERT(so);
2416
2417	/*
2418	 * In order to avoid calling dom_dispose with the socket buffer mutex
2419	 * held, and in order to generally avoid holding the lock for a long
2420	 * time, we make a copy of the socket buffer and clear the original
2421	 * (except locks, state).  The new socket buffer copy won't have
2422	 * initialized locks so we can only call routines that won't use or
2423	 * assert those locks.
2424	 *
2425	 * Dislodge threads currently blocked in receive and wait to acquire
2426	 * a lock against other simultaneous readers before clearing the
2427	 * socket buffer.  Don't let our acquire be interrupted by a signal
2428	 * despite any existing socket disposition on interruptable waiting.
2429	 */
2430	socantrcvmore(so);
2431	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2432
2433	/*
2434	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2435	 * and mutex data unchanged.
2436	 */
2437	SOCKBUF_LOCK(sb);
2438	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2439	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2440	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2441	bzero(&sb->sb_startzero,
2442	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2443	SOCKBUF_UNLOCK(sb);
2444	sbunlock(sb);
2445
2446	/*
2447	 * Dispose of special rights and flush the socket buffer.  Don't call
2448	 * any unsafe routines (that rely on locks being initialized) on asb.
2449	 */
2450	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2451		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2452	sbrelease_internal(&asb, so);
2453}
2454
2455/*
2456 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2457 * additional variant to handle the case where the option value needs to be
2458 * some kind of integer, but not a specific size.  In addition to their use
2459 * here, these functions are also called by the protocol-level pr_ctloutput()
2460 * routines.
2461 */
2462int
2463sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2464{
2465	size_t	valsize;
2466
2467	/*
2468	 * If the user gives us more than we wanted, we ignore it, but if we
2469	 * don't get the minimum length the caller wants, we return EINVAL.
2470	 * On success, sopt->sopt_valsize is set to however much we actually
2471	 * retrieved.
2472	 */
2473	if ((valsize = sopt->sopt_valsize) < minlen)
2474		return EINVAL;
2475	if (valsize > len)
2476		sopt->sopt_valsize = valsize = len;
2477
2478	if (sopt->sopt_td != NULL)
2479		return (copyin(sopt->sopt_val, buf, valsize));
2480
2481	bcopy(sopt->sopt_val, buf, valsize);
2482	return (0);
2483}
2484
2485/*
2486 * Kernel version of setsockopt(2).
2487 *
2488 * XXX: optlen is size_t, not socklen_t
2489 */
2490int
2491so_setsockopt(struct socket *so, int level, int optname, void *optval,
2492    size_t optlen)
2493{
2494	struct sockopt sopt;
2495
2496	sopt.sopt_level = level;
2497	sopt.sopt_name = optname;
2498	sopt.sopt_dir = SOPT_SET;
2499	sopt.sopt_val = optval;
2500	sopt.sopt_valsize = optlen;
2501	sopt.sopt_td = NULL;
2502	return (sosetopt(so, &sopt));
2503}
2504
2505int
2506sosetopt(struct socket *so, struct sockopt *sopt)
2507{
2508	int	error, optval;
2509	struct	linger l;
2510	struct	timeval tv;
2511	u_long  val;
2512	uint32_t val32;
2513#ifdef MAC
2514	struct mac extmac;
2515#endif
2516
2517	CURVNET_SET(so->so_vnet);
2518	error = 0;
2519	if (sopt->sopt_level != SOL_SOCKET) {
2520		if (so->so_proto->pr_ctloutput != NULL) {
2521			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2522			CURVNET_RESTORE();
2523			return (error);
2524		}
2525		error = ENOPROTOOPT;
2526	} else {
2527		switch (sopt->sopt_name) {
2528#ifdef INET
2529		case SO_ACCEPTFILTER:
2530			error = do_setopt_accept_filter(so, sopt);
2531			if (error)
2532				goto bad;
2533			break;
2534#endif
2535		case SO_LINGER:
2536			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2537			if (error)
2538				goto bad;
2539
2540			SOCK_LOCK(so);
2541			so->so_linger = l.l_linger;
2542			if (l.l_onoff)
2543				so->so_options |= SO_LINGER;
2544			else
2545				so->so_options &= ~SO_LINGER;
2546			SOCK_UNLOCK(so);
2547			break;
2548
2549		case SO_DEBUG:
2550		case SO_KEEPALIVE:
2551		case SO_DONTROUTE:
2552		case SO_USELOOPBACK:
2553		case SO_BROADCAST:
2554		case SO_REUSEADDR:
2555		case SO_REUSEPORT:
2556		case SO_OOBINLINE:
2557		case SO_TIMESTAMP:
2558		case SO_BINTIME:
2559		case SO_NOSIGPIPE:
2560		case SO_NO_DDP:
2561		case SO_NO_OFFLOAD:
2562			error = sooptcopyin(sopt, &optval, sizeof optval,
2563			    sizeof optval);
2564			if (error)
2565				goto bad;
2566			SOCK_LOCK(so);
2567			if (optval)
2568				so->so_options |= sopt->sopt_name;
2569			else
2570				so->so_options &= ~sopt->sopt_name;
2571			SOCK_UNLOCK(so);
2572			break;
2573
2574		case SO_SETFIB:
2575			error = sooptcopyin(sopt, &optval, sizeof optval,
2576			    sizeof optval);
2577			if (error)
2578				goto bad;
2579
2580			if (optval < 0 || optval >= rt_numfibs) {
2581				error = EINVAL;
2582				goto bad;
2583			}
2584			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2585			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2586			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2587				so->so_fibnum = optval;
2588			else
2589				so->so_fibnum = 0;
2590			break;
2591
2592		case SO_USER_COOKIE:
2593			error = sooptcopyin(sopt, &val32, sizeof val32,
2594			    sizeof val32);
2595			if (error)
2596				goto bad;
2597			so->so_user_cookie = val32;
2598			break;
2599
2600		case SO_SNDBUF:
2601		case SO_RCVBUF:
2602		case SO_SNDLOWAT:
2603		case SO_RCVLOWAT:
2604			error = sooptcopyin(sopt, &optval, sizeof optval,
2605			    sizeof optval);
2606			if (error)
2607				goto bad;
2608
2609			/*
2610			 * Values < 1 make no sense for any of these options,
2611			 * so disallow them.
2612			 */
2613			if (optval < 1) {
2614				error = EINVAL;
2615				goto bad;
2616			}
2617
2618			switch (sopt->sopt_name) {
2619			case SO_SNDBUF:
2620			case SO_RCVBUF:
2621				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2622				    &so->so_snd : &so->so_rcv, (u_long)optval,
2623				    so, curthread) == 0) {
2624					error = ENOBUFS;
2625					goto bad;
2626				}
2627				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2628				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2629				break;
2630
2631			/*
2632			 * Make sure the low-water is never greater than the
2633			 * high-water.
2634			 */
2635			case SO_SNDLOWAT:
2636				SOCKBUF_LOCK(&so->so_snd);
2637				so->so_snd.sb_lowat =
2638				    (optval > so->so_snd.sb_hiwat) ?
2639				    so->so_snd.sb_hiwat : optval;
2640				SOCKBUF_UNLOCK(&so->so_snd);
2641				break;
2642			case SO_RCVLOWAT:
2643				SOCKBUF_LOCK(&so->so_rcv);
2644				so->so_rcv.sb_lowat =
2645				    (optval > so->so_rcv.sb_hiwat) ?
2646				    so->so_rcv.sb_hiwat : optval;
2647				SOCKBUF_UNLOCK(&so->so_rcv);
2648				break;
2649			}
2650			break;
2651
2652		case SO_SNDTIMEO:
2653		case SO_RCVTIMEO:
2654#ifdef COMPAT_FREEBSD32
2655			if (SV_CURPROC_FLAG(SV_ILP32)) {
2656				struct timeval32 tv32;
2657
2658				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2659				    sizeof tv32);
2660				CP(tv32, tv, tv_sec);
2661				CP(tv32, tv, tv_usec);
2662			} else
2663#endif
2664				error = sooptcopyin(sopt, &tv, sizeof tv,
2665				    sizeof tv);
2666			if (error)
2667				goto bad;
2668
2669			/* assert(hz > 0); */
2670			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2671			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2672				error = EDOM;
2673				goto bad;
2674			}
2675			/* assert(tick > 0); */
2676			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2677			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2678			if (val > INT_MAX) {
2679				error = EDOM;
2680				goto bad;
2681			}
2682			if (val == 0 && tv.tv_usec != 0)
2683				val = 1;
2684
2685			switch (sopt->sopt_name) {
2686			case SO_SNDTIMEO:
2687				so->so_snd.sb_timeo = val;
2688				break;
2689			case SO_RCVTIMEO:
2690				so->so_rcv.sb_timeo = val;
2691				break;
2692			}
2693			break;
2694
2695		case SO_LABEL:
2696#ifdef MAC
2697			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2698			    sizeof extmac);
2699			if (error)
2700				goto bad;
2701			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2702			    so, &extmac);
2703#else
2704			error = EOPNOTSUPP;
2705#endif
2706			break;
2707
2708		default:
2709			error = ENOPROTOOPT;
2710			break;
2711		}
2712		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2713			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
2714	}
2715bad:
2716	CURVNET_RESTORE();
2717	return (error);
2718}
2719
2720/*
2721 * Helper routine for getsockopt.
2722 */
2723int
2724sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2725{
2726	int	error;
2727	size_t	valsize;
2728
2729	error = 0;
2730
2731	/*
2732	 * Documented get behavior is that we always return a value, possibly
2733	 * truncated to fit in the user's buffer.  Traditional behavior is
2734	 * that we always tell the user precisely how much we copied, rather
2735	 * than something useful like the total amount we had available for
2736	 * her.  Note that this interface is not idempotent; the entire
2737	 * answer must generated ahead of time.
2738	 */
2739	valsize = min(len, sopt->sopt_valsize);
2740	sopt->sopt_valsize = valsize;
2741	if (sopt->sopt_val != NULL) {
2742		if (sopt->sopt_td != NULL)
2743			error = copyout(buf, sopt->sopt_val, valsize);
2744		else
2745			bcopy(buf, sopt->sopt_val, valsize);
2746	}
2747	return (error);
2748}
2749
2750int
2751sogetopt(struct socket *so, struct sockopt *sopt)
2752{
2753	int	error, optval;
2754	struct	linger l;
2755	struct	timeval tv;
2756#ifdef MAC
2757	struct mac extmac;
2758#endif
2759
2760	CURVNET_SET(so->so_vnet);
2761	error = 0;
2762	if (sopt->sopt_level != SOL_SOCKET) {
2763		if (so->so_proto->pr_ctloutput != NULL)
2764			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2765		else
2766			error = ENOPROTOOPT;
2767		CURVNET_RESTORE();
2768		return (error);
2769	} else {
2770		switch (sopt->sopt_name) {
2771#ifdef INET
2772		case SO_ACCEPTFILTER:
2773			error = do_getopt_accept_filter(so, sopt);
2774			break;
2775#endif
2776		case SO_LINGER:
2777			SOCK_LOCK(so);
2778			l.l_onoff = so->so_options & SO_LINGER;
2779			l.l_linger = so->so_linger;
2780			SOCK_UNLOCK(so);
2781			error = sooptcopyout(sopt, &l, sizeof l);
2782			break;
2783
2784		case SO_USELOOPBACK:
2785		case SO_DONTROUTE:
2786		case SO_DEBUG:
2787		case SO_KEEPALIVE:
2788		case SO_REUSEADDR:
2789		case SO_REUSEPORT:
2790		case SO_BROADCAST:
2791		case SO_OOBINLINE:
2792		case SO_ACCEPTCONN:
2793		case SO_TIMESTAMP:
2794		case SO_BINTIME:
2795		case SO_NOSIGPIPE:
2796			optval = so->so_options & sopt->sopt_name;
2797integer:
2798			error = sooptcopyout(sopt, &optval, sizeof optval);
2799			break;
2800
2801		case SO_TYPE:
2802			optval = so->so_type;
2803			goto integer;
2804
2805		case SO_PROTOCOL:
2806			optval = so->so_proto->pr_protocol;
2807			goto integer;
2808
2809		case SO_ERROR:
2810			SOCK_LOCK(so);
2811			optval = so->so_error;
2812			so->so_error = 0;
2813			SOCK_UNLOCK(so);
2814			goto integer;
2815
2816		case SO_SNDBUF:
2817			optval = so->so_snd.sb_hiwat;
2818			goto integer;
2819
2820		case SO_RCVBUF:
2821			optval = so->so_rcv.sb_hiwat;
2822			goto integer;
2823
2824		case SO_SNDLOWAT:
2825			optval = so->so_snd.sb_lowat;
2826			goto integer;
2827
2828		case SO_RCVLOWAT:
2829			optval = so->so_rcv.sb_lowat;
2830			goto integer;
2831
2832		case SO_SNDTIMEO:
2833		case SO_RCVTIMEO:
2834			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2835				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2836
2837			tv.tv_sec = optval / hz;
2838			tv.tv_usec = (optval % hz) * tick;
2839#ifdef COMPAT_FREEBSD32
2840			if (SV_CURPROC_FLAG(SV_ILP32)) {
2841				struct timeval32 tv32;
2842
2843				CP(tv, tv32, tv_sec);
2844				CP(tv, tv32, tv_usec);
2845				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2846			} else
2847#endif
2848				error = sooptcopyout(sopt, &tv, sizeof tv);
2849			break;
2850
2851		case SO_LABEL:
2852#ifdef MAC
2853			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2854			    sizeof(extmac));
2855			if (error)
2856				goto bad;
2857			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2858			    so, &extmac);
2859			if (error)
2860				goto bad;
2861			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2862#else
2863			error = EOPNOTSUPP;
2864#endif
2865			break;
2866
2867		case SO_PEERLABEL:
2868#ifdef MAC
2869			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2870			    sizeof(extmac));
2871			if (error)
2872				goto bad;
2873			error = mac_getsockopt_peerlabel(
2874			    sopt->sopt_td->td_ucred, so, &extmac);
2875			if (error)
2876				goto bad;
2877			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2878#else
2879			error = EOPNOTSUPP;
2880#endif
2881			break;
2882
2883		case SO_LISTENQLIMIT:
2884			optval = so->so_qlimit;
2885			goto integer;
2886
2887		case SO_LISTENQLEN:
2888			optval = so->so_qlen;
2889			goto integer;
2890
2891		case SO_LISTENINCQLEN:
2892			optval = so->so_incqlen;
2893			goto integer;
2894
2895		default:
2896			error = ENOPROTOOPT;
2897			break;
2898		}
2899	}
2900#ifdef MAC
2901bad:
2902#endif
2903	CURVNET_RESTORE();
2904	return (error);
2905}
2906
2907int
2908soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2909{
2910	struct mbuf *m, *m_prev;
2911	int sopt_size = sopt->sopt_valsize;
2912
2913	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2914	if (m == NULL)
2915		return ENOBUFS;
2916	if (sopt_size > MLEN) {
2917		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
2918		if ((m->m_flags & M_EXT) == 0) {
2919			m_free(m);
2920			return ENOBUFS;
2921		}
2922		m->m_len = min(MCLBYTES, sopt_size);
2923	} else {
2924		m->m_len = min(MLEN, sopt_size);
2925	}
2926	sopt_size -= m->m_len;
2927	*mp = m;
2928	m_prev = m;
2929
2930	while (sopt_size) {
2931		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2932		if (m == NULL) {
2933			m_freem(*mp);
2934			return ENOBUFS;
2935		}
2936		if (sopt_size > MLEN) {
2937			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
2938			    M_NOWAIT);
2939			if ((m->m_flags & M_EXT) == 0) {
2940				m_freem(m);
2941				m_freem(*mp);
2942				return ENOBUFS;
2943			}
2944			m->m_len = min(MCLBYTES, sopt_size);
2945		} else {
2946			m->m_len = min(MLEN, sopt_size);
2947		}
2948		sopt_size -= m->m_len;
2949		m_prev->m_next = m;
2950		m_prev = m;
2951	}
2952	return (0);
2953}
2954
2955int
2956soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2957{
2958	struct mbuf *m0 = m;
2959
2960	if (sopt->sopt_val == NULL)
2961		return (0);
2962	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2963		if (sopt->sopt_td != NULL) {
2964			int error;
2965
2966			error = copyin(sopt->sopt_val, mtod(m, char *),
2967			    m->m_len);
2968			if (error != 0) {
2969				m_freem(m0);
2970				return(error);
2971			}
2972		} else
2973			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2974		sopt->sopt_valsize -= m->m_len;
2975		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2976		m = m->m_next;
2977	}
2978	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2979		panic("ip6_sooptmcopyin");
2980	return (0);
2981}
2982
2983int
2984soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2985{
2986	struct mbuf *m0 = m;
2987	size_t valsize = 0;
2988
2989	if (sopt->sopt_val == NULL)
2990		return (0);
2991	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2992		if (sopt->sopt_td != NULL) {
2993			int error;
2994
2995			error = copyout(mtod(m, char *), sopt->sopt_val,
2996			    m->m_len);
2997			if (error != 0) {
2998				m_freem(m0);
2999				return(error);
3000			}
3001		} else
3002			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3003		sopt->sopt_valsize -= m->m_len;
3004		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3005		valsize += m->m_len;
3006		m = m->m_next;
3007	}
3008	if (m != NULL) {
3009		/* enough soopt buffer should be given from user-land */
3010		m_freem(m0);
3011		return(EINVAL);
3012	}
3013	sopt->sopt_valsize = valsize;
3014	return (0);
3015}
3016
3017/*
3018 * sohasoutofband(): protocol notifies socket layer of the arrival of new
3019 * out-of-band data, which will then notify socket consumers.
3020 */
3021void
3022sohasoutofband(struct socket *so)
3023{
3024
3025	if (so->so_sigio != NULL)
3026		pgsigio(&so->so_sigio, SIGURG, 0);
3027	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
3028}
3029
3030int
3031sopoll(struct socket *so, int events, struct ucred *active_cred,
3032    struct thread *td)
3033{
3034
3035	/*
3036	 * We do not need to set or assert curvnet as long as everyone uses
3037	 * sopoll_generic().
3038	 */
3039	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3040	    td));
3041}
3042
3043int
3044sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3045    struct thread *td)
3046{
3047	int revents = 0;
3048
3049	SOCKBUF_LOCK(&so->so_snd);
3050	SOCKBUF_LOCK(&so->so_rcv);
3051	if (events & (POLLIN | POLLRDNORM))
3052		if (soreadabledata(so))
3053			revents |= events & (POLLIN | POLLRDNORM);
3054
3055	if (events & (POLLOUT | POLLWRNORM))
3056		if (sowriteable(so))
3057			revents |= events & (POLLOUT | POLLWRNORM);
3058
3059	if (events & (POLLPRI | POLLRDBAND))
3060		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
3061			revents |= events & (POLLPRI | POLLRDBAND);
3062
3063	if ((events & POLLINIGNEOF) == 0) {
3064		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3065			revents |= events & (POLLIN | POLLRDNORM);
3066			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3067				revents |= POLLHUP;
3068		}
3069	}
3070
3071	if (revents == 0) {
3072		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3073			selrecord(td, &so->so_rcv.sb_sel);
3074			so->so_rcv.sb_flags |= SB_SEL;
3075		}
3076
3077		if (events & (POLLOUT | POLLWRNORM)) {
3078			selrecord(td, &so->so_snd.sb_sel);
3079			so->so_snd.sb_flags |= SB_SEL;
3080		}
3081	}
3082
3083	SOCKBUF_UNLOCK(&so->so_rcv);
3084	SOCKBUF_UNLOCK(&so->so_snd);
3085	return (revents);
3086}
3087
3088int
3089soo_kqfilter(struct file *fp, struct knote *kn)
3090{
3091	struct socket *so = kn->kn_fp->f_data;
3092	struct sockbuf *sb;
3093
3094	switch (kn->kn_filter) {
3095	case EVFILT_READ:
3096		if (so->so_options & SO_ACCEPTCONN)
3097			kn->kn_fop = &solisten_filtops;
3098		else
3099			kn->kn_fop = &soread_filtops;
3100		sb = &so->so_rcv;
3101		break;
3102	case EVFILT_WRITE:
3103		kn->kn_fop = &sowrite_filtops;
3104		sb = &so->so_snd;
3105		break;
3106	default:
3107		return (EINVAL);
3108	}
3109
3110	SOCKBUF_LOCK(sb);
3111	knlist_add(&sb->sb_sel.si_note, kn, 1);
3112	sb->sb_flags |= SB_KNOTE;
3113	SOCKBUF_UNLOCK(sb);
3114	return (0);
3115}
3116
3117/*
3118 * Some routines that return EOPNOTSUPP for entry points that are not
3119 * supported by a protocol.  Fill in as needed.
3120 */
3121int
3122pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3123{
3124
3125	return EOPNOTSUPP;
3126}
3127
3128int
3129pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3130{
3131
3132	return EOPNOTSUPP;
3133}
3134
3135int
3136pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3137{
3138
3139	return EOPNOTSUPP;
3140}
3141
3142int
3143pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3144{
3145
3146	return EOPNOTSUPP;
3147}
3148
3149int
3150pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3151{
3152
3153	return EOPNOTSUPP;
3154}
3155
3156int
3157pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3158    struct ifnet *ifp, struct thread *td)
3159{
3160
3161	return EOPNOTSUPP;
3162}
3163
3164int
3165pru_disconnect_notsupp(struct socket *so)
3166{
3167
3168	return EOPNOTSUPP;
3169}
3170
3171int
3172pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3173{
3174
3175	return EOPNOTSUPP;
3176}
3177
3178int
3179pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3180{
3181
3182	return EOPNOTSUPP;
3183}
3184
3185int
3186pru_rcvd_notsupp(struct socket *so, int flags)
3187{
3188
3189	return EOPNOTSUPP;
3190}
3191
3192int
3193pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3194{
3195
3196	return EOPNOTSUPP;
3197}
3198
3199int
3200pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3201    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3202{
3203
3204	return EOPNOTSUPP;
3205}
3206
3207/*
3208 * This isn't really a ``null'' operation, but it's the default one and
3209 * doesn't do anything destructive.
3210 */
3211int
3212pru_sense_null(struct socket *so, struct stat *sb)
3213{
3214
3215	sb->st_blksize = so->so_snd.sb_hiwat;
3216	return 0;
3217}
3218
3219int
3220pru_shutdown_notsupp(struct socket *so)
3221{
3222
3223	return EOPNOTSUPP;
3224}
3225
3226int
3227pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3228{
3229
3230	return EOPNOTSUPP;
3231}
3232
3233int
3234pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3235    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3236{
3237
3238	return EOPNOTSUPP;
3239}
3240
3241int
3242pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3243    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3244{
3245
3246	return EOPNOTSUPP;
3247}
3248
3249int
3250pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3251    struct thread *td)
3252{
3253
3254	return EOPNOTSUPP;
3255}
3256
3257static void
3258filt_sordetach(struct knote *kn)
3259{
3260	struct socket *so = kn->kn_fp->f_data;
3261
3262	SOCKBUF_LOCK(&so->so_rcv);
3263	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3264	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3265		so->so_rcv.sb_flags &= ~SB_KNOTE;
3266	SOCKBUF_UNLOCK(&so->so_rcv);
3267}
3268
3269/*ARGSUSED*/
3270static int
3271filt_soread(struct knote *kn, long hint)
3272{
3273	struct socket *so;
3274
3275	so = kn->kn_fp->f_data;
3276	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3277
3278	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3279	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3280		kn->kn_flags |= EV_EOF;
3281		kn->kn_fflags = so->so_error;
3282		return (1);
3283	} else if (so->so_error)	/* temporary udp error */
3284		return (1);
3285	else if (kn->kn_sfflags & NOTE_LOWAT)
3286		return (kn->kn_data >= kn->kn_sdata);
3287	else
3288		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3289}
3290
3291static void
3292filt_sowdetach(struct knote *kn)
3293{
3294	struct socket *so = kn->kn_fp->f_data;
3295
3296	SOCKBUF_LOCK(&so->so_snd);
3297	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3298	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3299		so->so_snd.sb_flags &= ~SB_KNOTE;
3300	SOCKBUF_UNLOCK(&so->so_snd);
3301}
3302
3303/*ARGSUSED*/
3304static int
3305filt_sowrite(struct knote *kn, long hint)
3306{
3307	struct socket *so;
3308
3309	so = kn->kn_fp->f_data;
3310	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3311	kn->kn_data = sbspace(&so->so_snd);
3312	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3313		kn->kn_flags |= EV_EOF;
3314		kn->kn_fflags = so->so_error;
3315		return (1);
3316	} else if (so->so_error)	/* temporary udp error */
3317		return (1);
3318	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3319	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3320		return (0);
3321	else if (kn->kn_sfflags & NOTE_LOWAT)
3322		return (kn->kn_data >= kn->kn_sdata);
3323	else
3324		return (kn->kn_data >= so->so_snd.sb_lowat);
3325}
3326
3327/*ARGSUSED*/
3328static int
3329filt_solisten(struct knote *kn, long hint)
3330{
3331	struct socket *so = kn->kn_fp->f_data;
3332
3333	kn->kn_data = so->so_qlen;
3334	return (!TAILQ_EMPTY(&so->so_comp));
3335}
3336
3337int
3338socheckuid(struct socket *so, uid_t uid)
3339{
3340
3341	if (so == NULL)
3342		return (EPERM);
3343	if (so->so_cred->cr_uid != uid)
3344		return (EPERM);
3345	return (0);
3346}
3347
3348/*
3349 * These functions are used by protocols to notify the socket layer (and its
3350 * consumers) of state changes in the sockets driven by protocol-side events.
3351 */
3352
3353/*
3354 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3355 *
3356 * Normal sequence from the active (originating) side is that
3357 * soisconnecting() is called during processing of connect() call, resulting
3358 * in an eventual call to soisconnected() if/when the connection is
3359 * established.  When the connection is torn down soisdisconnecting() is
3360 * called during processing of disconnect() call, and soisdisconnected() is
3361 * called when the connection to the peer is totally severed.  The semantics
3362 * of these routines are such that connectionless protocols can call
3363 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3364 * calls when setting up a ``connection'' takes no time.
3365 *
3366 * From the passive side, a socket is created with two queues of sockets:
3367 * so_incomp for connections in progress and so_comp for connections already
3368 * made and awaiting user acceptance.  As a protocol is preparing incoming
3369 * connections, it creates a socket structure queued on so_incomp by calling
3370 * sonewconn().  When the connection is established, soisconnected() is
3371 * called, and transfers the socket structure to so_comp, making it available
3372 * to accept().
3373 *
3374 * If a socket is closed with sockets on either so_incomp or so_comp, these
3375 * sockets are dropped.
3376 *
3377 * If higher-level protocols are implemented in the kernel, the wakeups done
3378 * here will sometimes cause software-interrupt process scheduling.
3379 */
3380void
3381soisconnecting(struct socket *so)
3382{
3383
3384	SOCK_LOCK(so);
3385	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3386	so->so_state |= SS_ISCONNECTING;
3387	SOCK_UNLOCK(so);
3388}
3389
3390void
3391soisconnected(struct socket *so)
3392{
3393	struct socket *head;
3394	int ret;
3395
3396restart:
3397	ACCEPT_LOCK();
3398	SOCK_LOCK(so);
3399	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3400	so->so_state |= SS_ISCONNECTED;
3401	head = so->so_head;
3402	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3403		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3404			SOCK_UNLOCK(so);
3405			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3406			head->so_incqlen--;
3407			so->so_qstate &= ~SQ_INCOMP;
3408			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3409			head->so_qlen++;
3410			so->so_qstate |= SQ_COMP;
3411			ACCEPT_UNLOCK();
3412			sorwakeup(head);
3413			wakeup_one(&head->so_timeo);
3414		} else {
3415			ACCEPT_UNLOCK();
3416			soupcall_set(so, SO_RCV,
3417			    head->so_accf->so_accept_filter->accf_callback,
3418			    head->so_accf->so_accept_filter_arg);
3419			so->so_options &= ~SO_ACCEPTFILTER;
3420			ret = head->so_accf->so_accept_filter->accf_callback(so,
3421			    head->so_accf->so_accept_filter_arg, M_NOWAIT);
3422			if (ret == SU_ISCONNECTED)
3423				soupcall_clear(so, SO_RCV);
3424			SOCK_UNLOCK(so);
3425			if (ret == SU_ISCONNECTED)
3426				goto restart;
3427		}
3428		return;
3429	}
3430	SOCK_UNLOCK(so);
3431	ACCEPT_UNLOCK();
3432	wakeup(&so->so_timeo);
3433	sorwakeup(so);
3434	sowwakeup(so);
3435}
3436
3437void
3438soisdisconnecting(struct socket *so)
3439{
3440
3441	/*
3442	 * Note: This code assumes that SOCK_LOCK(so) and
3443	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3444	 */
3445	SOCKBUF_LOCK(&so->so_rcv);
3446	so->so_state &= ~SS_ISCONNECTING;
3447	so->so_state |= SS_ISDISCONNECTING;
3448	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3449	sorwakeup_locked(so);
3450	SOCKBUF_LOCK(&so->so_snd);
3451	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3452	sowwakeup_locked(so);
3453	wakeup(&so->so_timeo);
3454}
3455
3456void
3457soisdisconnected(struct socket *so)
3458{
3459
3460	/*
3461	 * Note: This code assumes that SOCK_LOCK(so) and
3462	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3463	 */
3464	SOCKBUF_LOCK(&so->so_rcv);
3465	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3466	so->so_state |= SS_ISDISCONNECTED;
3467	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3468	sorwakeup_locked(so);
3469	SOCKBUF_LOCK(&so->so_snd);
3470	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3471	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3472	sowwakeup_locked(so);
3473	wakeup(&so->so_timeo);
3474}
3475
3476/*
3477 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3478 */
3479struct sockaddr *
3480sodupsockaddr(const struct sockaddr *sa, int mflags)
3481{
3482	struct sockaddr *sa2;
3483
3484	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3485	if (sa2)
3486		bcopy(sa, sa2, sa->sa_len);
3487	return sa2;
3488}
3489
3490/*
3491 * Register per-socket buffer upcalls.
3492 */
3493void
3494soupcall_set(struct socket *so, int which,
3495    int (*func)(struct socket *, void *, int), void *arg)
3496{
3497	struct sockbuf *sb;
3498
3499	switch (which) {
3500	case SO_RCV:
3501		sb = &so->so_rcv;
3502		break;
3503	case SO_SND:
3504		sb = &so->so_snd;
3505		break;
3506	default:
3507		panic("soupcall_set: bad which");
3508	}
3509	SOCKBUF_LOCK_ASSERT(sb);
3510#if 0
3511	/* XXX: accf_http actually wants to do this on purpose. */
3512	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3513#endif
3514	sb->sb_upcall = func;
3515	sb->sb_upcallarg = arg;
3516	sb->sb_flags |= SB_UPCALL;
3517}
3518
3519void
3520soupcall_clear(struct socket *so, int which)
3521{
3522	struct sockbuf *sb;
3523
3524	switch (which) {
3525	case SO_RCV:
3526		sb = &so->so_rcv;
3527		break;
3528	case SO_SND:
3529		sb = &so->so_snd;
3530		break;
3531	default:
3532		panic("soupcall_clear: bad which");
3533	}
3534	SOCKBUF_LOCK_ASSERT(sb);
3535	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3536	sb->sb_upcall = NULL;
3537	sb->sb_upcallarg = NULL;
3538	sb->sb_flags &= ~SB_UPCALL;
3539}
3540
3541/*
3542 * Create an external-format (``xsocket'') structure using the information in
3543 * the kernel-format socket structure pointed to by so.  This is done to
3544 * reduce the spew of irrelevant information over this interface, to isolate
3545 * user code from changes in the kernel structure, and potentially to provide
3546 * information-hiding if we decide that some of this information should be
3547 * hidden from users.
3548 */
3549void
3550sotoxsocket(struct socket *so, struct xsocket *xso)
3551{
3552
3553	xso->xso_len = sizeof *xso;
3554	xso->xso_so = so;
3555	xso->so_type = so->so_type;
3556	xso->so_options = so->so_options;
3557	xso->so_linger = so->so_linger;
3558	xso->so_state = so->so_state;
3559	xso->so_pcb = so->so_pcb;
3560	xso->xso_protocol = so->so_proto->pr_protocol;
3561	xso->xso_family = so->so_proto->pr_domain->dom_family;
3562	xso->so_qlen = so->so_qlen;
3563	xso->so_incqlen = so->so_incqlen;
3564	xso->so_qlimit = so->so_qlimit;
3565	xso->so_timeo = so->so_timeo;
3566	xso->so_error = so->so_error;
3567	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3568	xso->so_oobmark = so->so_oobmark;
3569	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3570	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3571	xso->so_uid = so->so_cred->cr_uid;
3572}
3573
3574
3575/*
3576 * Socket accessor functions to provide external consumers with
3577 * a safe interface to socket state
3578 *
3579 */
3580
3581void
3582so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
3583    void *arg)
3584{
3585
3586	TAILQ_FOREACH(so, &so->so_comp, so_list)
3587		func(so, arg);
3588}
3589
3590struct sockbuf *
3591so_sockbuf_rcv(struct socket *so)
3592{
3593
3594	return (&so->so_rcv);
3595}
3596
3597struct sockbuf *
3598so_sockbuf_snd(struct socket *so)
3599{
3600
3601	return (&so->so_snd);
3602}
3603
3604int
3605so_state_get(const struct socket *so)
3606{
3607
3608	return (so->so_state);
3609}
3610
3611void
3612so_state_set(struct socket *so, int val)
3613{
3614
3615	so->so_state = val;
3616}
3617
3618int
3619so_options_get(const struct socket *so)
3620{
3621
3622	return (so->so_options);
3623}
3624
3625void
3626so_options_set(struct socket *so, int val)
3627{
3628
3629	so->so_options = val;
3630}
3631
3632int
3633so_error_get(const struct socket *so)
3634{
3635
3636	return (so->so_error);
3637}
3638
3639void
3640so_error_set(struct socket *so, int val)
3641{
3642
3643	so->so_error = val;
3644}
3645
3646int
3647so_linger_get(const struct socket *so)
3648{
3649
3650	return (so->so_linger);
3651}
3652
3653void
3654so_linger_set(struct socket *so, int val)
3655{
3656
3657	so->so_linger = val;
3658}
3659
3660struct protosw *
3661so_protosw_get(const struct socket *so)
3662{
3663
3664	return (so->so_proto);
3665}
3666
3667void
3668so_protosw_set(struct socket *so, struct protosw *val)
3669{
3670
3671	so->so_proto = val;
3672}
3673
3674void
3675so_sorwakeup(struct socket *so)
3676{
3677
3678	sorwakeup(so);
3679}
3680
3681void
3682so_sowwakeup(struct socket *so)
3683{
3684
3685	sowwakeup(so);
3686}
3687
3688void
3689so_sorwakeup_locked(struct socket *so)
3690{
3691
3692	sorwakeup_locked(so);
3693}
3694
3695void
3696so_sowwakeup_locked(struct socket *so)
3697{
3698
3699	sowwakeup_locked(so);
3700}
3701
3702void
3703so_lock(struct socket *so)
3704{
3705
3706	SOCK_LOCK(so);
3707}
3708
3709void
3710so_unlock(struct socket *so)
3711{
3712
3713	SOCK_UNLOCK(so);
3714}
3715