uipc_socket.c revision 252843
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103#include <sys/cdefs.h>
104__FBSDID("$FreeBSD: stable/9/sys/kern/uipc_socket.c 252843 2013-07-05 21:33:32Z andre $");
105
106#include "opt_inet.h"
107#include "opt_inet6.h"
108#include "opt_zero.h"
109#include "opt_compat.h"
110
111#include <sys/param.h>
112#include <sys/systm.h>
113#include <sys/fcntl.h>
114#include <sys/limits.h>
115#include <sys/lock.h>
116#include <sys/mac.h>
117#include <sys/malloc.h>
118#include <sys/mbuf.h>
119#include <sys/mutex.h>
120#include <sys/domain.h>
121#include <sys/file.h>			/* for struct knote */
122#include <sys/kernel.h>
123#include <sys/event.h>
124#include <sys/eventhandler.h>
125#include <sys/poll.h>
126#include <sys/proc.h>
127#include <sys/protosw.h>
128#include <sys/socket.h>
129#include <sys/socketvar.h>
130#include <sys/resourcevar.h>
131#include <net/route.h>
132#include <sys/signalvar.h>
133#include <sys/stat.h>
134#include <sys/sx.h>
135#include <sys/sysctl.h>
136#include <sys/uio.h>
137#include <sys/jail.h>
138#include <sys/syslog.h>
139
140#include <net/vnet.h>
141
142#include <security/mac/mac_framework.h>
143
144#include <vm/uma.h>
145
146#ifdef COMPAT_FREEBSD32
147#include <sys/mount.h>
148#include <sys/sysent.h>
149#include <compat/freebsd32/freebsd32.h>
150#endif
151
152static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
153		    int flags);
154
155static void	filt_sordetach(struct knote *kn);
156static int	filt_soread(struct knote *kn, long hint);
157static void	filt_sowdetach(struct knote *kn);
158static int	filt_sowrite(struct knote *kn, long hint);
159static int	filt_solisten(struct knote *kn, long hint);
160
161static struct filterops solisten_filtops = {
162	.f_isfd = 1,
163	.f_detach = filt_sordetach,
164	.f_event = filt_solisten,
165};
166static struct filterops soread_filtops = {
167	.f_isfd = 1,
168	.f_detach = filt_sordetach,
169	.f_event = filt_soread,
170};
171static struct filterops sowrite_filtops = {
172	.f_isfd = 1,
173	.f_detach = filt_sowdetach,
174	.f_event = filt_sowrite,
175};
176
177uma_zone_t socket_zone;
178so_gen_t	so_gencnt;	/* generation count for sockets */
179
180int	maxsockets;
181
182MALLOC_DEFINE(M_SONAME, "soname", "socket name");
183MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
184
185#define	VNET_SO_ASSERT(so)						\
186	VNET_ASSERT(curvnet != NULL,					\
187	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
188
189static int somaxconn = SOMAXCONN;
190static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
191/* XXX: we dont have SYSCTL_USHORT */
192SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
193    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
194    "queue size");
195static int numopensockets;
196SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
197    &numopensockets, 0, "Number of open sockets");
198#ifdef ZERO_COPY_SOCKETS
199/* These aren't static because they're used in other files. */
200int so_zero_copy_send = 1;
201int so_zero_copy_receive = 1;
202SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
203    "Zero copy controls");
204SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
205    &so_zero_copy_receive, 0, "Enable zero copy receive");
206SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
207    &so_zero_copy_send, 0, "Enable zero copy send");
208#endif /* ZERO_COPY_SOCKETS */
209
210/*
211 * accept_mtx locks down per-socket fields relating to accept queues.  See
212 * socketvar.h for an annotation of the protected fields of struct socket.
213 */
214struct mtx accept_mtx;
215MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
216
217/*
218 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
219 * so_gencnt field.
220 */
221static struct mtx so_global_mtx;
222MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
223
224/*
225 * General IPC sysctl name space, used by sockets and a variety of other IPC
226 * types.
227 */
228SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
229
230/*
231 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
232 * of the change so that they can update their dependent limits as required.
233 */
234static int
235sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
236{
237	int error, newmaxsockets;
238
239	newmaxsockets = maxsockets;
240	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
241	if (error == 0 && req->newptr) {
242		if (newmaxsockets > maxsockets) {
243			maxsockets = newmaxsockets;
244			if (maxsockets > ((maxfiles / 4) * 3)) {
245				maxfiles = (maxsockets * 5) / 4;
246				maxfilesperproc = (maxfiles * 9) / 10;
247			}
248			EVENTHANDLER_INVOKE(maxsockets_change);
249		} else
250			error = EINVAL;
251	}
252	return (error);
253}
254
255SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
256    &maxsockets, 0, sysctl_maxsockets, "IU",
257    "Maximum number of sockets avaliable");
258
259/*
260 * Initialise maxsockets.  This SYSINIT must be run after
261 * tunable_mbinit().
262 */
263static void
264init_maxsockets(void *ignored)
265{
266
267	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
268	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
269}
270SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
271
272/*
273 * Socket operation routines.  These routines are called by the routines in
274 * sys_socket.c or from a system process, and implement the semantics of
275 * socket operations by switching out to the protocol specific routines.
276 */
277
278/*
279 * Get a socket structure from our zone, and initialize it.  Note that it
280 * would probably be better to allocate socket and PCB at the same time, but
281 * I'm not convinced that all the protocols can be easily modified to do
282 * this.
283 *
284 * soalloc() returns a socket with a ref count of 0.
285 */
286static struct socket *
287soalloc(struct vnet *vnet)
288{
289	struct socket *so;
290
291	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
292	if (so == NULL)
293		return (NULL);
294#ifdef MAC
295	if (mac_socket_init(so, M_NOWAIT) != 0) {
296		uma_zfree(socket_zone, so);
297		return (NULL);
298	}
299#endif
300	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
301	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
302	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
303	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
304	TAILQ_INIT(&so->so_aiojobq);
305	mtx_lock(&so_global_mtx);
306	so->so_gencnt = ++so_gencnt;
307	++numopensockets;
308#ifdef VIMAGE
309	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
310	    __func__, __LINE__, so));
311	vnet->vnet_sockcnt++;
312	so->so_vnet = vnet;
313#endif
314	mtx_unlock(&so_global_mtx);
315	return (so);
316}
317
318/*
319 * Free the storage associated with a socket at the socket layer, tear down
320 * locks, labels, etc.  All protocol state is assumed already to have been
321 * torn down (and possibly never set up) by the caller.
322 */
323static void
324sodealloc(struct socket *so)
325{
326
327	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
328	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
329
330	mtx_lock(&so_global_mtx);
331	so->so_gencnt = ++so_gencnt;
332	--numopensockets;	/* Could be below, but faster here. */
333#ifdef VIMAGE
334	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
335	    __func__, __LINE__, so));
336	so->so_vnet->vnet_sockcnt--;
337#endif
338	mtx_unlock(&so_global_mtx);
339	if (so->so_rcv.sb_hiwat)
340		(void)chgsbsize(so->so_cred->cr_uidinfo,
341		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
342	if (so->so_snd.sb_hiwat)
343		(void)chgsbsize(so->so_cred->cr_uidinfo,
344		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
345#ifdef INET
346	/* remove acccept filter if one is present. */
347	if (so->so_accf != NULL)
348		do_setopt_accept_filter(so, NULL);
349#endif
350#ifdef MAC
351	mac_socket_destroy(so);
352#endif
353	crfree(so->so_cred);
354	sx_destroy(&so->so_snd.sb_sx);
355	sx_destroy(&so->so_rcv.sb_sx);
356	SOCKBUF_LOCK_DESTROY(&so->so_snd);
357	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
358	uma_zfree(socket_zone, so);
359}
360
361/*
362 * socreate returns a socket with a ref count of 1.  The socket should be
363 * closed with soclose().
364 */
365int
366socreate(int dom, struct socket **aso, int type, int proto,
367    struct ucred *cred, struct thread *td)
368{
369	struct protosw *prp;
370	struct socket *so;
371	int error;
372
373	if (proto)
374		prp = pffindproto(dom, proto, type);
375	else
376		prp = pffindtype(dom, type);
377
378	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
379	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
380		return (EPROTONOSUPPORT);
381
382	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
383		return (EPROTONOSUPPORT);
384
385	if (prp->pr_type != type)
386		return (EPROTOTYPE);
387	so = soalloc(CRED_TO_VNET(cred));
388	if (so == NULL)
389		return (ENOBUFS);
390
391	TAILQ_INIT(&so->so_incomp);
392	TAILQ_INIT(&so->so_comp);
393	so->so_type = type;
394	so->so_cred = crhold(cred);
395	if ((prp->pr_domain->dom_family == PF_INET) ||
396	    (prp->pr_domain->dom_family == PF_INET6) ||
397	    (prp->pr_domain->dom_family == PF_ROUTE))
398		so->so_fibnum = td->td_proc->p_fibnum;
399	else
400		so->so_fibnum = 0;
401	so->so_proto = prp;
402#ifdef MAC
403	mac_socket_create(cred, so);
404#endif
405	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
406	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
407	so->so_count = 1;
408	/*
409	 * Auto-sizing of socket buffers is managed by the protocols and
410	 * the appropriate flags must be set in the pru_attach function.
411	 */
412	CURVNET_SET(so->so_vnet);
413	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
414	CURVNET_RESTORE();
415	if (error) {
416		KASSERT(so->so_count == 1, ("socreate: so_count %d",
417		    so->so_count));
418		so->so_count = 0;
419		sodealloc(so);
420		return (error);
421	}
422	*aso = so;
423	return (0);
424}
425
426#ifdef REGRESSION
427static int regression_sonewconn_earlytest = 1;
428SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
429    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
430#endif
431
432/*
433 * When an attempt at a new connection is noted on a socket which accepts
434 * connections, sonewconn is called.  If the connection is possible (subject
435 * to space constraints, etc.) then we allocate a new structure, propoerly
436 * linked into the data structure of the original socket, and return this.
437 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
438 *
439 * Note: the ref count on the socket is 0 on return.
440 */
441struct socket *
442sonewconn(struct socket *head, int connstatus)
443{
444	struct socket *so;
445	int over;
446
447	ACCEPT_LOCK();
448	over = (head->so_qlen > 3 * head->so_qlimit / 2);
449	ACCEPT_UNLOCK();
450#ifdef REGRESSION
451	if (regression_sonewconn_earlytest && over) {
452#else
453	if (over) {
454#endif
455		log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
456		    "%i already in queue awaiting acceptance\n",
457		    __func__, head->so_pcb, head->so_qlen);
458		return (NULL);
459	}
460	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
461	    __func__, __LINE__, head));
462	so = soalloc(head->so_vnet);
463	if (so == NULL) {
464		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
465		    "limit reached or out of memory\n",
466		    __func__, head->so_pcb);
467		return (NULL);
468	}
469	if ((head->so_options & SO_ACCEPTFILTER) != 0)
470		connstatus = 0;
471	so->so_head = head;
472	so->so_type = head->so_type;
473	so->so_options = head->so_options &~ SO_ACCEPTCONN;
474	so->so_linger = head->so_linger;
475	so->so_state = head->so_state | SS_NOFDREF;
476	so->so_fibnum = head->so_fibnum;
477	so->so_proto = head->so_proto;
478	so->so_cred = crhold(head->so_cred);
479#ifdef MAC
480	mac_socket_newconn(head, so);
481#endif
482	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
483	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
484	VNET_SO_ASSERT(head);
485	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
486		sodealloc(so);
487		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
488		    __func__, head->so_pcb);
489		return (NULL);
490	}
491	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
492		sodealloc(so);
493		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
494		    __func__, head->so_pcb);
495		return (NULL);
496	}
497	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
498	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
499	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
500	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
501	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
502	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
503	so->so_state |= connstatus;
504	ACCEPT_LOCK();
505	/*
506	 * The accept socket may be tearing down but we just
507	 * won a race on the ACCEPT_LOCK.
508	 */
509	if (!(head->so_options & SO_ACCEPTCONN)) {
510		SOCK_LOCK(so);
511		so->so_head = NULL;
512		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
513		return (NULL);
514	}
515	if (connstatus) {
516		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
517		so->so_qstate |= SQ_COMP;
518		head->so_qlen++;
519	} else {
520		/*
521		 * Keep removing sockets from the head until there's room for
522		 * us to insert on the tail.  In pre-locking revisions, this
523		 * was a simple if(), but as we could be racing with other
524		 * threads and soabort() requires dropping locks, we must
525		 * loop waiting for the condition to be true.
526		 */
527		while (head->so_incqlen > head->so_qlimit) {
528			struct socket *sp;
529			sp = TAILQ_FIRST(&head->so_incomp);
530			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
531			head->so_incqlen--;
532			sp->so_qstate &= ~SQ_INCOMP;
533			sp->so_head = NULL;
534			ACCEPT_UNLOCK();
535			soabort(sp);
536			ACCEPT_LOCK();
537		}
538		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
539		so->so_qstate |= SQ_INCOMP;
540		head->so_incqlen++;
541	}
542	ACCEPT_UNLOCK();
543	if (connstatus) {
544		sorwakeup(head);
545		wakeup_one(&head->so_timeo);
546	}
547	return (so);
548}
549
550int
551sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
552{
553	int error;
554
555	CURVNET_SET(so->so_vnet);
556	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
557	CURVNET_RESTORE();
558	return error;
559}
560
561/*
562 * solisten() transitions a socket from a non-listening state to a listening
563 * state, but can also be used to update the listen queue depth on an
564 * existing listen socket.  The protocol will call back into the sockets
565 * layer using solisten_proto_check() and solisten_proto() to check and set
566 * socket-layer listen state.  Call backs are used so that the protocol can
567 * acquire both protocol and socket layer locks in whatever order is required
568 * by the protocol.
569 *
570 * Protocol implementors are advised to hold the socket lock across the
571 * socket-layer test and set to avoid races at the socket layer.
572 */
573int
574solisten(struct socket *so, int backlog, struct thread *td)
575{
576	int error;
577
578	CURVNET_SET(so->so_vnet);
579	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
580	CURVNET_RESTORE();
581	return error;
582}
583
584int
585solisten_proto_check(struct socket *so)
586{
587
588	SOCK_LOCK_ASSERT(so);
589
590	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
591	    SS_ISDISCONNECTING))
592		return (EINVAL);
593	return (0);
594}
595
596void
597solisten_proto(struct socket *so, int backlog)
598{
599
600	SOCK_LOCK_ASSERT(so);
601
602	if (backlog < 0 || backlog > somaxconn)
603		backlog = somaxconn;
604	so->so_qlimit = backlog;
605	so->so_options |= SO_ACCEPTCONN;
606}
607
608/*
609 * Evaluate the reference count and named references on a socket; if no
610 * references remain, free it.  This should be called whenever a reference is
611 * released, such as in sorele(), but also when named reference flags are
612 * cleared in socket or protocol code.
613 *
614 * sofree() will free the socket if:
615 *
616 * - There are no outstanding file descriptor references or related consumers
617 *   (so_count == 0).
618 *
619 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
620 *
621 * - The protocol does not have an outstanding strong reference on the socket
622 *   (SS_PROTOREF).
623 *
624 * - The socket is not in a completed connection queue, so a process has been
625 *   notified that it is present.  If it is removed, the user process may
626 *   block in accept() despite select() saying the socket was ready.
627 */
628void
629sofree(struct socket *so)
630{
631	struct protosw *pr = so->so_proto;
632	struct socket *head;
633
634	ACCEPT_LOCK_ASSERT();
635	SOCK_LOCK_ASSERT(so);
636
637	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
638	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
639		SOCK_UNLOCK(so);
640		ACCEPT_UNLOCK();
641		return;
642	}
643
644	head = so->so_head;
645	if (head != NULL) {
646		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
647		    (so->so_qstate & SQ_INCOMP) != 0,
648		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
649		    "SQ_INCOMP"));
650		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
651		    (so->so_qstate & SQ_INCOMP) == 0,
652		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
653		TAILQ_REMOVE(&head->so_incomp, so, so_list);
654		head->so_incqlen--;
655		so->so_qstate &= ~SQ_INCOMP;
656		so->so_head = NULL;
657	}
658	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
659	    (so->so_qstate & SQ_INCOMP) == 0,
660	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
661	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
662	if (so->so_options & SO_ACCEPTCONN) {
663		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
664		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
665	}
666	SOCK_UNLOCK(so);
667	ACCEPT_UNLOCK();
668
669	VNET_SO_ASSERT(so);
670	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
671		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
672	if (pr->pr_usrreqs->pru_detach != NULL)
673		(*pr->pr_usrreqs->pru_detach)(so);
674
675	/*
676	 * From this point on, we assume that no other references to this
677	 * socket exist anywhere else in the stack.  Therefore, no locks need
678	 * to be acquired or held.
679	 *
680	 * We used to do a lot of socket buffer and socket locking here, as
681	 * well as invoke sorflush() and perform wakeups.  The direct call to
682	 * dom_dispose() and sbrelease_internal() are an inlining of what was
683	 * necessary from sorflush().
684	 *
685	 * Notice that the socket buffer and kqueue state are torn down
686	 * before calling pru_detach.  This means that protocols shold not
687	 * assume they can perform socket wakeups, etc, in their detach code.
688	 */
689	sbdestroy(&so->so_snd, so);
690	sbdestroy(&so->so_rcv, so);
691	seldrain(&so->so_snd.sb_sel);
692	seldrain(&so->so_rcv.sb_sel);
693	knlist_destroy(&so->so_rcv.sb_sel.si_note);
694	knlist_destroy(&so->so_snd.sb_sel.si_note);
695	sodealloc(so);
696}
697
698/*
699 * Close a socket on last file table reference removal.  Initiate disconnect
700 * if connected.  Free socket when disconnect complete.
701 *
702 * This function will sorele() the socket.  Note that soclose() may be called
703 * prior to the ref count reaching zero.  The actual socket structure will
704 * not be freed until the ref count reaches zero.
705 */
706int
707soclose(struct socket *so)
708{
709	int error = 0;
710
711	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
712
713	CURVNET_SET(so->so_vnet);
714	funsetown(&so->so_sigio);
715	if (so->so_state & SS_ISCONNECTED) {
716		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
717			error = sodisconnect(so);
718			if (error) {
719				if (error == ENOTCONN)
720					error = 0;
721				goto drop;
722			}
723		}
724		if (so->so_options & SO_LINGER) {
725			if ((so->so_state & SS_ISDISCONNECTING) &&
726			    (so->so_state & SS_NBIO))
727				goto drop;
728			while (so->so_state & SS_ISCONNECTED) {
729				error = tsleep(&so->so_timeo,
730				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
731				if (error)
732					break;
733			}
734		}
735	}
736
737drop:
738	if (so->so_proto->pr_usrreqs->pru_close != NULL)
739		(*so->so_proto->pr_usrreqs->pru_close)(so);
740	ACCEPT_LOCK();
741	if (so->so_options & SO_ACCEPTCONN) {
742		struct socket *sp;
743		/*
744		 * Prevent new additions to the accept queues due
745		 * to ACCEPT_LOCK races while we are draining them.
746		 */
747		so->so_options &= ~SO_ACCEPTCONN;
748		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
749			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
750			so->so_incqlen--;
751			sp->so_qstate &= ~SQ_INCOMP;
752			sp->so_head = NULL;
753			ACCEPT_UNLOCK();
754			soabort(sp);
755			ACCEPT_LOCK();
756		}
757		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
758			TAILQ_REMOVE(&so->so_comp, sp, so_list);
759			so->so_qlen--;
760			sp->so_qstate &= ~SQ_COMP;
761			sp->so_head = NULL;
762			ACCEPT_UNLOCK();
763			soabort(sp);
764			ACCEPT_LOCK();
765		}
766		KASSERT((TAILQ_EMPTY(&so->so_comp)),
767		    ("%s: so_comp populated", __func__));
768		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
769		    ("%s: so_incomp populated", __func__));
770	}
771	SOCK_LOCK(so);
772	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
773	so->so_state |= SS_NOFDREF;
774	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
775	CURVNET_RESTORE();
776	return (error);
777}
778
779/*
780 * soabort() is used to abruptly tear down a connection, such as when a
781 * resource limit is reached (listen queue depth exceeded), or if a listen
782 * socket is closed while there are sockets waiting to be accepted.
783 *
784 * This interface is tricky, because it is called on an unreferenced socket,
785 * and must be called only by a thread that has actually removed the socket
786 * from the listen queue it was on, or races with other threads are risked.
787 *
788 * This interface will call into the protocol code, so must not be called
789 * with any socket locks held.  Protocols do call it while holding their own
790 * recursible protocol mutexes, but this is something that should be subject
791 * to review in the future.
792 */
793void
794soabort(struct socket *so)
795{
796
797	/*
798	 * In as much as is possible, assert that no references to this
799	 * socket are held.  This is not quite the same as asserting that the
800	 * current thread is responsible for arranging for no references, but
801	 * is as close as we can get for now.
802	 */
803	KASSERT(so->so_count == 0, ("soabort: so_count"));
804	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
805	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
806	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
807	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
808	VNET_SO_ASSERT(so);
809
810	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
811		(*so->so_proto->pr_usrreqs->pru_abort)(so);
812	ACCEPT_LOCK();
813	SOCK_LOCK(so);
814	sofree(so);
815}
816
817int
818soaccept(struct socket *so, struct sockaddr **nam)
819{
820	int error;
821
822	SOCK_LOCK(so);
823	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
824	so->so_state &= ~SS_NOFDREF;
825	SOCK_UNLOCK(so);
826
827	CURVNET_SET(so->so_vnet);
828	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
829	CURVNET_RESTORE();
830	return (error);
831}
832
833int
834soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
835{
836	int error;
837
838	if (so->so_options & SO_ACCEPTCONN)
839		return (EOPNOTSUPP);
840
841	CURVNET_SET(so->so_vnet);
842	/*
843	 * If protocol is connection-based, can only connect once.
844	 * Otherwise, if connected, try to disconnect first.  This allows
845	 * user to disconnect by connecting to, e.g., a null address.
846	 */
847	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
848	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
849	    (error = sodisconnect(so)))) {
850		error = EISCONN;
851	} else {
852		/*
853		 * Prevent accumulated error from previous connection from
854		 * biting us.
855		 */
856		so->so_error = 0;
857		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
858	}
859	CURVNET_RESTORE();
860
861	return (error);
862}
863
864int
865soconnect2(struct socket *so1, struct socket *so2)
866{
867	int error;
868
869	CURVNET_SET(so1->so_vnet);
870	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
871	CURVNET_RESTORE();
872	return (error);
873}
874
875int
876sodisconnect(struct socket *so)
877{
878	int error;
879
880	if ((so->so_state & SS_ISCONNECTED) == 0)
881		return (ENOTCONN);
882	if (so->so_state & SS_ISDISCONNECTING)
883		return (EALREADY);
884	VNET_SO_ASSERT(so);
885	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
886	return (error);
887}
888
889#ifdef ZERO_COPY_SOCKETS
890struct so_zerocopy_stats{
891	int size_ok;
892	int align_ok;
893	int found_ifp;
894};
895struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
896
897/*
898 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
899 * sosend_dgram() and sosend_generic() use m_uiotombuf().
900 *
901 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
902 * all of the data referenced by the uio.  If desired, it uses zero-copy.
903 * *space will be updated to reflect data copied in.
904 *
905 * NB: If atomic I/O is requested, the caller must already have checked that
906 * space can hold resid bytes.
907 *
908 * NB: In the event of an error, the caller may need to free the partial
909 * chain pointed to by *mpp.  The contents of both *uio and *space may be
910 * modified even in the case of an error.
911 */
912static int
913sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
914    int flags)
915{
916	struct mbuf *m, **mp, *top;
917	long len;
918	ssize_t resid;
919	int error;
920	int cow_send;
921
922	*retmp = top = NULL;
923	mp = &top;
924	len = 0;
925	resid = uio->uio_resid;
926	error = 0;
927	do {
928		cow_send = 0;
929		if (resid >= MINCLSIZE) {
930			if (top == NULL) {
931				m = m_gethdr(M_WAITOK, MT_DATA);
932				m->m_pkthdr.len = 0;
933				m->m_pkthdr.rcvif = NULL;
934			} else
935				m = m_get(M_WAITOK, MT_DATA);
936			if (so_zero_copy_send &&
937			    resid >= PAGE_SIZE &&
938			    *space >= PAGE_SIZE &&
939			    uio->uio_iov->iov_len >= PAGE_SIZE) {
940				so_zerocp_stats.size_ok++;
941				so_zerocp_stats.align_ok++;
942				cow_send = socow_setup(m, uio);
943				len = cow_send;
944			}
945			if (!cow_send) {
946				m_clget(m, M_WAITOK);
947				len = min(min(MCLBYTES, resid), *space);
948			}
949		} else {
950			if (top == NULL) {
951				m = m_gethdr(M_WAIT, MT_DATA);
952				m->m_pkthdr.len = 0;
953				m->m_pkthdr.rcvif = NULL;
954
955				len = min(min(MHLEN, resid), *space);
956				/*
957				 * For datagram protocols, leave room
958				 * for protocol headers in first mbuf.
959				 */
960				if (atomic && m && len < MHLEN)
961					MH_ALIGN(m, len);
962			} else {
963				m = m_get(M_WAIT, MT_DATA);
964				len = min(min(MLEN, resid), *space);
965			}
966		}
967		if (m == NULL) {
968			error = ENOBUFS;
969			goto out;
970		}
971
972		*space -= len;
973		if (cow_send)
974			error = 0;
975		else
976			error = uiomove(mtod(m, void *), (int)len, uio);
977		resid = uio->uio_resid;
978		m->m_len = len;
979		*mp = m;
980		top->m_pkthdr.len += len;
981		if (error)
982			goto out;
983		mp = &m->m_next;
984		if (resid <= 0) {
985			if (flags & MSG_EOR)
986				top->m_flags |= M_EOR;
987			break;
988		}
989	} while (*space > 0 && atomic);
990out:
991	*retmp = top;
992	return (error);
993}
994#endif /* ZERO_COPY_SOCKETS */
995
996#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
997
998int
999sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1000    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1001{
1002	long space;
1003	ssize_t resid;
1004	int clen = 0, error, dontroute;
1005#ifdef ZERO_COPY_SOCKETS
1006	int atomic = sosendallatonce(so) || top;
1007#endif
1008
1009	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
1010	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1011	    ("sodgram_send: !PR_ATOMIC"));
1012
1013	if (uio != NULL)
1014		resid = uio->uio_resid;
1015	else
1016		resid = top->m_pkthdr.len;
1017	/*
1018	 * In theory resid should be unsigned.  However, space must be
1019	 * signed, as it might be less than 0 if we over-committed, and we
1020	 * must use a signed comparison of space and resid.  On the other
1021	 * hand, a negative resid causes us to loop sending 0-length
1022	 * segments to the protocol.
1023	 */
1024	if (resid < 0) {
1025		error = EINVAL;
1026		goto out;
1027	}
1028
1029	dontroute =
1030	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1031	if (td != NULL)
1032		td->td_ru.ru_msgsnd++;
1033	if (control != NULL)
1034		clen = control->m_len;
1035
1036	SOCKBUF_LOCK(&so->so_snd);
1037	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1038		SOCKBUF_UNLOCK(&so->so_snd);
1039		error = EPIPE;
1040		goto out;
1041	}
1042	if (so->so_error) {
1043		error = so->so_error;
1044		so->so_error = 0;
1045		SOCKBUF_UNLOCK(&so->so_snd);
1046		goto out;
1047	}
1048	if ((so->so_state & SS_ISCONNECTED) == 0) {
1049		/*
1050		 * `sendto' and `sendmsg' is allowed on a connection-based
1051		 * socket if it supports implied connect.  Return ENOTCONN if
1052		 * not connected and no address is supplied.
1053		 */
1054		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1055		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1056			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1057			    !(resid == 0 && clen != 0)) {
1058				SOCKBUF_UNLOCK(&so->so_snd);
1059				error = ENOTCONN;
1060				goto out;
1061			}
1062		} else if (addr == NULL) {
1063			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1064				error = ENOTCONN;
1065			else
1066				error = EDESTADDRREQ;
1067			SOCKBUF_UNLOCK(&so->so_snd);
1068			goto out;
1069		}
1070	}
1071
1072	/*
1073	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1074	 * problem and need fixing.
1075	 */
1076	space = sbspace(&so->so_snd);
1077	if (flags & MSG_OOB)
1078		space += 1024;
1079	space -= clen;
1080	SOCKBUF_UNLOCK(&so->so_snd);
1081	if (resid > space) {
1082		error = EMSGSIZE;
1083		goto out;
1084	}
1085	if (uio == NULL) {
1086		resid = 0;
1087		if (flags & MSG_EOR)
1088			top->m_flags |= M_EOR;
1089	} else {
1090#ifdef ZERO_COPY_SOCKETS
1091		error = sosend_copyin(uio, &top, atomic, &space, flags);
1092		if (error)
1093			goto out;
1094#else
1095		/*
1096		 * Copy the data from userland into a mbuf chain.
1097		 * If no data is to be copied in, a single empty mbuf
1098		 * is returned.
1099		 */
1100		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1101		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1102		if (top == NULL) {
1103			error = EFAULT;	/* only possible error */
1104			goto out;
1105		}
1106		space -= resid - uio->uio_resid;
1107#endif
1108		resid = uio->uio_resid;
1109	}
1110	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1111	/*
1112	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1113	 * than with.
1114	 */
1115	if (dontroute) {
1116		SOCK_LOCK(so);
1117		so->so_options |= SO_DONTROUTE;
1118		SOCK_UNLOCK(so);
1119	}
1120	/*
1121	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1122	 * of date.  We could have recieved a reset packet in an interrupt or
1123	 * maybe we slept while doing page faults in uiomove() etc.  We could
1124	 * probably recheck again inside the locking protection here, but
1125	 * there are probably other places that this also happens.  We must
1126	 * rethink this.
1127	 */
1128	VNET_SO_ASSERT(so);
1129	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1130	    (flags & MSG_OOB) ? PRUS_OOB :
1131	/*
1132	 * If the user set MSG_EOF, the protocol understands this flag and
1133	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1134	 */
1135	    ((flags & MSG_EOF) &&
1136	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1137	     (resid <= 0)) ?
1138		PRUS_EOF :
1139		/* If there is more to send set PRUS_MORETOCOME */
1140		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1141		top, addr, control, td);
1142	if (dontroute) {
1143		SOCK_LOCK(so);
1144		so->so_options &= ~SO_DONTROUTE;
1145		SOCK_UNLOCK(so);
1146	}
1147	clen = 0;
1148	control = NULL;
1149	top = NULL;
1150out:
1151	if (top != NULL)
1152		m_freem(top);
1153	if (control != NULL)
1154		m_freem(control);
1155	return (error);
1156}
1157
1158/*
1159 * Send on a socket.  If send must go all at once and message is larger than
1160 * send buffering, then hard error.  Lock against other senders.  If must go
1161 * all at once and not enough room now, then inform user that this would
1162 * block and do nothing.  Otherwise, if nonblocking, send as much as
1163 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1164 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1165 * in mbuf chain must be small enough to send all at once.
1166 *
1167 * Returns nonzero on error, timeout or signal; callers must check for short
1168 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1169 * on return.
1170 */
1171int
1172sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1173    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1174{
1175	long space;
1176	ssize_t resid;
1177	int clen = 0, error, dontroute;
1178	int atomic = sosendallatonce(so) || top;
1179
1180	if (uio != NULL)
1181		resid = uio->uio_resid;
1182	else
1183		resid = top->m_pkthdr.len;
1184	/*
1185	 * In theory resid should be unsigned.  However, space must be
1186	 * signed, as it might be less than 0 if we over-committed, and we
1187	 * must use a signed comparison of space and resid.  On the other
1188	 * hand, a negative resid causes us to loop sending 0-length
1189	 * segments to the protocol.
1190	 *
1191	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1192	 * type sockets since that's an error.
1193	 */
1194	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1195		error = EINVAL;
1196		goto out;
1197	}
1198
1199	dontroute =
1200	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1201	    (so->so_proto->pr_flags & PR_ATOMIC);
1202	if (td != NULL)
1203		td->td_ru.ru_msgsnd++;
1204	if (control != NULL)
1205		clen = control->m_len;
1206
1207	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1208	if (error)
1209		goto out;
1210
1211restart:
1212	do {
1213		SOCKBUF_LOCK(&so->so_snd);
1214		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1215			SOCKBUF_UNLOCK(&so->so_snd);
1216			error = EPIPE;
1217			goto release;
1218		}
1219		if (so->so_error) {
1220			error = so->so_error;
1221			so->so_error = 0;
1222			SOCKBUF_UNLOCK(&so->so_snd);
1223			goto release;
1224		}
1225		if ((so->so_state & SS_ISCONNECTED) == 0) {
1226			/*
1227			 * `sendto' and `sendmsg' is allowed on a connection-
1228			 * based socket if it supports implied connect.
1229			 * Return ENOTCONN if not connected and no address is
1230			 * supplied.
1231			 */
1232			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1233			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1234				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1235				    !(resid == 0 && clen != 0)) {
1236					SOCKBUF_UNLOCK(&so->so_snd);
1237					error = ENOTCONN;
1238					goto release;
1239				}
1240			} else if (addr == NULL) {
1241				SOCKBUF_UNLOCK(&so->so_snd);
1242				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1243					error = ENOTCONN;
1244				else
1245					error = EDESTADDRREQ;
1246				goto release;
1247			}
1248		}
1249		space = sbspace(&so->so_snd);
1250		if (flags & MSG_OOB)
1251			space += 1024;
1252		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1253		    clen > so->so_snd.sb_hiwat) {
1254			SOCKBUF_UNLOCK(&so->so_snd);
1255			error = EMSGSIZE;
1256			goto release;
1257		}
1258		if (space < resid + clen &&
1259		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1260			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1261				SOCKBUF_UNLOCK(&so->so_snd);
1262				error = EWOULDBLOCK;
1263				goto release;
1264			}
1265			error = sbwait(&so->so_snd);
1266			SOCKBUF_UNLOCK(&so->so_snd);
1267			if (error)
1268				goto release;
1269			goto restart;
1270		}
1271		SOCKBUF_UNLOCK(&so->so_snd);
1272		space -= clen;
1273		do {
1274			if (uio == NULL) {
1275				resid = 0;
1276				if (flags & MSG_EOR)
1277					top->m_flags |= M_EOR;
1278			} else {
1279#ifdef ZERO_COPY_SOCKETS
1280				error = sosend_copyin(uio, &top, atomic,
1281				    &space, flags);
1282				if (error != 0)
1283					goto release;
1284#else
1285				/*
1286				 * Copy the data from userland into a mbuf
1287				 * chain.  If no data is to be copied in,
1288				 * a single empty mbuf is returned.
1289				 */
1290				top = m_uiotombuf(uio, M_WAITOK, space,
1291				    (atomic ? max_hdr : 0),
1292				    (atomic ? M_PKTHDR : 0) |
1293				    ((flags & MSG_EOR) ? M_EOR : 0));
1294				if (top == NULL) {
1295					error = EFAULT; /* only possible error */
1296					goto release;
1297				}
1298				space -= resid - uio->uio_resid;
1299#endif
1300				resid = uio->uio_resid;
1301			}
1302			if (dontroute) {
1303				SOCK_LOCK(so);
1304				so->so_options |= SO_DONTROUTE;
1305				SOCK_UNLOCK(so);
1306			}
1307			/*
1308			 * XXX all the SBS_CANTSENDMORE checks previously
1309			 * done could be out of date.  We could have recieved
1310			 * a reset packet in an interrupt or maybe we slept
1311			 * while doing page faults in uiomove() etc.  We
1312			 * could probably recheck again inside the locking
1313			 * protection here, but there are probably other
1314			 * places that this also happens.  We must rethink
1315			 * this.
1316			 */
1317			VNET_SO_ASSERT(so);
1318			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1319			    (flags & MSG_OOB) ? PRUS_OOB :
1320			/*
1321			 * If the user set MSG_EOF, the protocol understands
1322			 * this flag and nothing left to send then use
1323			 * PRU_SEND_EOF instead of PRU_SEND.
1324			 */
1325			    ((flags & MSG_EOF) &&
1326			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1327			     (resid <= 0)) ?
1328				PRUS_EOF :
1329			/* If there is more to send set PRUS_MORETOCOME. */
1330			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1331			    top, addr, control, td);
1332			if (dontroute) {
1333				SOCK_LOCK(so);
1334				so->so_options &= ~SO_DONTROUTE;
1335				SOCK_UNLOCK(so);
1336			}
1337			clen = 0;
1338			control = NULL;
1339			top = NULL;
1340			if (error)
1341				goto release;
1342		} while (resid && space > 0);
1343	} while (resid);
1344
1345release:
1346	sbunlock(&so->so_snd);
1347out:
1348	if (top != NULL)
1349		m_freem(top);
1350	if (control != NULL)
1351		m_freem(control);
1352	return (error);
1353}
1354
1355int
1356sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1357    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1358{
1359	int error;
1360
1361	CURVNET_SET(so->so_vnet);
1362	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1363	    control, flags, td);
1364	CURVNET_RESTORE();
1365	return (error);
1366}
1367
1368/*
1369 * The part of soreceive() that implements reading non-inline out-of-band
1370 * data from a socket.  For more complete comments, see soreceive(), from
1371 * which this code originated.
1372 *
1373 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1374 * unable to return an mbuf chain to the caller.
1375 */
1376static int
1377soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1378{
1379	struct protosw *pr = so->so_proto;
1380	struct mbuf *m;
1381	int error;
1382
1383	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1384	VNET_SO_ASSERT(so);
1385
1386	m = m_get(M_WAIT, MT_DATA);
1387	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1388	if (error)
1389		goto bad;
1390	do {
1391#ifdef ZERO_COPY_SOCKETS
1392		if (so_zero_copy_receive) {
1393			int disposable;
1394
1395			if ((m->m_flags & M_EXT)
1396			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1397				disposable = 1;
1398			else
1399				disposable = 0;
1400
1401			error = uiomoveco(mtod(m, void *),
1402					  min(uio->uio_resid, m->m_len),
1403					  uio, disposable);
1404		} else
1405#endif /* ZERO_COPY_SOCKETS */
1406		error = uiomove(mtod(m, void *),
1407		    (int) min(uio->uio_resid, m->m_len), uio);
1408		m = m_free(m);
1409	} while (uio->uio_resid && error == 0 && m);
1410bad:
1411	if (m != NULL)
1412		m_freem(m);
1413	return (error);
1414}
1415
1416/*
1417 * Following replacement or removal of the first mbuf on the first mbuf chain
1418 * of a socket buffer, push necessary state changes back into the socket
1419 * buffer so that other consumers see the values consistently.  'nextrecord'
1420 * is the callers locally stored value of the original value of
1421 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1422 * NOTE: 'nextrecord' may be NULL.
1423 */
1424static __inline void
1425sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1426{
1427
1428	SOCKBUF_LOCK_ASSERT(sb);
1429	/*
1430	 * First, update for the new value of nextrecord.  If necessary, make
1431	 * it the first record.
1432	 */
1433	if (sb->sb_mb != NULL)
1434		sb->sb_mb->m_nextpkt = nextrecord;
1435	else
1436		sb->sb_mb = nextrecord;
1437
1438        /*
1439         * Now update any dependent socket buffer fields to reflect the new
1440         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1441	 * addition of a second clause that takes care of the case where
1442	 * sb_mb has been updated, but remains the last record.
1443         */
1444        if (sb->sb_mb == NULL) {
1445                sb->sb_mbtail = NULL;
1446                sb->sb_lastrecord = NULL;
1447        } else if (sb->sb_mb->m_nextpkt == NULL)
1448                sb->sb_lastrecord = sb->sb_mb;
1449}
1450
1451
1452/*
1453 * Implement receive operations on a socket.  We depend on the way that
1454 * records are added to the sockbuf by sbappend.  In particular, each record
1455 * (mbufs linked through m_next) must begin with an address if the protocol
1456 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1457 * data, and then zero or more mbufs of data.  In order to allow parallelism
1458 * between network receive and copying to user space, as well as avoid
1459 * sleeping with a mutex held, we release the socket buffer mutex during the
1460 * user space copy.  Although the sockbuf is locked, new data may still be
1461 * appended, and thus we must maintain consistency of the sockbuf during that
1462 * time.
1463 *
1464 * The caller may receive the data as a single mbuf chain by supplying an
1465 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1466 * the count in uio_resid.
1467 */
1468int
1469soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1470    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1471{
1472	struct mbuf *m, **mp;
1473	int flags, error, offset;
1474	ssize_t len;
1475	struct protosw *pr = so->so_proto;
1476	struct mbuf *nextrecord;
1477	int moff, type = 0;
1478	ssize_t orig_resid = uio->uio_resid;
1479
1480	mp = mp0;
1481	if (psa != NULL)
1482		*psa = NULL;
1483	if (controlp != NULL)
1484		*controlp = NULL;
1485	if (flagsp != NULL)
1486		flags = *flagsp &~ MSG_EOR;
1487	else
1488		flags = 0;
1489	if (flags & MSG_OOB)
1490		return (soreceive_rcvoob(so, uio, flags));
1491	if (mp != NULL)
1492		*mp = NULL;
1493	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1494	    && uio->uio_resid) {
1495		VNET_SO_ASSERT(so);
1496		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1497	}
1498
1499	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1500	if (error)
1501		return (error);
1502
1503restart:
1504	SOCKBUF_LOCK(&so->so_rcv);
1505	m = so->so_rcv.sb_mb;
1506	/*
1507	 * If we have less data than requested, block awaiting more (subject
1508	 * to any timeout) if:
1509	 *   1. the current count is less than the low water mark, or
1510	 *   2. MSG_DONTWAIT is not set
1511	 */
1512	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1513	    so->so_rcv.sb_cc < uio->uio_resid) &&
1514	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1515	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1516		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1517		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1518		    m, so->so_rcv.sb_cc));
1519		if (so->so_error) {
1520			if (m != NULL)
1521				goto dontblock;
1522			error = so->so_error;
1523			if ((flags & MSG_PEEK) == 0)
1524				so->so_error = 0;
1525			SOCKBUF_UNLOCK(&so->so_rcv);
1526			goto release;
1527		}
1528		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1529		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1530			if (m == NULL) {
1531				SOCKBUF_UNLOCK(&so->so_rcv);
1532				goto release;
1533			} else
1534				goto dontblock;
1535		}
1536		for (; m != NULL; m = m->m_next)
1537			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1538				m = so->so_rcv.sb_mb;
1539				goto dontblock;
1540			}
1541		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1542		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1543			SOCKBUF_UNLOCK(&so->so_rcv);
1544			error = ENOTCONN;
1545			goto release;
1546		}
1547		if (uio->uio_resid == 0) {
1548			SOCKBUF_UNLOCK(&so->so_rcv);
1549			goto release;
1550		}
1551		if ((so->so_state & SS_NBIO) ||
1552		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1553			SOCKBUF_UNLOCK(&so->so_rcv);
1554			error = EWOULDBLOCK;
1555			goto release;
1556		}
1557		SBLASTRECORDCHK(&so->so_rcv);
1558		SBLASTMBUFCHK(&so->so_rcv);
1559		error = sbwait(&so->so_rcv);
1560		SOCKBUF_UNLOCK(&so->so_rcv);
1561		if (error)
1562			goto release;
1563		goto restart;
1564	}
1565dontblock:
1566	/*
1567	 * From this point onward, we maintain 'nextrecord' as a cache of the
1568	 * pointer to the next record in the socket buffer.  We must keep the
1569	 * various socket buffer pointers and local stack versions of the
1570	 * pointers in sync, pushing out modifications before dropping the
1571	 * socket buffer mutex, and re-reading them when picking it up.
1572	 *
1573	 * Otherwise, we will race with the network stack appending new data
1574	 * or records onto the socket buffer by using inconsistent/stale
1575	 * versions of the field, possibly resulting in socket buffer
1576	 * corruption.
1577	 *
1578	 * By holding the high-level sblock(), we prevent simultaneous
1579	 * readers from pulling off the front of the socket buffer.
1580	 */
1581	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1582	if (uio->uio_td)
1583		uio->uio_td->td_ru.ru_msgrcv++;
1584	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1585	SBLASTRECORDCHK(&so->so_rcv);
1586	SBLASTMBUFCHK(&so->so_rcv);
1587	nextrecord = m->m_nextpkt;
1588	if (pr->pr_flags & PR_ADDR) {
1589		KASSERT(m->m_type == MT_SONAME,
1590		    ("m->m_type == %d", m->m_type));
1591		orig_resid = 0;
1592		if (psa != NULL)
1593			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1594			    M_NOWAIT);
1595		if (flags & MSG_PEEK) {
1596			m = m->m_next;
1597		} else {
1598			sbfree(&so->so_rcv, m);
1599			so->so_rcv.sb_mb = m_free(m);
1600			m = so->so_rcv.sb_mb;
1601			sockbuf_pushsync(&so->so_rcv, nextrecord);
1602		}
1603	}
1604
1605	/*
1606	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1607	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1608	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1609	 * perform externalization (or freeing if controlp == NULL).
1610	 */
1611	if (m != NULL && m->m_type == MT_CONTROL) {
1612		struct mbuf *cm = NULL, *cmn;
1613		struct mbuf **cme = &cm;
1614
1615		do {
1616			if (flags & MSG_PEEK) {
1617				if (controlp != NULL) {
1618					*controlp = m_copy(m, 0, m->m_len);
1619					controlp = &(*controlp)->m_next;
1620				}
1621				m = m->m_next;
1622			} else {
1623				sbfree(&so->so_rcv, m);
1624				so->so_rcv.sb_mb = m->m_next;
1625				m->m_next = NULL;
1626				*cme = m;
1627				cme = &(*cme)->m_next;
1628				m = so->so_rcv.sb_mb;
1629			}
1630		} while (m != NULL && m->m_type == MT_CONTROL);
1631		if ((flags & MSG_PEEK) == 0)
1632			sockbuf_pushsync(&so->so_rcv, nextrecord);
1633		while (cm != NULL) {
1634			cmn = cm->m_next;
1635			cm->m_next = NULL;
1636			if (pr->pr_domain->dom_externalize != NULL) {
1637				SOCKBUF_UNLOCK(&so->so_rcv);
1638				VNET_SO_ASSERT(so);
1639				error = (*pr->pr_domain->dom_externalize)
1640				    (cm, controlp);
1641				SOCKBUF_LOCK(&so->so_rcv);
1642			} else if (controlp != NULL)
1643				*controlp = cm;
1644			else
1645				m_freem(cm);
1646			if (controlp != NULL) {
1647				orig_resid = 0;
1648				while (*controlp != NULL)
1649					controlp = &(*controlp)->m_next;
1650			}
1651			cm = cmn;
1652		}
1653		if (m != NULL)
1654			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1655		else
1656			nextrecord = so->so_rcv.sb_mb;
1657		orig_resid = 0;
1658	}
1659	if (m != NULL) {
1660		if ((flags & MSG_PEEK) == 0) {
1661			KASSERT(m->m_nextpkt == nextrecord,
1662			    ("soreceive: post-control, nextrecord !sync"));
1663			if (nextrecord == NULL) {
1664				KASSERT(so->so_rcv.sb_mb == m,
1665				    ("soreceive: post-control, sb_mb!=m"));
1666				KASSERT(so->so_rcv.sb_lastrecord == m,
1667				    ("soreceive: post-control, lastrecord!=m"));
1668			}
1669		}
1670		type = m->m_type;
1671		if (type == MT_OOBDATA)
1672			flags |= MSG_OOB;
1673	} else {
1674		if ((flags & MSG_PEEK) == 0) {
1675			KASSERT(so->so_rcv.sb_mb == nextrecord,
1676			    ("soreceive: sb_mb != nextrecord"));
1677			if (so->so_rcv.sb_mb == NULL) {
1678				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1679				    ("soreceive: sb_lastercord != NULL"));
1680			}
1681		}
1682	}
1683	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1684	SBLASTRECORDCHK(&so->so_rcv);
1685	SBLASTMBUFCHK(&so->so_rcv);
1686
1687	/*
1688	 * Now continue to read any data mbufs off of the head of the socket
1689	 * buffer until the read request is satisfied.  Note that 'type' is
1690	 * used to store the type of any mbuf reads that have happened so far
1691	 * such that soreceive() can stop reading if the type changes, which
1692	 * causes soreceive() to return only one of regular data and inline
1693	 * out-of-band data in a single socket receive operation.
1694	 */
1695	moff = 0;
1696	offset = 0;
1697	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1698		/*
1699		 * If the type of mbuf has changed since the last mbuf
1700		 * examined ('type'), end the receive operation.
1701	 	 */
1702		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1703		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1704			if (type != m->m_type)
1705				break;
1706		} else if (type == MT_OOBDATA)
1707			break;
1708		else
1709		    KASSERT(m->m_type == MT_DATA,
1710			("m->m_type == %d", m->m_type));
1711		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1712		len = uio->uio_resid;
1713		if (so->so_oobmark && len > so->so_oobmark - offset)
1714			len = so->so_oobmark - offset;
1715		if (len > m->m_len - moff)
1716			len = m->m_len - moff;
1717		/*
1718		 * If mp is set, just pass back the mbufs.  Otherwise copy
1719		 * them out via the uio, then free.  Sockbuf must be
1720		 * consistent here (points to current mbuf, it points to next
1721		 * record) when we drop priority; we must note any additions
1722		 * to the sockbuf when we block interrupts again.
1723		 */
1724		if (mp == NULL) {
1725			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1726			SBLASTRECORDCHK(&so->so_rcv);
1727			SBLASTMBUFCHK(&so->so_rcv);
1728			SOCKBUF_UNLOCK(&so->so_rcv);
1729#ifdef ZERO_COPY_SOCKETS
1730			if (so_zero_copy_receive) {
1731				int disposable;
1732
1733				if ((m->m_flags & M_EXT)
1734				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1735					disposable = 1;
1736				else
1737					disposable = 0;
1738
1739				error = uiomoveco(mtod(m, char *) + moff,
1740						  (int)len, uio,
1741						  disposable);
1742			} else
1743#endif /* ZERO_COPY_SOCKETS */
1744			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1745			SOCKBUF_LOCK(&so->so_rcv);
1746			if (error) {
1747				/*
1748				 * The MT_SONAME mbuf has already been removed
1749				 * from the record, so it is necessary to
1750				 * remove the data mbufs, if any, to preserve
1751				 * the invariant in the case of PR_ADDR that
1752				 * requires MT_SONAME mbufs at the head of
1753				 * each record.
1754				 */
1755				if (m && pr->pr_flags & PR_ATOMIC &&
1756				    ((flags & MSG_PEEK) == 0))
1757					(void)sbdroprecord_locked(&so->so_rcv);
1758				SOCKBUF_UNLOCK(&so->so_rcv);
1759				goto release;
1760			}
1761		} else
1762			uio->uio_resid -= len;
1763		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1764		if (len == m->m_len - moff) {
1765			if (m->m_flags & M_EOR)
1766				flags |= MSG_EOR;
1767			if (flags & MSG_PEEK) {
1768				m = m->m_next;
1769				moff = 0;
1770			} else {
1771				nextrecord = m->m_nextpkt;
1772				sbfree(&so->so_rcv, m);
1773				if (mp != NULL) {
1774					*mp = m;
1775					mp = &m->m_next;
1776					so->so_rcv.sb_mb = m = m->m_next;
1777					*mp = NULL;
1778				} else {
1779					so->so_rcv.sb_mb = m_free(m);
1780					m = so->so_rcv.sb_mb;
1781				}
1782				sockbuf_pushsync(&so->so_rcv, nextrecord);
1783				SBLASTRECORDCHK(&so->so_rcv);
1784				SBLASTMBUFCHK(&so->so_rcv);
1785			}
1786		} else {
1787			if (flags & MSG_PEEK)
1788				moff += len;
1789			else {
1790				if (mp != NULL) {
1791					int copy_flag;
1792
1793					if (flags & MSG_DONTWAIT)
1794						copy_flag = M_DONTWAIT;
1795					else
1796						copy_flag = M_WAIT;
1797					if (copy_flag == M_WAIT)
1798						SOCKBUF_UNLOCK(&so->so_rcv);
1799					*mp = m_copym(m, 0, len, copy_flag);
1800					if (copy_flag == M_WAIT)
1801						SOCKBUF_LOCK(&so->so_rcv);
1802 					if (*mp == NULL) {
1803 						/*
1804 						 * m_copym() couldn't
1805						 * allocate an mbuf.  Adjust
1806						 * uio_resid back (it was
1807						 * adjusted down by len
1808						 * bytes, which we didn't end
1809						 * up "copying" over).
1810 						 */
1811 						uio->uio_resid += len;
1812 						break;
1813 					}
1814				}
1815				m->m_data += len;
1816				m->m_len -= len;
1817				so->so_rcv.sb_cc -= len;
1818			}
1819		}
1820		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1821		if (so->so_oobmark) {
1822			if ((flags & MSG_PEEK) == 0) {
1823				so->so_oobmark -= len;
1824				if (so->so_oobmark == 0) {
1825					so->so_rcv.sb_state |= SBS_RCVATMARK;
1826					break;
1827				}
1828			} else {
1829				offset += len;
1830				if (offset == so->so_oobmark)
1831					break;
1832			}
1833		}
1834		if (flags & MSG_EOR)
1835			break;
1836		/*
1837		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1838		 * must not quit until "uio->uio_resid == 0" or an error
1839		 * termination.  If a signal/timeout occurs, return with a
1840		 * short count but without error.  Keep sockbuf locked
1841		 * against other readers.
1842		 */
1843		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1844		    !sosendallatonce(so) && nextrecord == NULL) {
1845			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1846			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1847				break;
1848			/*
1849			 * Notify the protocol that some data has been
1850			 * drained before blocking.
1851			 */
1852			if (pr->pr_flags & PR_WANTRCVD) {
1853				SOCKBUF_UNLOCK(&so->so_rcv);
1854				VNET_SO_ASSERT(so);
1855				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1856				SOCKBUF_LOCK(&so->so_rcv);
1857			}
1858			SBLASTRECORDCHK(&so->so_rcv);
1859			SBLASTMBUFCHK(&so->so_rcv);
1860			/*
1861			 * We could receive some data while was notifying
1862			 * the protocol. Skip blocking in this case.
1863			 */
1864			if (so->so_rcv.sb_mb == NULL) {
1865				error = sbwait(&so->so_rcv);
1866				if (error) {
1867					SOCKBUF_UNLOCK(&so->so_rcv);
1868					goto release;
1869				}
1870			}
1871			m = so->so_rcv.sb_mb;
1872			if (m != NULL)
1873				nextrecord = m->m_nextpkt;
1874		}
1875	}
1876
1877	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1878	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1879		flags |= MSG_TRUNC;
1880		if ((flags & MSG_PEEK) == 0)
1881			(void) sbdroprecord_locked(&so->so_rcv);
1882	}
1883	if ((flags & MSG_PEEK) == 0) {
1884		if (m == NULL) {
1885			/*
1886			 * First part is an inline SB_EMPTY_FIXUP().  Second
1887			 * part makes sure sb_lastrecord is up-to-date if
1888			 * there is still data in the socket buffer.
1889			 */
1890			so->so_rcv.sb_mb = nextrecord;
1891			if (so->so_rcv.sb_mb == NULL) {
1892				so->so_rcv.sb_mbtail = NULL;
1893				so->so_rcv.sb_lastrecord = NULL;
1894			} else if (nextrecord->m_nextpkt == NULL)
1895				so->so_rcv.sb_lastrecord = nextrecord;
1896		}
1897		SBLASTRECORDCHK(&so->so_rcv);
1898		SBLASTMBUFCHK(&so->so_rcv);
1899		/*
1900		 * If soreceive() is being done from the socket callback,
1901		 * then don't need to generate ACK to peer to update window,
1902		 * since ACK will be generated on return to TCP.
1903		 */
1904		if (!(flags & MSG_SOCALLBCK) &&
1905		    (pr->pr_flags & PR_WANTRCVD)) {
1906			SOCKBUF_UNLOCK(&so->so_rcv);
1907			VNET_SO_ASSERT(so);
1908			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1909			SOCKBUF_LOCK(&so->so_rcv);
1910		}
1911	}
1912	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1913	if (orig_resid == uio->uio_resid && orig_resid &&
1914	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1915		SOCKBUF_UNLOCK(&so->so_rcv);
1916		goto restart;
1917	}
1918	SOCKBUF_UNLOCK(&so->so_rcv);
1919
1920	if (flagsp != NULL)
1921		*flagsp |= flags;
1922release:
1923	sbunlock(&so->so_rcv);
1924	return (error);
1925}
1926
1927/*
1928 * Optimized version of soreceive() for stream (TCP) sockets.
1929 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1930 */
1931int
1932soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1933    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1934{
1935	int len = 0, error = 0, flags, oresid;
1936	struct sockbuf *sb;
1937	struct mbuf *m, *n = NULL;
1938
1939	/* We only do stream sockets. */
1940	if (so->so_type != SOCK_STREAM)
1941		return (EINVAL);
1942	if (psa != NULL)
1943		*psa = NULL;
1944	if (controlp != NULL)
1945		return (EINVAL);
1946	if (flagsp != NULL)
1947		flags = *flagsp &~ MSG_EOR;
1948	else
1949		flags = 0;
1950	if (flags & MSG_OOB)
1951		return (soreceive_rcvoob(so, uio, flags));
1952	if (mp0 != NULL)
1953		*mp0 = NULL;
1954
1955	sb = &so->so_rcv;
1956
1957	/* Prevent other readers from entering the socket. */
1958	error = sblock(sb, SBLOCKWAIT(flags));
1959	if (error)
1960		goto out;
1961	SOCKBUF_LOCK(sb);
1962
1963	/* Easy one, no space to copyout anything. */
1964	if (uio->uio_resid == 0) {
1965		error = EINVAL;
1966		goto out;
1967	}
1968	oresid = uio->uio_resid;
1969
1970	/* We will never ever get anything unless we are or were connected. */
1971	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1972		error = ENOTCONN;
1973		goto out;
1974	}
1975
1976restart:
1977	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1978
1979	/* Abort if socket has reported problems. */
1980	if (so->so_error) {
1981		if (sb->sb_cc > 0)
1982			goto deliver;
1983		if (oresid > uio->uio_resid)
1984			goto out;
1985		error = so->so_error;
1986		if (!(flags & MSG_PEEK))
1987			so->so_error = 0;
1988		goto out;
1989	}
1990
1991	/* Door is closed.  Deliver what is left, if any. */
1992	if (sb->sb_state & SBS_CANTRCVMORE) {
1993		if (sb->sb_cc > 0)
1994			goto deliver;
1995		else
1996			goto out;
1997	}
1998
1999	/* Socket buffer is empty and we shall not block. */
2000	if (sb->sb_cc == 0 &&
2001	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2002		error = EAGAIN;
2003		goto out;
2004	}
2005
2006	/* Socket buffer got some data that we shall deliver now. */
2007	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
2008	    ((sb->sb_flags & SS_NBIO) ||
2009	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2010	     sb->sb_cc >= sb->sb_lowat ||
2011	     sb->sb_cc >= uio->uio_resid ||
2012	     sb->sb_cc >= sb->sb_hiwat) ) {
2013		goto deliver;
2014	}
2015
2016	/* On MSG_WAITALL we must wait until all data or error arrives. */
2017	if ((flags & MSG_WAITALL) &&
2018	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
2019		goto deliver;
2020
2021	/*
2022	 * Wait and block until (more) data comes in.
2023	 * NB: Drops the sockbuf lock during wait.
2024	 */
2025	error = sbwait(sb);
2026	if (error)
2027		goto out;
2028	goto restart;
2029
2030deliver:
2031	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2032	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
2033	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2034
2035	/* Statistics. */
2036	if (uio->uio_td)
2037		uio->uio_td->td_ru.ru_msgrcv++;
2038
2039	/* Fill uio until full or current end of socket buffer is reached. */
2040	len = min(uio->uio_resid, sb->sb_cc);
2041	if (mp0 != NULL) {
2042		/* Dequeue as many mbufs as possible. */
2043		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2044			if (*mp0 == NULL)
2045				*mp0 = sb->sb_mb;
2046			else
2047				m_cat(*mp0, sb->sb_mb);
2048			for (m = sb->sb_mb;
2049			     m != NULL && m->m_len <= len;
2050			     m = m->m_next) {
2051				len -= m->m_len;
2052				uio->uio_resid -= m->m_len;
2053				sbfree(sb, m);
2054				n = m;
2055			}
2056			n->m_next = NULL;
2057			sb->sb_mb = m;
2058			sb->sb_lastrecord = sb->sb_mb;
2059			if (sb->sb_mb == NULL)
2060				SB_EMPTY_FIXUP(sb);
2061		}
2062		/* Copy the remainder. */
2063		if (len > 0) {
2064			KASSERT(sb->sb_mb != NULL,
2065			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2066
2067			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2068			if (m == NULL)
2069				len = 0;	/* Don't flush data from sockbuf. */
2070			else
2071				uio->uio_resid -= len;
2072			if (*mp0 != NULL)
2073				m_cat(*mp0, m);
2074			else
2075				*mp0 = m;
2076			if (*mp0 == NULL) {
2077				error = ENOBUFS;
2078				goto out;
2079			}
2080		}
2081	} else {
2082		/* NB: Must unlock socket buffer as uiomove may sleep. */
2083		SOCKBUF_UNLOCK(sb);
2084		error = m_mbuftouio(uio, sb->sb_mb, len);
2085		SOCKBUF_LOCK(sb);
2086		if (error)
2087			goto out;
2088	}
2089	SBLASTRECORDCHK(sb);
2090	SBLASTMBUFCHK(sb);
2091
2092	/*
2093	 * Remove the delivered data from the socket buffer unless we
2094	 * were only peeking.
2095	 */
2096	if (!(flags & MSG_PEEK)) {
2097		if (len > 0)
2098			sbdrop_locked(sb, len);
2099
2100		/* Notify protocol that we drained some data. */
2101		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2102		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2103		     !(flags & MSG_SOCALLBCK))) {
2104			SOCKBUF_UNLOCK(sb);
2105			VNET_SO_ASSERT(so);
2106			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2107			SOCKBUF_LOCK(sb);
2108		}
2109	}
2110
2111	/*
2112	 * For MSG_WAITALL we may have to loop again and wait for
2113	 * more data to come in.
2114	 */
2115	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2116		goto restart;
2117out:
2118	SOCKBUF_LOCK_ASSERT(sb);
2119	SBLASTRECORDCHK(sb);
2120	SBLASTMBUFCHK(sb);
2121	SOCKBUF_UNLOCK(sb);
2122	sbunlock(sb);
2123	return (error);
2124}
2125
2126/*
2127 * Optimized version of soreceive() for simple datagram cases from userspace.
2128 * Unlike in the stream case, we're able to drop a datagram if copyout()
2129 * fails, and because we handle datagrams atomically, we don't need to use a
2130 * sleep lock to prevent I/O interlacing.
2131 */
2132int
2133soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2134    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2135{
2136	struct mbuf *m, *m2;
2137	int flags, error;
2138	ssize_t len;
2139	struct protosw *pr = so->so_proto;
2140	struct mbuf *nextrecord;
2141
2142	if (psa != NULL)
2143		*psa = NULL;
2144	if (controlp != NULL)
2145		*controlp = NULL;
2146	if (flagsp != NULL)
2147		flags = *flagsp &~ MSG_EOR;
2148	else
2149		flags = 0;
2150
2151	/*
2152	 * For any complicated cases, fall back to the full
2153	 * soreceive_generic().
2154	 */
2155	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2156		return (soreceive_generic(so, psa, uio, mp0, controlp,
2157		    flagsp));
2158
2159	/*
2160	 * Enforce restrictions on use.
2161	 */
2162	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2163	    ("soreceive_dgram: wantrcvd"));
2164	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2165	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2166	    ("soreceive_dgram: SBS_RCVATMARK"));
2167	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2168	    ("soreceive_dgram: P_CONNREQUIRED"));
2169
2170	/*
2171	 * Loop blocking while waiting for a datagram.
2172	 */
2173	SOCKBUF_LOCK(&so->so_rcv);
2174	while ((m = so->so_rcv.sb_mb) == NULL) {
2175		KASSERT(so->so_rcv.sb_cc == 0,
2176		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2177		    so->so_rcv.sb_cc));
2178		if (so->so_error) {
2179			error = so->so_error;
2180			so->so_error = 0;
2181			SOCKBUF_UNLOCK(&so->so_rcv);
2182			return (error);
2183		}
2184		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2185		    uio->uio_resid == 0) {
2186			SOCKBUF_UNLOCK(&so->so_rcv);
2187			return (0);
2188		}
2189		if ((so->so_state & SS_NBIO) ||
2190		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2191			SOCKBUF_UNLOCK(&so->so_rcv);
2192			return (EWOULDBLOCK);
2193		}
2194		SBLASTRECORDCHK(&so->so_rcv);
2195		SBLASTMBUFCHK(&so->so_rcv);
2196		error = sbwait(&so->so_rcv);
2197		if (error) {
2198			SOCKBUF_UNLOCK(&so->so_rcv);
2199			return (error);
2200		}
2201	}
2202	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2203
2204	if (uio->uio_td)
2205		uio->uio_td->td_ru.ru_msgrcv++;
2206	SBLASTRECORDCHK(&so->so_rcv);
2207	SBLASTMBUFCHK(&so->so_rcv);
2208	nextrecord = m->m_nextpkt;
2209	if (nextrecord == NULL) {
2210		KASSERT(so->so_rcv.sb_lastrecord == m,
2211		    ("soreceive_dgram: lastrecord != m"));
2212	}
2213
2214	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2215	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2216
2217	/*
2218	 * Pull 'm' and its chain off the front of the packet queue.
2219	 */
2220	so->so_rcv.sb_mb = NULL;
2221	sockbuf_pushsync(&so->so_rcv, nextrecord);
2222
2223	/*
2224	 * Walk 'm's chain and free that many bytes from the socket buffer.
2225	 */
2226	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2227		sbfree(&so->so_rcv, m2);
2228
2229	/*
2230	 * Do a few last checks before we let go of the lock.
2231	 */
2232	SBLASTRECORDCHK(&so->so_rcv);
2233	SBLASTMBUFCHK(&so->so_rcv);
2234	SOCKBUF_UNLOCK(&so->so_rcv);
2235
2236	if (pr->pr_flags & PR_ADDR) {
2237		KASSERT(m->m_type == MT_SONAME,
2238		    ("m->m_type == %d", m->m_type));
2239		if (psa != NULL)
2240			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2241			    M_NOWAIT);
2242		m = m_free(m);
2243	}
2244	if (m == NULL) {
2245		/* XXXRW: Can this happen? */
2246		return (0);
2247	}
2248
2249	/*
2250	 * Packet to copyout() is now in 'm' and it is disconnected from the
2251	 * queue.
2252	 *
2253	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2254	 * in the first mbuf chain on the socket buffer.  We call into the
2255	 * protocol to perform externalization (or freeing if controlp ==
2256	 * NULL).
2257	 */
2258	if (m->m_type == MT_CONTROL) {
2259		struct mbuf *cm = NULL, *cmn;
2260		struct mbuf **cme = &cm;
2261
2262		do {
2263			m2 = m->m_next;
2264			m->m_next = NULL;
2265			*cme = m;
2266			cme = &(*cme)->m_next;
2267			m = m2;
2268		} while (m != NULL && m->m_type == MT_CONTROL);
2269		while (cm != NULL) {
2270			cmn = cm->m_next;
2271			cm->m_next = NULL;
2272			if (pr->pr_domain->dom_externalize != NULL) {
2273				error = (*pr->pr_domain->dom_externalize)
2274				    (cm, controlp);
2275			} else if (controlp != NULL)
2276				*controlp = cm;
2277			else
2278				m_freem(cm);
2279			if (controlp != NULL) {
2280				while (*controlp != NULL)
2281					controlp = &(*controlp)->m_next;
2282			}
2283			cm = cmn;
2284		}
2285	}
2286	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2287
2288	while (m != NULL && uio->uio_resid > 0) {
2289		len = uio->uio_resid;
2290		if (len > m->m_len)
2291			len = m->m_len;
2292		error = uiomove(mtod(m, char *), (int)len, uio);
2293		if (error) {
2294			m_freem(m);
2295			return (error);
2296		}
2297		if (len == m->m_len)
2298			m = m_free(m);
2299		else {
2300			m->m_data += len;
2301			m->m_len -= len;
2302		}
2303	}
2304	if (m != NULL)
2305		flags |= MSG_TRUNC;
2306	m_freem(m);
2307	if (flagsp != NULL)
2308		*flagsp |= flags;
2309	return (0);
2310}
2311
2312int
2313soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2314    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2315{
2316	int error;
2317
2318	CURVNET_SET(so->so_vnet);
2319	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2320	    controlp, flagsp));
2321	CURVNET_RESTORE();
2322	return (error);
2323}
2324
2325int
2326soshutdown(struct socket *so, int how)
2327{
2328	struct protosw *pr = so->so_proto;
2329	int error;
2330
2331	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2332		return (EINVAL);
2333
2334	CURVNET_SET(so->so_vnet);
2335	if (pr->pr_usrreqs->pru_flush != NULL) {
2336	        (*pr->pr_usrreqs->pru_flush)(so, how);
2337	}
2338	if (how != SHUT_WR)
2339		sorflush(so);
2340	if (how != SHUT_RD) {
2341		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2342		CURVNET_RESTORE();
2343		return (error);
2344	}
2345	CURVNET_RESTORE();
2346	return (0);
2347}
2348
2349void
2350sorflush(struct socket *so)
2351{
2352	struct sockbuf *sb = &so->so_rcv;
2353	struct protosw *pr = so->so_proto;
2354	struct sockbuf asb;
2355
2356	VNET_SO_ASSERT(so);
2357
2358	/*
2359	 * In order to avoid calling dom_dispose with the socket buffer mutex
2360	 * held, and in order to generally avoid holding the lock for a long
2361	 * time, we make a copy of the socket buffer and clear the original
2362	 * (except locks, state).  The new socket buffer copy won't have
2363	 * initialized locks so we can only call routines that won't use or
2364	 * assert those locks.
2365	 *
2366	 * Dislodge threads currently blocked in receive and wait to acquire
2367	 * a lock against other simultaneous readers before clearing the
2368	 * socket buffer.  Don't let our acquire be interrupted by a signal
2369	 * despite any existing socket disposition on interruptable waiting.
2370	 */
2371	socantrcvmore(so);
2372	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2373
2374	/*
2375	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2376	 * and mutex data unchanged.
2377	 */
2378	SOCKBUF_LOCK(sb);
2379	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2380	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2381	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2382	bzero(&sb->sb_startzero,
2383	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2384	SOCKBUF_UNLOCK(sb);
2385	sbunlock(sb);
2386
2387	/*
2388	 * Dispose of special rights and flush the socket buffer.  Don't call
2389	 * any unsafe routines (that rely on locks being initialized) on asb.
2390	 */
2391	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2392		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2393	sbrelease_internal(&asb, so);
2394}
2395
2396/*
2397 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2398 * additional variant to handle the case where the option value needs to be
2399 * some kind of integer, but not a specific size.  In addition to their use
2400 * here, these functions are also called by the protocol-level pr_ctloutput()
2401 * routines.
2402 */
2403int
2404sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2405{
2406	size_t	valsize;
2407
2408	/*
2409	 * If the user gives us more than we wanted, we ignore it, but if we
2410	 * don't get the minimum length the caller wants, we return EINVAL.
2411	 * On success, sopt->sopt_valsize is set to however much we actually
2412	 * retrieved.
2413	 */
2414	if ((valsize = sopt->sopt_valsize) < minlen)
2415		return EINVAL;
2416	if (valsize > len)
2417		sopt->sopt_valsize = valsize = len;
2418
2419	if (sopt->sopt_td != NULL)
2420		return (copyin(sopt->sopt_val, buf, valsize));
2421
2422	bcopy(sopt->sopt_val, buf, valsize);
2423	return (0);
2424}
2425
2426/*
2427 * Kernel version of setsockopt(2).
2428 *
2429 * XXX: optlen is size_t, not socklen_t
2430 */
2431int
2432so_setsockopt(struct socket *so, int level, int optname, void *optval,
2433    size_t optlen)
2434{
2435	struct sockopt sopt;
2436
2437	sopt.sopt_level = level;
2438	sopt.sopt_name = optname;
2439	sopt.sopt_dir = SOPT_SET;
2440	sopt.sopt_val = optval;
2441	sopt.sopt_valsize = optlen;
2442	sopt.sopt_td = NULL;
2443	return (sosetopt(so, &sopt));
2444}
2445
2446int
2447sosetopt(struct socket *so, struct sockopt *sopt)
2448{
2449	int	error, optval;
2450	struct	linger l;
2451	struct	timeval tv;
2452	u_long  val;
2453	uint32_t val32;
2454#ifdef MAC
2455	struct mac extmac;
2456#endif
2457
2458	CURVNET_SET(so->so_vnet);
2459	error = 0;
2460	if (sopt->sopt_level != SOL_SOCKET) {
2461		if (so->so_proto->pr_ctloutput != NULL) {
2462			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2463			CURVNET_RESTORE();
2464			return (error);
2465		}
2466		error = ENOPROTOOPT;
2467	} else {
2468		switch (sopt->sopt_name) {
2469#ifdef INET
2470		case SO_ACCEPTFILTER:
2471			error = do_setopt_accept_filter(so, sopt);
2472			if (error)
2473				goto bad;
2474			break;
2475#endif
2476		case SO_LINGER:
2477			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2478			if (error)
2479				goto bad;
2480
2481			SOCK_LOCK(so);
2482			so->so_linger = l.l_linger;
2483			if (l.l_onoff)
2484				so->so_options |= SO_LINGER;
2485			else
2486				so->so_options &= ~SO_LINGER;
2487			SOCK_UNLOCK(so);
2488			break;
2489
2490		case SO_DEBUG:
2491		case SO_KEEPALIVE:
2492		case SO_DONTROUTE:
2493		case SO_USELOOPBACK:
2494		case SO_BROADCAST:
2495		case SO_REUSEADDR:
2496		case SO_REUSEPORT:
2497		case SO_OOBINLINE:
2498		case SO_TIMESTAMP:
2499		case SO_BINTIME:
2500		case SO_NOSIGPIPE:
2501		case SO_NO_DDP:
2502		case SO_NO_OFFLOAD:
2503			error = sooptcopyin(sopt, &optval, sizeof optval,
2504					    sizeof optval);
2505			if (error)
2506				goto bad;
2507			SOCK_LOCK(so);
2508			if (optval)
2509				so->so_options |= sopt->sopt_name;
2510			else
2511				so->so_options &= ~sopt->sopt_name;
2512			SOCK_UNLOCK(so);
2513			break;
2514
2515		case SO_SETFIB:
2516			error = sooptcopyin(sopt, &optval, sizeof optval,
2517					    sizeof optval);
2518			if (error)
2519				goto bad;
2520
2521			if (optval < 0 || optval >= rt_numfibs) {
2522				error = EINVAL;
2523				goto bad;
2524			}
2525			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2526			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2527			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2528				so->so_fibnum = optval;
2529			else
2530				so->so_fibnum = 0;
2531			break;
2532
2533		case SO_USER_COOKIE:
2534			error = sooptcopyin(sopt, &val32, sizeof val32,
2535					    sizeof val32);
2536			if (error)
2537				goto bad;
2538			so->so_user_cookie = val32;
2539			break;
2540
2541		case SO_SNDBUF:
2542		case SO_RCVBUF:
2543		case SO_SNDLOWAT:
2544		case SO_RCVLOWAT:
2545			error = sooptcopyin(sopt, &optval, sizeof optval,
2546					    sizeof optval);
2547			if (error)
2548				goto bad;
2549
2550			/*
2551			 * Values < 1 make no sense for any of these options,
2552			 * so disallow them.
2553			 */
2554			if (optval < 1) {
2555				error = EINVAL;
2556				goto bad;
2557			}
2558
2559			switch (sopt->sopt_name) {
2560			case SO_SNDBUF:
2561			case SO_RCVBUF:
2562				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2563				    &so->so_snd : &so->so_rcv, (u_long)optval,
2564				    so, curthread) == 0) {
2565					error = ENOBUFS;
2566					goto bad;
2567				}
2568				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2569				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2570				break;
2571
2572			/*
2573			 * Make sure the low-water is never greater than the
2574			 * high-water.
2575			 */
2576			case SO_SNDLOWAT:
2577				SOCKBUF_LOCK(&so->so_snd);
2578				so->so_snd.sb_lowat =
2579				    (optval > so->so_snd.sb_hiwat) ?
2580				    so->so_snd.sb_hiwat : optval;
2581				SOCKBUF_UNLOCK(&so->so_snd);
2582				break;
2583			case SO_RCVLOWAT:
2584				SOCKBUF_LOCK(&so->so_rcv);
2585				so->so_rcv.sb_lowat =
2586				    (optval > so->so_rcv.sb_hiwat) ?
2587				    so->so_rcv.sb_hiwat : optval;
2588				SOCKBUF_UNLOCK(&so->so_rcv);
2589				break;
2590			}
2591			break;
2592
2593		case SO_SNDTIMEO:
2594		case SO_RCVTIMEO:
2595#ifdef COMPAT_FREEBSD32
2596			if (SV_CURPROC_FLAG(SV_ILP32)) {
2597				struct timeval32 tv32;
2598
2599				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2600				    sizeof tv32);
2601				CP(tv32, tv, tv_sec);
2602				CP(tv32, tv, tv_usec);
2603			} else
2604#endif
2605				error = sooptcopyin(sopt, &tv, sizeof tv,
2606				    sizeof tv);
2607			if (error)
2608				goto bad;
2609
2610			/* assert(hz > 0); */
2611			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2612			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2613				error = EDOM;
2614				goto bad;
2615			}
2616			/* assert(tick > 0); */
2617			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2618			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2619			if (val > INT_MAX) {
2620				error = EDOM;
2621				goto bad;
2622			}
2623			if (val == 0 && tv.tv_usec != 0)
2624				val = 1;
2625
2626			switch (sopt->sopt_name) {
2627			case SO_SNDTIMEO:
2628				so->so_snd.sb_timeo = val;
2629				break;
2630			case SO_RCVTIMEO:
2631				so->so_rcv.sb_timeo = val;
2632				break;
2633			}
2634			break;
2635
2636		case SO_LABEL:
2637#ifdef MAC
2638			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2639			    sizeof extmac);
2640			if (error)
2641				goto bad;
2642			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2643			    so, &extmac);
2644#else
2645			error = EOPNOTSUPP;
2646#endif
2647			break;
2648
2649		default:
2650			error = ENOPROTOOPT;
2651			break;
2652		}
2653		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2654			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
2655	}
2656bad:
2657	CURVNET_RESTORE();
2658	return (error);
2659}
2660
2661/*
2662 * Helper routine for getsockopt.
2663 */
2664int
2665sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2666{
2667	int	error;
2668	size_t	valsize;
2669
2670	error = 0;
2671
2672	/*
2673	 * Documented get behavior is that we always return a value, possibly
2674	 * truncated to fit in the user's buffer.  Traditional behavior is
2675	 * that we always tell the user precisely how much we copied, rather
2676	 * than something useful like the total amount we had available for
2677	 * her.  Note that this interface is not idempotent; the entire
2678	 * answer must generated ahead of time.
2679	 */
2680	valsize = min(len, sopt->sopt_valsize);
2681	sopt->sopt_valsize = valsize;
2682	if (sopt->sopt_val != NULL) {
2683		if (sopt->sopt_td != NULL)
2684			error = copyout(buf, sopt->sopt_val, valsize);
2685		else
2686			bcopy(buf, sopt->sopt_val, valsize);
2687	}
2688	return (error);
2689}
2690
2691int
2692sogetopt(struct socket *so, struct sockopt *sopt)
2693{
2694	int	error, optval;
2695	struct	linger l;
2696	struct	timeval tv;
2697#ifdef MAC
2698	struct mac extmac;
2699#endif
2700
2701	CURVNET_SET(so->so_vnet);
2702	error = 0;
2703	if (sopt->sopt_level != SOL_SOCKET) {
2704		if (so->so_proto->pr_ctloutput != NULL)
2705			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2706		else
2707			error = ENOPROTOOPT;
2708		CURVNET_RESTORE();
2709		return (error);
2710	} else {
2711		switch (sopt->sopt_name) {
2712#ifdef INET
2713		case SO_ACCEPTFILTER:
2714			error = do_getopt_accept_filter(so, sopt);
2715			break;
2716#endif
2717		case SO_LINGER:
2718			SOCK_LOCK(so);
2719			l.l_onoff = so->so_options & SO_LINGER;
2720			l.l_linger = so->so_linger;
2721			SOCK_UNLOCK(so);
2722			error = sooptcopyout(sopt, &l, sizeof l);
2723			break;
2724
2725		case SO_USELOOPBACK:
2726		case SO_DONTROUTE:
2727		case SO_DEBUG:
2728		case SO_KEEPALIVE:
2729		case SO_REUSEADDR:
2730		case SO_REUSEPORT:
2731		case SO_BROADCAST:
2732		case SO_OOBINLINE:
2733		case SO_ACCEPTCONN:
2734		case SO_TIMESTAMP:
2735		case SO_BINTIME:
2736		case SO_NOSIGPIPE:
2737			optval = so->so_options & sopt->sopt_name;
2738integer:
2739			error = sooptcopyout(sopt, &optval, sizeof optval);
2740			break;
2741
2742		case SO_TYPE:
2743			optval = so->so_type;
2744			goto integer;
2745
2746		case SO_PROTOCOL:
2747			optval = so->so_proto->pr_protocol;
2748			goto integer;
2749
2750		case SO_ERROR:
2751			SOCK_LOCK(so);
2752			optval = so->so_error;
2753			so->so_error = 0;
2754			SOCK_UNLOCK(so);
2755			goto integer;
2756
2757		case SO_SNDBUF:
2758			optval = so->so_snd.sb_hiwat;
2759			goto integer;
2760
2761		case SO_RCVBUF:
2762			optval = so->so_rcv.sb_hiwat;
2763			goto integer;
2764
2765		case SO_SNDLOWAT:
2766			optval = so->so_snd.sb_lowat;
2767			goto integer;
2768
2769		case SO_RCVLOWAT:
2770			optval = so->so_rcv.sb_lowat;
2771			goto integer;
2772
2773		case SO_SNDTIMEO:
2774		case SO_RCVTIMEO:
2775			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2776				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2777
2778			tv.tv_sec = optval / hz;
2779			tv.tv_usec = (optval % hz) * tick;
2780#ifdef COMPAT_FREEBSD32
2781			if (SV_CURPROC_FLAG(SV_ILP32)) {
2782				struct timeval32 tv32;
2783
2784				CP(tv, tv32, tv_sec);
2785				CP(tv, tv32, tv_usec);
2786				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2787			} else
2788#endif
2789				error = sooptcopyout(sopt, &tv, sizeof tv);
2790			break;
2791
2792		case SO_LABEL:
2793#ifdef MAC
2794			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2795			    sizeof(extmac));
2796			if (error)
2797				goto bad;
2798			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2799			    so, &extmac);
2800			if (error)
2801				goto bad;
2802			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2803#else
2804			error = EOPNOTSUPP;
2805#endif
2806			break;
2807
2808		case SO_PEERLABEL:
2809#ifdef MAC
2810			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2811			    sizeof(extmac));
2812			if (error)
2813				goto bad;
2814			error = mac_getsockopt_peerlabel(
2815			    sopt->sopt_td->td_ucred, so, &extmac);
2816			if (error)
2817				goto bad;
2818			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2819#else
2820			error = EOPNOTSUPP;
2821#endif
2822			break;
2823
2824		case SO_LISTENQLIMIT:
2825			optval = so->so_qlimit;
2826			goto integer;
2827
2828		case SO_LISTENQLEN:
2829			optval = so->so_qlen;
2830			goto integer;
2831
2832		case SO_LISTENINCQLEN:
2833			optval = so->so_incqlen;
2834			goto integer;
2835
2836		default:
2837			error = ENOPROTOOPT;
2838			break;
2839		}
2840	}
2841#ifdef MAC
2842bad:
2843#endif
2844	CURVNET_RESTORE();
2845	return (error);
2846}
2847
2848/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2849int
2850soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2851{
2852	struct mbuf *m, *m_prev;
2853	int sopt_size = sopt->sopt_valsize;
2854
2855	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2856	if (m == NULL)
2857		return ENOBUFS;
2858	if (sopt_size > MLEN) {
2859		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2860		if ((m->m_flags & M_EXT) == 0) {
2861			m_free(m);
2862			return ENOBUFS;
2863		}
2864		m->m_len = min(MCLBYTES, sopt_size);
2865	} else {
2866		m->m_len = min(MLEN, sopt_size);
2867	}
2868	sopt_size -= m->m_len;
2869	*mp = m;
2870	m_prev = m;
2871
2872	while (sopt_size) {
2873		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2874		if (m == NULL) {
2875			m_freem(*mp);
2876			return ENOBUFS;
2877		}
2878		if (sopt_size > MLEN) {
2879			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2880			    M_DONTWAIT);
2881			if ((m->m_flags & M_EXT) == 0) {
2882				m_freem(m);
2883				m_freem(*mp);
2884				return ENOBUFS;
2885			}
2886			m->m_len = min(MCLBYTES, sopt_size);
2887		} else {
2888			m->m_len = min(MLEN, sopt_size);
2889		}
2890		sopt_size -= m->m_len;
2891		m_prev->m_next = m;
2892		m_prev = m;
2893	}
2894	return (0);
2895}
2896
2897/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2898int
2899soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2900{
2901	struct mbuf *m0 = m;
2902
2903	if (sopt->sopt_val == NULL)
2904		return (0);
2905	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2906		if (sopt->sopt_td != NULL) {
2907			int error;
2908
2909			error = copyin(sopt->sopt_val, mtod(m, char *),
2910				       m->m_len);
2911			if (error != 0) {
2912				m_freem(m0);
2913				return(error);
2914			}
2915		} else
2916			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2917		sopt->sopt_valsize -= m->m_len;
2918		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2919		m = m->m_next;
2920	}
2921	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2922		panic("ip6_sooptmcopyin");
2923	return (0);
2924}
2925
2926/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2927int
2928soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2929{
2930	struct mbuf *m0 = m;
2931	size_t valsize = 0;
2932
2933	if (sopt->sopt_val == NULL)
2934		return (0);
2935	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2936		if (sopt->sopt_td != NULL) {
2937			int error;
2938
2939			error = copyout(mtod(m, char *), sopt->sopt_val,
2940				       m->m_len);
2941			if (error != 0) {
2942				m_freem(m0);
2943				return(error);
2944			}
2945		} else
2946			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2947	       sopt->sopt_valsize -= m->m_len;
2948	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2949	       valsize += m->m_len;
2950	       m = m->m_next;
2951	}
2952	if (m != NULL) {
2953		/* enough soopt buffer should be given from user-land */
2954		m_freem(m0);
2955		return(EINVAL);
2956	}
2957	sopt->sopt_valsize = valsize;
2958	return (0);
2959}
2960
2961/*
2962 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2963 * out-of-band data, which will then notify socket consumers.
2964 */
2965void
2966sohasoutofband(struct socket *so)
2967{
2968
2969	if (so->so_sigio != NULL)
2970		pgsigio(&so->so_sigio, SIGURG, 0);
2971	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2972}
2973
2974int
2975sopoll(struct socket *so, int events, struct ucred *active_cred,
2976    struct thread *td)
2977{
2978
2979	/*
2980	 * We do not need to set or assert curvnet as long as everyone uses
2981	 * sopoll_generic().
2982	 */
2983	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2984	    td));
2985}
2986
2987int
2988sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2989    struct thread *td)
2990{
2991	int revents = 0;
2992
2993	SOCKBUF_LOCK(&so->so_snd);
2994	SOCKBUF_LOCK(&so->so_rcv);
2995	if (events & (POLLIN | POLLRDNORM))
2996		if (soreadabledata(so))
2997			revents |= events & (POLLIN | POLLRDNORM);
2998
2999	if (events & (POLLOUT | POLLWRNORM))
3000		if (sowriteable(so))
3001			revents |= events & (POLLOUT | POLLWRNORM);
3002
3003	if (events & (POLLPRI | POLLRDBAND))
3004		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
3005			revents |= events & (POLLPRI | POLLRDBAND);
3006
3007	if ((events & POLLINIGNEOF) == 0) {
3008		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3009			revents |= events & (POLLIN | POLLRDNORM);
3010			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3011				revents |= POLLHUP;
3012		}
3013	}
3014
3015	if (revents == 0) {
3016		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3017			selrecord(td, &so->so_rcv.sb_sel);
3018			so->so_rcv.sb_flags |= SB_SEL;
3019		}
3020
3021		if (events & (POLLOUT | POLLWRNORM)) {
3022			selrecord(td, &so->so_snd.sb_sel);
3023			so->so_snd.sb_flags |= SB_SEL;
3024		}
3025	}
3026
3027	SOCKBUF_UNLOCK(&so->so_rcv);
3028	SOCKBUF_UNLOCK(&so->so_snd);
3029	return (revents);
3030}
3031
3032int
3033soo_kqfilter(struct file *fp, struct knote *kn)
3034{
3035	struct socket *so = kn->kn_fp->f_data;
3036	struct sockbuf *sb;
3037
3038	switch (kn->kn_filter) {
3039	case EVFILT_READ:
3040		if (so->so_options & SO_ACCEPTCONN)
3041			kn->kn_fop = &solisten_filtops;
3042		else
3043			kn->kn_fop = &soread_filtops;
3044		sb = &so->so_rcv;
3045		break;
3046	case EVFILT_WRITE:
3047		kn->kn_fop = &sowrite_filtops;
3048		sb = &so->so_snd;
3049		break;
3050	default:
3051		return (EINVAL);
3052	}
3053
3054	SOCKBUF_LOCK(sb);
3055	knlist_add(&sb->sb_sel.si_note, kn, 1);
3056	sb->sb_flags |= SB_KNOTE;
3057	SOCKBUF_UNLOCK(sb);
3058	return (0);
3059}
3060
3061/*
3062 * Some routines that return EOPNOTSUPP for entry points that are not
3063 * supported by a protocol.  Fill in as needed.
3064 */
3065int
3066pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3067{
3068
3069	return EOPNOTSUPP;
3070}
3071
3072int
3073pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3074{
3075
3076	return EOPNOTSUPP;
3077}
3078
3079int
3080pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3081{
3082
3083	return EOPNOTSUPP;
3084}
3085
3086int
3087pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3088{
3089
3090	return EOPNOTSUPP;
3091}
3092
3093int
3094pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3095{
3096
3097	return EOPNOTSUPP;
3098}
3099
3100int
3101pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3102    struct ifnet *ifp, struct thread *td)
3103{
3104
3105	return EOPNOTSUPP;
3106}
3107
3108int
3109pru_disconnect_notsupp(struct socket *so)
3110{
3111
3112	return EOPNOTSUPP;
3113}
3114
3115int
3116pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3117{
3118
3119	return EOPNOTSUPP;
3120}
3121
3122int
3123pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3124{
3125
3126	return EOPNOTSUPP;
3127}
3128
3129int
3130pru_rcvd_notsupp(struct socket *so, int flags)
3131{
3132
3133	return EOPNOTSUPP;
3134}
3135
3136int
3137pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3138{
3139
3140	return EOPNOTSUPP;
3141}
3142
3143int
3144pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3145    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3146{
3147
3148	return EOPNOTSUPP;
3149}
3150
3151/*
3152 * This isn't really a ``null'' operation, but it's the default one and
3153 * doesn't do anything destructive.
3154 */
3155int
3156pru_sense_null(struct socket *so, struct stat *sb)
3157{
3158
3159	sb->st_blksize = so->so_snd.sb_hiwat;
3160	return 0;
3161}
3162
3163int
3164pru_shutdown_notsupp(struct socket *so)
3165{
3166
3167	return EOPNOTSUPP;
3168}
3169
3170int
3171pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3172{
3173
3174	return EOPNOTSUPP;
3175}
3176
3177int
3178pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3179    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3180{
3181
3182	return EOPNOTSUPP;
3183}
3184
3185int
3186pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3187    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3188{
3189
3190	return EOPNOTSUPP;
3191}
3192
3193int
3194pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3195    struct thread *td)
3196{
3197
3198	return EOPNOTSUPP;
3199}
3200
3201static void
3202filt_sordetach(struct knote *kn)
3203{
3204	struct socket *so = kn->kn_fp->f_data;
3205
3206	SOCKBUF_LOCK(&so->so_rcv);
3207	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3208	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3209		so->so_rcv.sb_flags &= ~SB_KNOTE;
3210	SOCKBUF_UNLOCK(&so->so_rcv);
3211}
3212
3213/*ARGSUSED*/
3214static int
3215filt_soread(struct knote *kn, long hint)
3216{
3217	struct socket *so;
3218
3219	so = kn->kn_fp->f_data;
3220	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3221
3222	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3223	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3224		kn->kn_flags |= EV_EOF;
3225		kn->kn_fflags = so->so_error;
3226		return (1);
3227	} else if (so->so_error)	/* temporary udp error */
3228		return (1);
3229	else if (kn->kn_sfflags & NOTE_LOWAT)
3230		return (kn->kn_data >= kn->kn_sdata);
3231	else
3232		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3233}
3234
3235static void
3236filt_sowdetach(struct knote *kn)
3237{
3238	struct socket *so = kn->kn_fp->f_data;
3239
3240	SOCKBUF_LOCK(&so->so_snd);
3241	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3242	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3243		so->so_snd.sb_flags &= ~SB_KNOTE;
3244	SOCKBUF_UNLOCK(&so->so_snd);
3245}
3246
3247/*ARGSUSED*/
3248static int
3249filt_sowrite(struct knote *kn, long hint)
3250{
3251	struct socket *so;
3252
3253	so = kn->kn_fp->f_data;
3254	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3255	kn->kn_data = sbspace(&so->so_snd);
3256	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3257		kn->kn_flags |= EV_EOF;
3258		kn->kn_fflags = so->so_error;
3259		return (1);
3260	} else if (so->so_error)	/* temporary udp error */
3261		return (1);
3262	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3263	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3264		return (0);
3265	else if (kn->kn_sfflags & NOTE_LOWAT)
3266		return (kn->kn_data >= kn->kn_sdata);
3267	else
3268		return (kn->kn_data >= so->so_snd.sb_lowat);
3269}
3270
3271/*ARGSUSED*/
3272static int
3273filt_solisten(struct knote *kn, long hint)
3274{
3275	struct socket *so = kn->kn_fp->f_data;
3276
3277	kn->kn_data = so->so_qlen;
3278	return (! TAILQ_EMPTY(&so->so_comp));
3279}
3280
3281int
3282socheckuid(struct socket *so, uid_t uid)
3283{
3284
3285	if (so == NULL)
3286		return (EPERM);
3287	if (so->so_cred->cr_uid != uid)
3288		return (EPERM);
3289	return (0);
3290}
3291
3292static int
3293sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3294{
3295	int error;
3296	int val;
3297
3298	val = somaxconn;
3299	error = sysctl_handle_int(oidp, &val, 0, req);
3300	if (error || !req->newptr )
3301		return (error);
3302
3303	if (val < 1 || val > USHRT_MAX)
3304		return (EINVAL);
3305
3306	somaxconn = val;
3307	return (0);
3308}
3309
3310/*
3311 * These functions are used by protocols to notify the socket layer (and its
3312 * consumers) of state changes in the sockets driven by protocol-side events.
3313 */
3314
3315/*
3316 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3317 *
3318 * Normal sequence from the active (originating) side is that
3319 * soisconnecting() is called during processing of connect() call, resulting
3320 * in an eventual call to soisconnected() if/when the connection is
3321 * established.  When the connection is torn down soisdisconnecting() is
3322 * called during processing of disconnect() call, and soisdisconnected() is
3323 * called when the connection to the peer is totally severed.  The semantics
3324 * of these routines are such that connectionless protocols can call
3325 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3326 * calls when setting up a ``connection'' takes no time.
3327 *
3328 * From the passive side, a socket is created with two queues of sockets:
3329 * so_incomp for connections in progress and so_comp for connections already
3330 * made and awaiting user acceptance.  As a protocol is preparing incoming
3331 * connections, it creates a socket structure queued on so_incomp by calling
3332 * sonewconn().  When the connection is established, soisconnected() is
3333 * called, and transfers the socket structure to so_comp, making it available
3334 * to accept().
3335 *
3336 * If a socket is closed with sockets on either so_incomp or so_comp, these
3337 * sockets are dropped.
3338 *
3339 * If higher-level protocols are implemented in the kernel, the wakeups done
3340 * here will sometimes cause software-interrupt process scheduling.
3341 */
3342void
3343soisconnecting(struct socket *so)
3344{
3345
3346	SOCK_LOCK(so);
3347	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3348	so->so_state |= SS_ISCONNECTING;
3349	SOCK_UNLOCK(so);
3350}
3351
3352void
3353soisconnected(struct socket *so)
3354{
3355	struct socket *head;
3356	int ret;
3357
3358restart:
3359	ACCEPT_LOCK();
3360	SOCK_LOCK(so);
3361	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3362	so->so_state |= SS_ISCONNECTED;
3363	head = so->so_head;
3364	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3365		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3366			SOCK_UNLOCK(so);
3367			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3368			head->so_incqlen--;
3369			so->so_qstate &= ~SQ_INCOMP;
3370			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3371			head->so_qlen++;
3372			so->so_qstate |= SQ_COMP;
3373			ACCEPT_UNLOCK();
3374			sorwakeup(head);
3375			wakeup_one(&head->so_timeo);
3376		} else {
3377			ACCEPT_UNLOCK();
3378			soupcall_set(so, SO_RCV,
3379			    head->so_accf->so_accept_filter->accf_callback,
3380			    head->so_accf->so_accept_filter_arg);
3381			so->so_options &= ~SO_ACCEPTFILTER;
3382			ret = head->so_accf->so_accept_filter->accf_callback(so,
3383			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3384			if (ret == SU_ISCONNECTED)
3385				soupcall_clear(so, SO_RCV);
3386			SOCK_UNLOCK(so);
3387			if (ret == SU_ISCONNECTED)
3388				goto restart;
3389		}
3390		return;
3391	}
3392	SOCK_UNLOCK(so);
3393	ACCEPT_UNLOCK();
3394	wakeup(&so->so_timeo);
3395	sorwakeup(so);
3396	sowwakeup(so);
3397}
3398
3399void
3400soisdisconnecting(struct socket *so)
3401{
3402
3403	/*
3404	 * Note: This code assumes that SOCK_LOCK(so) and
3405	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3406	 */
3407	SOCKBUF_LOCK(&so->so_rcv);
3408	so->so_state &= ~SS_ISCONNECTING;
3409	so->so_state |= SS_ISDISCONNECTING;
3410	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3411	sorwakeup_locked(so);
3412	SOCKBUF_LOCK(&so->so_snd);
3413	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3414	sowwakeup_locked(so);
3415	wakeup(&so->so_timeo);
3416}
3417
3418void
3419soisdisconnected(struct socket *so)
3420{
3421
3422	/*
3423	 * Note: This code assumes that SOCK_LOCK(so) and
3424	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3425	 */
3426	SOCKBUF_LOCK(&so->so_rcv);
3427	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3428	so->so_state |= SS_ISDISCONNECTED;
3429	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3430	sorwakeup_locked(so);
3431	SOCKBUF_LOCK(&so->so_snd);
3432	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3433	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3434	sowwakeup_locked(so);
3435	wakeup(&so->so_timeo);
3436}
3437
3438/*
3439 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3440 */
3441struct sockaddr *
3442sodupsockaddr(const struct sockaddr *sa, int mflags)
3443{
3444	struct sockaddr *sa2;
3445
3446	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3447	if (sa2)
3448		bcopy(sa, sa2, sa->sa_len);
3449	return sa2;
3450}
3451
3452/*
3453 * Register per-socket buffer upcalls.
3454 */
3455void
3456soupcall_set(struct socket *so, int which,
3457    int (*func)(struct socket *, void *, int), void *arg)
3458{
3459	struct sockbuf *sb;
3460
3461	switch (which) {
3462	case SO_RCV:
3463		sb = &so->so_rcv;
3464		break;
3465	case SO_SND:
3466		sb = &so->so_snd;
3467		break;
3468	default:
3469		panic("soupcall_set: bad which");
3470	}
3471	SOCKBUF_LOCK_ASSERT(sb);
3472#if 0
3473	/* XXX: accf_http actually wants to do this on purpose. */
3474	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3475#endif
3476	sb->sb_upcall = func;
3477	sb->sb_upcallarg = arg;
3478	sb->sb_flags |= SB_UPCALL;
3479}
3480
3481void
3482soupcall_clear(struct socket *so, int which)
3483{
3484	struct sockbuf *sb;
3485
3486	switch (which) {
3487	case SO_RCV:
3488		sb = &so->so_rcv;
3489		break;
3490	case SO_SND:
3491		sb = &so->so_snd;
3492		break;
3493	default:
3494		panic("soupcall_clear: bad which");
3495	}
3496	SOCKBUF_LOCK_ASSERT(sb);
3497	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3498	sb->sb_upcall = NULL;
3499	sb->sb_upcallarg = NULL;
3500	sb->sb_flags &= ~SB_UPCALL;
3501}
3502
3503/*
3504 * Create an external-format (``xsocket'') structure using the information in
3505 * the kernel-format socket structure pointed to by so.  This is done to
3506 * reduce the spew of irrelevant information over this interface, to isolate
3507 * user code from changes in the kernel structure, and potentially to provide
3508 * information-hiding if we decide that some of this information should be
3509 * hidden from users.
3510 */
3511void
3512sotoxsocket(struct socket *so, struct xsocket *xso)
3513{
3514
3515	xso->xso_len = sizeof *xso;
3516	xso->xso_so = so;
3517	xso->so_type = so->so_type;
3518	xso->so_options = so->so_options;
3519	xso->so_linger = so->so_linger;
3520	xso->so_state = so->so_state;
3521	xso->so_pcb = so->so_pcb;
3522	xso->xso_protocol = so->so_proto->pr_protocol;
3523	xso->xso_family = so->so_proto->pr_domain->dom_family;
3524	xso->so_qlen = so->so_qlen;
3525	xso->so_incqlen = so->so_incqlen;
3526	xso->so_qlimit = so->so_qlimit;
3527	xso->so_timeo = so->so_timeo;
3528	xso->so_error = so->so_error;
3529	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3530	xso->so_oobmark = so->so_oobmark;
3531	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3532	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3533	xso->so_uid = so->so_cred->cr_uid;
3534}
3535
3536
3537/*
3538 * Socket accessor functions to provide external consumers with
3539 * a safe interface to socket state
3540 *
3541 */
3542
3543void
3544so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3545{
3546
3547	TAILQ_FOREACH(so, &so->so_comp, so_list)
3548		func(so, arg);
3549}
3550
3551struct sockbuf *
3552so_sockbuf_rcv(struct socket *so)
3553{
3554
3555	return (&so->so_rcv);
3556}
3557
3558struct sockbuf *
3559so_sockbuf_snd(struct socket *so)
3560{
3561
3562	return (&so->so_snd);
3563}
3564
3565int
3566so_state_get(const struct socket *so)
3567{
3568
3569	return (so->so_state);
3570}
3571
3572void
3573so_state_set(struct socket *so, int val)
3574{
3575
3576	so->so_state = val;
3577}
3578
3579int
3580so_options_get(const struct socket *so)
3581{
3582
3583	return (so->so_options);
3584}
3585
3586void
3587so_options_set(struct socket *so, int val)
3588{
3589
3590	so->so_options = val;
3591}
3592
3593int
3594so_error_get(const struct socket *so)
3595{
3596
3597	return (so->so_error);
3598}
3599
3600void
3601so_error_set(struct socket *so, int val)
3602{
3603
3604	so->so_error = val;
3605}
3606
3607int
3608so_linger_get(const struct socket *so)
3609{
3610
3611	return (so->so_linger);
3612}
3613
3614void
3615so_linger_set(struct socket *so, int val)
3616{
3617
3618	so->so_linger = val;
3619}
3620
3621struct protosw *
3622so_protosw_get(const struct socket *so)
3623{
3624
3625	return (so->so_proto);
3626}
3627
3628void
3629so_protosw_set(struct socket *so, struct protosw *val)
3630{
3631
3632	so->so_proto = val;
3633}
3634
3635void
3636so_sorwakeup(struct socket *so)
3637{
3638
3639	sorwakeup(so);
3640}
3641
3642void
3643so_sowwakeup(struct socket *so)
3644{
3645
3646	sowwakeup(so);
3647}
3648
3649void
3650so_sorwakeup_locked(struct socket *so)
3651{
3652
3653	sorwakeup_locked(so);
3654}
3655
3656void
3657so_sowwakeup_locked(struct socket *so)
3658{
3659
3660	sowwakeup_locked(so);
3661}
3662
3663void
3664so_lock(struct socket *so)
3665{
3666	SOCK_LOCK(so);
3667}
3668
3669void
3670so_unlock(struct socket *so)
3671{
3672	SOCK_UNLOCK(so);
3673}
3674