uipc_socket.c revision 167902
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 167902 2007-03-26 17:05:09Z rwatson $");
99
100#include "opt_inet.h"
101#include "opt_mac.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h>			/* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <sys/signalvar.h>
126#include <sys/stat.h>
127#include <sys/sysctl.h>
128#include <sys/uio.h>
129#include <sys/jail.h>
130
131#include <security/mac/mac_framework.h>
132
133#include <vm/uma.h>
134
135#ifdef COMPAT_IA32
136#include <sys/mount.h>
137#include <compat/freebsd32/freebsd32.h>
138
139extern struct sysentvec ia32_freebsd_sysvec;
140#endif
141
142static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
143		    int flags);
144
145static void	filt_sordetach(struct knote *kn);
146static int	filt_soread(struct knote *kn, long hint);
147static void	filt_sowdetach(struct knote *kn);
148static int	filt_sowrite(struct knote *kn, long hint);
149static int	filt_solisten(struct knote *kn, long hint);
150
151static struct filterops solisten_filtops =
152	{ 1, NULL, filt_sordetach, filt_solisten };
153static struct filterops soread_filtops =
154	{ 1, NULL, filt_sordetach, filt_soread };
155static struct filterops sowrite_filtops =
156	{ 1, NULL, filt_sowdetach, filt_sowrite };
157
158uma_zone_t socket_zone;
159so_gen_t	so_gencnt;	/* generation count for sockets */
160
161int	maxsockets;
162
163MALLOC_DEFINE(M_SONAME, "soname", "socket name");
164MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
165
166static int somaxconn = SOMAXCONN;
167static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
168/* XXX: we dont have SYSCTL_USHORT */
169SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
170    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
171    "queue size");
172static int numopensockets;
173SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
174    &numopensockets, 0, "Number of open sockets");
175#ifdef ZERO_COPY_SOCKETS
176/* These aren't static because they're used in other files. */
177int so_zero_copy_send = 1;
178int so_zero_copy_receive = 1;
179SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
180    "Zero copy controls");
181SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
182    &so_zero_copy_receive, 0, "Enable zero copy receive");
183SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
184    &so_zero_copy_send, 0, "Enable zero copy send");
185#endif /* ZERO_COPY_SOCKETS */
186
187/*
188 * accept_mtx locks down per-socket fields relating to accept queues.  See
189 * socketvar.h for an annotation of the protected fields of struct socket.
190 */
191struct mtx accept_mtx;
192MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
193
194/*
195 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
196 * so_gencnt field.
197 */
198static struct mtx so_global_mtx;
199MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
200
201/*
202 * General IPC sysctl name space, used by sockets and a variety of other IPC
203 * types.
204 */
205SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
206
207/*
208 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
209 * of the change so that they can update their dependent limits as required.
210 */
211static int
212sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
213{
214	int error, newmaxsockets;
215
216	newmaxsockets = maxsockets;
217	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
218	if (error == 0 && req->newptr) {
219		if (newmaxsockets > maxsockets) {
220			maxsockets = newmaxsockets;
221			if (maxsockets > ((maxfiles / 4) * 3)) {
222				maxfiles = (maxsockets * 5) / 4;
223				maxfilesperproc = (maxfiles * 9) / 10;
224			}
225			EVENTHANDLER_INVOKE(maxsockets_change);
226		} else
227			error = EINVAL;
228	}
229	return (error);
230}
231
232SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
233    &maxsockets, 0, sysctl_maxsockets, "IU",
234    "Maximum number of sockets avaliable");
235
236/*
237 * Initialise maxsockets.
238 */
239static void init_maxsockets(void *ignored)
240{
241	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
242	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
243}
244SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
245
246/*
247 * Socket operation routines.  These routines are called by the routines in
248 * sys_socket.c or from a system process, and implement the semantics of
249 * socket operations by switching out to the protocol specific routines.
250 */
251
252/*
253 * Get a socket structure from our zone, and initialize it.  Note that it
254 * would probably be better to allocate socket and PCB at the same time, but
255 * I'm not convinced that all the protocols can be easily modified to do
256 * this.
257 *
258 * soalloc() returns a socket with a ref count of 0.
259 */
260static struct socket *
261soalloc(void)
262{
263	struct socket *so;
264
265	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
266	if (so == NULL)
267		return (NULL);
268#ifdef MAC
269	if (mac_init_socket(so, M_NOWAIT) != 0) {
270		uma_zfree(socket_zone, so);
271		return (NULL);
272	}
273#endif
274	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
275	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
276	TAILQ_INIT(&so->so_aiojobq);
277	mtx_lock(&so_global_mtx);
278	so->so_gencnt = ++so_gencnt;
279	++numopensockets;
280	mtx_unlock(&so_global_mtx);
281	return (so);
282}
283
284/*
285 * Free the storage associated with a socket at the socket layer, tear down
286 * locks, labels, etc.  All protocol state is assumed already to have been
287 * torn down (and possibly never set up) by the caller.
288 */
289static void
290sodealloc(struct socket *so)
291{
292
293	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
294	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
295
296	mtx_lock(&so_global_mtx);
297	so->so_gencnt = ++so_gencnt;
298	--numopensockets;	/* Could be below, but faster here. */
299	mtx_unlock(&so_global_mtx);
300	if (so->so_rcv.sb_hiwat)
301		(void)chgsbsize(so->so_cred->cr_uidinfo,
302		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
303	if (so->so_snd.sb_hiwat)
304		(void)chgsbsize(so->so_cred->cr_uidinfo,
305		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
306#ifdef INET
307	/* remove acccept filter if one is present. */
308	if (so->so_accf != NULL)
309		do_setopt_accept_filter(so, NULL);
310#endif
311#ifdef MAC
312	mac_destroy_socket(so);
313#endif
314	crfree(so->so_cred);
315	SOCKBUF_LOCK_DESTROY(&so->so_snd);
316	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
317	uma_zfree(socket_zone, so);
318}
319
320/*
321 * socreate returns a socket with a ref count of 1.  The socket should be
322 * closed with soclose().
323 */
324int
325socreate(dom, aso, type, proto, cred, td)
326	int dom;
327	struct socket **aso;
328	int type;
329	int proto;
330	struct ucred *cred;
331	struct thread *td;
332{
333	struct protosw *prp;
334	struct socket *so;
335	int error;
336
337	if (proto)
338		prp = pffindproto(dom, proto, type);
339	else
340		prp = pffindtype(dom, type);
341
342	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
343	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
344		return (EPROTONOSUPPORT);
345
346	if (jailed(cred) && jail_socket_unixiproute_only &&
347	    prp->pr_domain->dom_family != PF_LOCAL &&
348	    prp->pr_domain->dom_family != PF_INET &&
349	    prp->pr_domain->dom_family != PF_ROUTE) {
350		return (EPROTONOSUPPORT);
351	}
352
353	if (prp->pr_type != type)
354		return (EPROTOTYPE);
355	so = soalloc();
356	if (so == NULL)
357		return (ENOBUFS);
358
359	TAILQ_INIT(&so->so_incomp);
360	TAILQ_INIT(&so->so_comp);
361	so->so_type = type;
362	so->so_cred = crhold(cred);
363	so->so_proto = prp;
364#ifdef MAC
365	mac_create_socket(cred, so);
366#endif
367	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
368	    NULL, NULL, NULL);
369	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
370	    NULL, NULL, NULL);
371	so->so_count = 1;
372	/*
373	 * Auto-sizing of socket buffers is managed by the protocols and
374	 * the appropriate flags must be set in the pru_attach function.
375	 */
376	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
377	if (error) {
378		KASSERT(so->so_count == 1, ("socreate: so_count %d",
379		    so->so_count));
380		so->so_count = 0;
381		sodealloc(so);
382		return (error);
383	}
384	*aso = so;
385	return (0);
386}
387
388#ifdef REGRESSION
389static int regression_sonewconn_earlytest = 1;
390SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
391    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
392#endif
393
394/*
395 * When an attempt at a new connection is noted on a socket which accepts
396 * connections, sonewconn is called.  If the connection is possible (subject
397 * to space constraints, etc.) then we allocate a new structure, propoerly
398 * linked into the data structure of the original socket, and return this.
399 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
400 *
401 * Note: the ref count on the socket is 0 on return.
402 */
403struct socket *
404sonewconn(head, connstatus)
405	register struct socket *head;
406	int connstatus;
407{
408	register struct socket *so;
409	int over;
410
411	ACCEPT_LOCK();
412	over = (head->so_qlen > 3 * head->so_qlimit / 2);
413	ACCEPT_UNLOCK();
414#ifdef REGRESSION
415	if (regression_sonewconn_earlytest && over)
416#else
417	if (over)
418#endif
419		return (NULL);
420	so = soalloc();
421	if (so == NULL)
422		return (NULL);
423	if ((head->so_options & SO_ACCEPTFILTER) != 0)
424		connstatus = 0;
425	so->so_head = head;
426	so->so_type = head->so_type;
427	so->so_options = head->so_options &~ SO_ACCEPTCONN;
428	so->so_linger = head->so_linger;
429	so->so_state = head->so_state | SS_NOFDREF;
430	so->so_proto = head->so_proto;
431	so->so_cred = crhold(head->so_cred);
432#ifdef MAC
433	SOCK_LOCK(head);
434	mac_create_socket_from_socket(head, so);
435	SOCK_UNLOCK(head);
436#endif
437	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
438	    NULL, NULL, NULL);
439	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
440	    NULL, NULL, NULL);
441	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
442	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
443		sodealloc(so);
444		return (NULL);
445	}
446	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
447	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
448	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
449	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
450	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
451	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
452	so->so_state |= connstatus;
453	ACCEPT_LOCK();
454	if (connstatus) {
455		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
456		so->so_qstate |= SQ_COMP;
457		head->so_qlen++;
458	} else {
459		/*
460		 * Keep removing sockets from the head until there's room for
461		 * us to insert on the tail.  In pre-locking revisions, this
462		 * was a simple if(), but as we could be racing with other
463		 * threads and soabort() requires dropping locks, we must
464		 * loop waiting for the condition to be true.
465		 */
466		while (head->so_incqlen > head->so_qlimit) {
467			struct socket *sp;
468			sp = TAILQ_FIRST(&head->so_incomp);
469			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
470			head->so_incqlen--;
471			sp->so_qstate &= ~SQ_INCOMP;
472			sp->so_head = NULL;
473			ACCEPT_UNLOCK();
474			soabort(sp);
475			ACCEPT_LOCK();
476		}
477		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
478		so->so_qstate |= SQ_INCOMP;
479		head->so_incqlen++;
480	}
481	ACCEPT_UNLOCK();
482	if (connstatus) {
483		sorwakeup(head);
484		wakeup_one(&head->so_timeo);
485	}
486	return (so);
487}
488
489int
490sobind(so, nam, td)
491	struct socket *so;
492	struct sockaddr *nam;
493	struct thread *td;
494{
495
496	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
497}
498
499/*
500 * solisten() transitions a socket from a non-listening state to a listening
501 * state, but can also be used to update the listen queue depth on an
502 * existing listen socket.  The protocol will call back into the sockets
503 * layer using solisten_proto_check() and solisten_proto() to check and set
504 * socket-layer listen state.  Call backs are used so that the protocol can
505 * acquire both protocol and socket layer locks in whatever order is required
506 * by the protocol.
507 *
508 * Protocol implementors are advised to hold the socket lock across the
509 * socket-layer test and set to avoid races at the socket layer.
510 */
511int
512solisten(so, backlog, td)
513	struct socket *so;
514	int backlog;
515	struct thread *td;
516{
517
518	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
519}
520
521int
522solisten_proto_check(so)
523	struct socket *so;
524{
525
526	SOCK_LOCK_ASSERT(so);
527
528	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
529	    SS_ISDISCONNECTING))
530		return (EINVAL);
531	return (0);
532}
533
534void
535solisten_proto(so, backlog)
536	struct socket *so;
537	int backlog;
538{
539
540	SOCK_LOCK_ASSERT(so);
541
542	if (backlog < 0 || backlog > somaxconn)
543		backlog = somaxconn;
544	so->so_qlimit = backlog;
545	so->so_options |= SO_ACCEPTCONN;
546}
547
548/*
549 * Attempt to free a socket.  This should really be sotryfree().
550 *
551 * sofree() will succeed if:
552 *
553 * - There are no outstanding file descriptor references or related consumers
554 *   (so_count == 0).
555 *
556 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
557 *
558 * - The protocol does not have an outstanding strong reference on the socket
559 *   (SS_PROTOREF).
560 *
561 * - The socket is not in a completed connection queue, so a process has been
562 *   notified that it is present.  If it is removed, the user process may
563 *   block in accept() despite select() saying the socket was ready.
564 *
565 * Otherwise, it will quietly abort so that a future call to sofree(), when
566 * conditions are right, can succeed.
567 */
568void
569sofree(so)
570	struct socket *so;
571{
572	struct protosw *pr = so->so_proto;
573	struct socket *head;
574
575	ACCEPT_LOCK_ASSERT();
576	SOCK_LOCK_ASSERT(so);
577
578	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
579	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
580		SOCK_UNLOCK(so);
581		ACCEPT_UNLOCK();
582		return;
583	}
584
585	head = so->so_head;
586	if (head != NULL) {
587		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
588		    (so->so_qstate & SQ_INCOMP) != 0,
589		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
590		    "SQ_INCOMP"));
591		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
592		    (so->so_qstate & SQ_INCOMP) == 0,
593		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
594		TAILQ_REMOVE(&head->so_incomp, so, so_list);
595		head->so_incqlen--;
596		so->so_qstate &= ~SQ_INCOMP;
597		so->so_head = NULL;
598	}
599	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
600	    (so->so_qstate & SQ_INCOMP) == 0,
601	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
602	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
603	if (so->so_options & SO_ACCEPTCONN) {
604		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
605		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
606	}
607	SOCK_UNLOCK(so);
608	ACCEPT_UNLOCK();
609
610	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
611		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
612	if (pr->pr_usrreqs->pru_detach != NULL)
613		(*pr->pr_usrreqs->pru_detach)(so);
614
615	/*
616	 * From this point on, we assume that no other references to this
617	 * socket exist anywhere else in the stack.  Therefore, no locks need
618	 * to be acquired or held.
619	 *
620	 * We used to do a lot of socket buffer and socket locking here, as
621	 * well as invoke sorflush() and perform wakeups.  The direct call to
622	 * dom_dispose() and sbrelease_internal() are an inlining of what was
623	 * necessary from sorflush().
624	 *
625	 * Notice that the socket buffer and kqueue state are torn down
626	 * before calling pru_detach.  This means that protocols shold not
627	 * assume they can perform socket wakeups, etc, in their detach
628	 * code.
629	 */
630	KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
631	KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
632	sbdestroy(&so->so_snd, so);
633	sbdestroy(&so->so_rcv, so);
634	knlist_destroy(&so->so_rcv.sb_sel.si_note);
635	knlist_destroy(&so->so_snd.sb_sel.si_note);
636	sodealloc(so);
637}
638
639/*
640 * Close a socket on last file table reference removal.  Initiate disconnect
641 * if connected.  Free socket when disconnect complete.
642 *
643 * This function will sorele() the socket.  Note that soclose() may be called
644 * prior to the ref count reaching zero.  The actual socket structure will
645 * not be freed until the ref count reaches zero.
646 */
647int
648soclose(so)
649	struct socket *so;
650{
651	int error = 0;
652
653	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
654
655	funsetown(&so->so_sigio);
656	if (so->so_state & SS_ISCONNECTED) {
657		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
658			error = sodisconnect(so);
659			if (error)
660				goto drop;
661		}
662		if (so->so_options & SO_LINGER) {
663			if ((so->so_state & SS_ISDISCONNECTING) &&
664			    (so->so_state & SS_NBIO))
665				goto drop;
666			while (so->so_state & SS_ISCONNECTED) {
667				error = tsleep(&so->so_timeo,
668				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
669				if (error)
670					break;
671			}
672		}
673	}
674
675drop:
676	if (so->so_proto->pr_usrreqs->pru_close != NULL)
677		(*so->so_proto->pr_usrreqs->pru_close)(so);
678	if (so->so_options & SO_ACCEPTCONN) {
679		struct socket *sp;
680		ACCEPT_LOCK();
681		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
682			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
683			so->so_incqlen--;
684			sp->so_qstate &= ~SQ_INCOMP;
685			sp->so_head = NULL;
686			ACCEPT_UNLOCK();
687			soabort(sp);
688			ACCEPT_LOCK();
689		}
690		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
691			TAILQ_REMOVE(&so->so_comp, sp, so_list);
692			so->so_qlen--;
693			sp->so_qstate &= ~SQ_COMP;
694			sp->so_head = NULL;
695			ACCEPT_UNLOCK();
696			soabort(sp);
697			ACCEPT_LOCK();
698		}
699		ACCEPT_UNLOCK();
700	}
701	ACCEPT_LOCK();
702	SOCK_LOCK(so);
703	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
704	so->so_state |= SS_NOFDREF;
705	sorele(so);
706	return (error);
707}
708
709/*
710 * soabort() is used to abruptly tear down a connection, such as when a
711 * resource limit is reached (listen queue depth exceeded), or if a listen
712 * socket is closed while there are sockets waiting to be accepted.
713 *
714 * This interface is tricky, because it is called on an unreferenced socket,
715 * and must be called only by a thread that has actually removed the socket
716 * from the listen queue it was on, or races with other threads are risked.
717 *
718 * This interface will call into the protocol code, so must not be called
719 * with any socket locks held.  Protocols do call it while holding their own
720 * recursible protocol mutexes, but this is something that should be subject
721 * to review in the future.
722 */
723void
724soabort(so)
725	struct socket *so;
726{
727
728	/*
729	 * In as much as is possible, assert that no references to this
730	 * socket are held.  This is not quite the same as asserting that the
731	 * current thread is responsible for arranging for no references, but
732	 * is as close as we can get for now.
733	 */
734	KASSERT(so->so_count == 0, ("soabort: so_count"));
735	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
736	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
737	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
738	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
739
740	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
741		(*so->so_proto->pr_usrreqs->pru_abort)(so);
742	ACCEPT_LOCK();
743	SOCK_LOCK(so);
744	sofree(so);
745}
746
747int
748soaccept(so, nam)
749	struct socket *so;
750	struct sockaddr **nam;
751{
752	int error;
753
754	SOCK_LOCK(so);
755	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
756	so->so_state &= ~SS_NOFDREF;
757	SOCK_UNLOCK(so);
758	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
759	return (error);
760}
761
762int
763soconnect(so, nam, td)
764	struct socket *so;
765	struct sockaddr *nam;
766	struct thread *td;
767{
768	int error;
769
770	if (so->so_options & SO_ACCEPTCONN)
771		return (EOPNOTSUPP);
772	/*
773	 * If protocol is connection-based, can only connect once.
774	 * Otherwise, if connected, try to disconnect first.  This allows
775	 * user to disconnect by connecting to, e.g., a null address.
776	 */
777	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
778	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
779	    (error = sodisconnect(so)))) {
780		error = EISCONN;
781	} else {
782		/*
783		 * Prevent accumulated error from previous connection from
784		 * biting us.
785		 */
786		so->so_error = 0;
787		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
788	}
789
790	return (error);
791}
792
793int
794soconnect2(so1, so2)
795	struct socket *so1;
796	struct socket *so2;
797{
798
799	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
800}
801
802int
803sodisconnect(so)
804	struct socket *so;
805{
806	int error;
807
808	if ((so->so_state & SS_ISCONNECTED) == 0)
809		return (ENOTCONN);
810	if (so->so_state & SS_ISDISCONNECTING)
811		return (EALREADY);
812	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
813	return (error);
814}
815
816#ifdef ZERO_COPY_SOCKETS
817struct so_zerocopy_stats{
818	int size_ok;
819	int align_ok;
820	int found_ifp;
821};
822struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
823#include <netinet/in.h>
824#include <net/route.h>
825#include <netinet/in_pcb.h>
826#include <vm/vm.h>
827#include <vm/vm_page.h>
828#include <vm/vm_object.h>
829
830/*
831 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
832 * sosend_dgram() and sosend_generic() use m_uiotombuf().
833 *
834 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
835 * all of the data referenced by the uio.  If desired, it uses zero-copy.
836 * *space will be updated to reflect data copied in.
837 *
838 * NB: If atomic I/O is requested, the caller must already have checked that
839 * space can hold resid bytes.
840 *
841 * NB: In the event of an error, the caller may need to free the partial
842 * chain pointed to by *mpp.  The contents of both *uio and *space may be
843 * modified even in the case of an error.
844 */
845static int
846sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
847    int flags)
848{
849	struct mbuf *m, **mp, *top;
850	long len, resid;
851	int error;
852#ifdef ZERO_COPY_SOCKETS
853	int cow_send;
854#endif
855
856	*retmp = top = NULL;
857	mp = &top;
858	len = 0;
859	resid = uio->uio_resid;
860	error = 0;
861	do {
862#ifdef ZERO_COPY_SOCKETS
863		cow_send = 0;
864#endif /* ZERO_COPY_SOCKETS */
865		if (resid >= MINCLSIZE) {
866#ifdef ZERO_COPY_SOCKETS
867			if (top == NULL) {
868				m = m_gethdr(M_WAITOK, MT_DATA);
869				m->m_pkthdr.len = 0;
870				m->m_pkthdr.rcvif = NULL;
871			} else
872				m = m_get(M_WAITOK, MT_DATA);
873			if (so_zero_copy_send &&
874			    resid>=PAGE_SIZE &&
875			    *space>=PAGE_SIZE &&
876			    uio->uio_iov->iov_len>=PAGE_SIZE) {
877				so_zerocp_stats.size_ok++;
878				so_zerocp_stats.align_ok++;
879				cow_send = socow_setup(m, uio);
880				len = cow_send;
881			}
882			if (!cow_send) {
883				m_clget(m, M_WAITOK);
884				len = min(min(MCLBYTES, resid), *space);
885			}
886#else /* ZERO_COPY_SOCKETS */
887			if (top == NULL) {
888				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
889				m->m_pkthdr.len = 0;
890				m->m_pkthdr.rcvif = NULL;
891			} else
892				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
893			len = min(min(MCLBYTES, resid), *space);
894#endif /* ZERO_COPY_SOCKETS */
895		} else {
896			if (top == NULL) {
897				m = m_gethdr(M_TRYWAIT, MT_DATA);
898				m->m_pkthdr.len = 0;
899				m->m_pkthdr.rcvif = NULL;
900
901				len = min(min(MHLEN, resid), *space);
902				/*
903				 * For datagram protocols, leave room
904				 * for protocol headers in first mbuf.
905				 */
906				if (atomic && m && len < MHLEN)
907					MH_ALIGN(m, len);
908			} else {
909				m = m_get(M_TRYWAIT, MT_DATA);
910				len = min(min(MLEN, resid), *space);
911			}
912		}
913		if (m == NULL) {
914			error = ENOBUFS;
915			goto out;
916		}
917
918		*space -= len;
919#ifdef ZERO_COPY_SOCKETS
920		if (cow_send)
921			error = 0;
922		else
923#endif /* ZERO_COPY_SOCKETS */
924		error = uiomove(mtod(m, void *), (int)len, uio);
925		resid = uio->uio_resid;
926		m->m_len = len;
927		*mp = m;
928		top->m_pkthdr.len += len;
929		if (error)
930			goto out;
931		mp = &m->m_next;
932		if (resid <= 0) {
933			if (flags & MSG_EOR)
934				top->m_flags |= M_EOR;
935			break;
936		}
937	} while (*space > 0 && atomic);
938out:
939	*retmp = top;
940	return (error);
941}
942#endif /*ZERO_COPY_SOCKETS*/
943
944#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
945
946int
947sosend_dgram(so, addr, uio, top, control, flags, td)
948	struct socket *so;
949	struct sockaddr *addr;
950	struct uio *uio;
951	struct mbuf *top;
952	struct mbuf *control;
953	int flags;
954	struct thread *td;
955{
956	long space, resid;
957	int clen = 0, error, dontroute;
958#ifdef ZERO_COPY_SOCKETS
959	int atomic = sosendallatonce(so) || top;
960#endif
961
962	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
963	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
964	    ("sodgram_send: !PR_ATOMIC"));
965
966	if (uio != NULL)
967		resid = uio->uio_resid;
968	else
969		resid = top->m_pkthdr.len;
970	/*
971	 * In theory resid should be unsigned.  However, space must be
972	 * signed, as it might be less than 0 if we over-committed, and we
973	 * must use a signed comparison of space and resid.  On the other
974	 * hand, a negative resid causes us to loop sending 0-length
975	 * segments to the protocol.
976	 *
977	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
978	 * type sockets since that's an error.
979	 */
980	if (resid < 0) {
981		error = EINVAL;
982		goto out;
983	}
984
985	dontroute =
986	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
987	if (td != NULL)
988		td->td_proc->p_stats->p_ru.ru_msgsnd++;
989	if (control != NULL)
990		clen = control->m_len;
991
992	SOCKBUF_LOCK(&so->so_snd);
993	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
994		SOCKBUF_UNLOCK(&so->so_snd);
995		error = EPIPE;
996		goto out;
997	}
998	if (so->so_error) {
999		error = so->so_error;
1000		so->so_error = 0;
1001		SOCKBUF_UNLOCK(&so->so_snd);
1002		goto out;
1003	}
1004	if ((so->so_state & SS_ISCONNECTED) == 0) {
1005		/*
1006		 * `sendto' and `sendmsg' is allowed on a connection-based
1007		 * socket if it supports implied connect.  Return ENOTCONN if
1008		 * not connected and no address is supplied.
1009		 */
1010		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1011		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1012			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1013			    !(resid == 0 && clen != 0)) {
1014				SOCKBUF_UNLOCK(&so->so_snd);
1015				error = ENOTCONN;
1016				goto out;
1017			}
1018		} else if (addr == NULL) {
1019			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1020				error = ENOTCONN;
1021			else
1022				error = EDESTADDRREQ;
1023			SOCKBUF_UNLOCK(&so->so_snd);
1024			goto out;
1025		}
1026	}
1027
1028	/*
1029	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1030	 * problem and need fixing.
1031	 */
1032	space = sbspace(&so->so_snd);
1033	if (flags & MSG_OOB)
1034		space += 1024;
1035	space -= clen;
1036	SOCKBUF_UNLOCK(&so->so_snd);
1037	if (resid > space) {
1038		error = EMSGSIZE;
1039		goto out;
1040	}
1041	if (uio == NULL) {
1042		resid = 0;
1043		if (flags & MSG_EOR)
1044			top->m_flags |= M_EOR;
1045	} else {
1046#ifdef ZERO_COPY_SOCKETS
1047		error = sosend_copyin(uio, &top, atomic, &space, flags);
1048		if (error)
1049			goto out;
1050#else
1051		/*
1052		 * Copy the data from userland into a mbuf chain.
1053		 * If no data is to be copied in, a single empty mbuf
1054		 * is returned.
1055		 */
1056		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1057		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1058		if (top == NULL) {
1059			error = EFAULT;	/* only possible error */
1060			goto out;
1061		}
1062		space -= resid - uio->uio_resid;
1063#endif
1064		resid = uio->uio_resid;
1065	}
1066	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1067	/*
1068	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1069	 * than with.
1070	 */
1071	if (dontroute) {
1072		SOCK_LOCK(so);
1073		so->so_options |= SO_DONTROUTE;
1074		SOCK_UNLOCK(so);
1075	}
1076	/*
1077	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1078	 * of date.  We could have recieved a reset packet in an interrupt or
1079	 * maybe we slept while doing page faults in uiomove() etc.  We could
1080	 * probably recheck again inside the locking protection here, but
1081	 * there are probably other places that this also happens.  We must
1082	 * rethink this.
1083	 */
1084	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1085	    (flags & MSG_OOB) ? PRUS_OOB :
1086	/*
1087	 * If the user set MSG_EOF, the protocol understands this flag and
1088	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1089	 */
1090	    ((flags & MSG_EOF) &&
1091	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1092	     (resid <= 0)) ?
1093		PRUS_EOF :
1094		/* If there is more to send set PRUS_MORETOCOME */
1095		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1096		top, addr, control, td);
1097	if (dontroute) {
1098		SOCK_LOCK(so);
1099		so->so_options &= ~SO_DONTROUTE;
1100		SOCK_UNLOCK(so);
1101	}
1102	clen = 0;
1103	control = NULL;
1104	top = NULL;
1105out:
1106	if (top != NULL)
1107		m_freem(top);
1108	if (control != NULL)
1109		m_freem(control);
1110	return (error);
1111}
1112
1113/*
1114 * Send on a socket.  If send must go all at once and message is larger than
1115 * send buffering, then hard error.  Lock against other senders.  If must go
1116 * all at once and not enough room now, then inform user that this would
1117 * block and do nothing.  Otherwise, if nonblocking, send as much as
1118 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1119 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1120 * in mbuf chain must be small enough to send all at once.
1121 *
1122 * Returns nonzero on error, timeout or signal; callers must check for short
1123 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1124 * on return.
1125 */
1126#define	snderr(errno)	{ error = (errno); goto release; }
1127int
1128sosend_generic(so, addr, uio, top, control, flags, td)
1129	struct socket *so;
1130	struct sockaddr *addr;
1131	struct uio *uio;
1132	struct mbuf *top;
1133	struct mbuf *control;
1134	int flags;
1135	struct thread *td;
1136{
1137	long space, resid;
1138	int clen = 0, error, dontroute;
1139	int atomic = sosendallatonce(so) || top;
1140
1141	if (uio != NULL)
1142		resid = uio->uio_resid;
1143	else
1144		resid = top->m_pkthdr.len;
1145	/*
1146	 * In theory resid should be unsigned.  However, space must be
1147	 * signed, as it might be less than 0 if we over-committed, and we
1148	 * must use a signed comparison of space and resid.  On the other
1149	 * hand, a negative resid causes us to loop sending 0-length
1150	 * segments to the protocol.
1151	 *
1152	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1153	 * type sockets since that's an error.
1154	 */
1155	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1156		error = EINVAL;
1157		goto out;
1158	}
1159
1160	dontroute =
1161	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1162	    (so->so_proto->pr_flags & PR_ATOMIC);
1163	if (td != NULL)
1164		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1165	if (control != NULL)
1166		clen = control->m_len;
1167
1168	SOCKBUF_LOCK(&so->so_snd);
1169restart:
1170	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1171	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1172	if (error)
1173		goto out_locked;
1174	do {
1175		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1176		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1177			snderr(EPIPE);
1178		if (so->so_error) {
1179			error = so->so_error;
1180			so->so_error = 0;
1181			goto release;
1182		}
1183		if ((so->so_state & SS_ISCONNECTED) == 0) {
1184			/*
1185			 * `sendto' and `sendmsg' is allowed on a connection-
1186			 * based socket if it supports implied connect.
1187			 * Return ENOTCONN if not connected and no address is
1188			 * supplied.
1189			 */
1190			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1191			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1192				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1193				    !(resid == 0 && clen != 0))
1194					snderr(ENOTCONN);
1195			} else if (addr == NULL)
1196			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1197				   ENOTCONN : EDESTADDRREQ);
1198		}
1199		space = sbspace(&so->so_snd);
1200		if (flags & MSG_OOB)
1201			space += 1024;
1202		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1203		    clen > so->so_snd.sb_hiwat)
1204			snderr(EMSGSIZE);
1205		if (space < resid + clen &&
1206		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1207			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1208				snderr(EWOULDBLOCK);
1209			sbunlock(&so->so_snd);
1210			error = sbwait(&so->so_snd);
1211			if (error)
1212				goto out_locked;
1213			goto restart;
1214		}
1215		SOCKBUF_UNLOCK(&so->so_snd);
1216		space -= clen;
1217		do {
1218			if (uio == NULL) {
1219				resid = 0;
1220				if (flags & MSG_EOR)
1221					top->m_flags |= M_EOR;
1222			} else {
1223#ifdef ZERO_COPY_SOCKETS
1224				error = sosend_copyin(uio, &top, atomic,
1225				    &space, flags);
1226				if (error != 0) {
1227					SOCKBUF_LOCK(&so->so_snd);
1228					goto release;
1229				}
1230#else
1231				/*
1232				 * Copy the data from userland into a mbuf
1233				 * chain.  If no data is to be copied in,
1234				 * a single empty mbuf is returned.
1235				 */
1236				top = m_uiotombuf(uio, M_WAITOK, space,
1237				    (atomic ? max_hdr : 0),
1238				    (atomic ? M_PKTHDR : 0) |
1239				    ((flags & MSG_EOR) ? M_EOR : 0));
1240				if (top == NULL) {
1241					SOCKBUF_LOCK(&so->so_snd);
1242					error = EFAULT; /* only possible error */
1243					goto release;
1244				}
1245				space -= resid - uio->uio_resid;
1246#endif
1247				resid = uio->uio_resid;
1248			}
1249			if (dontroute) {
1250				SOCK_LOCK(so);
1251				so->so_options |= SO_DONTROUTE;
1252				SOCK_UNLOCK(so);
1253			}
1254			/*
1255			 * XXX all the SBS_CANTSENDMORE checks previously
1256			 * done could be out of date.  We could have recieved
1257			 * a reset packet in an interrupt or maybe we slept
1258			 * while doing page faults in uiomove() etc.  We
1259			 * could probably recheck again inside the locking
1260			 * protection here, but there are probably other
1261			 * places that this also happens.  We must rethink
1262			 * this.
1263			 */
1264			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1265			    (flags & MSG_OOB) ? PRUS_OOB :
1266			/*
1267			 * If the user set MSG_EOF, the protocol understands
1268			 * this flag and nothing left to send then use
1269			 * PRU_SEND_EOF instead of PRU_SEND.
1270			 */
1271			    ((flags & MSG_EOF) &&
1272			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1273			     (resid <= 0)) ?
1274				PRUS_EOF :
1275			/* If there is more to send set PRUS_MORETOCOME. */
1276			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1277			    top, addr, control, td);
1278			if (dontroute) {
1279				SOCK_LOCK(so);
1280				so->so_options &= ~SO_DONTROUTE;
1281				SOCK_UNLOCK(so);
1282			}
1283			clen = 0;
1284			control = NULL;
1285			top = NULL;
1286			if (error) {
1287				SOCKBUF_LOCK(&so->so_snd);
1288				goto release;
1289			}
1290		} while (resid && space > 0);
1291		SOCKBUF_LOCK(&so->so_snd);
1292	} while (resid);
1293
1294release:
1295	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1296	sbunlock(&so->so_snd);
1297out_locked:
1298	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1299	SOCKBUF_UNLOCK(&so->so_snd);
1300out:
1301	if (top != NULL)
1302		m_freem(top);
1303	if (control != NULL)
1304		m_freem(control);
1305	return (error);
1306}
1307#undef snderr
1308
1309int
1310sosend(so, addr, uio, top, control, flags, td)
1311	struct socket *so;
1312	struct sockaddr *addr;
1313	struct uio *uio;
1314	struct mbuf *top;
1315	struct mbuf *control;
1316	int flags;
1317	struct thread *td;
1318{
1319
1320	/* XXXRW: Temporary debugging. */
1321	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1322	    ("sosend: protocol calls sosend"));
1323
1324	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1325	    control, flags, td));
1326}
1327
1328/*
1329 * The part of soreceive() that implements reading non-inline out-of-band
1330 * data from a socket.  For more complete comments, see soreceive(), from
1331 * which this code originated.
1332 *
1333 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1334 * unable to return an mbuf chain to the caller.
1335 */
1336static int
1337soreceive_rcvoob(so, uio, flags)
1338	struct socket *so;
1339	struct uio *uio;
1340	int flags;
1341{
1342	struct protosw *pr = so->so_proto;
1343	struct mbuf *m;
1344	int error;
1345
1346	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1347
1348	m = m_get(M_TRYWAIT, MT_DATA);
1349	if (m == NULL)
1350		return (ENOBUFS);
1351	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1352	if (error)
1353		goto bad;
1354	do {
1355#ifdef ZERO_COPY_SOCKETS
1356		if (so_zero_copy_receive) {
1357			int disposable;
1358
1359			if ((m->m_flags & M_EXT)
1360			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1361				disposable = 1;
1362			else
1363				disposable = 0;
1364
1365			error = uiomoveco(mtod(m, void *),
1366					  min(uio->uio_resid, m->m_len),
1367					  uio, disposable);
1368		} else
1369#endif /* ZERO_COPY_SOCKETS */
1370		error = uiomove(mtod(m, void *),
1371		    (int) min(uio->uio_resid, m->m_len), uio);
1372		m = m_free(m);
1373	} while (uio->uio_resid && error == 0 && m);
1374bad:
1375	if (m != NULL)
1376		m_freem(m);
1377	return (error);
1378}
1379
1380/*
1381 * Following replacement or removal of the first mbuf on the first mbuf chain
1382 * of a socket buffer, push necessary state changes back into the socket
1383 * buffer so that other consumers see the values consistently.  'nextrecord'
1384 * is the callers locally stored value of the original value of
1385 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1386 * NOTE: 'nextrecord' may be NULL.
1387 */
1388static __inline void
1389sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1390{
1391
1392	SOCKBUF_LOCK_ASSERT(sb);
1393	/*
1394	 * First, update for the new value of nextrecord.  If necessary, make
1395	 * it the first record.
1396	 */
1397	if (sb->sb_mb != NULL)
1398		sb->sb_mb->m_nextpkt = nextrecord;
1399	else
1400		sb->sb_mb = nextrecord;
1401
1402        /*
1403         * Now update any dependent socket buffer fields to reflect the new
1404         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1405	 * addition of a second clause that takes care of the case where
1406	 * sb_mb has been updated, but remains the last record.
1407         */
1408        if (sb->sb_mb == NULL) {
1409                sb->sb_mbtail = NULL;
1410                sb->sb_lastrecord = NULL;
1411        } else if (sb->sb_mb->m_nextpkt == NULL)
1412                sb->sb_lastrecord = sb->sb_mb;
1413}
1414
1415
1416/*
1417 * Implement receive operations on a socket.  We depend on the way that
1418 * records are added to the sockbuf by sbappend.  In particular, each record
1419 * (mbufs linked through m_next) must begin with an address if the protocol
1420 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1421 * data, and then zero or more mbufs of data.  In order to allow parallelism
1422 * between network receive and copying to user space, as well as avoid
1423 * sleeping with a mutex held, we release the socket buffer mutex during the
1424 * user space copy.  Although the sockbuf is locked, new data may still be
1425 * appended, and thus we must maintain consistency of the sockbuf during that
1426 * time.
1427 *
1428 * The caller may receive the data as a single mbuf chain by supplying an
1429 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1430 * the count in uio_resid.
1431 */
1432int
1433soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1434	struct socket *so;
1435	struct sockaddr **psa;
1436	struct uio *uio;
1437	struct mbuf **mp0;
1438	struct mbuf **controlp;
1439	int *flagsp;
1440{
1441	struct mbuf *m, **mp;
1442	int flags, len, error, offset;
1443	struct protosw *pr = so->so_proto;
1444	struct mbuf *nextrecord;
1445	int moff, type = 0;
1446	int orig_resid = uio->uio_resid;
1447
1448	mp = mp0;
1449	if (psa != NULL)
1450		*psa = NULL;
1451	if (controlp != NULL)
1452		*controlp = NULL;
1453	if (flagsp != NULL)
1454		flags = *flagsp &~ MSG_EOR;
1455	else
1456		flags = 0;
1457	if (flags & MSG_OOB)
1458		return (soreceive_rcvoob(so, uio, flags));
1459	if (mp != NULL)
1460		*mp = NULL;
1461	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1462	    && uio->uio_resid)
1463		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1464
1465	SOCKBUF_LOCK(&so->so_rcv);
1466restart:
1467	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1468	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1469	if (error)
1470		goto out;
1471
1472	m = so->so_rcv.sb_mb;
1473	/*
1474	 * If we have less data than requested, block awaiting more (subject
1475	 * to any timeout) if:
1476	 *   1. the current count is less than the low water mark, or
1477	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1478	 *	receive operation at once if we block (resid <= hiwat).
1479	 *   3. MSG_DONTWAIT is not set
1480	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1481	 * we have to do the receive in sections, and thus risk returning a
1482	 * short count if a timeout or signal occurs after we start.
1483	 */
1484	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1485	    so->so_rcv.sb_cc < uio->uio_resid) &&
1486	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1487	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1488	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1489		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1490		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1491		    m, so->so_rcv.sb_cc));
1492		if (so->so_error) {
1493			if (m != NULL)
1494				goto dontblock;
1495			error = so->so_error;
1496			if ((flags & MSG_PEEK) == 0)
1497				so->so_error = 0;
1498			goto release;
1499		}
1500		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1501		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1502			if (m)
1503				goto dontblock;
1504			else
1505				goto release;
1506		}
1507		for (; m != NULL; m = m->m_next)
1508			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1509				m = so->so_rcv.sb_mb;
1510				goto dontblock;
1511			}
1512		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1513		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1514			error = ENOTCONN;
1515			goto release;
1516		}
1517		if (uio->uio_resid == 0)
1518			goto release;
1519		if ((so->so_state & SS_NBIO) ||
1520		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1521			error = EWOULDBLOCK;
1522			goto release;
1523		}
1524		SBLASTRECORDCHK(&so->so_rcv);
1525		SBLASTMBUFCHK(&so->so_rcv);
1526		sbunlock(&so->so_rcv);
1527		error = sbwait(&so->so_rcv);
1528		if (error)
1529			goto out;
1530		goto restart;
1531	}
1532dontblock:
1533	/*
1534	 * From this point onward, we maintain 'nextrecord' as a cache of the
1535	 * pointer to the next record in the socket buffer.  We must keep the
1536	 * various socket buffer pointers and local stack versions of the
1537	 * pointers in sync, pushing out modifications before dropping the
1538	 * socket buffer mutex, and re-reading them when picking it up.
1539	 *
1540	 * Otherwise, we will race with the network stack appending new data
1541	 * or records onto the socket buffer by using inconsistent/stale
1542	 * versions of the field, possibly resulting in socket buffer
1543	 * corruption.
1544	 *
1545	 * By holding the high-level sblock(), we prevent simultaneous
1546	 * readers from pulling off the front of the socket buffer.
1547	 */
1548	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1549	if (uio->uio_td)
1550		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1551	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1552	SBLASTRECORDCHK(&so->so_rcv);
1553	SBLASTMBUFCHK(&so->so_rcv);
1554	nextrecord = m->m_nextpkt;
1555	if (pr->pr_flags & PR_ADDR) {
1556		KASSERT(m->m_type == MT_SONAME,
1557		    ("m->m_type == %d", m->m_type));
1558		orig_resid = 0;
1559		if (psa != NULL)
1560			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1561			    M_NOWAIT);
1562		if (flags & MSG_PEEK) {
1563			m = m->m_next;
1564		} else {
1565			sbfree(&so->so_rcv, m);
1566			so->so_rcv.sb_mb = m_free(m);
1567			m = so->so_rcv.sb_mb;
1568			sockbuf_pushsync(&so->so_rcv, nextrecord);
1569		}
1570	}
1571
1572	/*
1573	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1574	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1575	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1576	 * perform externalization (or freeing if controlp == NULL).
1577	 */
1578	if (m != NULL && m->m_type == MT_CONTROL) {
1579		struct mbuf *cm = NULL, *cmn;
1580		struct mbuf **cme = &cm;
1581
1582		do {
1583			if (flags & MSG_PEEK) {
1584				if (controlp != NULL) {
1585					*controlp = m_copy(m, 0, m->m_len);
1586					controlp = &(*controlp)->m_next;
1587				}
1588				m = m->m_next;
1589			} else {
1590				sbfree(&so->so_rcv, m);
1591				so->so_rcv.sb_mb = m->m_next;
1592				m->m_next = NULL;
1593				*cme = m;
1594				cme = &(*cme)->m_next;
1595				m = so->so_rcv.sb_mb;
1596			}
1597		} while (m != NULL && m->m_type == MT_CONTROL);
1598		if ((flags & MSG_PEEK) == 0)
1599			sockbuf_pushsync(&so->so_rcv, nextrecord);
1600		while (cm != NULL) {
1601			cmn = cm->m_next;
1602			cm->m_next = NULL;
1603			if (pr->pr_domain->dom_externalize != NULL) {
1604				SOCKBUF_UNLOCK(&so->so_rcv);
1605				error = (*pr->pr_domain->dom_externalize)
1606				    (cm, controlp);
1607				SOCKBUF_LOCK(&so->so_rcv);
1608			} else if (controlp != NULL)
1609				*controlp = cm;
1610			else
1611				m_freem(cm);
1612			if (controlp != NULL) {
1613				orig_resid = 0;
1614				while (*controlp != NULL)
1615					controlp = &(*controlp)->m_next;
1616			}
1617			cm = cmn;
1618		}
1619		if (m != NULL)
1620			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1621		else
1622			nextrecord = so->so_rcv.sb_mb;
1623		orig_resid = 0;
1624	}
1625	if (m != NULL) {
1626		if ((flags & MSG_PEEK) == 0) {
1627			KASSERT(m->m_nextpkt == nextrecord,
1628			    ("soreceive: post-control, nextrecord !sync"));
1629			if (nextrecord == NULL) {
1630				KASSERT(so->so_rcv.sb_mb == m,
1631				    ("soreceive: post-control, sb_mb!=m"));
1632				KASSERT(so->so_rcv.sb_lastrecord == m,
1633				    ("soreceive: post-control, lastrecord!=m"));
1634			}
1635		}
1636		type = m->m_type;
1637		if (type == MT_OOBDATA)
1638			flags |= MSG_OOB;
1639	} else {
1640		if ((flags & MSG_PEEK) == 0) {
1641			KASSERT(so->so_rcv.sb_mb == nextrecord,
1642			    ("soreceive: sb_mb != nextrecord"));
1643			if (so->so_rcv.sb_mb == NULL) {
1644				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1645				    ("soreceive: sb_lastercord != NULL"));
1646			}
1647		}
1648	}
1649	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1650	SBLASTRECORDCHK(&so->so_rcv);
1651	SBLASTMBUFCHK(&so->so_rcv);
1652
1653	/*
1654	 * Now continue to read any data mbufs off of the head of the socket
1655	 * buffer until the read request is satisfied.  Note that 'type' is
1656	 * used to store the type of any mbuf reads that have happened so far
1657	 * such that soreceive() can stop reading if the type changes, which
1658	 * causes soreceive() to return only one of regular data and inline
1659	 * out-of-band data in a single socket receive operation.
1660	 */
1661	moff = 0;
1662	offset = 0;
1663	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1664		/*
1665		 * If the type of mbuf has changed since the last mbuf
1666		 * examined ('type'), end the receive operation.
1667	 	 */
1668		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1669		if (m->m_type == MT_OOBDATA) {
1670			if (type != MT_OOBDATA)
1671				break;
1672		} else if (type == MT_OOBDATA)
1673			break;
1674		else
1675		    KASSERT(m->m_type == MT_DATA,
1676			("m->m_type == %d", m->m_type));
1677		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1678		len = uio->uio_resid;
1679		if (so->so_oobmark && len > so->so_oobmark - offset)
1680			len = so->so_oobmark - offset;
1681		if (len > m->m_len - moff)
1682			len = m->m_len - moff;
1683		/*
1684		 * If mp is set, just pass back the mbufs.  Otherwise copy
1685		 * them out via the uio, then free.  Sockbuf must be
1686		 * consistent here (points to current mbuf, it points to next
1687		 * record) when we drop priority; we must note any additions
1688		 * to the sockbuf when we block interrupts again.
1689		 */
1690		if (mp == NULL) {
1691			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1692			SBLASTRECORDCHK(&so->so_rcv);
1693			SBLASTMBUFCHK(&so->so_rcv);
1694			SOCKBUF_UNLOCK(&so->so_rcv);
1695#ifdef ZERO_COPY_SOCKETS
1696			if (so_zero_copy_receive) {
1697				int disposable;
1698
1699				if ((m->m_flags & M_EXT)
1700				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1701					disposable = 1;
1702				else
1703					disposable = 0;
1704
1705				error = uiomoveco(mtod(m, char *) + moff,
1706						  (int)len, uio,
1707						  disposable);
1708			} else
1709#endif /* ZERO_COPY_SOCKETS */
1710			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1711			SOCKBUF_LOCK(&so->so_rcv);
1712			if (error) {
1713				/*
1714				 * The MT_SONAME mbuf has already been removed
1715				 * from the record, so it is necessary to
1716				 * remove the data mbufs, if any, to preserve
1717				 * the invariant in the case of PR_ADDR that
1718				 * requires MT_SONAME mbufs at the head of
1719				 * each record.
1720				 */
1721				if (m && pr->pr_flags & PR_ATOMIC &&
1722				    ((flags & MSG_PEEK) == 0))
1723					(void)sbdroprecord_locked(&so->so_rcv);
1724				goto release;
1725			}
1726		} else
1727			uio->uio_resid -= len;
1728		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1729		if (len == m->m_len - moff) {
1730			if (m->m_flags & M_EOR)
1731				flags |= MSG_EOR;
1732			if (flags & MSG_PEEK) {
1733				m = m->m_next;
1734				moff = 0;
1735			} else {
1736				nextrecord = m->m_nextpkt;
1737				sbfree(&so->so_rcv, m);
1738				if (mp != NULL) {
1739					*mp = m;
1740					mp = &m->m_next;
1741					so->so_rcv.sb_mb = m = m->m_next;
1742					*mp = NULL;
1743				} else {
1744					so->so_rcv.sb_mb = m_free(m);
1745					m = so->so_rcv.sb_mb;
1746				}
1747				sockbuf_pushsync(&so->so_rcv, nextrecord);
1748				SBLASTRECORDCHK(&so->so_rcv);
1749				SBLASTMBUFCHK(&so->so_rcv);
1750			}
1751		} else {
1752			if (flags & MSG_PEEK)
1753				moff += len;
1754			else {
1755				if (mp != NULL) {
1756					int copy_flag;
1757
1758					if (flags & MSG_DONTWAIT)
1759						copy_flag = M_DONTWAIT;
1760					else
1761						copy_flag = M_TRYWAIT;
1762					if (copy_flag == M_TRYWAIT)
1763						SOCKBUF_UNLOCK(&so->so_rcv);
1764					*mp = m_copym(m, 0, len, copy_flag);
1765					if (copy_flag == M_TRYWAIT)
1766						SOCKBUF_LOCK(&so->so_rcv);
1767 					if (*mp == NULL) {
1768 						/*
1769 						 * m_copym() couldn't
1770						 * allocate an mbuf.  Adjust
1771						 * uio_resid back (it was
1772						 * adjusted down by len
1773						 * bytes, which we didn't end
1774						 * up "copying" over).
1775 						 */
1776 						uio->uio_resid += len;
1777 						break;
1778 					}
1779				}
1780				m->m_data += len;
1781				m->m_len -= len;
1782				so->so_rcv.sb_cc -= len;
1783			}
1784		}
1785		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786		if (so->so_oobmark) {
1787			if ((flags & MSG_PEEK) == 0) {
1788				so->so_oobmark -= len;
1789				if (so->so_oobmark == 0) {
1790					so->so_rcv.sb_state |= SBS_RCVATMARK;
1791					break;
1792				}
1793			} else {
1794				offset += len;
1795				if (offset == so->so_oobmark)
1796					break;
1797			}
1798		}
1799		if (flags & MSG_EOR)
1800			break;
1801		/*
1802		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1803		 * must not quit until "uio->uio_resid == 0" or an error
1804		 * termination.  If a signal/timeout occurs, return with a
1805		 * short count but without error.  Keep sockbuf locked
1806		 * against other readers.
1807		 */
1808		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1809		    !sosendallatonce(so) && nextrecord == NULL) {
1810			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1812				break;
1813			/*
1814			 * Notify the protocol that some data has been
1815			 * drained before blocking.
1816			 */
1817			if (pr->pr_flags & PR_WANTRCVD) {
1818				SOCKBUF_UNLOCK(&so->so_rcv);
1819				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1820				SOCKBUF_LOCK(&so->so_rcv);
1821			}
1822			SBLASTRECORDCHK(&so->so_rcv);
1823			SBLASTMBUFCHK(&so->so_rcv);
1824			error = sbwait(&so->so_rcv);
1825			if (error)
1826				goto release;
1827			m = so->so_rcv.sb_mb;
1828			if (m != NULL)
1829				nextrecord = m->m_nextpkt;
1830		}
1831	}
1832
1833	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1834	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1835		flags |= MSG_TRUNC;
1836		if ((flags & MSG_PEEK) == 0)
1837			(void) sbdroprecord_locked(&so->so_rcv);
1838	}
1839	if ((flags & MSG_PEEK) == 0) {
1840		if (m == NULL) {
1841			/*
1842			 * First part is an inline SB_EMPTY_FIXUP().  Second
1843			 * part makes sure sb_lastrecord is up-to-date if
1844			 * there is still data in the socket buffer.
1845			 */
1846			so->so_rcv.sb_mb = nextrecord;
1847			if (so->so_rcv.sb_mb == NULL) {
1848				so->so_rcv.sb_mbtail = NULL;
1849				so->so_rcv.sb_lastrecord = NULL;
1850			} else if (nextrecord->m_nextpkt == NULL)
1851				so->so_rcv.sb_lastrecord = nextrecord;
1852		}
1853		SBLASTRECORDCHK(&so->so_rcv);
1854		SBLASTMBUFCHK(&so->so_rcv);
1855		/*
1856		 * If soreceive() is being done from the socket callback,
1857		 * then don't need to generate ACK to peer to update window,
1858		 * since ACK will be generated on return to TCP.
1859		 */
1860		if (!(flags & MSG_SOCALLBCK) &&
1861		    (pr->pr_flags & PR_WANTRCVD)) {
1862			SOCKBUF_UNLOCK(&so->so_rcv);
1863			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1864			SOCKBUF_LOCK(&so->so_rcv);
1865		}
1866	}
1867	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1868	if (orig_resid == uio->uio_resid && orig_resid &&
1869	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1870		sbunlock(&so->so_rcv);
1871		goto restart;
1872	}
1873
1874	if (flagsp != NULL)
1875		*flagsp |= flags;
1876release:
1877	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1878	sbunlock(&so->so_rcv);
1879out:
1880	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1881	SOCKBUF_UNLOCK(&so->so_rcv);
1882	return (error);
1883}
1884
1885int
1886soreceive(so, psa, uio, mp0, controlp, flagsp)
1887	struct socket *so;
1888	struct sockaddr **psa;
1889	struct uio *uio;
1890	struct mbuf **mp0;
1891	struct mbuf **controlp;
1892	int *flagsp;
1893{
1894
1895	/* XXXRW: Temporary debugging. */
1896	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1897	    ("soreceive: protocol calls soreceive"));
1898
1899	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1900	    controlp, flagsp));
1901}
1902
1903int
1904soshutdown(so, how)
1905	struct socket *so;
1906	int how;
1907{
1908	struct protosw *pr = so->so_proto;
1909
1910	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1911		return (EINVAL);
1912
1913	if (how != SHUT_WR)
1914		sorflush(so);
1915	if (how != SHUT_RD)
1916		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1917	return (0);
1918}
1919
1920void
1921sorflush(so)
1922	struct socket *so;
1923{
1924	struct sockbuf *sb = &so->so_rcv;
1925	struct protosw *pr = so->so_proto;
1926	struct sockbuf asb;
1927
1928	/*
1929	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1930	 * the socket buffer, then zero'd the original to clear the buffer
1931	 * fields.  However, with mutexes in the socket buffer, this causes
1932	 * problems.  We only clear the zeroable bits of the original;
1933	 * however, we have to initialize and destroy the mutex in the copy
1934	 * so that dom_dispose() and sbrelease() can lock t as needed.
1935	 */
1936	SOCKBUF_LOCK(sb);
1937	sb->sb_flags |= SB_NOINTR;
1938	(void) sblock(sb, M_WAITOK);
1939	/*
1940	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1941	 * can safely perform wakeups.  Re-acquire the mutex before
1942	 * continuing.
1943	 */
1944	socantrcvmore_locked(so);
1945	SOCKBUF_LOCK(sb);
1946	sbunlock(sb);
1947	/*
1948	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1949	 * and mutex data unchanged.
1950	 */
1951	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1952	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1953	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1954	bzero(&sb->sb_startzero,
1955	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1956	SOCKBUF_UNLOCK(sb);
1957
1958	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1959	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1960		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1961	sbrelease(&asb, so);
1962	SOCKBUF_LOCK_DESTROY(&asb);
1963}
1964
1965/*
1966 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1967 * additional variant to handle the case where the option value needs to be
1968 * some kind of integer, but not a specific size.  In addition to their use
1969 * here, these functions are also called by the protocol-level pr_ctloutput()
1970 * routines.
1971 */
1972int
1973sooptcopyin(sopt, buf, len, minlen)
1974	struct	sockopt *sopt;
1975	void	*buf;
1976	size_t	len;
1977	size_t	minlen;
1978{
1979	size_t	valsize;
1980
1981	/*
1982	 * If the user gives us more than we wanted, we ignore it, but if we
1983	 * don't get the minimum length the caller wants, we return EINVAL.
1984	 * On success, sopt->sopt_valsize is set to however much we actually
1985	 * retrieved.
1986	 */
1987	if ((valsize = sopt->sopt_valsize) < minlen)
1988		return EINVAL;
1989	if (valsize > len)
1990		sopt->sopt_valsize = valsize = len;
1991
1992	if (sopt->sopt_td != NULL)
1993		return (copyin(sopt->sopt_val, buf, valsize));
1994
1995	bcopy(sopt->sopt_val, buf, valsize);
1996	return (0);
1997}
1998
1999/*
2000 * Kernel version of setsockopt(2).
2001 *
2002 * XXX: optlen is size_t, not socklen_t
2003 */
2004int
2005so_setsockopt(struct socket *so, int level, int optname, void *optval,
2006    size_t optlen)
2007{
2008	struct sockopt sopt;
2009
2010	sopt.sopt_level = level;
2011	sopt.sopt_name = optname;
2012	sopt.sopt_dir = SOPT_SET;
2013	sopt.sopt_val = optval;
2014	sopt.sopt_valsize = optlen;
2015	sopt.sopt_td = NULL;
2016	return (sosetopt(so, &sopt));
2017}
2018
2019int
2020sosetopt(so, sopt)
2021	struct socket *so;
2022	struct sockopt *sopt;
2023{
2024	int	error, optval;
2025	struct	linger l;
2026	struct	timeval tv;
2027	u_long  val;
2028#ifdef MAC
2029	struct mac extmac;
2030#endif
2031
2032	error = 0;
2033	if (sopt->sopt_level != SOL_SOCKET) {
2034		if (so->so_proto && so->so_proto->pr_ctloutput)
2035			return ((*so->so_proto->pr_ctloutput)
2036				  (so, sopt));
2037		error = ENOPROTOOPT;
2038	} else {
2039		switch (sopt->sopt_name) {
2040#ifdef INET
2041		case SO_ACCEPTFILTER:
2042			error = do_setopt_accept_filter(so, sopt);
2043			if (error)
2044				goto bad;
2045			break;
2046#endif
2047		case SO_LINGER:
2048			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2049			if (error)
2050				goto bad;
2051
2052			SOCK_LOCK(so);
2053			so->so_linger = l.l_linger;
2054			if (l.l_onoff)
2055				so->so_options |= SO_LINGER;
2056			else
2057				so->so_options &= ~SO_LINGER;
2058			SOCK_UNLOCK(so);
2059			break;
2060
2061		case SO_DEBUG:
2062		case SO_KEEPALIVE:
2063		case SO_DONTROUTE:
2064		case SO_USELOOPBACK:
2065		case SO_BROADCAST:
2066		case SO_REUSEADDR:
2067		case SO_REUSEPORT:
2068		case SO_OOBINLINE:
2069		case SO_TIMESTAMP:
2070		case SO_BINTIME:
2071		case SO_NOSIGPIPE:
2072			error = sooptcopyin(sopt, &optval, sizeof optval,
2073					    sizeof optval);
2074			if (error)
2075				goto bad;
2076			SOCK_LOCK(so);
2077			if (optval)
2078				so->so_options |= sopt->sopt_name;
2079			else
2080				so->so_options &= ~sopt->sopt_name;
2081			SOCK_UNLOCK(so);
2082			break;
2083
2084		case SO_SNDBUF:
2085		case SO_RCVBUF:
2086		case SO_SNDLOWAT:
2087		case SO_RCVLOWAT:
2088			error = sooptcopyin(sopt, &optval, sizeof optval,
2089					    sizeof optval);
2090			if (error)
2091				goto bad;
2092
2093			/*
2094			 * Values < 1 make no sense for any of these options,
2095			 * so disallow them.
2096			 */
2097			if (optval < 1) {
2098				error = EINVAL;
2099				goto bad;
2100			}
2101
2102			switch (sopt->sopt_name) {
2103			case SO_SNDBUF:
2104			case SO_RCVBUF:
2105				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2106				    &so->so_snd : &so->so_rcv, (u_long)optval,
2107				    so, curthread) == 0) {
2108					error = ENOBUFS;
2109					goto bad;
2110				}
2111				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2112				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2113				break;
2114
2115			/*
2116			 * Make sure the low-water is never greater than the
2117			 * high-water.
2118			 */
2119			case SO_SNDLOWAT:
2120				SOCKBUF_LOCK(&so->so_snd);
2121				so->so_snd.sb_lowat =
2122				    (optval > so->so_snd.sb_hiwat) ?
2123				    so->so_snd.sb_hiwat : optval;
2124				SOCKBUF_UNLOCK(&so->so_snd);
2125				break;
2126			case SO_RCVLOWAT:
2127				SOCKBUF_LOCK(&so->so_rcv);
2128				so->so_rcv.sb_lowat =
2129				    (optval > so->so_rcv.sb_hiwat) ?
2130				    so->so_rcv.sb_hiwat : optval;
2131				SOCKBUF_UNLOCK(&so->so_rcv);
2132				break;
2133			}
2134			break;
2135
2136		case SO_SNDTIMEO:
2137		case SO_RCVTIMEO:
2138#ifdef COMPAT_IA32
2139			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2140				struct timeval32 tv32;
2141
2142				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2143				    sizeof tv32);
2144				CP(tv32, tv, tv_sec);
2145				CP(tv32, tv, tv_usec);
2146			} else
2147#endif
2148				error = sooptcopyin(sopt, &tv, sizeof tv,
2149				    sizeof tv);
2150			if (error)
2151				goto bad;
2152
2153			/* assert(hz > 0); */
2154			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2155			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2156				error = EDOM;
2157				goto bad;
2158			}
2159			/* assert(tick > 0); */
2160			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2161			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2162			if (val > INT_MAX) {
2163				error = EDOM;
2164				goto bad;
2165			}
2166			if (val == 0 && tv.tv_usec != 0)
2167				val = 1;
2168
2169			switch (sopt->sopt_name) {
2170			case SO_SNDTIMEO:
2171				so->so_snd.sb_timeo = val;
2172				break;
2173			case SO_RCVTIMEO:
2174				so->so_rcv.sb_timeo = val;
2175				break;
2176			}
2177			break;
2178
2179		case SO_LABEL:
2180#ifdef MAC
2181			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2182			    sizeof extmac);
2183			if (error)
2184				goto bad;
2185			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2186			    so, &extmac);
2187#else
2188			error = EOPNOTSUPP;
2189#endif
2190			break;
2191
2192		default:
2193			error = ENOPROTOOPT;
2194			break;
2195		}
2196		if (error == 0 && so->so_proto != NULL &&
2197		    so->so_proto->pr_ctloutput != NULL) {
2198			(void) ((*so->so_proto->pr_ctloutput)
2199				  (so, sopt));
2200		}
2201	}
2202bad:
2203	return (error);
2204}
2205
2206/*
2207 * Helper routine for getsockopt.
2208 */
2209int
2210sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2211{
2212	int	error;
2213	size_t	valsize;
2214
2215	error = 0;
2216
2217	/*
2218	 * Documented get behavior is that we always return a value, possibly
2219	 * truncated to fit in the user's buffer.  Traditional behavior is
2220	 * that we always tell the user precisely how much we copied, rather
2221	 * than something useful like the total amount we had available for
2222	 * her.  Note that this interface is not idempotent; the entire
2223	 * answer must generated ahead of time.
2224	 */
2225	valsize = min(len, sopt->sopt_valsize);
2226	sopt->sopt_valsize = valsize;
2227	if (sopt->sopt_val != NULL) {
2228		if (sopt->sopt_td != NULL)
2229			error = copyout(buf, sopt->sopt_val, valsize);
2230		else
2231			bcopy(buf, sopt->sopt_val, valsize);
2232	}
2233	return (error);
2234}
2235
2236int
2237sogetopt(so, sopt)
2238	struct socket *so;
2239	struct sockopt *sopt;
2240{
2241	int	error, optval;
2242	struct	linger l;
2243	struct	timeval tv;
2244#ifdef MAC
2245	struct mac extmac;
2246#endif
2247
2248	error = 0;
2249	if (sopt->sopt_level != SOL_SOCKET) {
2250		if (so->so_proto && so->so_proto->pr_ctloutput) {
2251			return ((*so->so_proto->pr_ctloutput)
2252				  (so, sopt));
2253		} else
2254			return (ENOPROTOOPT);
2255	} else {
2256		switch (sopt->sopt_name) {
2257#ifdef INET
2258		case SO_ACCEPTFILTER:
2259			error = do_getopt_accept_filter(so, sopt);
2260			break;
2261#endif
2262		case SO_LINGER:
2263			SOCK_LOCK(so);
2264			l.l_onoff = so->so_options & SO_LINGER;
2265			l.l_linger = so->so_linger;
2266			SOCK_UNLOCK(so);
2267			error = sooptcopyout(sopt, &l, sizeof l);
2268			break;
2269
2270		case SO_USELOOPBACK:
2271		case SO_DONTROUTE:
2272		case SO_DEBUG:
2273		case SO_KEEPALIVE:
2274		case SO_REUSEADDR:
2275		case SO_REUSEPORT:
2276		case SO_BROADCAST:
2277		case SO_OOBINLINE:
2278		case SO_ACCEPTCONN:
2279		case SO_TIMESTAMP:
2280		case SO_BINTIME:
2281		case SO_NOSIGPIPE:
2282			optval = so->so_options & sopt->sopt_name;
2283integer:
2284			error = sooptcopyout(sopt, &optval, sizeof optval);
2285			break;
2286
2287		case SO_TYPE:
2288			optval = so->so_type;
2289			goto integer;
2290
2291		case SO_ERROR:
2292			SOCK_LOCK(so);
2293			optval = so->so_error;
2294			so->so_error = 0;
2295			SOCK_UNLOCK(so);
2296			goto integer;
2297
2298		case SO_SNDBUF:
2299			optval = so->so_snd.sb_hiwat;
2300			goto integer;
2301
2302		case SO_RCVBUF:
2303			optval = so->so_rcv.sb_hiwat;
2304			goto integer;
2305
2306		case SO_SNDLOWAT:
2307			optval = so->so_snd.sb_lowat;
2308			goto integer;
2309
2310		case SO_RCVLOWAT:
2311			optval = so->so_rcv.sb_lowat;
2312			goto integer;
2313
2314		case SO_SNDTIMEO:
2315		case SO_RCVTIMEO:
2316			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2317				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2318
2319			tv.tv_sec = optval / hz;
2320			tv.tv_usec = (optval % hz) * tick;
2321#ifdef COMPAT_IA32
2322			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2323				struct timeval32 tv32;
2324
2325				CP(tv, tv32, tv_sec);
2326				CP(tv, tv32, tv_usec);
2327				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2328			} else
2329#endif
2330				error = sooptcopyout(sopt, &tv, sizeof tv);
2331			break;
2332
2333		case SO_LABEL:
2334#ifdef MAC
2335			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2336			    sizeof(extmac));
2337			if (error)
2338				return (error);
2339			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2340			    so, &extmac);
2341			if (error)
2342				return (error);
2343			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2344#else
2345			error = EOPNOTSUPP;
2346#endif
2347			break;
2348
2349		case SO_PEERLABEL:
2350#ifdef MAC
2351			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2352			    sizeof(extmac));
2353			if (error)
2354				return (error);
2355			error = mac_getsockopt_peerlabel(
2356			    sopt->sopt_td->td_ucred, so, &extmac);
2357			if (error)
2358				return (error);
2359			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2360#else
2361			error = EOPNOTSUPP;
2362#endif
2363			break;
2364
2365		case SO_LISTENQLIMIT:
2366			optval = so->so_qlimit;
2367			goto integer;
2368
2369		case SO_LISTENQLEN:
2370			optval = so->so_qlen;
2371			goto integer;
2372
2373		case SO_LISTENINCQLEN:
2374			optval = so->so_incqlen;
2375			goto integer;
2376
2377		default:
2378			error = ENOPROTOOPT;
2379			break;
2380		}
2381		return (error);
2382	}
2383}
2384
2385/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2386int
2387soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2388{
2389	struct mbuf *m, *m_prev;
2390	int sopt_size = sopt->sopt_valsize;
2391
2392	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2393	if (m == NULL)
2394		return ENOBUFS;
2395	if (sopt_size > MLEN) {
2396		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2397		if ((m->m_flags & M_EXT) == 0) {
2398			m_free(m);
2399			return ENOBUFS;
2400		}
2401		m->m_len = min(MCLBYTES, sopt_size);
2402	} else {
2403		m->m_len = min(MLEN, sopt_size);
2404	}
2405	sopt_size -= m->m_len;
2406	*mp = m;
2407	m_prev = m;
2408
2409	while (sopt_size) {
2410		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2411		if (m == NULL) {
2412			m_freem(*mp);
2413			return ENOBUFS;
2414		}
2415		if (sopt_size > MLEN) {
2416			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2417			    M_DONTWAIT);
2418			if ((m->m_flags & M_EXT) == 0) {
2419				m_freem(m);
2420				m_freem(*mp);
2421				return ENOBUFS;
2422			}
2423			m->m_len = min(MCLBYTES, sopt_size);
2424		} else {
2425			m->m_len = min(MLEN, sopt_size);
2426		}
2427		sopt_size -= m->m_len;
2428		m_prev->m_next = m;
2429		m_prev = m;
2430	}
2431	return (0);
2432}
2433
2434/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2435int
2436soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2437{
2438	struct mbuf *m0 = m;
2439
2440	if (sopt->sopt_val == NULL)
2441		return (0);
2442	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2443		if (sopt->sopt_td != NULL) {
2444			int error;
2445
2446			error = copyin(sopt->sopt_val, mtod(m, char *),
2447				       m->m_len);
2448			if (error != 0) {
2449				m_freem(m0);
2450				return(error);
2451			}
2452		} else
2453			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2454		sopt->sopt_valsize -= m->m_len;
2455		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2456		m = m->m_next;
2457	}
2458	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2459		panic("ip6_sooptmcopyin");
2460	return (0);
2461}
2462
2463/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2464int
2465soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2466{
2467	struct mbuf *m0 = m;
2468	size_t valsize = 0;
2469
2470	if (sopt->sopt_val == NULL)
2471		return (0);
2472	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2473		if (sopt->sopt_td != NULL) {
2474			int error;
2475
2476			error = copyout(mtod(m, char *), sopt->sopt_val,
2477				       m->m_len);
2478			if (error != 0) {
2479				m_freem(m0);
2480				return(error);
2481			}
2482		} else
2483			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2484	       sopt->sopt_valsize -= m->m_len;
2485	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2486	       valsize += m->m_len;
2487	       m = m->m_next;
2488	}
2489	if (m != NULL) {
2490		/* enough soopt buffer should be given from user-land */
2491		m_freem(m0);
2492		return(EINVAL);
2493	}
2494	sopt->sopt_valsize = valsize;
2495	return (0);
2496}
2497
2498/*
2499 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2500 * out-of-band data, which will then notify socket consumers.
2501 */
2502void
2503sohasoutofband(so)
2504	struct socket *so;
2505{
2506	if (so->so_sigio != NULL)
2507		pgsigio(&so->so_sigio, SIGURG, 0);
2508	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2509}
2510
2511int
2512sopoll(struct socket *so, int events, struct ucred *active_cred,
2513    struct thread *td)
2514{
2515
2516	/* XXXRW: Temporary debugging. */
2517	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2518	    ("sopoll: protocol calls sopoll"));
2519
2520	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2521	    td));
2522}
2523
2524int
2525sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2526    struct thread *td)
2527{
2528	int revents = 0;
2529
2530	SOCKBUF_LOCK(&so->so_snd);
2531	SOCKBUF_LOCK(&so->so_rcv);
2532	if (events & (POLLIN | POLLRDNORM))
2533		if (soreadable(so))
2534			revents |= events & (POLLIN | POLLRDNORM);
2535
2536	if (events & POLLINIGNEOF)
2537		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2538		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2539			revents |= POLLINIGNEOF;
2540
2541	if (events & (POLLOUT | POLLWRNORM))
2542		if (sowriteable(so))
2543			revents |= events & (POLLOUT | POLLWRNORM);
2544
2545	if (events & (POLLPRI | POLLRDBAND))
2546		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2547			revents |= events & (POLLPRI | POLLRDBAND);
2548
2549	if (revents == 0) {
2550		if (events &
2551		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2552		     POLLRDBAND)) {
2553			selrecord(td, &so->so_rcv.sb_sel);
2554			so->so_rcv.sb_flags |= SB_SEL;
2555		}
2556
2557		if (events & (POLLOUT | POLLWRNORM)) {
2558			selrecord(td, &so->so_snd.sb_sel);
2559			so->so_snd.sb_flags |= SB_SEL;
2560		}
2561	}
2562
2563	SOCKBUF_UNLOCK(&so->so_rcv);
2564	SOCKBUF_UNLOCK(&so->so_snd);
2565	return (revents);
2566}
2567
2568int
2569soo_kqfilter(struct file *fp, struct knote *kn)
2570{
2571	struct socket *so = kn->kn_fp->f_data;
2572	struct sockbuf *sb;
2573
2574	switch (kn->kn_filter) {
2575	case EVFILT_READ:
2576		if (so->so_options & SO_ACCEPTCONN)
2577			kn->kn_fop = &solisten_filtops;
2578		else
2579			kn->kn_fop = &soread_filtops;
2580		sb = &so->so_rcv;
2581		break;
2582	case EVFILT_WRITE:
2583		kn->kn_fop = &sowrite_filtops;
2584		sb = &so->so_snd;
2585		break;
2586	default:
2587		return (EINVAL);
2588	}
2589
2590	SOCKBUF_LOCK(sb);
2591	knlist_add(&sb->sb_sel.si_note, kn, 1);
2592	sb->sb_flags |= SB_KNOTE;
2593	SOCKBUF_UNLOCK(sb);
2594	return (0);
2595}
2596
2597/*
2598 * Some routines that return EOPNOTSUPP for entry points that are not
2599 * supported by a protocol.  Fill in as needed.
2600 */
2601int
2602pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2603{
2604	return EOPNOTSUPP;
2605}
2606
2607int
2608pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2609{
2610	return EOPNOTSUPP;
2611}
2612
2613int
2614pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2615{
2616	return EOPNOTSUPP;
2617}
2618
2619int
2620pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2621{
2622	return EOPNOTSUPP;
2623}
2624
2625int
2626pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2627{
2628	return EOPNOTSUPP;
2629}
2630
2631int
2632pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2633	struct ifnet *ifp, struct thread *td)
2634{
2635	return EOPNOTSUPP;
2636}
2637
2638int
2639pru_disconnect_notsupp(struct socket *so)
2640{
2641	return EOPNOTSUPP;
2642}
2643
2644int
2645pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2646{
2647	return EOPNOTSUPP;
2648}
2649
2650int
2651pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2652{
2653	return EOPNOTSUPP;
2654}
2655
2656int
2657pru_rcvd_notsupp(struct socket *so, int flags)
2658{
2659	return EOPNOTSUPP;
2660}
2661
2662int
2663pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2664{
2665	return EOPNOTSUPP;
2666}
2667
2668int
2669pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2670	struct sockaddr *addr, struct mbuf *control, struct thread *td)
2671{
2672	return EOPNOTSUPP;
2673}
2674
2675/*
2676 * This isn't really a ``null'' operation, but it's the default one and
2677 * doesn't do anything destructive.
2678 */
2679int
2680pru_sense_null(struct socket *so, struct stat *sb)
2681{
2682	sb->st_blksize = so->so_snd.sb_hiwat;
2683	return 0;
2684}
2685
2686int
2687pru_shutdown_notsupp(struct socket *so)
2688{
2689	return EOPNOTSUPP;
2690}
2691
2692int
2693pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2694{
2695	return EOPNOTSUPP;
2696}
2697
2698int
2699pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2700	struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2701{
2702	return EOPNOTSUPP;
2703}
2704
2705int
2706pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2707	struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
2708	int *flagsp)
2709{
2710	return EOPNOTSUPP;
2711}
2712
2713int
2714pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2715	struct thread *td)
2716{
2717	return EOPNOTSUPP;
2718}
2719
2720static void
2721filt_sordetach(struct knote *kn)
2722{
2723	struct socket *so = kn->kn_fp->f_data;
2724
2725	SOCKBUF_LOCK(&so->so_rcv);
2726	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2727	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2728		so->so_rcv.sb_flags &= ~SB_KNOTE;
2729	SOCKBUF_UNLOCK(&so->so_rcv);
2730}
2731
2732/*ARGSUSED*/
2733static int
2734filt_soread(struct knote *kn, long hint)
2735{
2736	struct socket *so;
2737
2738	so = kn->kn_fp->f_data;
2739	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2740
2741	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2742	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2743		kn->kn_flags |= EV_EOF;
2744		kn->kn_fflags = so->so_error;
2745		return (1);
2746	} else if (so->so_error)	/* temporary udp error */
2747		return (1);
2748	else if (kn->kn_sfflags & NOTE_LOWAT)
2749		return (kn->kn_data >= kn->kn_sdata);
2750	else
2751		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2752}
2753
2754static void
2755filt_sowdetach(struct knote *kn)
2756{
2757	struct socket *so = kn->kn_fp->f_data;
2758
2759	SOCKBUF_LOCK(&so->so_snd);
2760	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2761	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2762		so->so_snd.sb_flags &= ~SB_KNOTE;
2763	SOCKBUF_UNLOCK(&so->so_snd);
2764}
2765
2766/*ARGSUSED*/
2767static int
2768filt_sowrite(struct knote *kn, long hint)
2769{
2770	struct socket *so;
2771
2772	so = kn->kn_fp->f_data;
2773	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2774	kn->kn_data = sbspace(&so->so_snd);
2775	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2776		kn->kn_flags |= EV_EOF;
2777		kn->kn_fflags = so->so_error;
2778		return (1);
2779	} else if (so->so_error)	/* temporary udp error */
2780		return (1);
2781	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2782	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2783		return (0);
2784	else if (kn->kn_sfflags & NOTE_LOWAT)
2785		return (kn->kn_data >= kn->kn_sdata);
2786	else
2787		return (kn->kn_data >= so->so_snd.sb_lowat);
2788}
2789
2790/*ARGSUSED*/
2791static int
2792filt_solisten(struct knote *kn, long hint)
2793{
2794	struct socket *so = kn->kn_fp->f_data;
2795
2796	kn->kn_data = so->so_qlen;
2797	return (! TAILQ_EMPTY(&so->so_comp));
2798}
2799
2800int
2801socheckuid(struct socket *so, uid_t uid)
2802{
2803
2804	if (so == NULL)
2805		return (EPERM);
2806	if (so->so_cred->cr_uid != uid)
2807		return (EPERM);
2808	return (0);
2809}
2810
2811static int
2812sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2813{
2814	int error;
2815	int val;
2816
2817	val = somaxconn;
2818	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2819	if (error || !req->newptr )
2820		return (error);
2821
2822	if (val < 1 || val > USHRT_MAX)
2823		return (EINVAL);
2824
2825	somaxconn = val;
2826	return (0);
2827}
2828
2829/*
2830 * These functions are used by protocols to notify the socket layer (and its
2831 * consumers) of state changes in the sockets driven by protocol-side events.
2832 */
2833
2834/*
2835 * Procedures to manipulate state flags of socket and do appropriate wakeups.
2836 *
2837 * Normal sequence from the active (originating) side is that
2838 * soisconnecting() is called during processing of connect() call, resulting
2839 * in an eventual call to soisconnected() if/when the connection is
2840 * established.  When the connection is torn down soisdisconnecting() is
2841 * called during processing of disconnect() call, and soisdisconnected() is
2842 * called when the connection to the peer is totally severed.  The semantics
2843 * of these routines are such that connectionless protocols can call
2844 * soisconnected() and soisdisconnected() only, bypassing the in-progress
2845 * calls when setting up a ``connection'' takes no time.
2846 *
2847 * From the passive side, a socket is created with two queues of sockets:
2848 * so_incomp for connections in progress and so_comp for connections already
2849 * made and awaiting user acceptance.  As a protocol is preparing incoming
2850 * connections, it creates a socket structure queued on so_incomp by calling
2851 * sonewconn().  When the connection is established, soisconnected() is
2852 * called, and transfers the socket structure to so_comp, making it available
2853 * to accept().
2854 *
2855 * If a socket is closed with sockets on either so_incomp or so_comp, these
2856 * sockets are dropped.
2857 *
2858 * If higher-level protocols are implemented in the kernel, the wakeups done
2859 * here will sometimes cause software-interrupt process scheduling.
2860 */
2861void
2862soisconnecting(so)
2863	register struct socket *so;
2864{
2865
2866	SOCK_LOCK(so);
2867	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
2868	so->so_state |= SS_ISCONNECTING;
2869	SOCK_UNLOCK(so);
2870}
2871
2872void
2873soisconnected(so)
2874	struct socket *so;
2875{
2876	struct socket *head;
2877
2878	ACCEPT_LOCK();
2879	SOCK_LOCK(so);
2880	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
2881	so->so_state |= SS_ISCONNECTED;
2882	head = so->so_head;
2883	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
2884		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
2885			SOCK_UNLOCK(so);
2886			TAILQ_REMOVE(&head->so_incomp, so, so_list);
2887			head->so_incqlen--;
2888			so->so_qstate &= ~SQ_INCOMP;
2889			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
2890			head->so_qlen++;
2891			so->so_qstate |= SQ_COMP;
2892			ACCEPT_UNLOCK();
2893			sorwakeup(head);
2894			wakeup_one(&head->so_timeo);
2895		} else {
2896			ACCEPT_UNLOCK();
2897			so->so_upcall =
2898			    head->so_accf->so_accept_filter->accf_callback;
2899			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
2900			so->so_rcv.sb_flags |= SB_UPCALL;
2901			so->so_options &= ~SO_ACCEPTFILTER;
2902			SOCK_UNLOCK(so);
2903			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
2904		}
2905		return;
2906	}
2907	SOCK_UNLOCK(so);
2908	ACCEPT_UNLOCK();
2909	wakeup(&so->so_timeo);
2910	sorwakeup(so);
2911	sowwakeup(so);
2912}
2913
2914void
2915soisdisconnecting(so)
2916	register struct socket *so;
2917{
2918
2919	/*
2920	 * Note: This code assumes that SOCK_LOCK(so) and
2921	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2922	 */
2923	SOCKBUF_LOCK(&so->so_rcv);
2924	so->so_state &= ~SS_ISCONNECTING;
2925	so->so_state |= SS_ISDISCONNECTING;
2926	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2927	sorwakeup_locked(so);
2928	SOCKBUF_LOCK(&so->so_snd);
2929	so->so_snd.sb_state |= SBS_CANTSENDMORE;
2930	sowwakeup_locked(so);
2931	wakeup(&so->so_timeo);
2932}
2933
2934void
2935soisdisconnected(so)
2936	register struct socket *so;
2937{
2938
2939	/*
2940	 * Note: This code assumes that SOCK_LOCK(so) and
2941	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2942	 */
2943	SOCKBUF_LOCK(&so->so_rcv);
2944	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
2945	so->so_state |= SS_ISDISCONNECTED;
2946	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2947	sorwakeup_locked(so);
2948	SOCKBUF_LOCK(&so->so_snd);
2949	so->so_snd.sb_state |= SBS_CANTSENDMORE;
2950	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
2951	sowwakeup_locked(so);
2952	wakeup(&so->so_timeo);
2953}
2954
2955/*
2956 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2957 */
2958struct sockaddr *
2959sodupsockaddr(const struct sockaddr *sa, int mflags)
2960{
2961	struct sockaddr *sa2;
2962
2963	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
2964	if (sa2)
2965		bcopy(sa, sa2, sa->sa_len);
2966	return sa2;
2967}
2968
2969/*
2970 * Create an external-format (``xsocket'') structure using the information in
2971 * the kernel-format socket structure pointed to by so.  This is done to
2972 * reduce the spew of irrelevant information over this interface, to isolate
2973 * user code from changes in the kernel structure, and potentially to provide
2974 * information-hiding if we decide that some of this information should be
2975 * hidden from users.
2976 */
2977void
2978sotoxsocket(struct socket *so, struct xsocket *xso)
2979{
2980	xso->xso_len = sizeof *xso;
2981	xso->xso_so = so;
2982	xso->so_type = so->so_type;
2983	xso->so_options = so->so_options;
2984	xso->so_linger = so->so_linger;
2985	xso->so_state = so->so_state;
2986	xso->so_pcb = so->so_pcb;
2987	xso->xso_protocol = so->so_proto->pr_protocol;
2988	xso->xso_family = so->so_proto->pr_domain->dom_family;
2989	xso->so_qlen = so->so_qlen;
2990	xso->so_incqlen = so->so_incqlen;
2991	xso->so_qlimit = so->so_qlimit;
2992	xso->so_timeo = so->so_timeo;
2993	xso->so_error = so->so_error;
2994	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
2995	xso->so_oobmark = so->so_oobmark;
2996	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2997	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2998	xso->so_uid = so->so_cred->cr_uid;
2999}
3000