uipc_socket.c revision 205014
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 */
96
97#include <sys/cdefs.h>
98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 205014 2010-03-11 14:49:06Z nwhitehorn $");
99
100#include "opt_inet.h"
101#include "opt_inet6.h"
102#include "opt_zero.h"
103#include "opt_compat.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/fcntl.h>
108#include <sys/limits.h>
109#include <sys/lock.h>
110#include <sys/mac.h>
111#include <sys/malloc.h>
112#include <sys/mbuf.h>
113#include <sys/mutex.h>
114#include <sys/domain.h>
115#include <sys/file.h>			/* for struct knote */
116#include <sys/kernel.h>
117#include <sys/event.h>
118#include <sys/eventhandler.h>
119#include <sys/poll.h>
120#include <sys/proc.h>
121#include <sys/protosw.h>
122#include <sys/socket.h>
123#include <sys/socketvar.h>
124#include <sys/resourcevar.h>
125#include <net/route.h>
126#include <sys/signalvar.h>
127#include <sys/stat.h>
128#include <sys/sx.h>
129#include <sys/sysctl.h>
130#include <sys/uio.h>
131#include <sys/jail.h>
132
133#include <net/vnet.h>
134
135#include <security/mac/mac_framework.h>
136
137#include <vm/uma.h>
138
139#ifdef COMPAT_FREEBSD32
140#include <sys/mount.h>
141#include <sys/sysent.h>
142#include <compat/freebsd32/freebsd32.h>
143#endif
144
145static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
146		    int flags);
147
148static void	filt_sordetach(struct knote *kn);
149static int	filt_soread(struct knote *kn, long hint);
150static void	filt_sowdetach(struct knote *kn);
151static int	filt_sowrite(struct knote *kn, long hint);
152static int	filt_solisten(struct knote *kn, long hint);
153
154static struct filterops solisten_filtops = {
155	.f_isfd = 1,
156	.f_detach = filt_sordetach,
157	.f_event = filt_solisten,
158};
159static struct filterops soread_filtops = {
160	.f_isfd = 1,
161	.f_detach = filt_sordetach,
162	.f_event = filt_soread,
163};
164static struct filterops sowrite_filtops = {
165	.f_isfd = 1,
166	.f_detach = filt_sowdetach,
167	.f_event = filt_sowrite,
168};
169
170uma_zone_t socket_zone;
171so_gen_t	so_gencnt;	/* generation count for sockets */
172
173int	maxsockets;
174
175MALLOC_DEFINE(M_SONAME, "soname", "socket name");
176MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
177
178static int somaxconn = SOMAXCONN;
179static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
180/* XXX: we dont have SYSCTL_USHORT */
181SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
182    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
183    "queue size");
184static int numopensockets;
185SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
186    &numopensockets, 0, "Number of open sockets");
187#ifdef ZERO_COPY_SOCKETS
188/* These aren't static because they're used in other files. */
189int so_zero_copy_send = 1;
190int so_zero_copy_receive = 1;
191SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
192    "Zero copy controls");
193SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
194    &so_zero_copy_receive, 0, "Enable zero copy receive");
195SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
196    &so_zero_copy_send, 0, "Enable zero copy send");
197#endif /* ZERO_COPY_SOCKETS */
198
199/*
200 * accept_mtx locks down per-socket fields relating to accept queues.  See
201 * socketvar.h for an annotation of the protected fields of struct socket.
202 */
203struct mtx accept_mtx;
204MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
205
206/*
207 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
208 * so_gencnt field.
209 */
210static struct mtx so_global_mtx;
211MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
212
213/*
214 * General IPC sysctl name space, used by sockets and a variety of other IPC
215 * types.
216 */
217SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
218
219/*
220 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
221 * of the change so that they can update their dependent limits as required.
222 */
223static int
224sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
225{
226	int error, newmaxsockets;
227
228	newmaxsockets = maxsockets;
229	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
230	if (error == 0 && req->newptr) {
231		if (newmaxsockets > maxsockets) {
232			maxsockets = newmaxsockets;
233			if (maxsockets > ((maxfiles / 4) * 3)) {
234				maxfiles = (maxsockets * 5) / 4;
235				maxfilesperproc = (maxfiles * 9) / 10;
236			}
237			EVENTHANDLER_INVOKE(maxsockets_change);
238		} else
239			error = EINVAL;
240	}
241	return (error);
242}
243
244SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
245    &maxsockets, 0, sysctl_maxsockets, "IU",
246    "Maximum number of sockets avaliable");
247
248/*
249 * Initialise maxsockets.  This SYSINIT must be run after
250 * tunable_mbinit().
251 */
252static void
253init_maxsockets(void *ignored)
254{
255
256	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
257	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
258}
259SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
260
261/*
262 * Socket operation routines.  These routines are called by the routines in
263 * sys_socket.c or from a system process, and implement the semantics of
264 * socket operations by switching out to the protocol specific routines.
265 */
266
267/*
268 * Get a socket structure from our zone, and initialize it.  Note that it
269 * would probably be better to allocate socket and PCB at the same time, but
270 * I'm not convinced that all the protocols can be easily modified to do
271 * this.
272 *
273 * soalloc() returns a socket with a ref count of 0.
274 */
275static struct socket *
276soalloc(struct vnet *vnet)
277{
278	struct socket *so;
279
280	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
281	if (so == NULL)
282		return (NULL);
283#ifdef MAC
284	if (mac_socket_init(so, M_NOWAIT) != 0) {
285		uma_zfree(socket_zone, so);
286		return (NULL);
287	}
288#endif
289	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
290	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
291	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
292	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
293	TAILQ_INIT(&so->so_aiojobq);
294	mtx_lock(&so_global_mtx);
295	so->so_gencnt = ++so_gencnt;
296	++numopensockets;
297#ifdef VIMAGE
298	vnet->vnet_sockcnt++;
299	so->so_vnet = vnet;
300#endif
301	mtx_unlock(&so_global_mtx);
302	return (so);
303}
304
305/*
306 * Free the storage associated with a socket at the socket layer, tear down
307 * locks, labels, etc.  All protocol state is assumed already to have been
308 * torn down (and possibly never set up) by the caller.
309 */
310static void
311sodealloc(struct socket *so)
312{
313
314	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
315	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
316
317	mtx_lock(&so_global_mtx);
318	so->so_gencnt = ++so_gencnt;
319	--numopensockets;	/* Could be below, but faster here. */
320#ifdef VIMAGE
321	so->so_vnet->vnet_sockcnt--;
322#endif
323	mtx_unlock(&so_global_mtx);
324	if (so->so_rcv.sb_hiwat)
325		(void)chgsbsize(so->so_cred->cr_uidinfo,
326		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
327	if (so->so_snd.sb_hiwat)
328		(void)chgsbsize(so->so_cred->cr_uidinfo,
329		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
330#ifdef INET
331	/* remove acccept filter if one is present. */
332	if (so->so_accf != NULL)
333		do_setopt_accept_filter(so, NULL);
334#endif
335#ifdef MAC
336	mac_socket_destroy(so);
337#endif
338	crfree(so->so_cred);
339	sx_destroy(&so->so_snd.sb_sx);
340	sx_destroy(&so->so_rcv.sb_sx);
341	SOCKBUF_LOCK_DESTROY(&so->so_snd);
342	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
343	uma_zfree(socket_zone, so);
344}
345
346/*
347 * socreate returns a socket with a ref count of 1.  The socket should be
348 * closed with soclose().
349 */
350int
351socreate(int dom, struct socket **aso, int type, int proto,
352    struct ucred *cred, struct thread *td)
353{
354	struct protosw *prp;
355	struct socket *so;
356	int error;
357
358	if (proto)
359		prp = pffindproto(dom, proto, type);
360	else
361		prp = pffindtype(dom, type);
362
363	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
364	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
365		return (EPROTONOSUPPORT);
366
367	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
368		return (EPROTONOSUPPORT);
369
370	if (prp->pr_type != type)
371		return (EPROTOTYPE);
372	so = soalloc(CRED_TO_VNET(cred));
373	if (so == NULL)
374		return (ENOBUFS);
375
376	TAILQ_INIT(&so->so_incomp);
377	TAILQ_INIT(&so->so_comp);
378	so->so_type = type;
379	so->so_cred = crhold(cred);
380	if ((prp->pr_domain->dom_family == PF_INET) ||
381	    (prp->pr_domain->dom_family == PF_ROUTE))
382		so->so_fibnum = td->td_proc->p_fibnum;
383	else
384		so->so_fibnum = 0;
385	so->so_proto = prp;
386#ifdef MAC
387	mac_socket_create(cred, so);
388#endif
389	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
390	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
391	so->so_count = 1;
392	/*
393	 * Auto-sizing of socket buffers is managed by the protocols and
394	 * the appropriate flags must be set in the pru_attach function.
395	 */
396	CURVNET_SET(so->so_vnet);
397	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
398	CURVNET_RESTORE();
399	if (error) {
400		KASSERT(so->so_count == 1, ("socreate: so_count %d",
401		    so->so_count));
402		so->so_count = 0;
403		sodealloc(so);
404		return (error);
405	}
406	*aso = so;
407	return (0);
408}
409
410#ifdef REGRESSION
411static int regression_sonewconn_earlytest = 1;
412SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
413    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
414#endif
415
416/*
417 * When an attempt at a new connection is noted on a socket which accepts
418 * connections, sonewconn is called.  If the connection is possible (subject
419 * to space constraints, etc.) then we allocate a new structure, propoerly
420 * linked into the data structure of the original socket, and return this.
421 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
422 *
423 * Note: the ref count on the socket is 0 on return.
424 */
425struct socket *
426sonewconn(struct socket *head, int connstatus)
427{
428	struct socket *so;
429	int over;
430
431	ACCEPT_LOCK();
432	over = (head->so_qlen > 3 * head->so_qlimit / 2);
433	ACCEPT_UNLOCK();
434#ifdef REGRESSION
435	if (regression_sonewconn_earlytest && over)
436#else
437	if (over)
438#endif
439		return (NULL);
440	VNET_ASSERT(head->so_vnet);
441	so = soalloc(head->so_vnet);
442	if (so == NULL)
443		return (NULL);
444	if ((head->so_options & SO_ACCEPTFILTER) != 0)
445		connstatus = 0;
446	so->so_head = head;
447	so->so_type = head->so_type;
448	so->so_options = head->so_options &~ SO_ACCEPTCONN;
449	so->so_linger = head->so_linger;
450	so->so_state = head->so_state | SS_NOFDREF;
451	so->so_fibnum = head->so_fibnum;
452	so->so_proto = head->so_proto;
453	so->so_cred = crhold(head->so_cred);
454#ifdef MAC
455	mac_socket_newconn(head, so);
456#endif
457	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
458	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
459	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
460	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
461		sodealloc(so);
462		return (NULL);
463	}
464	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
465	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
466	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
467	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
468	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
469	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
470	so->so_state |= connstatus;
471	ACCEPT_LOCK();
472	if (connstatus) {
473		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
474		so->so_qstate |= SQ_COMP;
475		head->so_qlen++;
476	} else {
477		/*
478		 * Keep removing sockets from the head until there's room for
479		 * us to insert on the tail.  In pre-locking revisions, this
480		 * was a simple if(), but as we could be racing with other
481		 * threads and soabort() requires dropping locks, we must
482		 * loop waiting for the condition to be true.
483		 */
484		while (head->so_incqlen > head->so_qlimit) {
485			struct socket *sp;
486			sp = TAILQ_FIRST(&head->so_incomp);
487			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
488			head->so_incqlen--;
489			sp->so_qstate &= ~SQ_INCOMP;
490			sp->so_head = NULL;
491			ACCEPT_UNLOCK();
492			soabort(sp);
493			ACCEPT_LOCK();
494		}
495		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
496		so->so_qstate |= SQ_INCOMP;
497		head->so_incqlen++;
498	}
499	ACCEPT_UNLOCK();
500	if (connstatus) {
501		sorwakeup(head);
502		wakeup_one(&head->so_timeo);
503	}
504	return (so);
505}
506
507int
508sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
509{
510	int error;
511
512	CURVNET_SET(so->so_vnet);
513	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
514	CURVNET_RESTORE();
515	return error;
516}
517
518/*
519 * solisten() transitions a socket from a non-listening state to a listening
520 * state, but can also be used to update the listen queue depth on an
521 * existing listen socket.  The protocol will call back into the sockets
522 * layer using solisten_proto_check() and solisten_proto() to check and set
523 * socket-layer listen state.  Call backs are used so that the protocol can
524 * acquire both protocol and socket layer locks in whatever order is required
525 * by the protocol.
526 *
527 * Protocol implementors are advised to hold the socket lock across the
528 * socket-layer test and set to avoid races at the socket layer.
529 */
530int
531solisten(struct socket *so, int backlog, struct thread *td)
532{
533
534	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
535}
536
537int
538solisten_proto_check(struct socket *so)
539{
540
541	SOCK_LOCK_ASSERT(so);
542
543	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
544	    SS_ISDISCONNECTING))
545		return (EINVAL);
546	return (0);
547}
548
549void
550solisten_proto(struct socket *so, int backlog)
551{
552
553	SOCK_LOCK_ASSERT(so);
554
555	if (backlog < 0 || backlog > somaxconn)
556		backlog = somaxconn;
557	so->so_qlimit = backlog;
558	so->so_options |= SO_ACCEPTCONN;
559}
560
561/*
562 * Attempt to free a socket.  This should really be sotryfree().
563 *
564 * sofree() will succeed if:
565 *
566 * - There are no outstanding file descriptor references or related consumers
567 *   (so_count == 0).
568 *
569 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
570 *
571 * - The protocol does not have an outstanding strong reference on the socket
572 *   (SS_PROTOREF).
573 *
574 * - The socket is not in a completed connection queue, so a process has been
575 *   notified that it is present.  If it is removed, the user process may
576 *   block in accept() despite select() saying the socket was ready.
577 *
578 * Otherwise, it will quietly abort so that a future call to sofree(), when
579 * conditions are right, can succeed.
580 */
581void
582sofree(struct socket *so)
583{
584	struct protosw *pr = so->so_proto;
585	struct socket *head;
586
587	ACCEPT_LOCK_ASSERT();
588	SOCK_LOCK_ASSERT(so);
589
590	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
591	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
592		SOCK_UNLOCK(so);
593		ACCEPT_UNLOCK();
594		return;
595	}
596
597	head = so->so_head;
598	if (head != NULL) {
599		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
600		    (so->so_qstate & SQ_INCOMP) != 0,
601		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
602		    "SQ_INCOMP"));
603		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
604		    (so->so_qstate & SQ_INCOMP) == 0,
605		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
606		TAILQ_REMOVE(&head->so_incomp, so, so_list);
607		head->so_incqlen--;
608		so->so_qstate &= ~SQ_INCOMP;
609		so->so_head = NULL;
610	}
611	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
612	    (so->so_qstate & SQ_INCOMP) == 0,
613	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
614	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
615	if (so->so_options & SO_ACCEPTCONN) {
616		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
617		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
618	}
619	SOCK_UNLOCK(so);
620	ACCEPT_UNLOCK();
621
622	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
623		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
624	if (pr->pr_usrreqs->pru_detach != NULL)
625		(*pr->pr_usrreqs->pru_detach)(so);
626
627	/*
628	 * From this point on, we assume that no other references to this
629	 * socket exist anywhere else in the stack.  Therefore, no locks need
630	 * to be acquired or held.
631	 *
632	 * We used to do a lot of socket buffer and socket locking here, as
633	 * well as invoke sorflush() and perform wakeups.  The direct call to
634	 * dom_dispose() and sbrelease_internal() are an inlining of what was
635	 * necessary from sorflush().
636	 *
637	 * Notice that the socket buffer and kqueue state are torn down
638	 * before calling pru_detach.  This means that protocols shold not
639	 * assume they can perform socket wakeups, etc, in their detach code.
640	 */
641	sbdestroy(&so->so_snd, so);
642	sbdestroy(&so->so_rcv, so);
643	knlist_destroy(&so->so_rcv.sb_sel.si_note);
644	knlist_destroy(&so->so_snd.sb_sel.si_note);
645	sodealloc(so);
646}
647
648/*
649 * Close a socket on last file table reference removal.  Initiate disconnect
650 * if connected.  Free socket when disconnect complete.
651 *
652 * This function will sorele() the socket.  Note that soclose() may be called
653 * prior to the ref count reaching zero.  The actual socket structure will
654 * not be freed until the ref count reaches zero.
655 */
656int
657soclose(struct socket *so)
658{
659	int error = 0;
660
661	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
662
663	CURVNET_SET(so->so_vnet);
664	funsetown(&so->so_sigio);
665	if (so->so_state & SS_ISCONNECTED) {
666		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
667			error = sodisconnect(so);
668			if (error)
669				goto drop;
670		}
671		if (so->so_options & SO_LINGER) {
672			if ((so->so_state & SS_ISDISCONNECTING) &&
673			    (so->so_state & SS_NBIO))
674				goto drop;
675			while (so->so_state & SS_ISCONNECTED) {
676				error = tsleep(&so->so_timeo,
677				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
678				if (error)
679					break;
680			}
681		}
682	}
683
684drop:
685	if (so->so_proto->pr_usrreqs->pru_close != NULL)
686		(*so->so_proto->pr_usrreqs->pru_close)(so);
687	if (so->so_options & SO_ACCEPTCONN) {
688		struct socket *sp;
689		ACCEPT_LOCK();
690		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
691			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
692			so->so_incqlen--;
693			sp->so_qstate &= ~SQ_INCOMP;
694			sp->so_head = NULL;
695			ACCEPT_UNLOCK();
696			soabort(sp);
697			ACCEPT_LOCK();
698		}
699		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
700			TAILQ_REMOVE(&so->so_comp, sp, so_list);
701			so->so_qlen--;
702			sp->so_qstate &= ~SQ_COMP;
703			sp->so_head = NULL;
704			ACCEPT_UNLOCK();
705			soabort(sp);
706			ACCEPT_LOCK();
707		}
708		ACCEPT_UNLOCK();
709	}
710	ACCEPT_LOCK();
711	SOCK_LOCK(so);
712	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
713	so->so_state |= SS_NOFDREF;
714	sorele(so);
715	CURVNET_RESTORE();
716	return (error);
717}
718
719/*
720 * soabort() is used to abruptly tear down a connection, such as when a
721 * resource limit is reached (listen queue depth exceeded), or if a listen
722 * socket is closed while there are sockets waiting to be accepted.
723 *
724 * This interface is tricky, because it is called on an unreferenced socket,
725 * and must be called only by a thread that has actually removed the socket
726 * from the listen queue it was on, or races with other threads are risked.
727 *
728 * This interface will call into the protocol code, so must not be called
729 * with any socket locks held.  Protocols do call it while holding their own
730 * recursible protocol mutexes, but this is something that should be subject
731 * to review in the future.
732 */
733void
734soabort(struct socket *so)
735{
736
737	/*
738	 * In as much as is possible, assert that no references to this
739	 * socket are held.  This is not quite the same as asserting that the
740	 * current thread is responsible for arranging for no references, but
741	 * is as close as we can get for now.
742	 */
743	KASSERT(so->so_count == 0, ("soabort: so_count"));
744	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
745	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
746	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
747	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
748
749	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
750		(*so->so_proto->pr_usrreqs->pru_abort)(so);
751	ACCEPT_LOCK();
752	SOCK_LOCK(so);
753	sofree(so);
754}
755
756int
757soaccept(struct socket *so, struct sockaddr **nam)
758{
759	int error;
760
761	SOCK_LOCK(so);
762	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
763	so->so_state &= ~SS_NOFDREF;
764	SOCK_UNLOCK(so);
765	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
766	return (error);
767}
768
769int
770soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
771{
772	int error;
773
774	if (so->so_options & SO_ACCEPTCONN)
775		return (EOPNOTSUPP);
776
777	CURVNET_SET(so->so_vnet);
778	/*
779	 * If protocol is connection-based, can only connect once.
780	 * Otherwise, if connected, try to disconnect first.  This allows
781	 * user to disconnect by connecting to, e.g., a null address.
782	 */
783	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
784	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
785	    (error = sodisconnect(so)))) {
786		error = EISCONN;
787	} else {
788		/*
789		 * Prevent accumulated error from previous connection from
790		 * biting us.
791		 */
792		so->so_error = 0;
793		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
794	}
795	CURVNET_RESTORE();
796
797	return (error);
798}
799
800int
801soconnect2(struct socket *so1, struct socket *so2)
802{
803
804	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
805}
806
807int
808sodisconnect(struct socket *so)
809{
810	int error;
811
812	if ((so->so_state & SS_ISCONNECTED) == 0)
813		return (ENOTCONN);
814	if (so->so_state & SS_ISDISCONNECTING)
815		return (EALREADY);
816	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
817	return (error);
818}
819
820#ifdef ZERO_COPY_SOCKETS
821struct so_zerocopy_stats{
822	int size_ok;
823	int align_ok;
824	int found_ifp;
825};
826struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
827#include <netinet/in.h>
828#include <net/route.h>
829#include <netinet/in_pcb.h>
830#include <vm/vm.h>
831#include <vm/vm_page.h>
832#include <vm/vm_object.h>
833
834/*
835 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
836 * sosend_dgram() and sosend_generic() use m_uiotombuf().
837 *
838 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
839 * all of the data referenced by the uio.  If desired, it uses zero-copy.
840 * *space will be updated to reflect data copied in.
841 *
842 * NB: If atomic I/O is requested, the caller must already have checked that
843 * space can hold resid bytes.
844 *
845 * NB: In the event of an error, the caller may need to free the partial
846 * chain pointed to by *mpp.  The contents of both *uio and *space may be
847 * modified even in the case of an error.
848 */
849static int
850sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
851    int flags)
852{
853	struct mbuf *m, **mp, *top;
854	long len, resid;
855	int error;
856#ifdef ZERO_COPY_SOCKETS
857	int cow_send;
858#endif
859
860	*retmp = top = NULL;
861	mp = &top;
862	len = 0;
863	resid = uio->uio_resid;
864	error = 0;
865	do {
866#ifdef ZERO_COPY_SOCKETS
867		cow_send = 0;
868#endif /* ZERO_COPY_SOCKETS */
869		if (resid >= MINCLSIZE) {
870#ifdef ZERO_COPY_SOCKETS
871			if (top == NULL) {
872				m = m_gethdr(M_WAITOK, MT_DATA);
873				m->m_pkthdr.len = 0;
874				m->m_pkthdr.rcvif = NULL;
875			} else
876				m = m_get(M_WAITOK, MT_DATA);
877			if (so_zero_copy_send &&
878			    resid>=PAGE_SIZE &&
879			    *space>=PAGE_SIZE &&
880			    uio->uio_iov->iov_len>=PAGE_SIZE) {
881				so_zerocp_stats.size_ok++;
882				so_zerocp_stats.align_ok++;
883				cow_send = socow_setup(m, uio);
884				len = cow_send;
885			}
886			if (!cow_send) {
887				m_clget(m, M_WAITOK);
888				len = min(min(MCLBYTES, resid), *space);
889			}
890#else /* ZERO_COPY_SOCKETS */
891			if (top == NULL) {
892				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
893				m->m_pkthdr.len = 0;
894				m->m_pkthdr.rcvif = NULL;
895			} else
896				m = m_getcl(M_WAIT, MT_DATA, 0);
897			len = min(min(MCLBYTES, resid), *space);
898#endif /* ZERO_COPY_SOCKETS */
899		} else {
900			if (top == NULL) {
901				m = m_gethdr(M_WAIT, MT_DATA);
902				m->m_pkthdr.len = 0;
903				m->m_pkthdr.rcvif = NULL;
904
905				len = min(min(MHLEN, resid), *space);
906				/*
907				 * For datagram protocols, leave room
908				 * for protocol headers in first mbuf.
909				 */
910				if (atomic && m && len < MHLEN)
911					MH_ALIGN(m, len);
912			} else {
913				m = m_get(M_WAIT, MT_DATA);
914				len = min(min(MLEN, resid), *space);
915			}
916		}
917		if (m == NULL) {
918			error = ENOBUFS;
919			goto out;
920		}
921
922		*space -= len;
923#ifdef ZERO_COPY_SOCKETS
924		if (cow_send)
925			error = 0;
926		else
927#endif /* ZERO_COPY_SOCKETS */
928		error = uiomove(mtod(m, void *), (int)len, uio);
929		resid = uio->uio_resid;
930		m->m_len = len;
931		*mp = m;
932		top->m_pkthdr.len += len;
933		if (error)
934			goto out;
935		mp = &m->m_next;
936		if (resid <= 0) {
937			if (flags & MSG_EOR)
938				top->m_flags |= M_EOR;
939			break;
940		}
941	} while (*space > 0 && atomic);
942out:
943	*retmp = top;
944	return (error);
945}
946#endif /*ZERO_COPY_SOCKETS*/
947
948#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
949
950int
951sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
952    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
953{
954	long space, resid;
955	int clen = 0, error, dontroute;
956#ifdef ZERO_COPY_SOCKETS
957	int atomic = sosendallatonce(so) || top;
958#endif
959
960	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
961	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
962	    ("sodgram_send: !PR_ATOMIC"));
963
964	if (uio != NULL)
965		resid = uio->uio_resid;
966	else
967		resid = top->m_pkthdr.len;
968	/*
969	 * In theory resid should be unsigned.  However, space must be
970	 * signed, as it might be less than 0 if we over-committed, and we
971	 * must use a signed comparison of space and resid.  On the other
972	 * hand, a negative resid causes us to loop sending 0-length
973	 * segments to the protocol.
974	 */
975	if (resid < 0) {
976		error = EINVAL;
977		goto out;
978	}
979
980	dontroute =
981	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
982	if (td != NULL)
983		td->td_ru.ru_msgsnd++;
984	if (control != NULL)
985		clen = control->m_len;
986
987	SOCKBUF_LOCK(&so->so_snd);
988	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
989		SOCKBUF_UNLOCK(&so->so_snd);
990		error = EPIPE;
991		goto out;
992	}
993	if (so->so_error) {
994		error = so->so_error;
995		so->so_error = 0;
996		SOCKBUF_UNLOCK(&so->so_snd);
997		goto out;
998	}
999	if ((so->so_state & SS_ISCONNECTED) == 0) {
1000		/*
1001		 * `sendto' and `sendmsg' is allowed on a connection-based
1002		 * socket if it supports implied connect.  Return ENOTCONN if
1003		 * not connected and no address is supplied.
1004		 */
1005		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1006		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1007			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1008			    !(resid == 0 && clen != 0)) {
1009				SOCKBUF_UNLOCK(&so->so_snd);
1010				error = ENOTCONN;
1011				goto out;
1012			}
1013		} else if (addr == NULL) {
1014			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1015				error = ENOTCONN;
1016			else
1017				error = EDESTADDRREQ;
1018			SOCKBUF_UNLOCK(&so->so_snd);
1019			goto out;
1020		}
1021	}
1022
1023	/*
1024	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1025	 * problem and need fixing.
1026	 */
1027	space = sbspace(&so->so_snd);
1028	if (flags & MSG_OOB)
1029		space += 1024;
1030	space -= clen;
1031	SOCKBUF_UNLOCK(&so->so_snd);
1032	if (resid > space) {
1033		error = EMSGSIZE;
1034		goto out;
1035	}
1036	if (uio == NULL) {
1037		resid = 0;
1038		if (flags & MSG_EOR)
1039			top->m_flags |= M_EOR;
1040	} else {
1041#ifdef ZERO_COPY_SOCKETS
1042		error = sosend_copyin(uio, &top, atomic, &space, flags);
1043		if (error)
1044			goto out;
1045#else
1046		/*
1047		 * Copy the data from userland into a mbuf chain.
1048		 * If no data is to be copied in, a single empty mbuf
1049		 * is returned.
1050		 */
1051		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1052		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1053		if (top == NULL) {
1054			error = EFAULT;	/* only possible error */
1055			goto out;
1056		}
1057		space -= resid - uio->uio_resid;
1058#endif
1059		resid = uio->uio_resid;
1060	}
1061	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1062	/*
1063	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1064	 * than with.
1065	 */
1066	if (dontroute) {
1067		SOCK_LOCK(so);
1068		so->so_options |= SO_DONTROUTE;
1069		SOCK_UNLOCK(so);
1070	}
1071	/*
1072	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1073	 * of date.  We could have recieved a reset packet in an interrupt or
1074	 * maybe we slept while doing page faults in uiomove() etc.  We could
1075	 * probably recheck again inside the locking protection here, but
1076	 * there are probably other places that this also happens.  We must
1077	 * rethink this.
1078	 */
1079	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1080	    (flags & MSG_OOB) ? PRUS_OOB :
1081	/*
1082	 * If the user set MSG_EOF, the protocol understands this flag and
1083	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1084	 */
1085	    ((flags & MSG_EOF) &&
1086	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1087	     (resid <= 0)) ?
1088		PRUS_EOF :
1089		/* If there is more to send set PRUS_MORETOCOME */
1090		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1091		top, addr, control, td);
1092	if (dontroute) {
1093		SOCK_LOCK(so);
1094		so->so_options &= ~SO_DONTROUTE;
1095		SOCK_UNLOCK(so);
1096	}
1097	clen = 0;
1098	control = NULL;
1099	top = NULL;
1100out:
1101	if (top != NULL)
1102		m_freem(top);
1103	if (control != NULL)
1104		m_freem(control);
1105	return (error);
1106}
1107
1108/*
1109 * Send on a socket.  If send must go all at once and message is larger than
1110 * send buffering, then hard error.  Lock against other senders.  If must go
1111 * all at once and not enough room now, then inform user that this would
1112 * block and do nothing.  Otherwise, if nonblocking, send as much as
1113 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1114 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1115 * in mbuf chain must be small enough to send all at once.
1116 *
1117 * Returns nonzero on error, timeout or signal; callers must check for short
1118 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1119 * on return.
1120 */
1121int
1122sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1123    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1124{
1125	long space, resid;
1126	int clen = 0, error, dontroute;
1127	int atomic = sosendallatonce(so) || top;
1128
1129	if (uio != NULL)
1130		resid = uio->uio_resid;
1131	else
1132		resid = top->m_pkthdr.len;
1133	/*
1134	 * In theory resid should be unsigned.  However, space must be
1135	 * signed, as it might be less than 0 if we over-committed, and we
1136	 * must use a signed comparison of space and resid.  On the other
1137	 * hand, a negative resid causes us to loop sending 0-length
1138	 * segments to the protocol.
1139	 *
1140	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1141	 * type sockets since that's an error.
1142	 */
1143	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1144		error = EINVAL;
1145		goto out;
1146	}
1147
1148	dontroute =
1149	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1150	    (so->so_proto->pr_flags & PR_ATOMIC);
1151	if (td != NULL)
1152		td->td_ru.ru_msgsnd++;
1153	if (control != NULL)
1154		clen = control->m_len;
1155
1156	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1157	if (error)
1158		goto out;
1159
1160restart:
1161	do {
1162		SOCKBUF_LOCK(&so->so_snd);
1163		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1164			SOCKBUF_UNLOCK(&so->so_snd);
1165			error = EPIPE;
1166			goto release;
1167		}
1168		if (so->so_error) {
1169			error = so->so_error;
1170			so->so_error = 0;
1171			SOCKBUF_UNLOCK(&so->so_snd);
1172			goto release;
1173		}
1174		if ((so->so_state & SS_ISCONNECTED) == 0) {
1175			/*
1176			 * `sendto' and `sendmsg' is allowed on a connection-
1177			 * based socket if it supports implied connect.
1178			 * Return ENOTCONN if not connected and no address is
1179			 * supplied.
1180			 */
1181			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1182			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1183				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1184				    !(resid == 0 && clen != 0)) {
1185					SOCKBUF_UNLOCK(&so->so_snd);
1186					error = ENOTCONN;
1187					goto release;
1188				}
1189			} else if (addr == NULL) {
1190				SOCKBUF_UNLOCK(&so->so_snd);
1191				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1192					error = ENOTCONN;
1193				else
1194					error = EDESTADDRREQ;
1195				goto release;
1196			}
1197		}
1198		space = sbspace(&so->so_snd);
1199		if (flags & MSG_OOB)
1200			space += 1024;
1201		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1202		    clen > so->so_snd.sb_hiwat) {
1203			SOCKBUF_UNLOCK(&so->so_snd);
1204			error = EMSGSIZE;
1205			goto release;
1206		}
1207		if (space < resid + clen &&
1208		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1209			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1210				SOCKBUF_UNLOCK(&so->so_snd);
1211				error = EWOULDBLOCK;
1212				goto release;
1213			}
1214			error = sbwait(&so->so_snd);
1215			SOCKBUF_UNLOCK(&so->so_snd);
1216			if (error)
1217				goto release;
1218			goto restart;
1219		}
1220		SOCKBUF_UNLOCK(&so->so_snd);
1221		space -= clen;
1222		do {
1223			if (uio == NULL) {
1224				resid = 0;
1225				if (flags & MSG_EOR)
1226					top->m_flags |= M_EOR;
1227			} else {
1228#ifdef ZERO_COPY_SOCKETS
1229				error = sosend_copyin(uio, &top, atomic,
1230				    &space, flags);
1231				if (error != 0)
1232					goto release;
1233#else
1234				/*
1235				 * Copy the data from userland into a mbuf
1236				 * chain.  If no data is to be copied in,
1237				 * a single empty mbuf is returned.
1238				 */
1239				top = m_uiotombuf(uio, M_WAITOK, space,
1240				    (atomic ? max_hdr : 0),
1241				    (atomic ? M_PKTHDR : 0) |
1242				    ((flags & MSG_EOR) ? M_EOR : 0));
1243				if (top == NULL) {
1244					error = EFAULT; /* only possible error */
1245					goto release;
1246				}
1247				space -= resid - uio->uio_resid;
1248#endif
1249				resid = uio->uio_resid;
1250			}
1251			if (dontroute) {
1252				SOCK_LOCK(so);
1253				so->so_options |= SO_DONTROUTE;
1254				SOCK_UNLOCK(so);
1255			}
1256			/*
1257			 * XXX all the SBS_CANTSENDMORE checks previously
1258			 * done could be out of date.  We could have recieved
1259			 * a reset packet in an interrupt or maybe we slept
1260			 * while doing page faults in uiomove() etc.  We
1261			 * could probably recheck again inside the locking
1262			 * protection here, but there are probably other
1263			 * places that this also happens.  We must rethink
1264			 * this.
1265			 */
1266			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1267			    (flags & MSG_OOB) ? PRUS_OOB :
1268			/*
1269			 * If the user set MSG_EOF, the protocol understands
1270			 * this flag and nothing left to send then use
1271			 * PRU_SEND_EOF instead of PRU_SEND.
1272			 */
1273			    ((flags & MSG_EOF) &&
1274			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1275			     (resid <= 0)) ?
1276				PRUS_EOF :
1277			/* If there is more to send set PRUS_MORETOCOME. */
1278			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1279			    top, addr, control, td);
1280			if (dontroute) {
1281				SOCK_LOCK(so);
1282				so->so_options &= ~SO_DONTROUTE;
1283				SOCK_UNLOCK(so);
1284			}
1285			clen = 0;
1286			control = NULL;
1287			top = NULL;
1288			if (error)
1289				goto release;
1290		} while (resid && space > 0);
1291	} while (resid);
1292
1293release:
1294	sbunlock(&so->so_snd);
1295out:
1296	if (top != NULL)
1297		m_freem(top);
1298	if (control != NULL)
1299		m_freem(control);
1300	return (error);
1301}
1302
1303int
1304sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1305    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1306{
1307	int error;
1308
1309	CURVNET_SET(so->so_vnet);
1310	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1311	    control, flags, td);
1312	CURVNET_RESTORE();
1313	return (error);
1314}
1315
1316/*
1317 * The part of soreceive() that implements reading non-inline out-of-band
1318 * data from a socket.  For more complete comments, see soreceive(), from
1319 * which this code originated.
1320 *
1321 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1322 * unable to return an mbuf chain to the caller.
1323 */
1324static int
1325soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1326{
1327	struct protosw *pr = so->so_proto;
1328	struct mbuf *m;
1329	int error;
1330
1331	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1332
1333	m = m_get(M_WAIT, MT_DATA);
1334	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1335	if (error)
1336		goto bad;
1337	do {
1338#ifdef ZERO_COPY_SOCKETS
1339		if (so_zero_copy_receive) {
1340			int disposable;
1341
1342			if ((m->m_flags & M_EXT)
1343			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1344				disposable = 1;
1345			else
1346				disposable = 0;
1347
1348			error = uiomoveco(mtod(m, void *),
1349					  min(uio->uio_resid, m->m_len),
1350					  uio, disposable);
1351		} else
1352#endif /* ZERO_COPY_SOCKETS */
1353		error = uiomove(mtod(m, void *),
1354		    (int) min(uio->uio_resid, m->m_len), uio);
1355		m = m_free(m);
1356	} while (uio->uio_resid && error == 0 && m);
1357bad:
1358	if (m != NULL)
1359		m_freem(m);
1360	return (error);
1361}
1362
1363/*
1364 * Following replacement or removal of the first mbuf on the first mbuf chain
1365 * of a socket buffer, push necessary state changes back into the socket
1366 * buffer so that other consumers see the values consistently.  'nextrecord'
1367 * is the callers locally stored value of the original value of
1368 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1369 * NOTE: 'nextrecord' may be NULL.
1370 */
1371static __inline void
1372sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1373{
1374
1375	SOCKBUF_LOCK_ASSERT(sb);
1376	/*
1377	 * First, update for the new value of nextrecord.  If necessary, make
1378	 * it the first record.
1379	 */
1380	if (sb->sb_mb != NULL)
1381		sb->sb_mb->m_nextpkt = nextrecord;
1382	else
1383		sb->sb_mb = nextrecord;
1384
1385        /*
1386         * Now update any dependent socket buffer fields to reflect the new
1387         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1388	 * addition of a second clause that takes care of the case where
1389	 * sb_mb has been updated, but remains the last record.
1390         */
1391        if (sb->sb_mb == NULL) {
1392                sb->sb_mbtail = NULL;
1393                sb->sb_lastrecord = NULL;
1394        } else if (sb->sb_mb->m_nextpkt == NULL)
1395                sb->sb_lastrecord = sb->sb_mb;
1396}
1397
1398
1399/*
1400 * Implement receive operations on a socket.  We depend on the way that
1401 * records are added to the sockbuf by sbappend.  In particular, each record
1402 * (mbufs linked through m_next) must begin with an address if the protocol
1403 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1404 * data, and then zero or more mbufs of data.  In order to allow parallelism
1405 * between network receive and copying to user space, as well as avoid
1406 * sleeping with a mutex held, we release the socket buffer mutex during the
1407 * user space copy.  Although the sockbuf is locked, new data may still be
1408 * appended, and thus we must maintain consistency of the sockbuf during that
1409 * time.
1410 *
1411 * The caller may receive the data as a single mbuf chain by supplying an
1412 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1413 * the count in uio_resid.
1414 */
1415int
1416soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1417    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1418{
1419	struct mbuf *m, **mp;
1420	int flags, len, error, offset;
1421	struct protosw *pr = so->so_proto;
1422	struct mbuf *nextrecord;
1423	int moff, type = 0;
1424	int orig_resid = uio->uio_resid;
1425
1426	mp = mp0;
1427	if (psa != NULL)
1428		*psa = NULL;
1429	if (controlp != NULL)
1430		*controlp = NULL;
1431	if (flagsp != NULL)
1432		flags = *flagsp &~ MSG_EOR;
1433	else
1434		flags = 0;
1435	if (flags & MSG_OOB)
1436		return (soreceive_rcvoob(so, uio, flags));
1437	if (mp != NULL)
1438		*mp = NULL;
1439	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1440	    && uio->uio_resid)
1441		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1442
1443	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1444	if (error)
1445		return (error);
1446
1447restart:
1448	SOCKBUF_LOCK(&so->so_rcv);
1449	m = so->so_rcv.sb_mb;
1450	/*
1451	 * If we have less data than requested, block awaiting more (subject
1452	 * to any timeout) if:
1453	 *   1. the current count is less than the low water mark, or
1454	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1455	 *	receive operation at once if we block (resid <= hiwat).
1456	 *   3. MSG_DONTWAIT is not set
1457	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1458	 * we have to do the receive in sections, and thus risk returning a
1459	 * short count if a timeout or signal occurs after we start.
1460	 */
1461	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1462	    so->so_rcv.sb_cc < uio->uio_resid) &&
1463	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1464	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1465	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1466		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1467		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1468		    m, so->so_rcv.sb_cc));
1469		if (so->so_error) {
1470			if (m != NULL)
1471				goto dontblock;
1472			error = so->so_error;
1473			if ((flags & MSG_PEEK) == 0)
1474				so->so_error = 0;
1475			SOCKBUF_UNLOCK(&so->so_rcv);
1476			goto release;
1477		}
1478		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1479		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1480			if (m == NULL) {
1481				SOCKBUF_UNLOCK(&so->so_rcv);
1482				goto release;
1483			} else
1484				goto dontblock;
1485		}
1486		for (; m != NULL; m = m->m_next)
1487			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1488				m = so->so_rcv.sb_mb;
1489				goto dontblock;
1490			}
1491		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1492		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1493			SOCKBUF_UNLOCK(&so->so_rcv);
1494			error = ENOTCONN;
1495			goto release;
1496		}
1497		if (uio->uio_resid == 0) {
1498			SOCKBUF_UNLOCK(&so->so_rcv);
1499			goto release;
1500		}
1501		if ((so->so_state & SS_NBIO) ||
1502		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1503			SOCKBUF_UNLOCK(&so->so_rcv);
1504			error = EWOULDBLOCK;
1505			goto release;
1506		}
1507		SBLASTRECORDCHK(&so->so_rcv);
1508		SBLASTMBUFCHK(&so->so_rcv);
1509		error = sbwait(&so->so_rcv);
1510		SOCKBUF_UNLOCK(&so->so_rcv);
1511		if (error)
1512			goto release;
1513		goto restart;
1514	}
1515dontblock:
1516	/*
1517	 * From this point onward, we maintain 'nextrecord' as a cache of the
1518	 * pointer to the next record in the socket buffer.  We must keep the
1519	 * various socket buffer pointers and local stack versions of the
1520	 * pointers in sync, pushing out modifications before dropping the
1521	 * socket buffer mutex, and re-reading them when picking it up.
1522	 *
1523	 * Otherwise, we will race with the network stack appending new data
1524	 * or records onto the socket buffer by using inconsistent/stale
1525	 * versions of the field, possibly resulting in socket buffer
1526	 * corruption.
1527	 *
1528	 * By holding the high-level sblock(), we prevent simultaneous
1529	 * readers from pulling off the front of the socket buffer.
1530	 */
1531	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1532	if (uio->uio_td)
1533		uio->uio_td->td_ru.ru_msgrcv++;
1534	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1535	SBLASTRECORDCHK(&so->so_rcv);
1536	SBLASTMBUFCHK(&so->so_rcv);
1537	nextrecord = m->m_nextpkt;
1538	if (pr->pr_flags & PR_ADDR) {
1539		KASSERT(m->m_type == MT_SONAME,
1540		    ("m->m_type == %d", m->m_type));
1541		orig_resid = 0;
1542		if (psa != NULL)
1543			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1544			    M_NOWAIT);
1545		if (flags & MSG_PEEK) {
1546			m = m->m_next;
1547		} else {
1548			sbfree(&so->so_rcv, m);
1549			so->so_rcv.sb_mb = m_free(m);
1550			m = so->so_rcv.sb_mb;
1551			sockbuf_pushsync(&so->so_rcv, nextrecord);
1552		}
1553	}
1554
1555	/*
1556	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1557	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1558	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1559	 * perform externalization (or freeing if controlp == NULL).
1560	 */
1561	if (m != NULL && m->m_type == MT_CONTROL) {
1562		struct mbuf *cm = NULL, *cmn;
1563		struct mbuf **cme = &cm;
1564
1565		do {
1566			if (flags & MSG_PEEK) {
1567				if (controlp != NULL) {
1568					*controlp = m_copy(m, 0, m->m_len);
1569					controlp = &(*controlp)->m_next;
1570				}
1571				m = m->m_next;
1572			} else {
1573				sbfree(&so->so_rcv, m);
1574				so->so_rcv.sb_mb = m->m_next;
1575				m->m_next = NULL;
1576				*cme = m;
1577				cme = &(*cme)->m_next;
1578				m = so->so_rcv.sb_mb;
1579			}
1580		} while (m != NULL && m->m_type == MT_CONTROL);
1581		if ((flags & MSG_PEEK) == 0)
1582			sockbuf_pushsync(&so->so_rcv, nextrecord);
1583		while (cm != NULL) {
1584			cmn = cm->m_next;
1585			cm->m_next = NULL;
1586			if (pr->pr_domain->dom_externalize != NULL) {
1587				SOCKBUF_UNLOCK(&so->so_rcv);
1588				error = (*pr->pr_domain->dom_externalize)
1589				    (cm, controlp);
1590				SOCKBUF_LOCK(&so->so_rcv);
1591			} else if (controlp != NULL)
1592				*controlp = cm;
1593			else
1594				m_freem(cm);
1595			if (controlp != NULL) {
1596				orig_resid = 0;
1597				while (*controlp != NULL)
1598					controlp = &(*controlp)->m_next;
1599			}
1600			cm = cmn;
1601		}
1602		if (m != NULL)
1603			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1604		else
1605			nextrecord = so->so_rcv.sb_mb;
1606		orig_resid = 0;
1607	}
1608	if (m != NULL) {
1609		if ((flags & MSG_PEEK) == 0) {
1610			KASSERT(m->m_nextpkt == nextrecord,
1611			    ("soreceive: post-control, nextrecord !sync"));
1612			if (nextrecord == NULL) {
1613				KASSERT(so->so_rcv.sb_mb == m,
1614				    ("soreceive: post-control, sb_mb!=m"));
1615				KASSERT(so->so_rcv.sb_lastrecord == m,
1616				    ("soreceive: post-control, lastrecord!=m"));
1617			}
1618		}
1619		type = m->m_type;
1620		if (type == MT_OOBDATA)
1621			flags |= MSG_OOB;
1622	} else {
1623		if ((flags & MSG_PEEK) == 0) {
1624			KASSERT(so->so_rcv.sb_mb == nextrecord,
1625			    ("soreceive: sb_mb != nextrecord"));
1626			if (so->so_rcv.sb_mb == NULL) {
1627				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1628				    ("soreceive: sb_lastercord != NULL"));
1629			}
1630		}
1631	}
1632	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1633	SBLASTRECORDCHK(&so->so_rcv);
1634	SBLASTMBUFCHK(&so->so_rcv);
1635
1636	/*
1637	 * Now continue to read any data mbufs off of the head of the socket
1638	 * buffer until the read request is satisfied.  Note that 'type' is
1639	 * used to store the type of any mbuf reads that have happened so far
1640	 * such that soreceive() can stop reading if the type changes, which
1641	 * causes soreceive() to return only one of regular data and inline
1642	 * out-of-band data in a single socket receive operation.
1643	 */
1644	moff = 0;
1645	offset = 0;
1646	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1647		/*
1648		 * If the type of mbuf has changed since the last mbuf
1649		 * examined ('type'), end the receive operation.
1650	 	 */
1651		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1652		if (m->m_type == MT_OOBDATA) {
1653			if (type != MT_OOBDATA)
1654				break;
1655		} else if (type == MT_OOBDATA)
1656			break;
1657		else
1658		    KASSERT(m->m_type == MT_DATA,
1659			("m->m_type == %d", m->m_type));
1660		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1661		len = uio->uio_resid;
1662		if (so->so_oobmark && len > so->so_oobmark - offset)
1663			len = so->so_oobmark - offset;
1664		if (len > m->m_len - moff)
1665			len = m->m_len - moff;
1666		/*
1667		 * If mp is set, just pass back the mbufs.  Otherwise copy
1668		 * them out via the uio, then free.  Sockbuf must be
1669		 * consistent here (points to current mbuf, it points to next
1670		 * record) when we drop priority; we must note any additions
1671		 * to the sockbuf when we block interrupts again.
1672		 */
1673		if (mp == NULL) {
1674			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1675			SBLASTRECORDCHK(&so->so_rcv);
1676			SBLASTMBUFCHK(&so->so_rcv);
1677			SOCKBUF_UNLOCK(&so->so_rcv);
1678#ifdef ZERO_COPY_SOCKETS
1679			if (so_zero_copy_receive) {
1680				int disposable;
1681
1682				if ((m->m_flags & M_EXT)
1683				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1684					disposable = 1;
1685				else
1686					disposable = 0;
1687
1688				error = uiomoveco(mtod(m, char *) + moff,
1689						  (int)len, uio,
1690						  disposable);
1691			} else
1692#endif /* ZERO_COPY_SOCKETS */
1693			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1694			SOCKBUF_LOCK(&so->so_rcv);
1695			if (error) {
1696				/*
1697				 * The MT_SONAME mbuf has already been removed
1698				 * from the record, so it is necessary to
1699				 * remove the data mbufs, if any, to preserve
1700				 * the invariant in the case of PR_ADDR that
1701				 * requires MT_SONAME mbufs at the head of
1702				 * each record.
1703				 */
1704				if (m && pr->pr_flags & PR_ATOMIC &&
1705				    ((flags & MSG_PEEK) == 0))
1706					(void)sbdroprecord_locked(&so->so_rcv);
1707				SOCKBUF_UNLOCK(&so->so_rcv);
1708				goto release;
1709			}
1710		} else
1711			uio->uio_resid -= len;
1712		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1713		if (len == m->m_len - moff) {
1714			if (m->m_flags & M_EOR)
1715				flags |= MSG_EOR;
1716			if (flags & MSG_PEEK) {
1717				m = m->m_next;
1718				moff = 0;
1719			} else {
1720				nextrecord = m->m_nextpkt;
1721				sbfree(&so->so_rcv, m);
1722				if (mp != NULL) {
1723					*mp = m;
1724					mp = &m->m_next;
1725					so->so_rcv.sb_mb = m = m->m_next;
1726					*mp = NULL;
1727				} else {
1728					so->so_rcv.sb_mb = m_free(m);
1729					m = so->so_rcv.sb_mb;
1730				}
1731				sockbuf_pushsync(&so->so_rcv, nextrecord);
1732				SBLASTRECORDCHK(&so->so_rcv);
1733				SBLASTMBUFCHK(&so->so_rcv);
1734			}
1735		} else {
1736			if (flags & MSG_PEEK)
1737				moff += len;
1738			else {
1739				if (mp != NULL) {
1740					int copy_flag;
1741
1742					if (flags & MSG_DONTWAIT)
1743						copy_flag = M_DONTWAIT;
1744					else
1745						copy_flag = M_WAIT;
1746					if (copy_flag == M_WAIT)
1747						SOCKBUF_UNLOCK(&so->so_rcv);
1748					*mp = m_copym(m, 0, len, copy_flag);
1749					if (copy_flag == M_WAIT)
1750						SOCKBUF_LOCK(&so->so_rcv);
1751 					if (*mp == NULL) {
1752 						/*
1753 						 * m_copym() couldn't
1754						 * allocate an mbuf.  Adjust
1755						 * uio_resid back (it was
1756						 * adjusted down by len
1757						 * bytes, which we didn't end
1758						 * up "copying" over).
1759 						 */
1760 						uio->uio_resid += len;
1761 						break;
1762 					}
1763				}
1764				m->m_data += len;
1765				m->m_len -= len;
1766				so->so_rcv.sb_cc -= len;
1767			}
1768		}
1769		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1770		if (so->so_oobmark) {
1771			if ((flags & MSG_PEEK) == 0) {
1772				so->so_oobmark -= len;
1773				if (so->so_oobmark == 0) {
1774					so->so_rcv.sb_state |= SBS_RCVATMARK;
1775					break;
1776				}
1777			} else {
1778				offset += len;
1779				if (offset == so->so_oobmark)
1780					break;
1781			}
1782		}
1783		if (flags & MSG_EOR)
1784			break;
1785		/*
1786		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1787		 * must not quit until "uio->uio_resid == 0" or an error
1788		 * termination.  If a signal/timeout occurs, return with a
1789		 * short count but without error.  Keep sockbuf locked
1790		 * against other readers.
1791		 */
1792		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1793		    !sosendallatonce(so) && nextrecord == NULL) {
1794			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1795			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1796				break;
1797			/*
1798			 * Notify the protocol that some data has been
1799			 * drained before blocking.
1800			 */
1801			if (pr->pr_flags & PR_WANTRCVD) {
1802				SOCKBUF_UNLOCK(&so->so_rcv);
1803				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1804				SOCKBUF_LOCK(&so->so_rcv);
1805			}
1806			SBLASTRECORDCHK(&so->so_rcv);
1807			SBLASTMBUFCHK(&so->so_rcv);
1808			error = sbwait(&so->so_rcv);
1809			if (error) {
1810				SOCKBUF_UNLOCK(&so->so_rcv);
1811				goto release;
1812			}
1813			m = so->so_rcv.sb_mb;
1814			if (m != NULL)
1815				nextrecord = m->m_nextpkt;
1816		}
1817	}
1818
1819	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1820	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1821		flags |= MSG_TRUNC;
1822		if ((flags & MSG_PEEK) == 0)
1823			(void) sbdroprecord_locked(&so->so_rcv);
1824	}
1825	if ((flags & MSG_PEEK) == 0) {
1826		if (m == NULL) {
1827			/*
1828			 * First part is an inline SB_EMPTY_FIXUP().  Second
1829			 * part makes sure sb_lastrecord is up-to-date if
1830			 * there is still data in the socket buffer.
1831			 */
1832			so->so_rcv.sb_mb = nextrecord;
1833			if (so->so_rcv.sb_mb == NULL) {
1834				so->so_rcv.sb_mbtail = NULL;
1835				so->so_rcv.sb_lastrecord = NULL;
1836			} else if (nextrecord->m_nextpkt == NULL)
1837				so->so_rcv.sb_lastrecord = nextrecord;
1838		}
1839		SBLASTRECORDCHK(&so->so_rcv);
1840		SBLASTMBUFCHK(&so->so_rcv);
1841		/*
1842		 * If soreceive() is being done from the socket callback,
1843		 * then don't need to generate ACK to peer to update window,
1844		 * since ACK will be generated on return to TCP.
1845		 */
1846		if (!(flags & MSG_SOCALLBCK) &&
1847		    (pr->pr_flags & PR_WANTRCVD)) {
1848			SOCKBUF_UNLOCK(&so->so_rcv);
1849			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1850			SOCKBUF_LOCK(&so->so_rcv);
1851		}
1852	}
1853	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1854	if (orig_resid == uio->uio_resid && orig_resid &&
1855	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1856		SOCKBUF_UNLOCK(&so->so_rcv);
1857		goto restart;
1858	}
1859	SOCKBUF_UNLOCK(&so->so_rcv);
1860
1861	if (flagsp != NULL)
1862		*flagsp |= flags;
1863release:
1864	sbunlock(&so->so_rcv);
1865	return (error);
1866}
1867
1868/*
1869 * Optimized version of soreceive() for stream (TCP) sockets.
1870 */
1871#ifdef TCP_SORECEIVE_STREAM
1872int
1873soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1874    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1875{
1876	int len = 0, error = 0, flags, oresid;
1877	struct sockbuf *sb;
1878	struct mbuf *m, *n = NULL;
1879
1880	/* We only do stream sockets. */
1881	if (so->so_type != SOCK_STREAM)
1882		return (EINVAL);
1883	if (psa != NULL)
1884		*psa = NULL;
1885	if (controlp != NULL)
1886		return (EINVAL);
1887	if (flagsp != NULL)
1888		flags = *flagsp &~ MSG_EOR;
1889	else
1890		flags = 0;
1891	if (flags & MSG_OOB)
1892		return (soreceive_rcvoob(so, uio, flags));
1893	if (mp0 != NULL)
1894		*mp0 = NULL;
1895
1896	sb = &so->so_rcv;
1897
1898	/* Prevent other readers from entering the socket. */
1899	error = sblock(sb, SBLOCKWAIT(flags));
1900	if (error)
1901		goto out;
1902	SOCKBUF_LOCK(sb);
1903
1904	/* Easy one, no space to copyout anything. */
1905	if (uio->uio_resid == 0) {
1906		error = EINVAL;
1907		goto out;
1908	}
1909	oresid = uio->uio_resid;
1910
1911	/* We will never ever get anything unless we are connected. */
1912	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1913		/* When disconnecting there may be still some data left. */
1914		if (sb->sb_cc > 0)
1915			goto deliver;
1916		if (!(so->so_state & SS_ISDISCONNECTED))
1917			error = ENOTCONN;
1918		goto out;
1919	}
1920
1921	/* Socket buffer is empty and we shall not block. */
1922	if (sb->sb_cc == 0 &&
1923	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1924		error = EAGAIN;
1925		goto out;
1926	}
1927
1928restart:
1929	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1930
1931	/* Abort if socket has reported problems. */
1932	if (so->so_error) {
1933		if (sb->sb_cc > 0)
1934			goto deliver;
1935		if (oresid > uio->uio_resid)
1936			goto out;
1937		error = so->so_error;
1938		if (!(flags & MSG_PEEK))
1939			so->so_error = 0;
1940		goto out;
1941	}
1942
1943	/* Door is closed.  Deliver what is left, if any. */
1944	if (sb->sb_state & SBS_CANTRCVMORE) {
1945		if (sb->sb_cc > 0)
1946			goto deliver;
1947		else
1948			goto out;
1949	}
1950
1951	/* Socket buffer got some data that we shall deliver now. */
1952	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1953	    ((sb->sb_flags & SS_NBIO) ||
1954	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1955	     sb->sb_cc >= sb->sb_lowat ||
1956	     sb->sb_cc >= uio->uio_resid ||
1957	     sb->sb_cc >= sb->sb_hiwat) ) {
1958		goto deliver;
1959	}
1960
1961	/* On MSG_WAITALL we must wait until all data or error arrives. */
1962	if ((flags & MSG_WAITALL) &&
1963	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1964		goto deliver;
1965
1966	/*
1967	 * Wait and block until (more) data comes in.
1968	 * NB: Drops the sockbuf lock during wait.
1969	 */
1970	error = sbwait(sb);
1971	if (error)
1972		goto out;
1973	goto restart;
1974
1975deliver:
1976	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1977	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1978	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1979
1980	/* Statistics. */
1981	if (uio->uio_td)
1982		uio->uio_td->td_ru.ru_msgrcv++;
1983
1984	/* Fill uio until full or current end of socket buffer is reached. */
1985	len = min(uio->uio_resid, sb->sb_cc);
1986	if (mp0 != NULL) {
1987		/* Dequeue as many mbufs as possible. */
1988		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1989			for (*mp0 = m = sb->sb_mb;
1990			     m != NULL && m->m_len <= len;
1991			     m = m->m_next) {
1992				len -= m->m_len;
1993				uio->uio_resid -= m->m_len;
1994				sbfree(sb, m);
1995				n = m;
1996			}
1997			sb->sb_mb = m;
1998			if (sb->sb_mb == NULL)
1999				SB_EMPTY_FIXUP(sb);
2000			n->m_next = NULL;
2001		}
2002		/* Copy the remainder. */
2003		if (len > 0) {
2004			KASSERT(sb->sb_mb != NULL,
2005			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2006
2007			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2008			if (m == NULL)
2009				len = 0;	/* Don't flush data from sockbuf. */
2010			else
2011				uio->uio_resid -= m->m_len;
2012			if (*mp0 != NULL)
2013				n->m_next = m;
2014			else
2015				*mp0 = m;
2016			if (*mp0 == NULL) {
2017				error = ENOBUFS;
2018				goto out;
2019			}
2020		}
2021	} else {
2022		/* NB: Must unlock socket buffer as uiomove may sleep. */
2023		SOCKBUF_UNLOCK(sb);
2024		error = m_mbuftouio(uio, sb->sb_mb, len);
2025		SOCKBUF_LOCK(sb);
2026		if (error)
2027			goto out;
2028	}
2029	SBLASTRECORDCHK(sb);
2030	SBLASTMBUFCHK(sb);
2031
2032	/*
2033	 * Remove the delivered data from the socket buffer unless we
2034	 * were only peeking.
2035	 */
2036	if (!(flags & MSG_PEEK)) {
2037		if (len > 0)
2038			sbdrop_locked(sb, len);
2039
2040		/* Notify protocol that we drained some data. */
2041		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2042		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2043		     !(flags & MSG_SOCALLBCK))) {
2044			SOCKBUF_UNLOCK(sb);
2045			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2046			SOCKBUF_LOCK(sb);
2047		}
2048	}
2049
2050	/*
2051	 * For MSG_WAITALL we may have to loop again and wait for
2052	 * more data to come in.
2053	 */
2054	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2055		goto restart;
2056out:
2057	SOCKBUF_LOCK_ASSERT(sb);
2058	SBLASTRECORDCHK(sb);
2059	SBLASTMBUFCHK(sb);
2060	SOCKBUF_UNLOCK(sb);
2061	sbunlock(sb);
2062	return (error);
2063}
2064#endif /* TCP_SORECEIVE_STREAM */
2065
2066/*
2067 * Optimized version of soreceive() for simple datagram cases from userspace.
2068 * Unlike in the stream case, we're able to drop a datagram if copyout()
2069 * fails, and because we handle datagrams atomically, we don't need to use a
2070 * sleep lock to prevent I/O interlacing.
2071 */
2072int
2073soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2074    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2075{
2076	struct mbuf *m, *m2;
2077	int flags, len, error;
2078	struct protosw *pr = so->so_proto;
2079	struct mbuf *nextrecord;
2080
2081	if (psa != NULL)
2082		*psa = NULL;
2083	if (controlp != NULL)
2084		*controlp = NULL;
2085	if (flagsp != NULL)
2086		flags = *flagsp &~ MSG_EOR;
2087	else
2088		flags = 0;
2089
2090	/*
2091	 * For any complicated cases, fall back to the full
2092	 * soreceive_generic().
2093	 */
2094	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2095		return (soreceive_generic(so, psa, uio, mp0, controlp,
2096		    flagsp));
2097
2098	/*
2099	 * Enforce restrictions on use.
2100	 */
2101	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2102	    ("soreceive_dgram: wantrcvd"));
2103	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2104	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2105	    ("soreceive_dgram: SBS_RCVATMARK"));
2106	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2107	    ("soreceive_dgram: P_CONNREQUIRED"));
2108
2109	/*
2110	 * Loop blocking while waiting for a datagram.
2111	 */
2112	SOCKBUF_LOCK(&so->so_rcv);
2113	while ((m = so->so_rcv.sb_mb) == NULL) {
2114		KASSERT(so->so_rcv.sb_cc == 0,
2115		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2116		    so->so_rcv.sb_cc));
2117		if (so->so_error) {
2118			error = so->so_error;
2119			so->so_error = 0;
2120			SOCKBUF_UNLOCK(&so->so_rcv);
2121			return (error);
2122		}
2123		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2124		    uio->uio_resid == 0) {
2125			SOCKBUF_UNLOCK(&so->so_rcv);
2126			return (0);
2127		}
2128		if ((so->so_state & SS_NBIO) ||
2129		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2130			SOCKBUF_UNLOCK(&so->so_rcv);
2131			return (EWOULDBLOCK);
2132		}
2133		SBLASTRECORDCHK(&so->so_rcv);
2134		SBLASTMBUFCHK(&so->so_rcv);
2135		error = sbwait(&so->so_rcv);
2136		if (error) {
2137			SOCKBUF_UNLOCK(&so->so_rcv);
2138			return (error);
2139		}
2140	}
2141	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2142
2143	if (uio->uio_td)
2144		uio->uio_td->td_ru.ru_msgrcv++;
2145	SBLASTRECORDCHK(&so->so_rcv);
2146	SBLASTMBUFCHK(&so->so_rcv);
2147	nextrecord = m->m_nextpkt;
2148	if (nextrecord == NULL) {
2149		KASSERT(so->so_rcv.sb_lastrecord == m,
2150		    ("soreceive_dgram: lastrecord != m"));
2151	}
2152
2153	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2154	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2155
2156	/*
2157	 * Pull 'm' and its chain off the front of the packet queue.
2158	 */
2159	so->so_rcv.sb_mb = NULL;
2160	sockbuf_pushsync(&so->so_rcv, nextrecord);
2161
2162	/*
2163	 * Walk 'm's chain and free that many bytes from the socket buffer.
2164	 */
2165	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2166		sbfree(&so->so_rcv, m2);
2167
2168	/*
2169	 * Do a few last checks before we let go of the lock.
2170	 */
2171	SBLASTRECORDCHK(&so->so_rcv);
2172	SBLASTMBUFCHK(&so->so_rcv);
2173	SOCKBUF_UNLOCK(&so->so_rcv);
2174
2175	if (pr->pr_flags & PR_ADDR) {
2176		KASSERT(m->m_type == MT_SONAME,
2177		    ("m->m_type == %d", m->m_type));
2178		if (psa != NULL)
2179			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2180			    M_NOWAIT);
2181		m = m_free(m);
2182	}
2183	if (m == NULL) {
2184		/* XXXRW: Can this happen? */
2185		return (0);
2186	}
2187
2188	/*
2189	 * Packet to copyout() is now in 'm' and it is disconnected from the
2190	 * queue.
2191	 *
2192	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2193	 * in the first mbuf chain on the socket buffer.  We call into the
2194	 * protocol to perform externalization (or freeing if controlp ==
2195	 * NULL).
2196	 */
2197	if (m->m_type == MT_CONTROL) {
2198		struct mbuf *cm = NULL, *cmn;
2199		struct mbuf **cme = &cm;
2200
2201		do {
2202			m2 = m->m_next;
2203			m->m_next = NULL;
2204			*cme = m;
2205			cme = &(*cme)->m_next;
2206			m = m2;
2207		} while (m != NULL && m->m_type == MT_CONTROL);
2208		while (cm != NULL) {
2209			cmn = cm->m_next;
2210			cm->m_next = NULL;
2211			if (pr->pr_domain->dom_externalize != NULL) {
2212				error = (*pr->pr_domain->dom_externalize)
2213				    (cm, controlp);
2214			} else if (controlp != NULL)
2215				*controlp = cm;
2216			else
2217				m_freem(cm);
2218			if (controlp != NULL) {
2219				while (*controlp != NULL)
2220					controlp = &(*controlp)->m_next;
2221			}
2222			cm = cmn;
2223		}
2224	}
2225	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2226
2227	while (m != NULL && uio->uio_resid > 0) {
2228		len = uio->uio_resid;
2229		if (len > m->m_len)
2230			len = m->m_len;
2231		error = uiomove(mtod(m, char *), (int)len, uio);
2232		if (error) {
2233			m_freem(m);
2234			return (error);
2235		}
2236		m = m_free(m);
2237	}
2238	if (m != NULL)
2239		flags |= MSG_TRUNC;
2240	m_freem(m);
2241	if (flagsp != NULL)
2242		*flagsp |= flags;
2243	return (0);
2244}
2245
2246int
2247soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2248    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2249{
2250
2251	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2252	    controlp, flagsp));
2253}
2254
2255int
2256soshutdown(struct socket *so, int how)
2257{
2258	struct protosw *pr = so->so_proto;
2259	int error;
2260
2261	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2262		return (EINVAL);
2263	if (pr->pr_usrreqs->pru_flush != NULL) {
2264	        (*pr->pr_usrreqs->pru_flush)(so, how);
2265	}
2266	if (how != SHUT_WR)
2267		sorflush(so);
2268	if (how != SHUT_RD) {
2269		CURVNET_SET(so->so_vnet);
2270		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2271		CURVNET_RESTORE();
2272		return (error);
2273	}
2274	return (0);
2275}
2276
2277void
2278sorflush(struct socket *so)
2279{
2280	struct sockbuf *sb = &so->so_rcv;
2281	struct protosw *pr = so->so_proto;
2282	struct sockbuf asb;
2283
2284	/*
2285	 * In order to avoid calling dom_dispose with the socket buffer mutex
2286	 * held, and in order to generally avoid holding the lock for a long
2287	 * time, we make a copy of the socket buffer and clear the original
2288	 * (except locks, state).  The new socket buffer copy won't have
2289	 * initialized locks so we can only call routines that won't use or
2290	 * assert those locks.
2291	 *
2292	 * Dislodge threads currently blocked in receive and wait to acquire
2293	 * a lock against other simultaneous readers before clearing the
2294	 * socket buffer.  Don't let our acquire be interrupted by a signal
2295	 * despite any existing socket disposition on interruptable waiting.
2296	 */
2297	CURVNET_SET(so->so_vnet);
2298	socantrcvmore(so);
2299	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2300
2301	/*
2302	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2303	 * and mutex data unchanged.
2304	 */
2305	SOCKBUF_LOCK(sb);
2306	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2307	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2308	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2309	bzero(&sb->sb_startzero,
2310	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2311	SOCKBUF_UNLOCK(sb);
2312	sbunlock(sb);
2313
2314	/*
2315	 * Dispose of special rights and flush the socket buffer.  Don't call
2316	 * any unsafe routines (that rely on locks being initialized) on asb.
2317	 */
2318	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2319		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2320	sbrelease_internal(&asb, so);
2321	CURVNET_RESTORE();
2322}
2323
2324/*
2325 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2326 * additional variant to handle the case where the option value needs to be
2327 * some kind of integer, but not a specific size.  In addition to their use
2328 * here, these functions are also called by the protocol-level pr_ctloutput()
2329 * routines.
2330 */
2331int
2332sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2333{
2334	size_t	valsize;
2335
2336	/*
2337	 * If the user gives us more than we wanted, we ignore it, but if we
2338	 * don't get the minimum length the caller wants, we return EINVAL.
2339	 * On success, sopt->sopt_valsize is set to however much we actually
2340	 * retrieved.
2341	 */
2342	if ((valsize = sopt->sopt_valsize) < minlen)
2343		return EINVAL;
2344	if (valsize > len)
2345		sopt->sopt_valsize = valsize = len;
2346
2347	if (sopt->sopt_td != NULL)
2348		return (copyin(sopt->sopt_val, buf, valsize));
2349
2350	bcopy(sopt->sopt_val, buf, valsize);
2351	return (0);
2352}
2353
2354/*
2355 * Kernel version of setsockopt(2).
2356 *
2357 * XXX: optlen is size_t, not socklen_t
2358 */
2359int
2360so_setsockopt(struct socket *so, int level, int optname, void *optval,
2361    size_t optlen)
2362{
2363	struct sockopt sopt;
2364
2365	sopt.sopt_level = level;
2366	sopt.sopt_name = optname;
2367	sopt.sopt_dir = SOPT_SET;
2368	sopt.sopt_val = optval;
2369	sopt.sopt_valsize = optlen;
2370	sopt.sopt_td = NULL;
2371	return (sosetopt(so, &sopt));
2372}
2373
2374int
2375sosetopt(struct socket *so, struct sockopt *sopt)
2376{
2377	int	error, optval;
2378	struct	linger l;
2379	struct	timeval tv;
2380	u_long  val;
2381#ifdef MAC
2382	struct mac extmac;
2383#endif
2384
2385	error = 0;
2386	if (sopt->sopt_level != SOL_SOCKET) {
2387		if (so->so_proto && so->so_proto->pr_ctloutput)
2388			return ((*so->so_proto->pr_ctloutput)
2389				  (so, sopt));
2390		error = ENOPROTOOPT;
2391	} else {
2392		switch (sopt->sopt_name) {
2393#ifdef INET
2394		case SO_ACCEPTFILTER:
2395			error = do_setopt_accept_filter(so, sopt);
2396			if (error)
2397				goto bad;
2398			break;
2399#endif
2400		case SO_LINGER:
2401			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2402			if (error)
2403				goto bad;
2404
2405			SOCK_LOCK(so);
2406			so->so_linger = l.l_linger;
2407			if (l.l_onoff)
2408				so->so_options |= SO_LINGER;
2409			else
2410				so->so_options &= ~SO_LINGER;
2411			SOCK_UNLOCK(so);
2412			break;
2413
2414		case SO_DEBUG:
2415		case SO_KEEPALIVE:
2416		case SO_DONTROUTE:
2417		case SO_USELOOPBACK:
2418		case SO_BROADCAST:
2419		case SO_REUSEADDR:
2420		case SO_REUSEPORT:
2421		case SO_OOBINLINE:
2422		case SO_TIMESTAMP:
2423		case SO_BINTIME:
2424		case SO_NOSIGPIPE:
2425		case SO_NO_DDP:
2426		case SO_NO_OFFLOAD:
2427			error = sooptcopyin(sopt, &optval, sizeof optval,
2428					    sizeof optval);
2429			if (error)
2430				goto bad;
2431			SOCK_LOCK(so);
2432			if (optval)
2433				so->so_options |= sopt->sopt_name;
2434			else
2435				so->so_options &= ~sopt->sopt_name;
2436			SOCK_UNLOCK(so);
2437			break;
2438
2439		case SO_SETFIB:
2440			error = sooptcopyin(sopt, &optval, sizeof optval,
2441					    sizeof optval);
2442			if (optval < 1 || optval > rt_numfibs) {
2443				error = EINVAL;
2444				goto bad;
2445			}
2446			if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2447			    (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2448				so->so_fibnum = optval;
2449				/* Note: ignore error */
2450				if (so->so_proto && so->so_proto->pr_ctloutput)
2451					(*so->so_proto->pr_ctloutput)(so, sopt);
2452			} else {
2453				so->so_fibnum = 0;
2454			}
2455			break;
2456		case SO_SNDBUF:
2457		case SO_RCVBUF:
2458		case SO_SNDLOWAT:
2459		case SO_RCVLOWAT:
2460			error = sooptcopyin(sopt, &optval, sizeof optval,
2461					    sizeof optval);
2462			if (error)
2463				goto bad;
2464
2465			/*
2466			 * Values < 1 make no sense for any of these options,
2467			 * so disallow them.
2468			 */
2469			if (optval < 1) {
2470				error = EINVAL;
2471				goto bad;
2472			}
2473
2474			switch (sopt->sopt_name) {
2475			case SO_SNDBUF:
2476			case SO_RCVBUF:
2477				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2478				    &so->so_snd : &so->so_rcv, (u_long)optval,
2479				    so, curthread) == 0) {
2480					error = ENOBUFS;
2481					goto bad;
2482				}
2483				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2484				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2485				break;
2486
2487			/*
2488			 * Make sure the low-water is never greater than the
2489			 * high-water.
2490			 */
2491			case SO_SNDLOWAT:
2492				SOCKBUF_LOCK(&so->so_snd);
2493				so->so_snd.sb_lowat =
2494				    (optval > so->so_snd.sb_hiwat) ?
2495				    so->so_snd.sb_hiwat : optval;
2496				SOCKBUF_UNLOCK(&so->so_snd);
2497				break;
2498			case SO_RCVLOWAT:
2499				SOCKBUF_LOCK(&so->so_rcv);
2500				so->so_rcv.sb_lowat =
2501				    (optval > so->so_rcv.sb_hiwat) ?
2502				    so->so_rcv.sb_hiwat : optval;
2503				SOCKBUF_UNLOCK(&so->so_rcv);
2504				break;
2505			}
2506			break;
2507
2508		case SO_SNDTIMEO:
2509		case SO_RCVTIMEO:
2510#ifdef COMPAT_FREEBSD32
2511			if (SV_CURPROC_FLAG(SV_ILP32)) {
2512				struct timeval32 tv32;
2513
2514				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2515				    sizeof tv32);
2516				CP(tv32, tv, tv_sec);
2517				CP(tv32, tv, tv_usec);
2518			} else
2519#endif
2520				error = sooptcopyin(sopt, &tv, sizeof tv,
2521				    sizeof tv);
2522			if (error)
2523				goto bad;
2524
2525			/* assert(hz > 0); */
2526			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2527			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2528				error = EDOM;
2529				goto bad;
2530			}
2531			/* assert(tick > 0); */
2532			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2533			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2534			if (val > INT_MAX) {
2535				error = EDOM;
2536				goto bad;
2537			}
2538			if (val == 0 && tv.tv_usec != 0)
2539				val = 1;
2540
2541			switch (sopt->sopt_name) {
2542			case SO_SNDTIMEO:
2543				so->so_snd.sb_timeo = val;
2544				break;
2545			case SO_RCVTIMEO:
2546				so->so_rcv.sb_timeo = val;
2547				break;
2548			}
2549			break;
2550
2551		case SO_LABEL:
2552#ifdef MAC
2553			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2554			    sizeof extmac);
2555			if (error)
2556				goto bad;
2557			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2558			    so, &extmac);
2559#else
2560			error = EOPNOTSUPP;
2561#endif
2562			break;
2563
2564		default:
2565			error = ENOPROTOOPT;
2566			break;
2567		}
2568		if (error == 0 && so->so_proto != NULL &&
2569		    so->so_proto->pr_ctloutput != NULL) {
2570			(void) ((*so->so_proto->pr_ctloutput)
2571				  (so, sopt));
2572		}
2573	}
2574bad:
2575	return (error);
2576}
2577
2578/*
2579 * Helper routine for getsockopt.
2580 */
2581int
2582sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2583{
2584	int	error;
2585	size_t	valsize;
2586
2587	error = 0;
2588
2589	/*
2590	 * Documented get behavior is that we always return a value, possibly
2591	 * truncated to fit in the user's buffer.  Traditional behavior is
2592	 * that we always tell the user precisely how much we copied, rather
2593	 * than something useful like the total amount we had available for
2594	 * her.  Note that this interface is not idempotent; the entire
2595	 * answer must generated ahead of time.
2596	 */
2597	valsize = min(len, sopt->sopt_valsize);
2598	sopt->sopt_valsize = valsize;
2599	if (sopt->sopt_val != NULL) {
2600		if (sopt->sopt_td != NULL)
2601			error = copyout(buf, sopt->sopt_val, valsize);
2602		else
2603			bcopy(buf, sopt->sopt_val, valsize);
2604	}
2605	return (error);
2606}
2607
2608int
2609sogetopt(struct socket *so, struct sockopt *sopt)
2610{
2611	int	error, optval;
2612	struct	linger l;
2613	struct	timeval tv;
2614#ifdef MAC
2615	struct mac extmac;
2616#endif
2617
2618	error = 0;
2619	if (sopt->sopt_level != SOL_SOCKET) {
2620		if (so->so_proto && so->so_proto->pr_ctloutput) {
2621			return ((*so->so_proto->pr_ctloutput)
2622				  (so, sopt));
2623		} else
2624			return (ENOPROTOOPT);
2625	} else {
2626		switch (sopt->sopt_name) {
2627#ifdef INET
2628		case SO_ACCEPTFILTER:
2629			error = do_getopt_accept_filter(so, sopt);
2630			break;
2631#endif
2632		case SO_LINGER:
2633			SOCK_LOCK(so);
2634			l.l_onoff = so->so_options & SO_LINGER;
2635			l.l_linger = so->so_linger;
2636			SOCK_UNLOCK(so);
2637			error = sooptcopyout(sopt, &l, sizeof l);
2638			break;
2639
2640		case SO_USELOOPBACK:
2641		case SO_DONTROUTE:
2642		case SO_DEBUG:
2643		case SO_KEEPALIVE:
2644		case SO_REUSEADDR:
2645		case SO_REUSEPORT:
2646		case SO_BROADCAST:
2647		case SO_OOBINLINE:
2648		case SO_ACCEPTCONN:
2649		case SO_TIMESTAMP:
2650		case SO_BINTIME:
2651		case SO_NOSIGPIPE:
2652			optval = so->so_options & sopt->sopt_name;
2653integer:
2654			error = sooptcopyout(sopt, &optval, sizeof optval);
2655			break;
2656
2657		case SO_TYPE:
2658			optval = so->so_type;
2659			goto integer;
2660
2661		case SO_ERROR:
2662			SOCK_LOCK(so);
2663			optval = so->so_error;
2664			so->so_error = 0;
2665			SOCK_UNLOCK(so);
2666			goto integer;
2667
2668		case SO_SNDBUF:
2669			optval = so->so_snd.sb_hiwat;
2670			goto integer;
2671
2672		case SO_RCVBUF:
2673			optval = so->so_rcv.sb_hiwat;
2674			goto integer;
2675
2676		case SO_SNDLOWAT:
2677			optval = so->so_snd.sb_lowat;
2678			goto integer;
2679
2680		case SO_RCVLOWAT:
2681			optval = so->so_rcv.sb_lowat;
2682			goto integer;
2683
2684		case SO_SNDTIMEO:
2685		case SO_RCVTIMEO:
2686			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2687				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2688
2689			tv.tv_sec = optval / hz;
2690			tv.tv_usec = (optval % hz) * tick;
2691#ifdef COMPAT_FREEBSD32
2692			if (SV_CURPROC_FLAG(SV_ILP32)) {
2693				struct timeval32 tv32;
2694
2695				CP(tv, tv32, tv_sec);
2696				CP(tv, tv32, tv_usec);
2697				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2698			} else
2699#endif
2700				error = sooptcopyout(sopt, &tv, sizeof tv);
2701			break;
2702
2703		case SO_LABEL:
2704#ifdef MAC
2705			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2706			    sizeof(extmac));
2707			if (error)
2708				return (error);
2709			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2710			    so, &extmac);
2711			if (error)
2712				return (error);
2713			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2714#else
2715			error = EOPNOTSUPP;
2716#endif
2717			break;
2718
2719		case SO_PEERLABEL:
2720#ifdef MAC
2721			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2722			    sizeof(extmac));
2723			if (error)
2724				return (error);
2725			error = mac_getsockopt_peerlabel(
2726			    sopt->sopt_td->td_ucred, so, &extmac);
2727			if (error)
2728				return (error);
2729			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2730#else
2731			error = EOPNOTSUPP;
2732#endif
2733			break;
2734
2735		case SO_LISTENQLIMIT:
2736			optval = so->so_qlimit;
2737			goto integer;
2738
2739		case SO_LISTENQLEN:
2740			optval = so->so_qlen;
2741			goto integer;
2742
2743		case SO_LISTENINCQLEN:
2744			optval = so->so_incqlen;
2745			goto integer;
2746
2747		default:
2748			error = ENOPROTOOPT;
2749			break;
2750		}
2751		return (error);
2752	}
2753}
2754
2755/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2756int
2757soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2758{
2759	struct mbuf *m, *m_prev;
2760	int sopt_size = sopt->sopt_valsize;
2761
2762	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2763	if (m == NULL)
2764		return ENOBUFS;
2765	if (sopt_size > MLEN) {
2766		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2767		if ((m->m_flags & M_EXT) == 0) {
2768			m_free(m);
2769			return ENOBUFS;
2770		}
2771		m->m_len = min(MCLBYTES, sopt_size);
2772	} else {
2773		m->m_len = min(MLEN, sopt_size);
2774	}
2775	sopt_size -= m->m_len;
2776	*mp = m;
2777	m_prev = m;
2778
2779	while (sopt_size) {
2780		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2781		if (m == NULL) {
2782			m_freem(*mp);
2783			return ENOBUFS;
2784		}
2785		if (sopt_size > MLEN) {
2786			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2787			    M_DONTWAIT);
2788			if ((m->m_flags & M_EXT) == 0) {
2789				m_freem(m);
2790				m_freem(*mp);
2791				return ENOBUFS;
2792			}
2793			m->m_len = min(MCLBYTES, sopt_size);
2794		} else {
2795			m->m_len = min(MLEN, sopt_size);
2796		}
2797		sopt_size -= m->m_len;
2798		m_prev->m_next = m;
2799		m_prev = m;
2800	}
2801	return (0);
2802}
2803
2804/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2805int
2806soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2807{
2808	struct mbuf *m0 = m;
2809
2810	if (sopt->sopt_val == NULL)
2811		return (0);
2812	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2813		if (sopt->sopt_td != NULL) {
2814			int error;
2815
2816			error = copyin(sopt->sopt_val, mtod(m, char *),
2817				       m->m_len);
2818			if (error != 0) {
2819				m_freem(m0);
2820				return(error);
2821			}
2822		} else
2823			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2824		sopt->sopt_valsize -= m->m_len;
2825		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2826		m = m->m_next;
2827	}
2828	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2829		panic("ip6_sooptmcopyin");
2830	return (0);
2831}
2832
2833/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2834int
2835soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2836{
2837	struct mbuf *m0 = m;
2838	size_t valsize = 0;
2839
2840	if (sopt->sopt_val == NULL)
2841		return (0);
2842	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2843		if (sopt->sopt_td != NULL) {
2844			int error;
2845
2846			error = copyout(mtod(m, char *), sopt->sopt_val,
2847				       m->m_len);
2848			if (error != 0) {
2849				m_freem(m0);
2850				return(error);
2851			}
2852		} else
2853			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2854	       sopt->sopt_valsize -= m->m_len;
2855	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2856	       valsize += m->m_len;
2857	       m = m->m_next;
2858	}
2859	if (m != NULL) {
2860		/* enough soopt buffer should be given from user-land */
2861		m_freem(m0);
2862		return(EINVAL);
2863	}
2864	sopt->sopt_valsize = valsize;
2865	return (0);
2866}
2867
2868/*
2869 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2870 * out-of-band data, which will then notify socket consumers.
2871 */
2872void
2873sohasoutofband(struct socket *so)
2874{
2875
2876	if (so->so_sigio != NULL)
2877		pgsigio(&so->so_sigio, SIGURG, 0);
2878	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2879}
2880
2881int
2882sopoll(struct socket *so, int events, struct ucred *active_cred,
2883    struct thread *td)
2884{
2885
2886	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2887	    td));
2888}
2889
2890int
2891sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2892    struct thread *td)
2893{
2894	int revents = 0;
2895
2896	SOCKBUF_LOCK(&so->so_snd);
2897	SOCKBUF_LOCK(&so->so_rcv);
2898	if (events & (POLLIN | POLLRDNORM))
2899		if (soreadabledata(so))
2900			revents |= events & (POLLIN | POLLRDNORM);
2901
2902	if (events & (POLLOUT | POLLWRNORM))
2903		if (sowriteable(so))
2904			revents |= events & (POLLOUT | POLLWRNORM);
2905
2906	if (events & (POLLPRI | POLLRDBAND))
2907		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2908			revents |= events & (POLLPRI | POLLRDBAND);
2909
2910	if ((events & POLLINIGNEOF) == 0) {
2911		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2912			revents |= events & (POLLIN | POLLRDNORM);
2913			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2914				revents |= POLLHUP;
2915		}
2916	}
2917
2918	if (revents == 0) {
2919		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2920			selrecord(td, &so->so_rcv.sb_sel);
2921			so->so_rcv.sb_flags |= SB_SEL;
2922		}
2923
2924		if (events & (POLLOUT | POLLWRNORM)) {
2925			selrecord(td, &so->so_snd.sb_sel);
2926			so->so_snd.sb_flags |= SB_SEL;
2927		}
2928	}
2929
2930	SOCKBUF_UNLOCK(&so->so_rcv);
2931	SOCKBUF_UNLOCK(&so->so_snd);
2932	return (revents);
2933}
2934
2935int
2936soo_kqfilter(struct file *fp, struct knote *kn)
2937{
2938	struct socket *so = kn->kn_fp->f_data;
2939	struct sockbuf *sb;
2940
2941	switch (kn->kn_filter) {
2942	case EVFILT_READ:
2943		if (so->so_options & SO_ACCEPTCONN)
2944			kn->kn_fop = &solisten_filtops;
2945		else
2946			kn->kn_fop = &soread_filtops;
2947		sb = &so->so_rcv;
2948		break;
2949	case EVFILT_WRITE:
2950		kn->kn_fop = &sowrite_filtops;
2951		sb = &so->so_snd;
2952		break;
2953	default:
2954		return (EINVAL);
2955	}
2956
2957	SOCKBUF_LOCK(sb);
2958	knlist_add(&sb->sb_sel.si_note, kn, 1);
2959	sb->sb_flags |= SB_KNOTE;
2960	SOCKBUF_UNLOCK(sb);
2961	return (0);
2962}
2963
2964/*
2965 * Some routines that return EOPNOTSUPP for entry points that are not
2966 * supported by a protocol.  Fill in as needed.
2967 */
2968int
2969pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2970{
2971
2972	return EOPNOTSUPP;
2973}
2974
2975int
2976pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2977{
2978
2979	return EOPNOTSUPP;
2980}
2981
2982int
2983pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2984{
2985
2986	return EOPNOTSUPP;
2987}
2988
2989int
2990pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2991{
2992
2993	return EOPNOTSUPP;
2994}
2995
2996int
2997pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2998{
2999
3000	return EOPNOTSUPP;
3001}
3002
3003int
3004pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3005    struct ifnet *ifp, struct thread *td)
3006{
3007
3008	return EOPNOTSUPP;
3009}
3010
3011int
3012pru_disconnect_notsupp(struct socket *so)
3013{
3014
3015	return EOPNOTSUPP;
3016}
3017
3018int
3019pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3020{
3021
3022	return EOPNOTSUPP;
3023}
3024
3025int
3026pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3027{
3028
3029	return EOPNOTSUPP;
3030}
3031
3032int
3033pru_rcvd_notsupp(struct socket *so, int flags)
3034{
3035
3036	return EOPNOTSUPP;
3037}
3038
3039int
3040pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3041{
3042
3043	return EOPNOTSUPP;
3044}
3045
3046int
3047pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3048    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3049{
3050
3051	return EOPNOTSUPP;
3052}
3053
3054/*
3055 * This isn't really a ``null'' operation, but it's the default one and
3056 * doesn't do anything destructive.
3057 */
3058int
3059pru_sense_null(struct socket *so, struct stat *sb)
3060{
3061
3062	sb->st_blksize = so->so_snd.sb_hiwat;
3063	return 0;
3064}
3065
3066int
3067pru_shutdown_notsupp(struct socket *so)
3068{
3069
3070	return EOPNOTSUPP;
3071}
3072
3073int
3074pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3075{
3076
3077	return EOPNOTSUPP;
3078}
3079
3080int
3081pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3082    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3083{
3084
3085	return EOPNOTSUPP;
3086}
3087
3088int
3089pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3090    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3091{
3092
3093	return EOPNOTSUPP;
3094}
3095
3096int
3097pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3098    struct thread *td)
3099{
3100
3101	return EOPNOTSUPP;
3102}
3103
3104static void
3105filt_sordetach(struct knote *kn)
3106{
3107	struct socket *so = kn->kn_fp->f_data;
3108
3109	SOCKBUF_LOCK(&so->so_rcv);
3110	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3111	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3112		so->so_rcv.sb_flags &= ~SB_KNOTE;
3113	SOCKBUF_UNLOCK(&so->so_rcv);
3114}
3115
3116/*ARGSUSED*/
3117static int
3118filt_soread(struct knote *kn, long hint)
3119{
3120	struct socket *so;
3121
3122	so = kn->kn_fp->f_data;
3123	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3124
3125	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3126	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3127		kn->kn_flags |= EV_EOF;
3128		kn->kn_fflags = so->so_error;
3129		return (1);
3130	} else if (so->so_error)	/* temporary udp error */
3131		return (1);
3132	else if (kn->kn_sfflags & NOTE_LOWAT)
3133		return (kn->kn_data >= kn->kn_sdata);
3134	else
3135		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3136}
3137
3138static void
3139filt_sowdetach(struct knote *kn)
3140{
3141	struct socket *so = kn->kn_fp->f_data;
3142
3143	SOCKBUF_LOCK(&so->so_snd);
3144	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3145	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3146		so->so_snd.sb_flags &= ~SB_KNOTE;
3147	SOCKBUF_UNLOCK(&so->so_snd);
3148}
3149
3150/*ARGSUSED*/
3151static int
3152filt_sowrite(struct knote *kn, long hint)
3153{
3154	struct socket *so;
3155
3156	so = kn->kn_fp->f_data;
3157	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3158	kn->kn_data = sbspace(&so->so_snd);
3159	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3160		kn->kn_flags |= EV_EOF;
3161		kn->kn_fflags = so->so_error;
3162		return (1);
3163	} else if (so->so_error)	/* temporary udp error */
3164		return (1);
3165	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3166	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3167		return (0);
3168	else if (kn->kn_sfflags & NOTE_LOWAT)
3169		return (kn->kn_data >= kn->kn_sdata);
3170	else
3171		return (kn->kn_data >= so->so_snd.sb_lowat);
3172}
3173
3174/*ARGSUSED*/
3175static int
3176filt_solisten(struct knote *kn, long hint)
3177{
3178	struct socket *so = kn->kn_fp->f_data;
3179
3180	kn->kn_data = so->so_qlen;
3181	return (! TAILQ_EMPTY(&so->so_comp));
3182}
3183
3184int
3185socheckuid(struct socket *so, uid_t uid)
3186{
3187
3188	if (so == NULL)
3189		return (EPERM);
3190	if (so->so_cred->cr_uid != uid)
3191		return (EPERM);
3192	return (0);
3193}
3194
3195static int
3196sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3197{
3198	int error;
3199	int val;
3200
3201	val = somaxconn;
3202	error = sysctl_handle_int(oidp, &val, 0, req);
3203	if (error || !req->newptr )
3204		return (error);
3205
3206	if (val < 1 || val > USHRT_MAX)
3207		return (EINVAL);
3208
3209	somaxconn = val;
3210	return (0);
3211}
3212
3213/*
3214 * These functions are used by protocols to notify the socket layer (and its
3215 * consumers) of state changes in the sockets driven by protocol-side events.
3216 */
3217
3218/*
3219 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3220 *
3221 * Normal sequence from the active (originating) side is that
3222 * soisconnecting() is called during processing of connect() call, resulting
3223 * in an eventual call to soisconnected() if/when the connection is
3224 * established.  When the connection is torn down soisdisconnecting() is
3225 * called during processing of disconnect() call, and soisdisconnected() is
3226 * called when the connection to the peer is totally severed.  The semantics
3227 * of these routines are such that connectionless protocols can call
3228 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3229 * calls when setting up a ``connection'' takes no time.
3230 *
3231 * From the passive side, a socket is created with two queues of sockets:
3232 * so_incomp for connections in progress and so_comp for connections already
3233 * made and awaiting user acceptance.  As a protocol is preparing incoming
3234 * connections, it creates a socket structure queued on so_incomp by calling
3235 * sonewconn().  When the connection is established, soisconnected() is
3236 * called, and transfers the socket structure to so_comp, making it available
3237 * to accept().
3238 *
3239 * If a socket is closed with sockets on either so_incomp or so_comp, these
3240 * sockets are dropped.
3241 *
3242 * If higher-level protocols are implemented in the kernel, the wakeups done
3243 * here will sometimes cause software-interrupt process scheduling.
3244 */
3245void
3246soisconnecting(struct socket *so)
3247{
3248
3249	SOCK_LOCK(so);
3250	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3251	so->so_state |= SS_ISCONNECTING;
3252	SOCK_UNLOCK(so);
3253}
3254
3255void
3256soisconnected(struct socket *so)
3257{
3258	struct socket *head;
3259	int ret;
3260
3261restart:
3262	ACCEPT_LOCK();
3263	SOCK_LOCK(so);
3264	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3265	so->so_state |= SS_ISCONNECTED;
3266	head = so->so_head;
3267	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3268		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3269			SOCK_UNLOCK(so);
3270			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3271			head->so_incqlen--;
3272			so->so_qstate &= ~SQ_INCOMP;
3273			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3274			head->so_qlen++;
3275			so->so_qstate |= SQ_COMP;
3276			ACCEPT_UNLOCK();
3277			sorwakeup(head);
3278			wakeup_one(&head->so_timeo);
3279		} else {
3280			ACCEPT_UNLOCK();
3281			soupcall_set(so, SO_RCV,
3282			    head->so_accf->so_accept_filter->accf_callback,
3283			    head->so_accf->so_accept_filter_arg);
3284			so->so_options &= ~SO_ACCEPTFILTER;
3285			ret = head->so_accf->so_accept_filter->accf_callback(so,
3286			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3287			if (ret == SU_ISCONNECTED)
3288				soupcall_clear(so, SO_RCV);
3289			SOCK_UNLOCK(so);
3290			if (ret == SU_ISCONNECTED)
3291				goto restart;
3292		}
3293		return;
3294	}
3295	SOCK_UNLOCK(so);
3296	ACCEPT_UNLOCK();
3297	wakeup(&so->so_timeo);
3298	sorwakeup(so);
3299	sowwakeup(so);
3300}
3301
3302void
3303soisdisconnecting(struct socket *so)
3304{
3305
3306	/*
3307	 * Note: This code assumes that SOCK_LOCK(so) and
3308	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3309	 */
3310	SOCKBUF_LOCK(&so->so_rcv);
3311	so->so_state &= ~SS_ISCONNECTING;
3312	so->so_state |= SS_ISDISCONNECTING;
3313	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3314	sorwakeup_locked(so);
3315	SOCKBUF_LOCK(&so->so_snd);
3316	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3317	sowwakeup_locked(so);
3318	wakeup(&so->so_timeo);
3319}
3320
3321void
3322soisdisconnected(struct socket *so)
3323{
3324
3325	/*
3326	 * Note: This code assumes that SOCK_LOCK(so) and
3327	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3328	 */
3329	SOCKBUF_LOCK(&so->so_rcv);
3330	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3331	so->so_state |= SS_ISDISCONNECTED;
3332	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3333	sorwakeup_locked(so);
3334	SOCKBUF_LOCK(&so->so_snd);
3335	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3336	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3337	sowwakeup_locked(so);
3338	wakeup(&so->so_timeo);
3339}
3340
3341/*
3342 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3343 */
3344struct sockaddr *
3345sodupsockaddr(const struct sockaddr *sa, int mflags)
3346{
3347	struct sockaddr *sa2;
3348
3349	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3350	if (sa2)
3351		bcopy(sa, sa2, sa->sa_len);
3352	return sa2;
3353}
3354
3355/*
3356 * Register per-socket buffer upcalls.
3357 */
3358void
3359soupcall_set(struct socket *so, int which,
3360    int (*func)(struct socket *, void *, int), void *arg)
3361{
3362	struct sockbuf *sb;
3363
3364	switch (which) {
3365	case SO_RCV:
3366		sb = &so->so_rcv;
3367		break;
3368	case SO_SND:
3369		sb = &so->so_snd;
3370		break;
3371	default:
3372		panic("soupcall_set: bad which");
3373	}
3374	SOCKBUF_LOCK_ASSERT(sb);
3375#if 0
3376	/* XXX: accf_http actually wants to do this on purpose. */
3377	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3378#endif
3379	sb->sb_upcall = func;
3380	sb->sb_upcallarg = arg;
3381	sb->sb_flags |= SB_UPCALL;
3382}
3383
3384void
3385soupcall_clear(struct socket *so, int which)
3386{
3387	struct sockbuf *sb;
3388
3389	switch (which) {
3390	case SO_RCV:
3391		sb = &so->so_rcv;
3392		break;
3393	case SO_SND:
3394		sb = &so->so_snd;
3395		break;
3396	default:
3397		panic("soupcall_clear: bad which");
3398	}
3399	SOCKBUF_LOCK_ASSERT(sb);
3400	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3401	sb->sb_upcall = NULL;
3402	sb->sb_upcallarg = NULL;
3403	sb->sb_flags &= ~SB_UPCALL;
3404}
3405
3406/*
3407 * Create an external-format (``xsocket'') structure using the information in
3408 * the kernel-format socket structure pointed to by so.  This is done to
3409 * reduce the spew of irrelevant information over this interface, to isolate
3410 * user code from changes in the kernel structure, and potentially to provide
3411 * information-hiding if we decide that some of this information should be
3412 * hidden from users.
3413 */
3414void
3415sotoxsocket(struct socket *so, struct xsocket *xso)
3416{
3417
3418	xso->xso_len = sizeof *xso;
3419	xso->xso_so = so;
3420	xso->so_type = so->so_type;
3421	xso->so_options = so->so_options;
3422	xso->so_linger = so->so_linger;
3423	xso->so_state = so->so_state;
3424	xso->so_pcb = so->so_pcb;
3425	xso->xso_protocol = so->so_proto->pr_protocol;
3426	xso->xso_family = so->so_proto->pr_domain->dom_family;
3427	xso->so_qlen = so->so_qlen;
3428	xso->so_incqlen = so->so_incqlen;
3429	xso->so_qlimit = so->so_qlimit;
3430	xso->so_timeo = so->so_timeo;
3431	xso->so_error = so->so_error;
3432	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3433	xso->so_oobmark = so->so_oobmark;
3434	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3435	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3436	xso->so_uid = so->so_cred->cr_uid;
3437}
3438
3439
3440/*
3441 * Socket accessor functions to provide external consumers with
3442 * a safe interface to socket state
3443 *
3444 */
3445
3446void
3447so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3448{
3449
3450	TAILQ_FOREACH(so, &so->so_comp, so_list)
3451		func(so, arg);
3452}
3453
3454struct sockbuf *
3455so_sockbuf_rcv(struct socket *so)
3456{
3457
3458	return (&so->so_rcv);
3459}
3460
3461struct sockbuf *
3462so_sockbuf_snd(struct socket *so)
3463{
3464
3465	return (&so->so_snd);
3466}
3467
3468int
3469so_state_get(const struct socket *so)
3470{
3471
3472	return (so->so_state);
3473}
3474
3475void
3476so_state_set(struct socket *so, int val)
3477{
3478
3479	so->so_state = val;
3480}
3481
3482int
3483so_options_get(const struct socket *so)
3484{
3485
3486	return (so->so_options);
3487}
3488
3489void
3490so_options_set(struct socket *so, int val)
3491{
3492
3493	so->so_options = val;
3494}
3495
3496int
3497so_error_get(const struct socket *so)
3498{
3499
3500	return (so->so_error);
3501}
3502
3503void
3504so_error_set(struct socket *so, int val)
3505{
3506
3507	so->so_error = val;
3508}
3509
3510int
3511so_linger_get(const struct socket *so)
3512{
3513
3514	return (so->so_linger);
3515}
3516
3517void
3518so_linger_set(struct socket *so, int val)
3519{
3520
3521	so->so_linger = val;
3522}
3523
3524struct protosw *
3525so_protosw_get(const struct socket *so)
3526{
3527
3528	return (so->so_proto);
3529}
3530
3531void
3532so_protosw_set(struct socket *so, struct protosw *val)
3533{
3534
3535	so->so_proto = val;
3536}
3537
3538void
3539so_sorwakeup(struct socket *so)
3540{
3541
3542	sorwakeup(so);
3543}
3544
3545void
3546so_sowwakeup(struct socket *so)
3547{
3548
3549	sowwakeup(so);
3550}
3551
3552void
3553so_sorwakeup_locked(struct socket *so)
3554{
3555
3556	sorwakeup_locked(so);
3557}
3558
3559void
3560so_sowwakeup_locked(struct socket *so)
3561{
3562
3563	sowwakeup_locked(so);
3564}
3565
3566void
3567so_lock(struct socket *so)
3568{
3569	SOCK_LOCK(so);
3570}
3571
3572void
3573so_unlock(struct socket *so)
3574{
3575	SOCK_UNLOCK(so);
3576}
3577