uipc_socket.c revision 160280
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2006 Robert N. M. Watson
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32 */
33
34/*
35 * Comments on the socket life cycle:
36 *
37 * soalloc() sets of socket layer state for a socket, called only by
38 * socreate() and sonewconn().  Socket layer private.
39 *
40 * sdealloc() tears down socket layer state for a socket, called only by
41 * sofree() and sonewconn().  Socket layer private.
42 *
43 * pru_attach() associates protocol layer state with an allocated socket;
44 * called only once, may fail, aborting socket allocation.  This is called
45 * from socreate() and sonewconn().  Socket layer private.
46 *
47 * pru_detach() disassociates protocol layer state from an attached socket,
48 * and will be called exactly once for sockets in which pru_attach() has
49 * been successfully called.  If pru_attach() returned an error,
50 * pru_detach() will not be called.  Socket layer private.
51 *
52 * socreate() creates a socket and attaches protocol state.  This is a public
53 * interface that may be used by socket layer consumers to create new
54 * sockets.
55 *
56 * sonewconn() creates a socket and attaches protocol state.  This is a
57 * public interface  that may be used by protocols to create new sockets when
58 * a new connection is received and will be available for accept() on a
59 * listen socket.
60 *
61 * soclose() destroys a socket after possibly waiting for it to disconnect.
62 * This is a public interface that socket consumers should use to close and
63 * release a socket when done with it.
64 *
65 * soabort() destroys a socket without waiting for it to disconnect (used
66 * only for incoming connections that are already partially or fully
67 * connected).  This is used internally by the socket layer when clearing
68 * listen socket queues (due to overflow or close on the listen socket), but
69 * is also a public interface protocols may use to abort connections in
70 * their incomplete listen queues should they no longer be required.  Sockets
71 * placed in completed connection listen queues should not be aborted.
72 *
73 * sofree() will free a socket and its protocol state if all references on
74 * the socket have been released, and is the public interface to attempt to
75 * free a socket when a reference is removed.  This is a socket layer private
76 * interface.
77 *
78 * NOTE: In addition to socreate() and soclose(), which provide a single
79 * socket reference to the consumer to be managed as required, there are two
80 * calls to explicitly manage socket references, soref(), and sorele().
81 * Currently, these are generally required only when transitioning a socket
82 * from a listen queue to a file descriptor, in order to prevent garbage
83 * collection of the socket at an untimely moment.  For a number of reasons,
84 * these interfaces are not preferred, and should be avoided.
85 *
86 * XXXRW: The behavior of sockets after soclose() but before the last
87 * sorele() is poorly defined.  We can probably entirely eliminate them with
88 * a little work, since consumers are managing references anyway.
89 */
90
91#include <sys/cdefs.h>
92__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 160280 2006-07-11 21:56:58Z rwatson $");
93
94#include "opt_inet.h"
95#include "opt_mac.h"
96#include "opt_zero.h"
97#include "opt_compat.h"
98
99#include <sys/param.h>
100#include <sys/systm.h>
101#include <sys/fcntl.h>
102#include <sys/limits.h>
103#include <sys/lock.h>
104#include <sys/mac.h>
105#include <sys/malloc.h>
106#include <sys/mbuf.h>
107#include <sys/mutex.h>
108#include <sys/domain.h>
109#include <sys/file.h>			/* for struct knote */
110#include <sys/kernel.h>
111#include <sys/event.h>
112#include <sys/eventhandler.h>
113#include <sys/poll.h>
114#include <sys/proc.h>
115#include <sys/protosw.h>
116#include <sys/socket.h>
117#include <sys/socketvar.h>
118#include <sys/resourcevar.h>
119#include <sys/signalvar.h>
120#include <sys/sysctl.h>
121#include <sys/uio.h>
122#include <sys/jail.h>
123
124#include <vm/uma.h>
125
126#ifdef COMPAT_IA32
127#include <sys/mount.h>
128#include <compat/freebsd32/freebsd32.h>
129
130extern struct sysentvec ia32_freebsd_sysvec;
131#endif
132
133static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
134		    int flags);
135
136static void	filt_sordetach(struct knote *kn);
137static int	filt_soread(struct knote *kn, long hint);
138static void	filt_sowdetach(struct knote *kn);
139static int	filt_sowrite(struct knote *kn, long hint);
140static int	filt_solisten(struct knote *kn, long hint);
141
142static struct filterops solisten_filtops =
143	{ 1, NULL, filt_sordetach, filt_solisten };
144static struct filterops soread_filtops =
145	{ 1, NULL, filt_sordetach, filt_soread };
146static struct filterops sowrite_filtops =
147	{ 1, NULL, filt_sowdetach, filt_sowrite };
148
149uma_zone_t socket_zone;
150so_gen_t	so_gencnt;	/* generation count for sockets */
151
152int	maxsockets;
153
154MALLOC_DEFINE(M_SONAME, "soname", "socket name");
155MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
156
157static int somaxconn = SOMAXCONN;
158static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
159/* XXX: we dont have SYSCTL_USHORT */
160SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
161    0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
162    "queue size");
163static int numopensockets;
164SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
165    &numopensockets, 0, "Number of open sockets");
166#ifdef ZERO_COPY_SOCKETS
167/* These aren't static because they're used in other files. */
168int so_zero_copy_send = 1;
169int so_zero_copy_receive = 1;
170SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
171    "Zero copy controls");
172SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
173    &so_zero_copy_receive, 0, "Enable zero copy receive");
174SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
175    &so_zero_copy_send, 0, "Enable zero copy send");
176#endif /* ZERO_COPY_SOCKETS */
177
178/*
179 * accept_mtx locks down per-socket fields relating to accept queues.  See
180 * socketvar.h for an annotation of the protected fields of struct socket.
181 */
182struct mtx accept_mtx;
183MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
184
185/*
186 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
187 * so_gencnt field.
188 */
189static struct mtx so_global_mtx;
190MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
191
192SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
193
194static int
195sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
196{
197	int error, newmaxsockets;
198
199	newmaxsockets = maxsockets;
200	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
201	if (error == 0 && req->newptr) {
202		if (newmaxsockets > maxsockets) {
203			maxsockets = newmaxsockets;
204			if (maxsockets > ((maxfiles / 4) * 3)) {
205				maxfiles = (maxsockets * 5) / 4;
206				maxfilesperproc = (maxfiles * 9) / 10;
207			}
208			EVENTHANDLER_INVOKE(maxsockets_change);
209		} else
210			error = EINVAL;
211	}
212	return (error);
213}
214
215SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
216    &maxsockets, 0, sysctl_maxsockets, "IU",
217    "Maximum number of sockets avaliable");
218
219/*
220 * Initialise maxsockets
221 */
222static void init_maxsockets(void *ignored)
223{
224	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
225	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
226}
227SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
228
229/*
230 * Socket operation routines.
231 * These routines are called by the routines in
232 * sys_socket.c or from a system process, and
233 * implement the semantics of socket operations by
234 * switching out to the protocol specific routines.
235 */
236
237/*
238 * Get a socket structure from our zone, and initialize it.
239 * Note that it would probably be better to allocate socket
240 * and PCB at the same time, but I'm not convinced that all
241 * the protocols can be easily modified to do this.
242 *
243 * soalloc() returns a socket with a ref count of 0.
244 */
245static struct socket *
246soalloc(int mflags)
247{
248	struct socket *so;
249
250	so = uma_zalloc(socket_zone, mflags | M_ZERO);
251	if (so == NULL)
252		return (NULL);
253#ifdef MAC
254	if (mac_init_socket(so, mflags) != 0) {
255		uma_zfree(socket_zone, so);
256		return (NULL);
257	}
258#endif
259	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
260	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
261	TAILQ_INIT(&so->so_aiojobq);
262	mtx_lock(&so_global_mtx);
263	so->so_gencnt = ++so_gencnt;
264	++numopensockets;
265	mtx_unlock(&so_global_mtx);
266	return (so);
267}
268
269static void
270sodealloc(struct socket *so)
271{
272
273	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
274	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
275
276	mtx_lock(&so_global_mtx);
277	so->so_gencnt = ++so_gencnt;
278	mtx_unlock(&so_global_mtx);
279	if (so->so_rcv.sb_hiwat)
280		(void)chgsbsize(so->so_cred->cr_uidinfo,
281		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
282	if (so->so_snd.sb_hiwat)
283		(void)chgsbsize(so->so_cred->cr_uidinfo,
284		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
285#ifdef INET
286	/* remove acccept filter if one is present. */
287	if (so->so_accf != NULL)
288		do_setopt_accept_filter(so, NULL);
289#endif
290#ifdef MAC
291	mac_destroy_socket(so);
292#endif
293	crfree(so->so_cred);
294	SOCKBUF_LOCK_DESTROY(&so->so_snd);
295	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
296	uma_zfree(socket_zone, so);
297	mtx_lock(&so_global_mtx);
298	--numopensockets;
299	mtx_unlock(&so_global_mtx);
300}
301
302/*
303 * socreate returns a socket with a ref count of 1.  The socket should be
304 * closed with soclose().
305 */
306int
307socreate(dom, aso, type, proto, cred, td)
308	int dom;
309	struct socket **aso;
310	int type;
311	int proto;
312	struct ucred *cred;
313	struct thread *td;
314{
315	struct protosw *prp;
316	struct socket *so;
317	int error;
318
319	if (proto)
320		prp = pffindproto(dom, proto, type);
321	else
322		prp = pffindtype(dom, type);
323
324	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
325	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
326		return (EPROTONOSUPPORT);
327
328	if (jailed(cred) && jail_socket_unixiproute_only &&
329	    prp->pr_domain->dom_family != PF_LOCAL &&
330	    prp->pr_domain->dom_family != PF_INET &&
331	    prp->pr_domain->dom_family != PF_ROUTE) {
332		return (EPROTONOSUPPORT);
333	}
334
335	if (prp->pr_type != type)
336		return (EPROTOTYPE);
337	so = soalloc(M_WAITOK);
338	if (so == NULL)
339		return (ENOBUFS);
340
341	TAILQ_INIT(&so->so_incomp);
342	TAILQ_INIT(&so->so_comp);
343	so->so_type = type;
344	so->so_cred = crhold(cred);
345	so->so_proto = prp;
346#ifdef MAC
347	mac_create_socket(cred, so);
348#endif
349	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
350	    NULL, NULL, NULL);
351	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
352	    NULL, NULL, NULL);
353	so->so_count = 1;
354	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
355	if (error) {
356		sodealloc(so);
357		return (error);
358	}
359	*aso = so;
360	return (0);
361}
362
363#ifdef REGRESSION
364static int regression_sonewconn_earlytest = 1;
365SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
366    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
367#endif
368
369/*
370 * When an attempt at a new connection is noted on a socket
371 * which accepts connections, sonewconn is called.  If the
372 * connection is possible (subject to space constraints, etc.)
373 * then we allocate a new structure, propoerly linked into the
374 * data structure of the original socket, and return this.
375 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
376 *
377 * note: the ref count on the socket is 0 on return
378 */
379struct socket *
380sonewconn(head, connstatus)
381	register struct socket *head;
382	int connstatus;
383{
384	register struct socket *so;
385	int over;
386
387	ACCEPT_LOCK();
388	over = (head->so_qlen > 3 * head->so_qlimit / 2);
389	ACCEPT_UNLOCK();
390#ifdef REGRESSION
391	if (regression_sonewconn_earlytest && over)
392#else
393	if (over)
394#endif
395		return (NULL);
396	so = soalloc(M_NOWAIT);
397	if (so == NULL)
398		return (NULL);
399	if ((head->so_options & SO_ACCEPTFILTER) != 0)
400		connstatus = 0;
401	so->so_head = head;
402	so->so_type = head->so_type;
403	so->so_options = head->so_options &~ SO_ACCEPTCONN;
404	so->so_linger = head->so_linger;
405	so->so_state = head->so_state | SS_NOFDREF;
406	so->so_proto = head->so_proto;
407	so->so_timeo = head->so_timeo;
408	so->so_cred = crhold(head->so_cred);
409#ifdef MAC
410	SOCK_LOCK(head);
411	mac_create_socket_from_socket(head, so);
412	SOCK_UNLOCK(head);
413#endif
414	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
415	    NULL, NULL, NULL);
416	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
417	    NULL, NULL, NULL);
418	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
419	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
420		sodealloc(so);
421		return (NULL);
422	}
423	so->so_state |= connstatus;
424	ACCEPT_LOCK();
425	if (connstatus) {
426		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
427		so->so_qstate |= SQ_COMP;
428		head->so_qlen++;
429	} else {
430		/*
431		 * Keep removing sockets from the head until there's room for
432		 * us to insert on the tail.  In pre-locking revisions, this
433		 * was a simple if(), but as we could be racing with other
434		 * threads and soabort() requires dropping locks, we must
435		 * loop waiting for the condition to be true.
436		 */
437		while (head->so_incqlen > head->so_qlimit) {
438			struct socket *sp;
439			sp = TAILQ_FIRST(&head->so_incomp);
440			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
441			head->so_incqlen--;
442			sp->so_qstate &= ~SQ_INCOMP;
443			sp->so_head = NULL;
444			ACCEPT_UNLOCK();
445			soabort(sp);
446			ACCEPT_LOCK();
447		}
448		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
449		so->so_qstate |= SQ_INCOMP;
450		head->so_incqlen++;
451	}
452	ACCEPT_UNLOCK();
453	if (connstatus) {
454		sorwakeup(head);
455		wakeup_one(&head->so_timeo);
456	}
457	return (so);
458}
459
460int
461sobind(so, nam, td)
462	struct socket *so;
463	struct sockaddr *nam;
464	struct thread *td;
465{
466
467	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
468}
469
470/*
471 * solisten() transitions a socket from a non-listening state to a listening
472 * state, but can also be used to update the listen queue depth on an
473 * existing listen socket.  The protocol will call back into the sockets
474 * layer using solisten_proto_check() and solisten_proto() to check and set
475 * socket-layer listen state.  Call backs are used so that the protocol can
476 * acquire both protocol and socket layer locks in whatever order is required
477 * by the protocol.
478 *
479 * Protocol implementors are advised to hold the socket lock across the
480 * socket-layer test and set to avoid races at the socket layer.
481 */
482int
483solisten(so, backlog, td)
484	struct socket *so;
485	int backlog;
486	struct thread *td;
487{
488
489	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
490}
491
492int
493solisten_proto_check(so)
494	struct socket *so;
495{
496
497	SOCK_LOCK_ASSERT(so);
498
499	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
500	    SS_ISDISCONNECTING))
501		return (EINVAL);
502	return (0);
503}
504
505void
506solisten_proto(so, backlog)
507	struct socket *so;
508	int backlog;
509{
510
511	SOCK_LOCK_ASSERT(so);
512
513	if (backlog < 0 || backlog > somaxconn)
514		backlog = somaxconn;
515	so->so_qlimit = backlog;
516	so->so_options |= SO_ACCEPTCONN;
517}
518
519/*
520 * Attempt to free a socket.  This should really be sotryfree().
521 *
522 * sofree() will succeed if:
523 *
524 * - There are no outstanding file descriptor references or related consumers
525 *   (so_count == 0).
526 *
527 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
528 *
529 * - The protocol does not have an outstanding strong reference on the socket
530 *   (SS_PROTOREF).
531 *
532 * - The socket is not in a completed connection queue, so a process has been
533 *   notified that it is present.  If it is removed, the user process may
534 *   block in accept() despite select() saying the socket was ready.
535 *
536 * Otherwise, it will quietly abort so that a future call to sofree(), when
537 * conditions are right, can succeed.
538 */
539void
540sofree(so)
541	struct socket *so;
542{
543	struct socket *head;
544
545	ACCEPT_LOCK_ASSERT();
546	SOCK_LOCK_ASSERT(so);
547
548	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
549	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
550		SOCK_UNLOCK(so);
551		ACCEPT_UNLOCK();
552		return;
553	}
554
555	head = so->so_head;
556	if (head != NULL) {
557		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
558		    (so->so_qstate & SQ_INCOMP) != 0,
559		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
560		    "SQ_INCOMP"));
561		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
562		    (so->so_qstate & SQ_INCOMP) == 0,
563		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
564		TAILQ_REMOVE(&head->so_incomp, so, so_list);
565		head->so_incqlen--;
566		so->so_qstate &= ~SQ_INCOMP;
567		so->so_head = NULL;
568	}
569	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
570	    (so->so_qstate & SQ_INCOMP) == 0,
571	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
572	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
573	SOCK_UNLOCK(so);
574	ACCEPT_UNLOCK();
575
576	SOCKBUF_LOCK(&so->so_snd);
577	so->so_snd.sb_flags |= SB_NOINTR;
578	(void)sblock(&so->so_snd, M_WAITOK);
579	/*
580	 * socantsendmore_locked() drops the socket buffer mutex so that it
581	 * can safely perform wakeups.  Re-acquire the mutex before
582	 * continuing.
583	 */
584	socantsendmore_locked(so);
585	SOCKBUF_LOCK(&so->so_snd);
586	sbunlock(&so->so_snd);
587	sbrelease_locked(&so->so_snd, so);
588	SOCKBUF_UNLOCK(&so->so_snd);
589	sorflush(so);
590	knlist_destroy(&so->so_rcv.sb_sel.si_note);
591	knlist_destroy(&so->so_snd.sb_sel.si_note);
592	sodealloc(so);
593}
594
595/*
596 * Close a socket on last file table reference removal.
597 * Initiate disconnect if connected.
598 * Free socket when disconnect complete.
599 *
600 * This function will sorele() the socket.  Note that soclose() may be
601 * called prior to the ref count reaching zero.  The actual socket
602 * structure will not be freed until the ref count reaches zero.
603 */
604int
605soclose(so)
606	struct socket *so;
607{
608	int error = 0;
609
610	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
611
612	funsetown(&so->so_sigio);
613	if (so->so_options & SO_ACCEPTCONN) {
614		struct socket *sp;
615		ACCEPT_LOCK();
616		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
617			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
618			so->so_incqlen--;
619			sp->so_qstate &= ~SQ_INCOMP;
620			sp->so_head = NULL;
621			ACCEPT_UNLOCK();
622			soabort(sp);
623			ACCEPT_LOCK();
624		}
625		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
626			TAILQ_REMOVE(&so->so_comp, sp, so_list);
627			so->so_qlen--;
628			sp->so_qstate &= ~SQ_COMP;
629			sp->so_head = NULL;
630			ACCEPT_UNLOCK();
631			soabort(sp);
632			ACCEPT_LOCK();
633		}
634		ACCEPT_UNLOCK();
635	}
636	if (so->so_state & SS_ISCONNECTED) {
637		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
638			error = sodisconnect(so);
639			if (error)
640				goto drop;
641		}
642		if (so->so_options & SO_LINGER) {
643			if ((so->so_state & SS_ISDISCONNECTING) &&
644			    (so->so_state & SS_NBIO))
645				goto drop;
646			while (so->so_state & SS_ISCONNECTED) {
647				error = tsleep(&so->so_timeo,
648				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
649				if (error)
650					break;
651			}
652		}
653	}
654
655drop:
656	(*so->so_proto->pr_usrreqs->pru_detach)(so);
657	ACCEPT_LOCK();
658	SOCK_LOCK(so);
659	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
660	so->so_state |= SS_NOFDREF;
661	sorele(so);
662	return (error);
663}
664
665/*
666 * soabort() allows the socket code or protocol code to detach a socket that
667 * has been in an incomplete or completed listen queue, but has not yet been
668 * accepted.
669 *
670 * This interface is tricky, because it is called on an unreferenced socket,
671 * and must be called only by a thread that has actually removed the socket
672 * from the listen queue it was on, or races with other threads are risked.
673 *
674 * This interface will call into the protocol code, so must not be called
675 * with any socket locks held.  Protocols do call it while holding their own
676 * recursible protocol mutexes, but this is something that should be subject
677 * to review in the future.
678 *
679 * XXXRW: Why do we maintain a distinction between pru_abort() and
680 * pru_detach()?
681 */
682void
683soabort(so)
684	struct socket *so;
685{
686
687	/*
688	 * In as much as is possible, assert that no references to this
689	 * socket are held.  This is not quite the same as asserting that the
690	 * current thread is responsible for arranging for no references, but
691	 * is as close as we can get for now.
692	 */
693	KASSERT(so->so_count == 0, ("soabort: so_count"));
694	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
695	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
696	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
697	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
698
699	(*so->so_proto->pr_usrreqs->pru_abort)(so);
700	ACCEPT_LOCK();
701	SOCK_LOCK(so);
702	sofree(so);
703}
704
705int
706soaccept(so, nam)
707	struct socket *so;
708	struct sockaddr **nam;
709{
710	int error;
711
712	SOCK_LOCK(so);
713	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
714	so->so_state &= ~SS_NOFDREF;
715	SOCK_UNLOCK(so);
716	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
717	return (error);
718}
719
720int
721soconnect(so, nam, td)
722	struct socket *so;
723	struct sockaddr *nam;
724	struct thread *td;
725{
726	int error;
727
728	if (so->so_options & SO_ACCEPTCONN)
729		return (EOPNOTSUPP);
730	/*
731	 * If protocol is connection-based, can only connect once.
732	 * Otherwise, if connected, try to disconnect first.
733	 * This allows user to disconnect by connecting to, e.g.,
734	 * a null address.
735	 */
736	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
737	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
738	    (error = sodisconnect(so)))) {
739		error = EISCONN;
740	} else {
741		/*
742		 * Prevent accumulated error from previous connection
743		 * from biting us.
744		 */
745		so->so_error = 0;
746		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
747	}
748
749	return (error);
750}
751
752int
753soconnect2(so1, so2)
754	struct socket *so1;
755	struct socket *so2;
756{
757
758	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
759}
760
761int
762sodisconnect(so)
763	struct socket *so;
764{
765	int error;
766
767	if ((so->so_state & SS_ISCONNECTED) == 0)
768		return (ENOTCONN);
769	if (so->so_state & SS_ISDISCONNECTING)
770		return (EALREADY);
771	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
772	return (error);
773}
774
775#ifdef ZERO_COPY_SOCKETS
776struct so_zerocopy_stats{
777	int size_ok;
778	int align_ok;
779	int found_ifp;
780};
781struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
782#include <netinet/in.h>
783#include <net/route.h>
784#include <netinet/in_pcb.h>
785#include <vm/vm.h>
786#include <vm/vm_page.h>
787#include <vm/vm_object.h>
788#endif /*ZERO_COPY_SOCKETS*/
789
790/*
791 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
792 * all of the data referenced by the uio.  If desired, it uses zero-copy.
793 * *space will be updated to reflect data copied in.
794 *
795 * NB: If atomic I/O is requested, the caller must already have checked that
796 * space can hold resid bytes.
797 *
798 * NB: In the event of an error, the caller may need to free the partial
799 * chain pointed to by *mpp.  The contents of both *uio and *space may be
800 * modified even in the case of an error.
801 */
802static int
803sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
804    int flags)
805{
806	struct mbuf *m, **mp, *top;
807	long len, resid;
808	int error;
809#ifdef ZERO_COPY_SOCKETS
810	int cow_send;
811#endif
812
813	*retmp = top = NULL;
814	mp = &top;
815	len = 0;
816	resid = uio->uio_resid;
817	error = 0;
818	do {
819#ifdef ZERO_COPY_SOCKETS
820		cow_send = 0;
821#endif /* ZERO_COPY_SOCKETS */
822		if (resid >= MINCLSIZE) {
823#ifdef ZERO_COPY_SOCKETS
824			if (top == NULL) {
825				MGETHDR(m, M_TRYWAIT, MT_DATA);
826				if (m == NULL) {
827					error = ENOBUFS;
828					goto out;
829				}
830				m->m_pkthdr.len = 0;
831				m->m_pkthdr.rcvif = NULL;
832			} else {
833				MGET(m, M_TRYWAIT, MT_DATA);
834				if (m == NULL) {
835					error = ENOBUFS;
836					goto out;
837				}
838			}
839			if (so_zero_copy_send &&
840			    resid>=PAGE_SIZE &&
841			    *space>=PAGE_SIZE &&
842			    uio->uio_iov->iov_len>=PAGE_SIZE) {
843				so_zerocp_stats.size_ok++;
844				so_zerocp_stats.align_ok++;
845				cow_send = socow_setup(m, uio);
846				len = cow_send;
847			}
848			if (!cow_send) {
849				MCLGET(m, M_TRYWAIT);
850				if ((m->m_flags & M_EXT) == 0) {
851					m_free(m);
852					m = NULL;
853				} else {
854					len = min(min(MCLBYTES, resid),
855					    *space);
856				}
857			}
858#else /* ZERO_COPY_SOCKETS */
859			if (top == NULL) {
860				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
861				m->m_pkthdr.len = 0;
862				m->m_pkthdr.rcvif = NULL;
863			} else
864				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
865			len = min(min(MCLBYTES, resid), *space);
866#endif /* ZERO_COPY_SOCKETS */
867		} else {
868			if (top == NULL) {
869				m = m_gethdr(M_TRYWAIT, MT_DATA);
870				m->m_pkthdr.len = 0;
871				m->m_pkthdr.rcvif = NULL;
872
873				len = min(min(MHLEN, resid), *space);
874				/*
875				 * For datagram protocols, leave room
876				 * for protocol headers in first mbuf.
877				 */
878				if (atomic && m && len < MHLEN)
879					MH_ALIGN(m, len);
880			} else {
881				m = m_get(M_TRYWAIT, MT_DATA);
882				len = min(min(MLEN, resid), *space);
883			}
884		}
885		if (m == NULL) {
886			error = ENOBUFS;
887			goto out;
888		}
889
890		*space -= len;
891#ifdef ZERO_COPY_SOCKETS
892		if (cow_send)
893			error = 0;
894		else
895#endif /* ZERO_COPY_SOCKETS */
896		error = uiomove(mtod(m, void *), (int)len, uio);
897		resid = uio->uio_resid;
898		m->m_len = len;
899		*mp = m;
900		top->m_pkthdr.len += len;
901		if (error)
902			goto out;
903		mp = &m->m_next;
904		if (resid <= 0) {
905			if (flags & MSG_EOR)
906				top->m_flags |= M_EOR;
907			break;
908		}
909	} while (*space > 0 && atomic);
910out:
911	*retmp = top;
912	return (error);
913}
914
915#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
916
917int
918sosend_dgram(so, addr, uio, top, control, flags, td)
919	struct socket *so;
920	struct sockaddr *addr;
921	struct uio *uio;
922	struct mbuf *top;
923	struct mbuf *control;
924	int flags;
925	struct thread *td;
926{
927	long space, resid;
928	int clen = 0, error, dontroute;
929	int atomic = sosendallatonce(so) || top;
930
931	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
932	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
933	    ("sodgram_send: !PR_ATOMIC"));
934
935	if (uio != NULL)
936		resid = uio->uio_resid;
937	else
938		resid = top->m_pkthdr.len;
939	/*
940	 * In theory resid should be unsigned.
941	 * However, space must be signed, as it might be less than 0
942	 * if we over-committed, and we must use a signed comparison
943	 * of space and resid.  On the other hand, a negative resid
944	 * causes us to loop sending 0-length segments to the protocol.
945	 *
946	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
947	 * type sockets since that's an error.
948	 */
949	if (resid < 0) {
950		error = EINVAL;
951		goto out;
952	}
953
954	dontroute =
955	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
956	if (td != NULL)
957		td->td_proc->p_stats->p_ru.ru_msgsnd++;
958	if (control != NULL)
959		clen = control->m_len;
960
961	SOCKBUF_LOCK(&so->so_snd);
962	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
963		SOCKBUF_UNLOCK(&so->so_snd);
964		error = EPIPE;
965		goto out;
966	}
967	if (so->so_error) {
968		error = so->so_error;
969		so->so_error = 0;
970		SOCKBUF_UNLOCK(&so->so_snd);
971		goto out;
972	}
973	if ((so->so_state & SS_ISCONNECTED) == 0) {
974		/*
975		 * `sendto' and `sendmsg' is allowed on a connection-
976		 * based socket if it supports implied connect.
977		 * Return ENOTCONN if not connected and no address is
978		 * supplied.
979		 */
980		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
981		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
982			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
983			    !(resid == 0 && clen != 0)) {
984				SOCKBUF_UNLOCK(&so->so_snd);
985				error = ENOTCONN;
986				goto out;
987			}
988		} else if (addr == NULL) {
989			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
990				error = ENOTCONN;
991			else
992				error = EDESTADDRREQ;
993			SOCKBUF_UNLOCK(&so->so_snd);
994			goto out;
995		}
996	}
997
998	/*
999	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1000	 * problem and need fixing.
1001	 */
1002	space = sbspace(&so->so_snd);
1003	if (flags & MSG_OOB)
1004		space += 1024;
1005	space -= clen;
1006	if (resid > space) {
1007		error = EMSGSIZE;
1008		goto out;
1009	}
1010	SOCKBUF_UNLOCK(&so->so_snd);
1011	if (uio == NULL) {
1012		resid = 0;
1013		if (flags & MSG_EOR)
1014			top->m_flags |= M_EOR;
1015	} else {
1016		error = sosend_copyin(uio, &top, atomic, &space, flags);
1017		if (error)
1018			goto out;
1019		resid = uio->uio_resid;
1020	}
1021	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1022	/*
1023	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1024	 * than with.
1025	 */
1026	if (dontroute) {
1027		SOCK_LOCK(so);
1028		so->so_options |= SO_DONTROUTE;
1029		SOCK_UNLOCK(so);
1030	}
1031	/*
1032	 * XXX all the SBS_CANTSENDMORE checks previously
1033	 * done could be out of date.  We could have recieved
1034	 * a reset packet in an interrupt or maybe we slept
1035	 * while doing page faults in uiomove() etc. We could
1036	 * probably recheck again inside the locking protection
1037	 * here, but there are probably other places that this
1038	 * also happens.  We must rethink this.
1039	 */
1040	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1041	    (flags & MSG_OOB) ? PRUS_OOB :
1042	/*
1043	 * If the user set MSG_EOF, the protocol
1044	 * understands this flag and nothing left to
1045	 * send then use PRU_SEND_EOF instead of PRU_SEND.
1046	 */
1047	    ((flags & MSG_EOF) &&
1048	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1049	     (resid <= 0)) ?
1050		PRUS_EOF :
1051		/* If there is more to send set PRUS_MORETOCOME */
1052		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1053		top, addr, control, td);
1054	if (dontroute) {
1055		SOCK_LOCK(so);
1056		so->so_options &= ~SO_DONTROUTE;
1057		SOCK_UNLOCK(so);
1058	}
1059	clen = 0;
1060	control = NULL;
1061	top = NULL;
1062out:
1063	if (top != NULL)
1064		m_freem(top);
1065	if (control != NULL)
1066		m_freem(control);
1067	return (error);
1068}
1069
1070/*
1071 * Send on a socket.
1072 * If send must go all at once and message is larger than
1073 * send buffering, then hard error.
1074 * Lock against other senders.
1075 * If must go all at once and not enough room now, then
1076 * inform user that this would block and do nothing.
1077 * Otherwise, if nonblocking, send as much as possible.
1078 * The data to be sent is described by "uio" if nonzero,
1079 * otherwise by the mbuf chain "top" (which must be null
1080 * if uio is not).  Data provided in mbuf chain must be small
1081 * enough to send all at once.
1082 *
1083 * Returns nonzero on error, timeout or signal; callers
1084 * must check for short counts if EINTR/ERESTART are returned.
1085 * Data and control buffers are freed on return.
1086 */
1087#define	snderr(errno)	{ error = (errno); goto release; }
1088int
1089sosend(so, addr, uio, top, control, flags, td)
1090	struct socket *so;
1091	struct sockaddr *addr;
1092	struct uio *uio;
1093	struct mbuf *top;
1094	struct mbuf *control;
1095	int flags;
1096	struct thread *td;
1097{
1098	long space, resid;
1099	int clen = 0, error, dontroute;
1100	int atomic = sosendallatonce(so) || top;
1101
1102	if (uio != NULL)
1103		resid = uio->uio_resid;
1104	else
1105		resid = top->m_pkthdr.len;
1106	/*
1107	 * In theory resid should be unsigned.
1108	 * However, space must be signed, as it might be less than 0
1109	 * if we over-committed, and we must use a signed comparison
1110	 * of space and resid.  On the other hand, a negative resid
1111	 * causes us to loop sending 0-length segments to the protocol.
1112	 *
1113	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1114	 * type sockets since that's an error.
1115	 */
1116	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1117		error = EINVAL;
1118		goto out;
1119	}
1120
1121	dontroute =
1122	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1123	    (so->so_proto->pr_flags & PR_ATOMIC);
1124	if (td != NULL)
1125		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1126	if (control != NULL)
1127		clen = control->m_len;
1128
1129	SOCKBUF_LOCK(&so->so_snd);
1130restart:
1131	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1132	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1133	if (error)
1134		goto out_locked;
1135	do {
1136		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1137		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1138			snderr(EPIPE);
1139		if (so->so_error) {
1140			error = so->so_error;
1141			so->so_error = 0;
1142			goto release;
1143		}
1144		if ((so->so_state & SS_ISCONNECTED) == 0) {
1145			/*
1146			 * `sendto' and `sendmsg' is allowed on a connection-
1147			 * based socket if it supports implied connect.
1148			 * Return ENOTCONN if not connected and no address is
1149			 * supplied.
1150			 */
1151			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1152			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1153				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1154				    !(resid == 0 && clen != 0))
1155					snderr(ENOTCONN);
1156			} else if (addr == NULL)
1157			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1158				   ENOTCONN : EDESTADDRREQ);
1159		}
1160		space = sbspace(&so->so_snd);
1161		if (flags & MSG_OOB)
1162			space += 1024;
1163		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1164		    clen > so->so_snd.sb_hiwat)
1165			snderr(EMSGSIZE);
1166		if (space < resid + clen &&
1167		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1168			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1169				snderr(EWOULDBLOCK);
1170			sbunlock(&so->so_snd);
1171			error = sbwait(&so->so_snd);
1172			if (error)
1173				goto out_locked;
1174			goto restart;
1175		}
1176		SOCKBUF_UNLOCK(&so->so_snd);
1177		space -= clen;
1178		do {
1179			if (uio == NULL) {
1180				resid = 0;
1181				if (flags & MSG_EOR)
1182					top->m_flags |= M_EOR;
1183			} else {
1184				error = sosend_copyin(uio, &top, atomic,
1185				    &space, flags);
1186				if (error != 0) {
1187					SOCKBUF_LOCK(&so->so_snd);
1188					goto release;
1189				}
1190				resid = uio->uio_resid;
1191			}
1192			if (dontroute) {
1193				SOCK_LOCK(so);
1194				so->so_options |= SO_DONTROUTE;
1195				SOCK_UNLOCK(so);
1196			}
1197			/*
1198			 * XXX all the SBS_CANTSENDMORE checks previously
1199			 * done could be out of date.  We could have recieved
1200			 * a reset packet in an interrupt or maybe we slept
1201			 * while doing page faults in uiomove() etc. We could
1202			 * probably recheck again inside the locking protection
1203			 * here, but there are probably other places that this
1204			 * also happens.  We must rethink this.
1205			 */
1206			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1207			    (flags & MSG_OOB) ? PRUS_OOB :
1208			/*
1209			 * If the user set MSG_EOF, the protocol
1210			 * understands this flag and nothing left to
1211			 * send then use PRU_SEND_EOF instead of PRU_SEND.
1212			 */
1213			    ((flags & MSG_EOF) &&
1214			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1215			     (resid <= 0)) ?
1216				PRUS_EOF :
1217			/* If there is more to send set PRUS_MORETOCOME */
1218			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1219			    top, addr, control, td);
1220			if (dontroute) {
1221				SOCK_LOCK(so);
1222				so->so_options &= ~SO_DONTROUTE;
1223				SOCK_UNLOCK(so);
1224			}
1225			clen = 0;
1226			control = NULL;
1227			top = NULL;
1228			if (error) {
1229				SOCKBUF_LOCK(&so->so_snd);
1230				goto release;
1231			}
1232		} while (resid && space > 0);
1233		SOCKBUF_LOCK(&so->so_snd);
1234	} while (resid);
1235
1236release:
1237	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1238	sbunlock(&so->so_snd);
1239out_locked:
1240	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1241	SOCKBUF_UNLOCK(&so->so_snd);
1242out:
1243	if (top != NULL)
1244		m_freem(top);
1245	if (control != NULL)
1246		m_freem(control);
1247	return (error);
1248}
1249#undef snderr
1250
1251/*
1252 * The part of soreceive() that implements reading non-inline out-of-band
1253 * data from a socket.  For more complete comments, see soreceive(), from
1254 * which this code originated.
1255 *
1256 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1257 * unable to return an mbuf chain to the caller.
1258 */
1259static int
1260soreceive_rcvoob(so, uio, flags)
1261	struct socket *so;
1262	struct uio *uio;
1263	int flags;
1264{
1265	struct protosw *pr = so->so_proto;
1266	struct mbuf *m;
1267	int error;
1268
1269	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1270
1271	m = m_get(M_TRYWAIT, MT_DATA);
1272	if (m == NULL)
1273		return (ENOBUFS);
1274	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1275	if (error)
1276		goto bad;
1277	do {
1278#ifdef ZERO_COPY_SOCKETS
1279		if (so_zero_copy_receive) {
1280			int disposable;
1281
1282			if ((m->m_flags & M_EXT)
1283			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1284				disposable = 1;
1285			else
1286				disposable = 0;
1287
1288			error = uiomoveco(mtod(m, void *),
1289					  min(uio->uio_resid, m->m_len),
1290					  uio, disposable);
1291		} else
1292#endif /* ZERO_COPY_SOCKETS */
1293		error = uiomove(mtod(m, void *),
1294		    (int) min(uio->uio_resid, m->m_len), uio);
1295		m = m_free(m);
1296	} while (uio->uio_resid && error == 0 && m);
1297bad:
1298	if (m != NULL)
1299		m_freem(m);
1300	return (error);
1301}
1302
1303/*
1304 * Following replacement or removal of the first mbuf on the first mbuf chain
1305 * of a socket buffer, push necessary state changes back into the socket
1306 * buffer so that other consumers see the values consistently.  'nextrecord'
1307 * is the callers locally stored value of the original value of
1308 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1309 * NOTE: 'nextrecord' may be NULL.
1310 */
1311static __inline void
1312sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1313{
1314
1315	SOCKBUF_LOCK_ASSERT(sb);
1316	/*
1317	 * First, update for the new value of nextrecord.  If necessary, make
1318	 * it the first record.
1319	 */
1320	if (sb->sb_mb != NULL)
1321		sb->sb_mb->m_nextpkt = nextrecord;
1322	else
1323		sb->sb_mb = nextrecord;
1324
1325        /*
1326         * Now update any dependent socket buffer fields to reflect the new
1327         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1328	 * addition of a second clause that takes care of the case where
1329	 * sb_mb has been updated, but remains the last record.
1330         */
1331        if (sb->sb_mb == NULL) {
1332                sb->sb_mbtail = NULL;
1333                sb->sb_lastrecord = NULL;
1334        } else if (sb->sb_mb->m_nextpkt == NULL)
1335                sb->sb_lastrecord = sb->sb_mb;
1336}
1337
1338
1339/*
1340 * Implement receive operations on a socket.
1341 * We depend on the way that records are added to the sockbuf
1342 * by sbappend*.  In particular, each record (mbufs linked through m_next)
1343 * must begin with an address if the protocol so specifies,
1344 * followed by an optional mbuf or mbufs containing ancillary data,
1345 * and then zero or more mbufs of data.
1346 * In order to avoid blocking network interrupts for the entire time here,
1347 * we splx() while doing the actual copy to user space.
1348 * Although the sockbuf is locked, new data may still be appended,
1349 * and thus we must maintain consistency of the sockbuf during that time.
1350 *
1351 * The caller may receive the data as a single mbuf chain by supplying
1352 * an mbuf **mp0 for use in returning the chain.  The uio is then used
1353 * only for the count in uio_resid.
1354 */
1355int
1356soreceive(so, psa, uio, mp0, controlp, flagsp)
1357	struct socket *so;
1358	struct sockaddr **psa;
1359	struct uio *uio;
1360	struct mbuf **mp0;
1361	struct mbuf **controlp;
1362	int *flagsp;
1363{
1364	struct mbuf *m, **mp;
1365	int flags, len, error, offset;
1366	struct protosw *pr = so->so_proto;
1367	struct mbuf *nextrecord;
1368	int moff, type = 0;
1369	int orig_resid = uio->uio_resid;
1370
1371	mp = mp0;
1372	if (psa != NULL)
1373		*psa = NULL;
1374	if (controlp != NULL)
1375		*controlp = NULL;
1376	if (flagsp != NULL)
1377		flags = *flagsp &~ MSG_EOR;
1378	else
1379		flags = 0;
1380	if (flags & MSG_OOB)
1381		return (soreceive_rcvoob(so, uio, flags));
1382	if (mp != NULL)
1383		*mp = NULL;
1384	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1385	    && uio->uio_resid)
1386		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1387
1388	SOCKBUF_LOCK(&so->so_rcv);
1389restart:
1390	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1391	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1392	if (error)
1393		goto out;
1394
1395	m = so->so_rcv.sb_mb;
1396	/*
1397	 * If we have less data than requested, block awaiting more
1398	 * (subject to any timeout) if:
1399	 *   1. the current count is less than the low water mark, or
1400	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1401	 *	receive operation at once if we block (resid <= hiwat).
1402	 *   3. MSG_DONTWAIT is not set
1403	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1404	 * we have to do the receive in sections, and thus risk returning
1405	 * a short count if a timeout or signal occurs after we start.
1406	 */
1407	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1408	    so->so_rcv.sb_cc < uio->uio_resid) &&
1409	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1410	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1411	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1412		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1413		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1414		    m, so->so_rcv.sb_cc));
1415		if (so->so_error) {
1416			if (m != NULL)
1417				goto dontblock;
1418			error = so->so_error;
1419			if ((flags & MSG_PEEK) == 0)
1420				so->so_error = 0;
1421			goto release;
1422		}
1423		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1424		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1425			if (m)
1426				goto dontblock;
1427			else
1428				goto release;
1429		}
1430		for (; m != NULL; m = m->m_next)
1431			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1432				m = so->so_rcv.sb_mb;
1433				goto dontblock;
1434			}
1435		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1436		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1437			error = ENOTCONN;
1438			goto release;
1439		}
1440		if (uio->uio_resid == 0)
1441			goto release;
1442		if ((so->so_state & SS_NBIO) ||
1443		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1444			error = EWOULDBLOCK;
1445			goto release;
1446		}
1447		SBLASTRECORDCHK(&so->so_rcv);
1448		SBLASTMBUFCHK(&so->so_rcv);
1449		sbunlock(&so->so_rcv);
1450		error = sbwait(&so->so_rcv);
1451		if (error)
1452			goto out;
1453		goto restart;
1454	}
1455dontblock:
1456	/*
1457	 * From this point onward, we maintain 'nextrecord' as a cache of the
1458	 * pointer to the next record in the socket buffer.  We must keep the
1459	 * various socket buffer pointers and local stack versions of the
1460	 * pointers in sync, pushing out modifications before dropping the
1461	 * socket buffer mutex, and re-reading them when picking it up.
1462	 *
1463	 * Otherwise, we will race with the network stack appending new data
1464	 * or records onto the socket buffer by using inconsistent/stale
1465	 * versions of the field, possibly resulting in socket buffer
1466	 * corruption.
1467	 *
1468	 * By holding the high-level sblock(), we prevent simultaneous
1469	 * readers from pulling off the front of the socket buffer.
1470	 */
1471	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1472	if (uio->uio_td)
1473		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1474	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1475	SBLASTRECORDCHK(&so->so_rcv);
1476	SBLASTMBUFCHK(&so->so_rcv);
1477	nextrecord = m->m_nextpkt;
1478	if (pr->pr_flags & PR_ADDR) {
1479		KASSERT(m->m_type == MT_SONAME,
1480		    ("m->m_type == %d", m->m_type));
1481		orig_resid = 0;
1482		if (psa != NULL)
1483			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1484			    M_NOWAIT);
1485		if (flags & MSG_PEEK) {
1486			m = m->m_next;
1487		} else {
1488			sbfree(&so->so_rcv, m);
1489			so->so_rcv.sb_mb = m_free(m);
1490			m = so->so_rcv.sb_mb;
1491			sockbuf_pushsync(&so->so_rcv, nextrecord);
1492		}
1493	}
1494
1495	/*
1496	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1497	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1498	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1499	 * perform externalization (or freeing if controlp == NULL).
1500	 */
1501	if (m != NULL && m->m_type == MT_CONTROL) {
1502		struct mbuf *cm = NULL, *cmn;
1503		struct mbuf **cme = &cm;
1504
1505		do {
1506			if (flags & MSG_PEEK) {
1507				if (controlp != NULL) {
1508					*controlp = m_copy(m, 0, m->m_len);
1509					controlp = &(*controlp)->m_next;
1510				}
1511				m = m->m_next;
1512			} else {
1513				sbfree(&so->so_rcv, m);
1514				so->so_rcv.sb_mb = m->m_next;
1515				m->m_next = NULL;
1516				*cme = m;
1517				cme = &(*cme)->m_next;
1518				m = so->so_rcv.sb_mb;
1519			}
1520		} while (m != NULL && m->m_type == MT_CONTROL);
1521		if ((flags & MSG_PEEK) == 0)
1522			sockbuf_pushsync(&so->so_rcv, nextrecord);
1523		while (cm != NULL) {
1524			cmn = cm->m_next;
1525			cm->m_next = NULL;
1526			if (pr->pr_domain->dom_externalize != NULL) {
1527				SOCKBUF_UNLOCK(&so->so_rcv);
1528				error = (*pr->pr_domain->dom_externalize)
1529				    (cm, controlp);
1530				SOCKBUF_LOCK(&so->so_rcv);
1531			} else if (controlp != NULL)
1532				*controlp = cm;
1533			else
1534				m_freem(cm);
1535			if (controlp != NULL) {
1536				orig_resid = 0;
1537				while (*controlp != NULL)
1538					controlp = &(*controlp)->m_next;
1539			}
1540			cm = cmn;
1541		}
1542		if (so->so_rcv.sb_mb)
1543			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1544		else
1545			nextrecord = NULL;
1546		orig_resid = 0;
1547	}
1548	if (m != NULL) {
1549		if ((flags & MSG_PEEK) == 0) {
1550			KASSERT(m->m_nextpkt == nextrecord,
1551			    ("soreceive: post-control, nextrecord !sync"));
1552			if (nextrecord == NULL) {
1553				KASSERT(so->so_rcv.sb_mb == m,
1554				    ("soreceive: post-control, sb_mb!=m"));
1555				KASSERT(so->so_rcv.sb_lastrecord == m,
1556				    ("soreceive: post-control, lastrecord!=m"));
1557			}
1558		}
1559		type = m->m_type;
1560		if (type == MT_OOBDATA)
1561			flags |= MSG_OOB;
1562	} else {
1563		if ((flags & MSG_PEEK) == 0) {
1564			KASSERT(so->so_rcv.sb_mb == nextrecord,
1565			    ("soreceive: sb_mb != nextrecord"));
1566			if (so->so_rcv.sb_mb == NULL) {
1567				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1568				    ("soreceive: sb_lastercord != NULL"));
1569			}
1570		}
1571	}
1572	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1573	SBLASTRECORDCHK(&so->so_rcv);
1574	SBLASTMBUFCHK(&so->so_rcv);
1575
1576	/*
1577	 * Now continue to read any data mbufs off of the head of the socket
1578	 * buffer until the read request is satisfied.  Note that 'type' is
1579	 * used to store the type of any mbuf reads that have happened so far
1580	 * such that soreceive() can stop reading if the type changes, which
1581	 * causes soreceive() to return only one of regular data and inline
1582	 * out-of-band data in a single socket receive operation.
1583	 */
1584	moff = 0;
1585	offset = 0;
1586	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1587		/*
1588		 * If the type of mbuf has changed since the last mbuf
1589		 * examined ('type'), end the receive operation.
1590	 	 */
1591		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1592		if (m->m_type == MT_OOBDATA) {
1593			if (type != MT_OOBDATA)
1594				break;
1595		} else if (type == MT_OOBDATA)
1596			break;
1597		else
1598		    KASSERT(m->m_type == MT_DATA,
1599			("m->m_type == %d", m->m_type));
1600		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1601		len = uio->uio_resid;
1602		if (so->so_oobmark && len > so->so_oobmark - offset)
1603			len = so->so_oobmark - offset;
1604		if (len > m->m_len - moff)
1605			len = m->m_len - moff;
1606		/*
1607		 * If mp is set, just pass back the mbufs.
1608		 * Otherwise copy them out via the uio, then free.
1609		 * Sockbuf must be consistent here (points to current mbuf,
1610		 * it points to next record) when we drop priority;
1611		 * we must note any additions to the sockbuf when we
1612		 * block interrupts again.
1613		 */
1614		if (mp == NULL) {
1615			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1616			SBLASTRECORDCHK(&so->so_rcv);
1617			SBLASTMBUFCHK(&so->so_rcv);
1618			SOCKBUF_UNLOCK(&so->so_rcv);
1619#ifdef ZERO_COPY_SOCKETS
1620			if (so_zero_copy_receive) {
1621				int disposable;
1622
1623				if ((m->m_flags & M_EXT)
1624				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1625					disposable = 1;
1626				else
1627					disposable = 0;
1628
1629				error = uiomoveco(mtod(m, char *) + moff,
1630						  (int)len, uio,
1631						  disposable);
1632			} else
1633#endif /* ZERO_COPY_SOCKETS */
1634			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1635			SOCKBUF_LOCK(&so->so_rcv);
1636			if (error)
1637				goto release;
1638		} else
1639			uio->uio_resid -= len;
1640		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1641		if (len == m->m_len - moff) {
1642			if (m->m_flags & M_EOR)
1643				flags |= MSG_EOR;
1644			if (flags & MSG_PEEK) {
1645				m = m->m_next;
1646				moff = 0;
1647			} else {
1648				nextrecord = m->m_nextpkt;
1649				sbfree(&so->so_rcv, m);
1650				if (mp != NULL) {
1651					*mp = m;
1652					mp = &m->m_next;
1653					so->so_rcv.sb_mb = m = m->m_next;
1654					*mp = NULL;
1655				} else {
1656					so->so_rcv.sb_mb = m_free(m);
1657					m = so->so_rcv.sb_mb;
1658				}
1659				sockbuf_pushsync(&so->so_rcv, nextrecord);
1660				SBLASTRECORDCHK(&so->so_rcv);
1661				SBLASTMBUFCHK(&so->so_rcv);
1662			}
1663		} else {
1664			if (flags & MSG_PEEK)
1665				moff += len;
1666			else {
1667				if (mp != NULL) {
1668					int copy_flag;
1669
1670					if (flags & MSG_DONTWAIT)
1671						copy_flag = M_DONTWAIT;
1672					else
1673						copy_flag = M_TRYWAIT;
1674					if (copy_flag == M_TRYWAIT)
1675						SOCKBUF_UNLOCK(&so->so_rcv);
1676					*mp = m_copym(m, 0, len, copy_flag);
1677					if (copy_flag == M_TRYWAIT)
1678						SOCKBUF_LOCK(&so->so_rcv);
1679 					if (*mp == NULL) {
1680 						/*
1681 						 * m_copym() couldn't allocate an mbuf.
1682						 * Adjust uio_resid back (it was adjusted
1683						 * down by len bytes, which we didn't end
1684						 * up "copying" over).
1685 						 */
1686 						uio->uio_resid += len;
1687 						break;
1688 					}
1689				}
1690				m->m_data += len;
1691				m->m_len -= len;
1692				so->so_rcv.sb_cc -= len;
1693			}
1694		}
1695		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1696		if (so->so_oobmark) {
1697			if ((flags & MSG_PEEK) == 0) {
1698				so->so_oobmark -= len;
1699				if (so->so_oobmark == 0) {
1700					so->so_rcv.sb_state |= SBS_RCVATMARK;
1701					break;
1702				}
1703			} else {
1704				offset += len;
1705				if (offset == so->so_oobmark)
1706					break;
1707			}
1708		}
1709		if (flags & MSG_EOR)
1710			break;
1711		/*
1712		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1713		 * we must not quit until "uio->uio_resid == 0" or an error
1714		 * termination.  If a signal/timeout occurs, return
1715		 * with a short count but without error.
1716		 * Keep sockbuf locked against other readers.
1717		 */
1718		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1719		    !sosendallatonce(so) && nextrecord == NULL) {
1720			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1721			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1722				break;
1723			/*
1724			 * Notify the protocol that some data has been
1725			 * drained before blocking.
1726			 */
1727			if (pr->pr_flags & PR_WANTRCVD) {
1728				SOCKBUF_UNLOCK(&so->so_rcv);
1729				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1730				SOCKBUF_LOCK(&so->so_rcv);
1731			}
1732			SBLASTRECORDCHK(&so->so_rcv);
1733			SBLASTMBUFCHK(&so->so_rcv);
1734			error = sbwait(&so->so_rcv);
1735			if (error)
1736				goto release;
1737			m = so->so_rcv.sb_mb;
1738			if (m != NULL)
1739				nextrecord = m->m_nextpkt;
1740		}
1741	}
1742
1743	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1744	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1745		flags |= MSG_TRUNC;
1746		if ((flags & MSG_PEEK) == 0)
1747			(void) sbdroprecord_locked(&so->so_rcv);
1748	}
1749	if ((flags & MSG_PEEK) == 0) {
1750		if (m == NULL) {
1751			/*
1752			 * First part is an inline SB_EMPTY_FIXUP().  Second
1753			 * part makes sure sb_lastrecord is up-to-date if
1754			 * there is still data in the socket buffer.
1755			 */
1756			so->so_rcv.sb_mb = nextrecord;
1757			if (so->so_rcv.sb_mb == NULL) {
1758				so->so_rcv.sb_mbtail = NULL;
1759				so->so_rcv.sb_lastrecord = NULL;
1760			} else if (nextrecord->m_nextpkt == NULL)
1761				so->so_rcv.sb_lastrecord = nextrecord;
1762		}
1763		SBLASTRECORDCHK(&so->so_rcv);
1764		SBLASTMBUFCHK(&so->so_rcv);
1765		/*
1766		 * If soreceive() is being done from the socket callback, then
1767		 * don't need to generate ACK to peer to update window, since
1768		 * ACK will be generated on return to TCP.
1769		 */
1770		if (!(flags & MSG_SOCALLBCK) &&
1771		    (pr->pr_flags & PR_WANTRCVD)) {
1772			SOCKBUF_UNLOCK(&so->so_rcv);
1773			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1774			SOCKBUF_LOCK(&so->so_rcv);
1775		}
1776	}
1777	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1778	if (orig_resid == uio->uio_resid && orig_resid &&
1779	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1780		sbunlock(&so->so_rcv);
1781		goto restart;
1782	}
1783
1784	if (flagsp != NULL)
1785		*flagsp |= flags;
1786release:
1787	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1788	sbunlock(&so->so_rcv);
1789out:
1790	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1791	SOCKBUF_UNLOCK(&so->so_rcv);
1792	return (error);
1793}
1794
1795int
1796soshutdown(so, how)
1797	struct socket *so;
1798	int how;
1799{
1800	struct protosw *pr = so->so_proto;
1801
1802	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1803		return (EINVAL);
1804
1805	if (how != SHUT_WR)
1806		sorflush(so);
1807	if (how != SHUT_RD)
1808		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1809	return (0);
1810}
1811
1812void
1813sorflush(so)
1814	struct socket *so;
1815{
1816	struct sockbuf *sb = &so->so_rcv;
1817	struct protosw *pr = so->so_proto;
1818	struct sockbuf asb;
1819
1820	/*
1821	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1822	 * the socket buffer, then zero'd the original to clear the buffer
1823	 * fields.  However, with mutexes in the socket buffer, this causes
1824	 * problems.  We only clear the zeroable bits of the original;
1825	 * however, we have to initialize and destroy the mutex in the copy
1826	 * so that dom_dispose() and sbrelease() can lock t as needed.
1827	 */
1828	SOCKBUF_LOCK(sb);
1829	sb->sb_flags |= SB_NOINTR;
1830	(void) sblock(sb, M_WAITOK);
1831	/*
1832	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1833	 * can safely perform wakeups.  Re-acquire the mutex before
1834	 * continuing.
1835	 */
1836	socantrcvmore_locked(so);
1837	SOCKBUF_LOCK(sb);
1838	sbunlock(sb);
1839	/*
1840	 * Invalidate/clear most of the sockbuf structure, but leave
1841	 * selinfo and mutex data unchanged.
1842	 */
1843	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1844	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1845	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1846	bzero(&sb->sb_startzero,
1847	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1848	SOCKBUF_UNLOCK(sb);
1849
1850	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1851	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1852		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1853	sbrelease(&asb, so);
1854	SOCKBUF_LOCK_DESTROY(&asb);
1855}
1856
1857/*
1858 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1859 * an additional variant to handle the case where the option value needs
1860 * to be some kind of integer, but not a specific size.
1861 * In addition to their use here, these functions are also called by the
1862 * protocol-level pr_ctloutput() routines.
1863 */
1864int
1865sooptcopyin(sopt, buf, len, minlen)
1866	struct	sockopt *sopt;
1867	void	*buf;
1868	size_t	len;
1869	size_t	minlen;
1870{
1871	size_t	valsize;
1872
1873	/*
1874	 * If the user gives us more than we wanted, we ignore it,
1875	 * but if we don't get the minimum length the caller
1876	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
1877	 * is set to however much we actually retrieved.
1878	 */
1879	if ((valsize = sopt->sopt_valsize) < minlen)
1880		return EINVAL;
1881	if (valsize > len)
1882		sopt->sopt_valsize = valsize = len;
1883
1884	if (sopt->sopt_td != NULL)
1885		return (copyin(sopt->sopt_val, buf, valsize));
1886
1887	bcopy(sopt->sopt_val, buf, valsize);
1888	return (0);
1889}
1890
1891/*
1892 * Kernel version of setsockopt(2)/
1893 * XXX: optlen is size_t, not socklen_t
1894 */
1895int
1896so_setsockopt(struct socket *so, int level, int optname, void *optval,
1897    size_t optlen)
1898{
1899	struct sockopt sopt;
1900
1901	sopt.sopt_level = level;
1902	sopt.sopt_name = optname;
1903	sopt.sopt_dir = SOPT_SET;
1904	sopt.sopt_val = optval;
1905	sopt.sopt_valsize = optlen;
1906	sopt.sopt_td = NULL;
1907	return (sosetopt(so, &sopt));
1908}
1909
1910int
1911sosetopt(so, sopt)
1912	struct socket *so;
1913	struct sockopt *sopt;
1914{
1915	int	error, optval;
1916	struct	linger l;
1917	struct	timeval tv;
1918	u_long  val;
1919#ifdef MAC
1920	struct mac extmac;
1921#endif
1922
1923	error = 0;
1924	if (sopt->sopt_level != SOL_SOCKET) {
1925		if (so->so_proto && so->so_proto->pr_ctloutput)
1926			return ((*so->so_proto->pr_ctloutput)
1927				  (so, sopt));
1928		error = ENOPROTOOPT;
1929	} else {
1930		switch (sopt->sopt_name) {
1931#ifdef INET
1932		case SO_ACCEPTFILTER:
1933			error = do_setopt_accept_filter(so, sopt);
1934			if (error)
1935				goto bad;
1936			break;
1937#endif
1938		case SO_LINGER:
1939			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1940			if (error)
1941				goto bad;
1942
1943			SOCK_LOCK(so);
1944			so->so_linger = l.l_linger;
1945			if (l.l_onoff)
1946				so->so_options |= SO_LINGER;
1947			else
1948				so->so_options &= ~SO_LINGER;
1949			SOCK_UNLOCK(so);
1950			break;
1951
1952		case SO_DEBUG:
1953		case SO_KEEPALIVE:
1954		case SO_DONTROUTE:
1955		case SO_USELOOPBACK:
1956		case SO_BROADCAST:
1957		case SO_REUSEADDR:
1958		case SO_REUSEPORT:
1959		case SO_OOBINLINE:
1960		case SO_TIMESTAMP:
1961		case SO_BINTIME:
1962		case SO_NOSIGPIPE:
1963			error = sooptcopyin(sopt, &optval, sizeof optval,
1964					    sizeof optval);
1965			if (error)
1966				goto bad;
1967			SOCK_LOCK(so);
1968			if (optval)
1969				so->so_options |= sopt->sopt_name;
1970			else
1971				so->so_options &= ~sopt->sopt_name;
1972			SOCK_UNLOCK(so);
1973			break;
1974
1975		case SO_SNDBUF:
1976		case SO_RCVBUF:
1977		case SO_SNDLOWAT:
1978		case SO_RCVLOWAT:
1979			error = sooptcopyin(sopt, &optval, sizeof optval,
1980					    sizeof optval);
1981			if (error)
1982				goto bad;
1983
1984			/*
1985			 * Values < 1 make no sense for any of these
1986			 * options, so disallow them.
1987			 */
1988			if (optval < 1) {
1989				error = EINVAL;
1990				goto bad;
1991			}
1992
1993			switch (sopt->sopt_name) {
1994			case SO_SNDBUF:
1995			case SO_RCVBUF:
1996				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1997				    &so->so_snd : &so->so_rcv, (u_long)optval,
1998				    so, curthread) == 0) {
1999					error = ENOBUFS;
2000					goto bad;
2001				}
2002				break;
2003
2004			/*
2005			 * Make sure the low-water is never greater than
2006			 * the high-water.
2007			 */
2008			case SO_SNDLOWAT:
2009				SOCKBUF_LOCK(&so->so_snd);
2010				so->so_snd.sb_lowat =
2011				    (optval > so->so_snd.sb_hiwat) ?
2012				    so->so_snd.sb_hiwat : optval;
2013				SOCKBUF_UNLOCK(&so->so_snd);
2014				break;
2015			case SO_RCVLOWAT:
2016				SOCKBUF_LOCK(&so->so_rcv);
2017				so->so_rcv.sb_lowat =
2018				    (optval > so->so_rcv.sb_hiwat) ?
2019				    so->so_rcv.sb_hiwat : optval;
2020				SOCKBUF_UNLOCK(&so->so_rcv);
2021				break;
2022			}
2023			break;
2024
2025		case SO_SNDTIMEO:
2026		case SO_RCVTIMEO:
2027#ifdef COMPAT_IA32
2028			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2029				struct timeval32 tv32;
2030
2031				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2032				    sizeof tv32);
2033				CP(tv32, tv, tv_sec);
2034				CP(tv32, tv, tv_usec);
2035			} else
2036#endif
2037				error = sooptcopyin(sopt, &tv, sizeof tv,
2038				    sizeof tv);
2039			if (error)
2040				goto bad;
2041
2042			/* assert(hz > 0); */
2043			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2044			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2045				error = EDOM;
2046				goto bad;
2047			}
2048			/* assert(tick > 0); */
2049			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2050			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2051			if (val > INT_MAX) {
2052				error = EDOM;
2053				goto bad;
2054			}
2055			if (val == 0 && tv.tv_usec != 0)
2056				val = 1;
2057
2058			switch (sopt->sopt_name) {
2059			case SO_SNDTIMEO:
2060				so->so_snd.sb_timeo = val;
2061				break;
2062			case SO_RCVTIMEO:
2063				so->so_rcv.sb_timeo = val;
2064				break;
2065			}
2066			break;
2067
2068		case SO_LABEL:
2069#ifdef MAC
2070			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2071			    sizeof extmac);
2072			if (error)
2073				goto bad;
2074			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2075			    so, &extmac);
2076#else
2077			error = EOPNOTSUPP;
2078#endif
2079			break;
2080
2081		default:
2082			error = ENOPROTOOPT;
2083			break;
2084		}
2085		if (error == 0 && so->so_proto != NULL &&
2086		    so->so_proto->pr_ctloutput != NULL) {
2087			(void) ((*so->so_proto->pr_ctloutput)
2088				  (so, sopt));
2089		}
2090	}
2091bad:
2092	return (error);
2093}
2094
2095/* Helper routine for getsockopt */
2096int
2097sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2098{
2099	int	error;
2100	size_t	valsize;
2101
2102	error = 0;
2103
2104	/*
2105	 * Documented get behavior is that we always return a value,
2106	 * possibly truncated to fit in the user's buffer.
2107	 * Traditional behavior is that we always tell the user
2108	 * precisely how much we copied, rather than something useful
2109	 * like the total amount we had available for her.
2110	 * Note that this interface is not idempotent; the entire answer must
2111	 * generated ahead of time.
2112	 */
2113	valsize = min(len, sopt->sopt_valsize);
2114	sopt->sopt_valsize = valsize;
2115	if (sopt->sopt_val != NULL) {
2116		if (sopt->sopt_td != NULL)
2117			error = copyout(buf, sopt->sopt_val, valsize);
2118		else
2119			bcopy(buf, sopt->sopt_val, valsize);
2120	}
2121	return (error);
2122}
2123
2124int
2125sogetopt(so, sopt)
2126	struct socket *so;
2127	struct sockopt *sopt;
2128{
2129	int	error, optval;
2130	struct	linger l;
2131	struct	timeval tv;
2132#ifdef MAC
2133	struct mac extmac;
2134#endif
2135
2136	error = 0;
2137	if (sopt->sopt_level != SOL_SOCKET) {
2138		if (so->so_proto && so->so_proto->pr_ctloutput) {
2139			return ((*so->so_proto->pr_ctloutput)
2140				  (so, sopt));
2141		} else
2142			return (ENOPROTOOPT);
2143	} else {
2144		switch (sopt->sopt_name) {
2145#ifdef INET
2146		case SO_ACCEPTFILTER:
2147			error = do_getopt_accept_filter(so, sopt);
2148			break;
2149#endif
2150		case SO_LINGER:
2151			SOCK_LOCK(so);
2152			l.l_onoff = so->so_options & SO_LINGER;
2153			l.l_linger = so->so_linger;
2154			SOCK_UNLOCK(so);
2155			error = sooptcopyout(sopt, &l, sizeof l);
2156			break;
2157
2158		case SO_USELOOPBACK:
2159		case SO_DONTROUTE:
2160		case SO_DEBUG:
2161		case SO_KEEPALIVE:
2162		case SO_REUSEADDR:
2163		case SO_REUSEPORT:
2164		case SO_BROADCAST:
2165		case SO_OOBINLINE:
2166		case SO_ACCEPTCONN:
2167		case SO_TIMESTAMP:
2168		case SO_BINTIME:
2169		case SO_NOSIGPIPE:
2170			optval = so->so_options & sopt->sopt_name;
2171integer:
2172			error = sooptcopyout(sopt, &optval, sizeof optval);
2173			break;
2174
2175		case SO_TYPE:
2176			optval = so->so_type;
2177			goto integer;
2178
2179		case SO_ERROR:
2180			SOCK_LOCK(so);
2181			optval = so->so_error;
2182			so->so_error = 0;
2183			SOCK_UNLOCK(so);
2184			goto integer;
2185
2186		case SO_SNDBUF:
2187			optval = so->so_snd.sb_hiwat;
2188			goto integer;
2189
2190		case SO_RCVBUF:
2191			optval = so->so_rcv.sb_hiwat;
2192			goto integer;
2193
2194		case SO_SNDLOWAT:
2195			optval = so->so_snd.sb_lowat;
2196			goto integer;
2197
2198		case SO_RCVLOWAT:
2199			optval = so->so_rcv.sb_lowat;
2200			goto integer;
2201
2202		case SO_SNDTIMEO:
2203		case SO_RCVTIMEO:
2204			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2205				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2206
2207			tv.tv_sec = optval / hz;
2208			tv.tv_usec = (optval % hz) * tick;
2209#ifdef COMPAT_IA32
2210			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2211				struct timeval32 tv32;
2212
2213				CP(tv, tv32, tv_sec);
2214				CP(tv, tv32, tv_usec);
2215				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2216			} else
2217#endif
2218				error = sooptcopyout(sopt, &tv, sizeof tv);
2219			break;
2220
2221		case SO_LABEL:
2222#ifdef MAC
2223			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2224			    sizeof(extmac));
2225			if (error)
2226				return (error);
2227			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2228			    so, &extmac);
2229			if (error)
2230				return (error);
2231			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2232#else
2233			error = EOPNOTSUPP;
2234#endif
2235			break;
2236
2237		case SO_PEERLABEL:
2238#ifdef MAC
2239			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2240			    sizeof(extmac));
2241			if (error)
2242				return (error);
2243			error = mac_getsockopt_peerlabel(
2244			    sopt->sopt_td->td_ucred, so, &extmac);
2245			if (error)
2246				return (error);
2247			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2248#else
2249			error = EOPNOTSUPP;
2250#endif
2251			break;
2252
2253		case SO_LISTENQLIMIT:
2254			optval = so->so_qlimit;
2255			goto integer;
2256
2257		case SO_LISTENQLEN:
2258			optval = so->so_qlen;
2259			goto integer;
2260
2261		case SO_LISTENINCQLEN:
2262			optval = so->so_incqlen;
2263			goto integer;
2264
2265		default:
2266			error = ENOPROTOOPT;
2267			break;
2268		}
2269		return (error);
2270	}
2271}
2272
2273/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2274int
2275soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2276{
2277	struct mbuf *m, *m_prev;
2278	int sopt_size = sopt->sopt_valsize;
2279
2280	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2281	if (m == NULL)
2282		return ENOBUFS;
2283	if (sopt_size > MLEN) {
2284		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2285		if ((m->m_flags & M_EXT) == 0) {
2286			m_free(m);
2287			return ENOBUFS;
2288		}
2289		m->m_len = min(MCLBYTES, sopt_size);
2290	} else {
2291		m->m_len = min(MLEN, sopt_size);
2292	}
2293	sopt_size -= m->m_len;
2294	*mp = m;
2295	m_prev = m;
2296
2297	while (sopt_size) {
2298		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2299		if (m == NULL) {
2300			m_freem(*mp);
2301			return ENOBUFS;
2302		}
2303		if (sopt_size > MLEN) {
2304			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2305			    M_DONTWAIT);
2306			if ((m->m_flags & M_EXT) == 0) {
2307				m_freem(m);
2308				m_freem(*mp);
2309				return ENOBUFS;
2310			}
2311			m->m_len = min(MCLBYTES, sopt_size);
2312		} else {
2313			m->m_len = min(MLEN, sopt_size);
2314		}
2315		sopt_size -= m->m_len;
2316		m_prev->m_next = m;
2317		m_prev = m;
2318	}
2319	return (0);
2320}
2321
2322/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2323int
2324soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2325{
2326	struct mbuf *m0 = m;
2327
2328	if (sopt->sopt_val == NULL)
2329		return (0);
2330	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2331		if (sopt->sopt_td != NULL) {
2332			int error;
2333
2334			error = copyin(sopt->sopt_val, mtod(m, char *),
2335				       m->m_len);
2336			if (error != 0) {
2337				m_freem(m0);
2338				return(error);
2339			}
2340		} else
2341			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2342		sopt->sopt_valsize -= m->m_len;
2343		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2344		m = m->m_next;
2345	}
2346	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2347		panic("ip6_sooptmcopyin");
2348	return (0);
2349}
2350
2351/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2352int
2353soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2354{
2355	struct mbuf *m0 = m;
2356	size_t valsize = 0;
2357
2358	if (sopt->sopt_val == NULL)
2359		return (0);
2360	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2361		if (sopt->sopt_td != NULL) {
2362			int error;
2363
2364			error = copyout(mtod(m, char *), sopt->sopt_val,
2365				       m->m_len);
2366			if (error != 0) {
2367				m_freem(m0);
2368				return(error);
2369			}
2370		} else
2371			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2372	       sopt->sopt_valsize -= m->m_len;
2373	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2374	       valsize += m->m_len;
2375	       m = m->m_next;
2376	}
2377	if (m != NULL) {
2378		/* enough soopt buffer should be given from user-land */
2379		m_freem(m0);
2380		return(EINVAL);
2381	}
2382	sopt->sopt_valsize = valsize;
2383	return (0);
2384}
2385
2386void
2387sohasoutofband(so)
2388	struct socket *so;
2389{
2390	if (so->so_sigio != NULL)
2391		pgsigio(&so->so_sigio, SIGURG, 0);
2392	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2393}
2394
2395int
2396sopoll(struct socket *so, int events, struct ucred *active_cred,
2397    struct thread *td)
2398{
2399	int revents = 0;
2400
2401	SOCKBUF_LOCK(&so->so_snd);
2402	SOCKBUF_LOCK(&so->so_rcv);
2403	if (events & (POLLIN | POLLRDNORM))
2404		if (soreadable(so))
2405			revents |= events & (POLLIN | POLLRDNORM);
2406
2407	if (events & POLLINIGNEOF)
2408		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2409		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2410			revents |= POLLINIGNEOF;
2411
2412	if (events & (POLLOUT | POLLWRNORM))
2413		if (sowriteable(so))
2414			revents |= events & (POLLOUT | POLLWRNORM);
2415
2416	if (events & (POLLPRI | POLLRDBAND))
2417		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2418			revents |= events & (POLLPRI | POLLRDBAND);
2419
2420	if (revents == 0) {
2421		if (events &
2422		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2423		     POLLRDBAND)) {
2424			selrecord(td, &so->so_rcv.sb_sel);
2425			so->so_rcv.sb_flags |= SB_SEL;
2426		}
2427
2428		if (events & (POLLOUT | POLLWRNORM)) {
2429			selrecord(td, &so->so_snd.sb_sel);
2430			so->so_snd.sb_flags |= SB_SEL;
2431		}
2432	}
2433
2434	SOCKBUF_UNLOCK(&so->so_rcv);
2435	SOCKBUF_UNLOCK(&so->so_snd);
2436	return (revents);
2437}
2438
2439int
2440soo_kqfilter(struct file *fp, struct knote *kn)
2441{
2442	struct socket *so = kn->kn_fp->f_data;
2443	struct sockbuf *sb;
2444
2445	switch (kn->kn_filter) {
2446	case EVFILT_READ:
2447		if (so->so_options & SO_ACCEPTCONN)
2448			kn->kn_fop = &solisten_filtops;
2449		else
2450			kn->kn_fop = &soread_filtops;
2451		sb = &so->so_rcv;
2452		break;
2453	case EVFILT_WRITE:
2454		kn->kn_fop = &sowrite_filtops;
2455		sb = &so->so_snd;
2456		break;
2457	default:
2458		return (EINVAL);
2459	}
2460
2461	SOCKBUF_LOCK(sb);
2462	knlist_add(&sb->sb_sel.si_note, kn, 1);
2463	sb->sb_flags |= SB_KNOTE;
2464	SOCKBUF_UNLOCK(sb);
2465	return (0);
2466}
2467
2468static void
2469filt_sordetach(struct knote *kn)
2470{
2471	struct socket *so = kn->kn_fp->f_data;
2472
2473	SOCKBUF_LOCK(&so->so_rcv);
2474	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2475	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2476		so->so_rcv.sb_flags &= ~SB_KNOTE;
2477	SOCKBUF_UNLOCK(&so->so_rcv);
2478}
2479
2480/*ARGSUSED*/
2481static int
2482filt_soread(struct knote *kn, long hint)
2483{
2484	struct socket *so;
2485
2486	so = kn->kn_fp->f_data;
2487	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2488
2489	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2490	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2491		kn->kn_flags |= EV_EOF;
2492		kn->kn_fflags = so->so_error;
2493		return (1);
2494	} else if (so->so_error)	/* temporary udp error */
2495		return (1);
2496	else if (kn->kn_sfflags & NOTE_LOWAT)
2497		return (kn->kn_data >= kn->kn_sdata);
2498	else
2499		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2500}
2501
2502static void
2503filt_sowdetach(struct knote *kn)
2504{
2505	struct socket *so = kn->kn_fp->f_data;
2506
2507	SOCKBUF_LOCK(&so->so_snd);
2508	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2509	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2510		so->so_snd.sb_flags &= ~SB_KNOTE;
2511	SOCKBUF_UNLOCK(&so->so_snd);
2512}
2513
2514/*ARGSUSED*/
2515static int
2516filt_sowrite(struct knote *kn, long hint)
2517{
2518	struct socket *so;
2519
2520	so = kn->kn_fp->f_data;
2521	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2522	kn->kn_data = sbspace(&so->so_snd);
2523	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2524		kn->kn_flags |= EV_EOF;
2525		kn->kn_fflags = so->so_error;
2526		return (1);
2527	} else if (so->so_error)	/* temporary udp error */
2528		return (1);
2529	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2530	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2531		return (0);
2532	else if (kn->kn_sfflags & NOTE_LOWAT)
2533		return (kn->kn_data >= kn->kn_sdata);
2534	else
2535		return (kn->kn_data >= so->so_snd.sb_lowat);
2536}
2537
2538/*ARGSUSED*/
2539static int
2540filt_solisten(struct knote *kn, long hint)
2541{
2542	struct socket *so = kn->kn_fp->f_data;
2543
2544	kn->kn_data = so->so_qlen;
2545	return (! TAILQ_EMPTY(&so->so_comp));
2546}
2547
2548int
2549socheckuid(struct socket *so, uid_t uid)
2550{
2551
2552	if (so == NULL)
2553		return (EPERM);
2554	if (so->so_cred->cr_uid != uid)
2555		return (EPERM);
2556	return (0);
2557}
2558
2559static int
2560somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2561{
2562	int error;
2563	int val;
2564
2565	val = somaxconn;
2566	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2567	if (error || !req->newptr )
2568		return (error);
2569
2570	if (val < 1 || val > USHRT_MAX)
2571		return (EINVAL);
2572
2573	somaxconn = val;
2574	return (0);
2575}
2576