1/*
2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 *    must display the following acknowledgement:
43 *	This product includes software developed by the University of
44 *	California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/filedesc.h>
73#include <sys/proc.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
82#include <sys/event.h>
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/syslog.h>
91#include <sys/uio.h>
92#include <sys/ev.h>
93#include <sys/kdebug.h>
94#include <sys/un.h>
95#include <sys/user.h>
96#include <sys/priv.h>
97#include <sys/kern_event.h>
98#include <net/route.h>
99#include <net/init.h>
100#include <net/ntstat.h>
101#include <netinet/in.h>
102#include <netinet/in_pcb.h>
103#include <netinet/ip6.h>
104#include <netinet6/ip6_var.h>
105#include <netinet/flow_divert.h>
106#include <kern/zalloc.h>
107#include <kern/locks.h>
108#include <machine/limits.h>
109#include <libkern/OSAtomic.h>
110#include <pexpert/pexpert.h>
111#include <kern/assert.h>
112#include <kern/task.h>
113#include <sys/kpi_mbuf.h>
114#include <sys/mcache.h>
115
116#if CONFIG_MACF
117#include <security/mac.h>
118#include <security/mac_framework.h>
119#endif /* MAC */
120
121#if MULTIPATH
122#include <netinet/mp_pcb.h>
123#endif /* MULTIPATH */
124
125/* TODO: this should be in a header file somewhere */
126extern char *proc_name_address(void *p);
127
128static u_int32_t	so_cache_hw;	/* High water mark for socache */
129static u_int32_t	so_cache_timeouts;	/* number of timeouts */
130static u_int32_t	so_cache_max_freed;	/* max freed per timeout */
131static u_int32_t	cached_sock_count = 0;
132STAILQ_HEAD(, socket)	so_cache_head;
133int	max_cached_sock_count = MAX_CACHED_SOCKETS;
134static u_int32_t	so_cache_time;
135static int		socketinit_done;
136static struct zone	*so_cache_zone;
137
138static lck_grp_t	*so_cache_mtx_grp;
139static lck_attr_t	*so_cache_mtx_attr;
140static lck_grp_attr_t	*so_cache_mtx_grp_attr;
141static lck_mtx_t	*so_cache_mtx;
142
143#include <machine/limits.h>
144
145static void	filt_sordetach(struct knote *kn);
146static int	filt_soread(struct knote *kn, long hint);
147static void	filt_sowdetach(struct knote *kn);
148static int	filt_sowrite(struct knote *kn, long hint);
149static void	filt_sockdetach(struct knote *kn);
150static int	filt_sockev(struct knote *kn, long hint);
151
152static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
153static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
154
155static struct filterops soread_filtops = {
156	.f_isfd = 1,
157	.f_detach = filt_sordetach,
158	.f_event = filt_soread,
159};
160
161static struct filterops sowrite_filtops = {
162	.f_isfd = 1,
163	.f_detach = filt_sowdetach,
164	.f_event = filt_sowrite,
165};
166
167static struct filterops sock_filtops = {
168	.f_isfd = 1,
169	.f_detach = filt_sockdetach,
170	.f_event = filt_sockev,
171};
172
173#define	EVEN_MORE_LOCKING_DEBUG 0
174int socket_debug = 0;
175static int socket_zone = M_SOCKET;
176so_gen_t	so_gencnt;	/* generation count for sockets */
177
178MALLOC_DEFINE(M_SONAME, "soname", "socket name");
179MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
180
181#define	DBG_LAYER_IN_BEG	NETDBG_CODE(DBG_NETSOCK, 0)
182#define	DBG_LAYER_IN_END	NETDBG_CODE(DBG_NETSOCK, 2)
183#define	DBG_LAYER_OUT_BEG	NETDBG_CODE(DBG_NETSOCK, 1)
184#define	DBG_LAYER_OUT_END	NETDBG_CODE(DBG_NETSOCK, 3)
185#define	DBG_FNC_SOSEND		NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
186#define	DBG_FNC_SORECEIVE	NETDBG_CODE(DBG_NETSOCK, (8 << 8))
187#define	DBG_FNC_SOSHUTDOWN	NETDBG_CODE(DBG_NETSOCK, (9 << 8))
188
189#define	MAX_SOOPTGETM_SIZE	(128 * MCLBYTES)
190
191SYSCTL_DECL(_kern_ipc);
192
193int somaxconn = SOMAXCONN;
194SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
195	CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
196
197/* Should we get a maximum also ??? */
198static int sosendmaxchain = 65536;
199static int sosendminchain = 16384;
200static int sorecvmincopy  = 16384;
201SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
202	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
203SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
204	CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
205
206/*
207 * Set to enable jumbo clusters (if available) for large writes when
208 * the socket is marked with SOF_MULTIPAGES; see below.
209 */
210int sosendjcl = 1;
211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
212	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
213
214/*
215 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
216 * writes on the socket for all protocols on any network interfaces,
217 * depending upon sosendjcl above.  Be extra careful when setting this
218 * to 1, because sending down packets that cross physical pages down to
219 * broken drivers (those that falsely assume that the physical pages
220 * are contiguous) might lead to system panics or silent data corruption.
221 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
222 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
223 * capable.  Set this to 1 only for testing/debugging purposes.
224 */
225int sosendjcl_ignore_capab = 0;
226SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
227	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
228
229int sodefunctlog = 0;
230SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
231	&sodefunctlog, 0, "");
232
233int sothrottlelog = 0;
234SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
235	&sothrottlelog, 0, "");
236
237int sorestrictrecv = 1;
238SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
239	&sorestrictrecv, 0, "Enable inbound interface restrictions");
240
241/*
242 * Socket operation routines.
243 * These routines are called by the routines in
244 * sys_socket.c or from a system process, and
245 * implement the semantics of socket operations by
246 * switching out to the protocol specific routines.
247 */
248
249/* sys_generic.c */
250extern void postevent(struct socket *, struct sockbuf *, int);
251extern void evsofree(struct socket *);
252extern int tcp_notsent_lowat_check(struct socket *so);
253extern struct inpcbinfo tcbinfo;
254
255/* TODO: these should be in header file */
256extern int get_inpcb_str_size(void);
257extern int get_tcp_str_size(void);
258
259static unsigned int sl_zone_size;		/* size of sockaddr_list */
260static struct zone *sl_zone;			/* zone for sockaddr_list */
261
262static unsigned int se_zone_size;		/* size of sockaddr_entry */
263static struct zone *se_zone;			/* zone for sockaddr_entry */
264
265vm_size_t	so_cache_zone_element_size;
266
267static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
268static void cached_sock_alloc(struct socket **, int);
269static void cached_sock_free(struct socket *);
270
271/*
272 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
273 * setting the DSCP code on the packet based on the service class; see
274 * <rdar://problem/11277343> for details.
275 */
276__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
277SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
278	&sotcdb, 0, "");
279
280void
281socketinit(void)
282{
283	if (socketinit_done) {
284		printf("socketinit: already called...\n");
285		return;
286	}
287	socketinit_done = 1;
288
289	PE_parse_boot_argn("socket_debug", &socket_debug,
290	    sizeof (socket_debug));
291
292	/*
293	 * allocate lock group attribute and group for socket cache mutex
294	 */
295	so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
296	so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
297	    so_cache_mtx_grp_attr);
298
299	/*
300	 * allocate the lock attribute for socket cache mutex
301	 */
302	so_cache_mtx_attr = lck_attr_alloc_init();
303
304	/* cached sockets mutex */
305	so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
306	if (so_cache_mtx == NULL) {
307		panic("%s: unable to allocate so_cache_mtx\n", __func__);
308		/* NOTREACHED */
309	}
310	STAILQ_INIT(&so_cache_head);
311
312	so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
313	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
314
315	so_cache_zone = zinit(so_cache_zone_element_size,
316	    (120000 * so_cache_zone_element_size), 8192, "socache zone");
317	zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
318	zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
319
320	sl_zone_size = sizeof (struct sockaddr_list);
321	if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
322	    "sockaddr_list")) == NULL) {
323		panic("%s: unable to allocate sockaddr_list zone\n", __func__);
324		/* NOTREACHED */
325	}
326	zone_change(sl_zone, Z_CALLERACCT, FALSE);
327	zone_change(sl_zone, Z_EXPAND, TRUE);
328
329	se_zone_size = sizeof (struct sockaddr_entry);
330	if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
331	    "sockaddr_entry")) == NULL) {
332		panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
333		/* NOTREACHED */
334	}
335	zone_change(se_zone, Z_CALLERACCT, FALSE);
336	zone_change(se_zone, Z_EXPAND, TRUE);
337
338
339	in_pcbinit();
340	sflt_init();
341	socket_tclass_init();
342#if MULTIPATH
343	mp_pcbinit();
344#endif /* MULTIPATH */
345}
346
347static void
348cached_sock_alloc(struct socket **so, int waitok)
349{
350	caddr_t	temp;
351	uintptr_t offset;
352
353	lck_mtx_lock(so_cache_mtx);
354
355	if (!STAILQ_EMPTY(&so_cache_head)) {
356		VERIFY(cached_sock_count > 0);
357
358		*so = STAILQ_FIRST(&so_cache_head);
359		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
360		STAILQ_NEXT((*so), so_cache_ent) = NULL;
361
362		cached_sock_count--;
363		lck_mtx_unlock(so_cache_mtx);
364
365		temp = (*so)->so_saved_pcb;
366		bzero((caddr_t)*so, sizeof (struct socket));
367
368		(*so)->so_saved_pcb = temp;
369	} else {
370
371		lck_mtx_unlock(so_cache_mtx);
372
373		if (waitok)
374			*so = (struct socket *)zalloc(so_cache_zone);
375		else
376			*so = (struct socket *)zalloc_noblock(so_cache_zone);
377
378		if (*so == NULL)
379			return;
380
381		bzero((caddr_t)*so, sizeof (struct socket));
382
383		/*
384		 * Define offsets for extra structures into our
385		 * single block of memory. Align extra structures
386		 * on longword boundaries.
387		 */
388
389		offset = (uintptr_t)*so;
390		offset += sizeof (struct socket);
391
392		offset = ALIGN(offset);
393
394		(*so)->so_saved_pcb = (caddr_t)offset;
395		offset += get_inpcb_str_size();
396
397		offset = ALIGN(offset);
398
399		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
400		    (caddr_t)offset;
401	}
402
403	(*so)->cached_in_sock_layer = true;
404}
405
406static void
407cached_sock_free(struct socket *so)
408{
409
410	lck_mtx_lock(so_cache_mtx);
411
412	so_cache_time = net_uptime();
413	if (++cached_sock_count > max_cached_sock_count) {
414		--cached_sock_count;
415		lck_mtx_unlock(so_cache_mtx);
416		zfree(so_cache_zone, so);
417	} else {
418		if (so_cache_hw < cached_sock_count)
419			so_cache_hw = cached_sock_count;
420
421		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
422
423		so->cache_timestamp = so_cache_time;
424		lck_mtx_unlock(so_cache_mtx);
425	}
426}
427
428void
429so_update_last_owner_locked(struct socket *so, proc_t self)
430{
431	if (so->last_pid != 0) {
432		/*
433		 * last_pid and last_upid should remain zero for sockets
434		 * created using sock_socket. The check above achieves that
435		 */
436		if (self == PROC_NULL)
437			self = current_proc();
438
439		if (so->last_upid != proc_uniqueid(self) ||
440		    so->last_pid != proc_pid(self)) {
441			so->last_upid = proc_uniqueid(self);
442			so->last_pid = proc_pid(self);
443			proc_getexecutableuuid(self, so->last_uuid,
444			    sizeof (so->last_uuid));
445		}
446	}
447}
448
449void
450so_update_policy(struct socket *so)
451{
452	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
453		(void) inp_update_policy(sotoinpcb(so));
454}
455
456boolean_t
457so_cache_timer(void)
458{
459	struct socket	*p;
460	int		n_freed = 0;
461	boolean_t rc = FALSE;
462
463	lck_mtx_lock(so_cache_mtx);
464	so_cache_timeouts++;
465	so_cache_time = net_uptime();
466
467	while (!STAILQ_EMPTY(&so_cache_head)) {
468		VERIFY(cached_sock_count > 0);
469		p = STAILQ_FIRST(&so_cache_head);
470		if ((so_cache_time - p->cache_timestamp) <
471			SO_CACHE_TIME_LIMIT)
472			break;
473
474		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
475		--cached_sock_count;
476
477		zfree(so_cache_zone, p);
478
479		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
480			so_cache_max_freed++;
481			break;
482		}
483	}
484
485	/* Schedule again if there is more to cleanup */
486	if (!STAILQ_EMPTY(&so_cache_head))
487		rc = TRUE;
488
489	lck_mtx_unlock(so_cache_mtx);
490	return (rc);
491}
492
493/*
494 * Get a socket structure from our zone, and initialize it.
495 * We don't implement `waitok' yet (see comments in uipc_domain.c).
496 * Note that it would probably be better to allocate socket
497 * and PCB at the same time, but I'm not convinced that all
498 * the protocols can be easily modified to do this.
499 */
500struct socket *
501soalloc(int waitok, int dom, int type)
502{
503	struct socket *so;
504
505	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
506		cached_sock_alloc(&so, waitok);
507	} else {
508		MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
509		    M_WAITOK);
510		if (so != NULL)
511			bzero(so, sizeof (*so));
512	}
513	if (so != NULL) {
514		so->so_gencnt = ++so_gencnt;
515		so->so_zone = socket_zone;
516#if CONFIG_MACF_SOCKET
517		/* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
518		if (mac_socket_label_init(so, !waitok) != 0) {
519			sodealloc(so);
520			return (NULL);
521		}
522#endif /* MAC_SOCKET */
523	}
524
525	return (so);
526}
527
528int
529socreate_internal(int dom, struct socket **aso, int type, int proto,
530    struct proc *p, uint32_t flags, struct proc *ep)
531{
532	struct protosw *prp;
533	struct socket *so;
534	int error = 0;
535
536#if TCPDEBUG
537	extern int tcpconsdebug;
538#endif
539
540	VERIFY(aso != NULL);
541	*aso = NULL;
542
543	if (proto != 0)
544		prp = pffindproto(dom, proto, type);
545	else
546		prp = pffindtype(dom, type);
547
548	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
549		if (pffinddomain(dom) == NULL)
550			return (EAFNOSUPPORT);
551		if (proto != 0) {
552			if (pffindprotonotype(dom, proto) != NULL)
553				return (EPROTOTYPE);
554		}
555		return (EPROTONOSUPPORT);
556	}
557	if (prp->pr_type != type)
558		return (EPROTOTYPE);
559	so = soalloc(1, dom, type);
560	if (so == NULL)
561		return (ENOBUFS);
562
563	if (flags & SOCF_ASYNC)
564		so->so_state |= SS_NBIO;
565#if MULTIPATH
566	if (flags & SOCF_MP_SUBFLOW) {
567		/*
568		 * A multipath subflow socket is used internally in the kernel,
569		 * therefore it does not have a file desciptor associated by
570		 * default.
571		 */
572		so->so_state |= SS_NOFDREF;
573		so->so_flags |= SOF_MP_SUBFLOW;
574	}
575#endif /* MULTIPATH */
576
577	TAILQ_INIT(&so->so_incomp);
578	TAILQ_INIT(&so->so_comp);
579	so->so_type = type;
580	so->last_upid = proc_uniqueid(p);
581	so->last_pid = proc_pid(p);
582	proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
583
584	if (ep != PROC_NULL && ep != p) {
585		so->e_upid = proc_uniqueid(ep);
586		so->e_pid = proc_pid(ep);
587		proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
588		so->so_flags |= SOF_DELEGATED;
589	}
590
591	so->so_cred = kauth_cred_proc_ref(p);
592	if (!suser(kauth_cred_get(), NULL))
593		so->so_state |= SS_PRIV;
594
595	so->so_proto = prp;
596	so->so_rcv.sb_flags |= SB_RECV;
597	so->so_rcv.sb_so = so->so_snd.sb_so = so;
598	so->next_lock_lr = 0;
599	so->next_unlock_lr = 0;
600
601#if CONFIG_MACF_SOCKET
602	mac_socket_label_associate(kauth_cred_get(), so);
603#endif /* MAC_SOCKET */
604
605	/*
606	 * Attachment will create the per pcb lock if necessary and
607	 * increase refcount for creation, make sure it's done before
608	 * socket is inserted in lists.
609	 */
610	so->so_usecount++;
611
612	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
613	if (error != 0) {
614		/*
615		 * Warning:
616		 * If so_pcb is not zero, the socket will be leaked,
617		 * so protocol attachment handler must be coded carefuly
618		 */
619		so->so_state |= SS_NOFDREF;
620		so->so_usecount--;
621		sofreelastref(so, 1);	/* will deallocate the socket */
622		return (error);
623	}
624
625	atomic_add_32(&prp->pr_domain->dom_refs, 1);
626	TAILQ_INIT(&so->so_evlist);
627
628	/* Attach socket filters for this protocol */
629	sflt_initsock(so);
630#if TCPDEBUG
631	if (tcpconsdebug == 2)
632		so->so_options |= SO_DEBUG;
633#endif
634	so_set_default_traffic_class(so);
635
636	/*
637	 * If this thread or task is marked to create backgrounded sockets,
638	 * mark the socket as background.
639	 */
640	if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
641		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
642		so->so_background_thread = current_thread();
643	}
644
645	switch (dom) {
646	/*
647	 * Don't mark Unix domain, system or multipath sockets as
648	 * eligible for defunct by default.
649	 */
650	case PF_LOCAL:
651	case PF_SYSTEM:
652	case PF_MULTIPATH:
653		so->so_flags |= SOF_NODEFUNCT;
654		break;
655	default:
656		break;
657	}
658
659	*aso = so;
660
661	return (0);
662}
663
664/*
665 * Returns:	0			Success
666 *		EAFNOSUPPORT
667 *		EPROTOTYPE
668 *		EPROTONOSUPPORT
669 *		ENOBUFS
670 *	<pru_attach>:ENOBUFS[AF_UNIX]
671 *	<pru_attach>:ENOBUFS[TCP]
672 *	<pru_attach>:ENOMEM[TCP]
673 *	<pru_attach>:???		[other protocol families, IPSEC]
674 */
675int
676socreate(int dom, struct socket **aso, int type, int proto)
677{
678	return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
679	    PROC_NULL));
680}
681
682int
683socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
684{
685	int error = 0;
686	struct proc *ep = PROC_NULL;
687
688	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
689		error = ESRCH;
690		goto done;
691	}
692
693	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
694
695	/*
696	 * It might not be wise to hold the proc reference when calling
697	 * socreate_internal since it calls soalloc with M_WAITOK
698	 */
699done:
700	if (ep != PROC_NULL)
701		proc_rele(ep);
702
703	return (error);
704}
705
706/*
707 * Returns:	0			Success
708 *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
709 *	<pru_bind>:EAFNOSUPPORT		Address family not supported
710 *	<pru_bind>:EADDRNOTAVAIL	Address not available.
711 *	<pru_bind>:EINVAL		Invalid argument
712 *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
713 *	<pru_bind>:EACCES		Permission denied
714 *	<pru_bind>:EADDRINUSE		Address in use
715 *	<pru_bind>:EAGAIN		Resource unavailable, try again
716 *	<pru_bind>:EPERM		Operation not permitted
717 *	<pru_bind>:???
718 *	<sf_bind>:???
719 *
720 * Notes:	It's not possible to fully enumerate the return codes above,
721 *		since socket filter authors and protocol family authors may
722 *		not choose to limit their error returns to those listed, even
723 *		though this may result in some software operating incorrectly.
724 *
725 *		The error codes which are enumerated above are those known to
726 *		be returned by the tcp_usr_bind function supplied.
727 */
728int
729sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
730{
731	struct proc *p = current_proc();
732	int error = 0;
733
734	if (dolock)
735		socket_lock(so, 1);
736	VERIFY(so->so_usecount > 1);
737
738	so_update_last_owner_locked(so, p);
739	so_update_policy(so);
740
741	/*
742	 * If this is a bind request on a socket that has been marked
743	 * as inactive, reject it now before we go any further.
744	 */
745	if (so->so_flags & SOF_DEFUNCT) {
746		error = EINVAL;
747		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
748		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
749		    SOCK_DOM(so), SOCK_TYPE(so), error));
750		goto out;
751	}
752
753	/* Socket filter */
754	error = sflt_bind(so, nam);
755
756	if (error == 0)
757		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
758out:
759	if (dolock)
760		socket_unlock(so, 1);
761
762	if (error == EJUSTRETURN)
763		error = 0;
764
765	return (error);
766}
767
768void
769sodealloc(struct socket *so)
770{
771	kauth_cred_unref(&so->so_cred);
772
773	/* Remove any filters */
774	sflt_termsock(so);
775
776	/* Delete the state allocated for msg queues on a socket */
777	if (so->so_flags & SOF_ENABLE_MSGS) {
778		FREE(so->so_msg_state, M_TEMP);
779		so->so_msg_state = NULL;
780	}
781	VERIFY(so->so_msg_state == NULL);
782
783	so->so_gencnt = ++so_gencnt;
784
785#if CONFIG_MACF_SOCKET
786	mac_socket_label_destroy(so);
787#endif /* MAC_SOCKET */
788
789	if (so->cached_in_sock_layer) {
790		cached_sock_free(so);
791	} else {
792		FREE_ZONE(so, sizeof (*so), so->so_zone);
793	}
794}
795
796/*
797 * Returns:	0			Success
798 *		EINVAL
799 *		EOPNOTSUPP
800 *	<pru_listen>:EINVAL[AF_UNIX]
801 *	<pru_listen>:EINVAL[TCP]
802 *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
803 *	<pru_listen>:EINVAL[TCP]	Invalid argument
804 *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
805 *	<pru_listen>:EACCES[TCP]	Permission denied
806 *	<pru_listen>:EADDRINUSE[TCP]	Address in use
807 *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
808 *	<pru_listen>:EPERM[TCP]		Operation not permitted
809 *	<sf_listen>:???
810 *
811 * Notes:	Other <pru_listen> returns depend on the protocol family; all
812 *		<sf_listen> returns depend on what the filter author causes
813 *		their filter to return.
814 */
815int
816solisten(struct socket *so, int backlog)
817{
818	struct proc *p = current_proc();
819	int error = 0;
820
821	socket_lock(so, 1);
822
823	so_update_last_owner_locked(so, p);
824	so_update_policy(so);
825
826	if (so->so_proto == NULL) {
827		error = EINVAL;
828		goto out;
829	}
830	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
831		error = EOPNOTSUPP;
832		goto out;
833	}
834
835	/*
836	 * If the listen request is made on a socket that is not fully
837	 * disconnected, or on a socket that has been marked as inactive,
838	 * reject the request now.
839	 */
840	if ((so->so_state &
841	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
842	    (so->so_flags & SOF_DEFUNCT)) {
843		error = EINVAL;
844		if (so->so_flags & SOF_DEFUNCT) {
845			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
846			    "(%d)\n", __func__, proc_pid(p),
847			    (uint64_t)VM_KERNEL_ADDRPERM(so),
848			    SOCK_DOM(so), SOCK_TYPE(so), error));
849		}
850		goto out;
851	}
852
853	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
854		error = EPERM;
855		goto out;
856	}
857
858	error = sflt_listen(so);
859	if (error == 0)
860		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
861
862	if (error) {
863		if (error == EJUSTRETURN)
864			error = 0;
865		goto out;
866	}
867
868	if (TAILQ_EMPTY(&so->so_comp))
869		so->so_options |= SO_ACCEPTCONN;
870	/*
871	 * POSIX: The implementation may have an upper limit on the length of
872	 * the listen queue-either global or per accepting socket. If backlog
873	 * exceeds this limit, the length of the listen queue is set to the
874	 * limit.
875	 *
876	 * If listen() is called with a backlog argument value that is less
877	 * than 0, the function behaves as if it had been called with a backlog
878	 * argument value of 0.
879	 *
880	 * A backlog argument of 0 may allow the socket to accept connections,
881	 * in which case the length of the listen queue may be set to an
882	 * implementation-defined minimum value.
883	 */
884	if (backlog <= 0 || backlog > somaxconn)
885		backlog = somaxconn;
886
887	so->so_qlimit = backlog;
888out:
889	socket_unlock(so, 1);
890	return (error);
891}
892
893void
894sofreelastref(struct socket *so, int dealloc)
895{
896	struct socket *head = so->so_head;
897
898	/* Assume socket is locked */
899
900	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
901		selthreadclear(&so->so_snd.sb_sel);
902		selthreadclear(&so->so_rcv.sb_sel);
903		so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
904		so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
905		so->so_event = NULL;
906		return;
907	}
908	if (head != NULL) {
909		socket_lock(head, 1);
910		if (so->so_state & SS_INCOMP) {
911			TAILQ_REMOVE(&head->so_incomp, so, so_list);
912			head->so_incqlen--;
913		} else if (so->so_state & SS_COMP) {
914			/*
915			 * We must not decommission a socket that's
916			 * on the accept(2) queue.  If we do, then
917			 * accept(2) may hang after select(2) indicated
918			 * that the listening socket was ready.
919			 */
920			selthreadclear(&so->so_snd.sb_sel);
921			selthreadclear(&so->so_rcv.sb_sel);
922			so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
923			so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
924			so->so_event = NULL;
925			socket_unlock(head, 1);
926			return;
927		} else {
928			panic("sofree: not queued");
929		}
930		head->so_qlen--;
931		so->so_state &= ~SS_INCOMP;
932		so->so_head = NULL;
933		socket_unlock(head, 1);
934	}
935	sowflush(so);
936	sorflush(so);
937
938#if FLOW_DIVERT
939	if (so->so_flags & SOF_FLOW_DIVERT) {
940		flow_divert_detach(so);
941	}
942#endif	/* FLOW_DIVERT */
943
944	/* 3932268: disable upcall */
945	so->so_rcv.sb_flags &= ~SB_UPCALL;
946	so->so_snd.sb_flags &= ~SB_UPCALL;
947	so->so_event = NULL;
948
949	if (dealloc)
950		sodealloc(so);
951}
952
953void
954soclose_wait_locked(struct socket *so)
955{
956	lck_mtx_t *mutex_held;
957
958	if (so->so_proto->pr_getlock != NULL)
959		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
960	else
961		mutex_held = so->so_proto->pr_domain->dom_mtx;
962	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
963
964	/*
965	 * Double check here and return if there's no outstanding upcall;
966	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
967	 */
968	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
969		return;
970	so->so_rcv.sb_flags &= ~SB_UPCALL;
971	so->so_snd.sb_flags &= ~SB_UPCALL;
972	so->so_flags |= SOF_CLOSEWAIT;
973	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
974	    "soclose_wait_locked", NULL);
975	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
976	so->so_flags &= ~SOF_CLOSEWAIT;
977}
978
979/*
980 * Close a socket on last file table reference removal.
981 * Initiate disconnect if connected.
982 * Free socket when disconnect complete.
983 */
984int
985soclose_locked(struct socket *so)
986{
987	int error = 0;
988	lck_mtx_t *mutex_held;
989	struct timespec ts;
990
991	if (so->so_usecount == 0) {
992		panic("soclose: so=%p refcount=0\n", so);
993		/* NOTREACHED */
994	}
995
996	sflt_notify(so, sock_evt_closing, NULL);
997
998	if (so->so_upcallusecount)
999		soclose_wait_locked(so);
1000
1001	if ((so->so_options & SO_ACCEPTCONN)) {
1002		struct socket *sp, *sonext;
1003		int socklock = 0;
1004
1005		/*
1006		 * We do not want new connection to be added
1007		 * to the connection queues
1008		 */
1009		so->so_options &= ~SO_ACCEPTCONN;
1010
1011		for (sp = TAILQ_FIRST(&so->so_incomp);
1012		    sp != NULL; sp = sonext) {
1013			sonext = TAILQ_NEXT(sp, so_list);
1014
1015			/*
1016			 * Radar 5350314
1017			 * skip sockets thrown away by tcpdropdropblreq
1018			 * they will get cleanup by the garbage collection.
1019			 * otherwise, remove the incomp socket from the queue
1020			 * and let soabort trigger the appropriate cleanup.
1021			 */
1022			if (sp->so_flags & SOF_OVERFLOW)
1023				continue;
1024
1025			if (so->so_proto->pr_getlock != NULL) {
1026				/*
1027				 * Lock ordering for consistency with the
1028				 * rest of the stack, we lock the socket
1029				 * first and then grabb the head.
1030				 */
1031				socket_unlock(so, 0);
1032				socket_lock(sp, 1);
1033				socket_lock(so, 0);
1034				socklock = 1;
1035			}
1036
1037			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1038			so->so_incqlen--;
1039
1040			if (sp->so_state & SS_INCOMP) {
1041				sp->so_state &= ~SS_INCOMP;
1042				sp->so_head = NULL;
1043
1044				(void) soabort(sp);
1045			}
1046
1047			if (socklock)
1048				socket_unlock(sp, 1);
1049		}
1050
1051		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1052			/* Dequeue from so_comp since sofree() won't do it */
1053			TAILQ_REMOVE(&so->so_comp, sp, so_list);
1054			so->so_qlen--;
1055
1056			if (so->so_proto->pr_getlock != NULL) {
1057				socket_unlock(so, 0);
1058				socket_lock(sp, 1);
1059			}
1060
1061			if (sp->so_state & SS_COMP) {
1062				sp->so_state &= ~SS_COMP;
1063				sp->so_head = NULL;
1064
1065				(void) soabort(sp);
1066			}
1067
1068			if (so->so_proto->pr_getlock != NULL) {
1069				socket_unlock(sp, 1);
1070				socket_lock(so, 0);
1071			}
1072		}
1073	}
1074	if (so->so_pcb == NULL) {
1075		/* 3915887: mark the socket as ready for dealloc */
1076		so->so_flags |= SOF_PCBCLEARING;
1077		goto discard;
1078	}
1079	if (so->so_state & SS_ISCONNECTED) {
1080		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1081			error = sodisconnectlocked(so);
1082			if (error)
1083				goto drop;
1084		}
1085		if (so->so_options & SO_LINGER) {
1086			if ((so->so_state & SS_ISDISCONNECTING) &&
1087			    (so->so_state & SS_NBIO))
1088				goto drop;
1089			if (so->so_proto->pr_getlock != NULL)
1090				mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1091			else
1092				mutex_held = so->so_proto->pr_domain->dom_mtx;
1093			while (so->so_state & SS_ISCONNECTED) {
1094				ts.tv_sec = (so->so_linger/100);
1095				ts.tv_nsec = (so->so_linger % 100) *
1096				    NSEC_PER_USEC * 1000 * 10;
1097				error = msleep((caddr_t)&so->so_timeo,
1098				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1099				if (error) {
1100					/*
1101					 * It's OK when the time fires,
1102					 * don't report an error
1103					 */
1104					if (error == EWOULDBLOCK)
1105						error = 0;
1106					break;
1107				}
1108			}
1109		}
1110	}
1111drop:
1112	if (so->so_usecount == 0) {
1113		panic("soclose: usecount is zero so=%p\n", so);
1114		/* NOTREACHED */
1115	}
1116	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1117		/*
1118		 * Let NetworkStatistics know this PCB is going away
1119		 * before we detach it.
1120		 */
1121		if (nstat_collect &&
1122		    (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6))
1123			nstat_pcb_detach(so->so_pcb);
1124
1125		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1126		if (error == 0)
1127			error = error2;
1128	}
1129	if (so->so_usecount <= 0) {
1130		panic("soclose: usecount is zero so=%p\n", so);
1131		/* NOTREACHED */
1132	}
1133discard:
1134	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1135	    (so->so_state & SS_NOFDREF)) {
1136		panic("soclose: NOFDREF");
1137		/* NOTREACHED */
1138	}
1139	so->so_state |= SS_NOFDREF;
1140
1141	if (so->so_flags & SOF_MP_SUBFLOW)
1142		so->so_flags &= ~SOF_MP_SUBFLOW;
1143
1144	if ((so->so_flags & SOF_KNOTE) != 0)
1145		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1146
1147	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1148	evsofree(so);
1149
1150	so->so_usecount--;
1151	sofree(so);
1152	return (error);
1153}
1154
1155int
1156soclose(struct socket *so)
1157{
1158	int error = 0;
1159	socket_lock(so, 1);
1160
1161	if (so->so_retaincnt == 0) {
1162		error = soclose_locked(so);
1163	} else {
1164		/*
1165		 * if the FD is going away, but socket is
1166		 * retained in kernel remove its reference
1167		 */
1168		so->so_usecount--;
1169		if (so->so_usecount < 2)
1170			panic("soclose: retaincnt non null and so=%p "
1171			    "usecount=%d\n", so, so->so_usecount);
1172	}
1173	socket_unlock(so, 1);
1174	return (error);
1175}
1176
1177/*
1178 * Must be called at splnet...
1179 */
1180/* Should already be locked */
1181int
1182soabort(struct socket *so)
1183{
1184	int error;
1185
1186#ifdef MORE_LOCKING_DEBUG
1187	lck_mtx_t *mutex_held;
1188
1189	if (so->so_proto->pr_getlock != NULL)
1190		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1191	else
1192		mutex_held = so->so_proto->pr_domain->dom_mtx;
1193	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1194#endif
1195
1196	if ((so->so_flags & SOF_ABORTED) == 0) {
1197		so->so_flags |= SOF_ABORTED;
1198		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1199		if (error) {
1200			sofree(so);
1201			return (error);
1202		}
1203	}
1204	return (0);
1205}
1206
1207int
1208soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1209{
1210	int error;
1211
1212	if (dolock)
1213		socket_lock(so, 1);
1214
1215	so_update_last_owner_locked(so, PROC_NULL);
1216	so_update_policy(so);
1217
1218	if ((so->so_state & SS_NOFDREF) == 0)
1219		panic("soaccept: !NOFDREF");
1220	so->so_state &= ~SS_NOFDREF;
1221	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1222
1223	if (dolock)
1224		socket_unlock(so, 1);
1225	return (error);
1226}
1227
1228int
1229soaccept(struct socket *so, struct sockaddr **nam)
1230{
1231	return (soacceptlock(so, nam, 1));
1232}
1233
1234int
1235soacceptfilter(struct socket *so)
1236{
1237	struct sockaddr *local = NULL, *remote = NULL;
1238	int error = 0;
1239	struct socket *head = so->so_head;
1240
1241	/*
1242	 * Hold the lock even if this socket has not been made visible
1243	 * to the filter(s).  For sockets with global locks, this protects
1244	 * against the head or peer going away
1245	 */
1246	socket_lock(so, 1);
1247	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1248	    sogetaddr_locked(so, &local, 0) != 0) {
1249		so->so_state &= ~(SS_NOFDREF | SS_COMP);
1250		so->so_head = NULL;
1251		socket_unlock(so, 1);
1252		soclose(so);
1253		/* Out of resources; try it again next time */
1254		error = ECONNABORTED;
1255		goto done;
1256	}
1257
1258	error = sflt_accept(head, so, local, remote);
1259
1260	/*
1261	 * If we get EJUSTRETURN from one of the filters, mark this socket
1262	 * as inactive and return it anyway.  This newly accepted socket
1263	 * will be disconnected later before we hand it off to the caller.
1264	 */
1265	if (error == EJUSTRETURN) {
1266		error = 0;
1267		(void) sosetdefunct(current_proc(), so,
1268		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1269	}
1270
1271	if (error != 0) {
1272		/*
1273		 * This may seem like a duplication to the above error
1274		 * handling part when we return ECONNABORTED, except
1275		 * the following is done while holding the lock since
1276		 * the socket has been exposed to the filter(s) earlier.
1277		 */
1278		so->so_state &= ~(SS_NOFDREF | SS_COMP);
1279		so->so_head = NULL;
1280		socket_unlock(so, 1);
1281		soclose(so);
1282		/* Propagate socket filter's error code to the caller */
1283	} else {
1284		socket_unlock(so, 1);
1285	}
1286done:
1287	/* Callee checks for NULL pointer */
1288	sock_freeaddr(remote);
1289	sock_freeaddr(local);
1290	return (error);
1291}
1292
1293/*
1294 * Returns:	0			Success
1295 *		EOPNOTSUPP		Operation not supported on socket
1296 *		EISCONN			Socket is connected
1297 *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1298 *	<pru_connect>:EINVAL		Invalid argument
1299 *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1300 *	<pru_connect>:EACCES		Permission denied
1301 *	<pru_connect>:EADDRINUSE	Address in use
1302 *	<pru_connect>:EAGAIN		Resource unavailable, try again
1303 *	<pru_connect>:EPERM		Operation not permitted
1304 *	<sf_connect_out>:???		[anything a filter writer might set]
1305 */
1306int
1307soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1308{
1309	int error;
1310	struct proc *p = current_proc();
1311
1312	if (dolock)
1313		socket_lock(so, 1);
1314
1315	so_update_last_owner_locked(so, p);
1316	so_update_policy(so);
1317
1318	/*
1319	 * If this is a listening socket or if this is a previously-accepted
1320	 * socket that has been marked as inactive, reject the connect request.
1321	 */
1322	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1323		error = EOPNOTSUPP;
1324		if (so->so_flags & SOF_DEFUNCT) {
1325			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1326			    "(%d)\n", __func__, proc_pid(p),
1327			    (uint64_t)VM_KERNEL_ADDRPERM(so),
1328			    SOCK_DOM(so), SOCK_TYPE(so), error));
1329		}
1330		if (dolock)
1331			socket_unlock(so, 1);
1332		return (error);
1333	}
1334
1335	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1336		if (dolock)
1337			socket_unlock(so, 1);
1338		return (EPERM);
1339	}
1340
1341	/*
1342	 * If protocol is connection-based, can only connect once.
1343	 * Otherwise, if connected, try to disconnect first.
1344	 * This allows user to disconnect by connecting to, e.g.,
1345	 * a null address.
1346	 */
1347	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1348	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1349	    (error = sodisconnectlocked(so)))) {
1350		error = EISCONN;
1351	} else {
1352		/*
1353		 * Run connect filter before calling protocol:
1354		 *  - non-blocking connect returns before completion;
1355		 */
1356		error = sflt_connectout(so, nam);
1357		if (error != 0) {
1358			if (error == EJUSTRETURN)
1359				error = 0;
1360		} else {
1361			error = (*so->so_proto->pr_usrreqs->pru_connect)
1362			    (so, nam, p);
1363		}
1364	}
1365	if (dolock)
1366		socket_unlock(so, 1);
1367	return (error);
1368}
1369
1370int
1371soconnect(struct socket *so, struct sockaddr *nam)
1372{
1373	return (soconnectlock(so, nam, 1));
1374}
1375
1376/*
1377 * Returns:	0			Success
1378 *	<pru_connect2>:EINVAL[AF_UNIX]
1379 *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1380 *	<pru_connect2>:???		[other protocol families]
1381 *
1382 * Notes:	<pru_connect2> is not supported by [TCP].
1383 */
1384int
1385soconnect2(struct socket *so1, struct socket *so2)
1386{
1387	int error;
1388
1389	socket_lock(so1, 1);
1390	if (so2->so_proto->pr_lock)
1391		socket_lock(so2, 1);
1392
1393	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1394
1395	socket_unlock(so1, 1);
1396	if (so2->so_proto->pr_lock)
1397		socket_unlock(so2, 1);
1398	return (error);
1399}
1400
1401int
1402soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1403    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1404    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1405    uint32_t arglen)
1406{
1407	int error;
1408
1409	/*
1410	 * If this is a listening socket or if this is a previously-accepted
1411	 * socket that has been marked as inactive, reject the connect request.
1412	 */
1413	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1414		error = EOPNOTSUPP;
1415		if (so->so_flags & SOF_DEFUNCT) {
1416			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1417			    "(%d)\n", __func__, proc_pid(p),
1418			    (uint64_t)VM_KERNEL_ADDRPERM(so),
1419			    SOCK_DOM(so), SOCK_TYPE(so), error));
1420		}
1421		return (error);
1422	}
1423
1424	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1425		return (EPERM);
1426
1427	/*
1428	 * If protocol is connection-based, can only connect once
1429	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1430	 * try to disconnect first.  This allows user to disconnect
1431	 * by connecting to, e.g., a null address.
1432	 */
1433	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1434	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1435	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1436	    (error = sodisconnectlocked(so)) != 0)) {
1437		error = EISCONN;
1438	} else {
1439		/*
1440		 * Run connect filter before calling protocol:
1441		 *  - non-blocking connect returns before completion;
1442		 */
1443		error = sflt_connectxout(so, dst_sl);
1444		if (error != 0) {
1445			if (error == EJUSTRETURN)
1446				error = 0;
1447		} else {
1448			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1449			    (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1450			    flags, arg, arglen);
1451		}
1452	}
1453
1454	return (error);
1455}
1456
1457int
1458sodisconnectlocked(struct socket *so)
1459{
1460	int error;
1461
1462	if ((so->so_state & SS_ISCONNECTED) == 0) {
1463		error = ENOTCONN;
1464		goto bad;
1465	}
1466	if (so->so_state & SS_ISDISCONNECTING) {
1467		error = EALREADY;
1468		goto bad;
1469	}
1470
1471	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1472	if (error == 0)
1473		sflt_notify(so, sock_evt_disconnected, NULL);
1474
1475bad:
1476	return (error);
1477}
1478
1479/* Locking version */
1480int
1481sodisconnect(struct socket *so)
1482{
1483	int error;
1484
1485	socket_lock(so, 1);
1486	error = sodisconnectlocked(so);
1487	socket_unlock(so, 1);
1488	return (error);
1489}
1490
1491int
1492sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1493{
1494	int error;
1495
1496	/*
1497	 * Call the protocol disconnectx handler; let it handle all
1498	 * matters related to the connection state of this session.
1499	 */
1500	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1501	if (error == 0) {
1502		/*
1503		 * The event applies only for the session, not for
1504		 * the disconnection of individual subflows.
1505		 */
1506		if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1507			sflt_notify(so, sock_evt_disconnected, NULL);
1508	}
1509	return (error);
1510}
1511
1512int
1513sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1514{
1515	int error;
1516
1517	socket_lock(so, 1);
1518	error = sodisconnectxlocked(so, aid, cid);
1519	socket_unlock(so, 1);
1520	return (error);
1521}
1522
1523int
1524sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1525{
1526	return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1527}
1528
1529#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1530
1531/*
1532 * sosendcheck will lock the socket buffer if it isn't locked and
1533 * verify that there is space for the data being inserted.
1534 *
1535 * Returns:	0			Success
1536 *		EPIPE
1537 *	sblock:EWOULDBLOCK
1538 *	sblock:EINTR
1539 *	sbwait:EBADF
1540 *	sbwait:EINTR
1541 *	[so_error]:???
1542 */
1543int
1544sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1545    int32_t clen, int32_t atomic, int flags, int *sblocked,
1546    struct mbuf *control)
1547{
1548	int	error = 0;
1549	int32_t space;
1550	int	assumelock = 0;
1551
1552restart:
1553	if (*sblocked == 0) {
1554		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1555		    so->so_send_filt_thread != 0 &&
1556		    so->so_send_filt_thread == current_thread()) {
1557			/*
1558			 * We're being called recursively from a filter,
1559			 * allow this to continue. Radar 4150520.
1560			 * Don't set sblocked because we don't want
1561			 * to perform an unlock later.
1562			 */
1563			assumelock = 1;
1564		} else {
1565			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1566			if (error) {
1567				if (so->so_flags & SOF_DEFUNCT)
1568					goto defunct;
1569				return (error);
1570			}
1571			*sblocked = 1;
1572		}
1573	}
1574
1575	/*
1576	 * If a send attempt is made on a socket that has been marked
1577	 * as inactive (disconnected), reject the request.
1578	 */
1579	if (so->so_flags & SOF_DEFUNCT) {
1580defunct:
1581		error = EPIPE;
1582		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1583		    __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1584		    SOCK_DOM(so), SOCK_TYPE(so), error));
1585		return (error);
1586	}
1587
1588	if (so->so_state & SS_CANTSENDMORE)
1589		return (EPIPE);
1590
1591	if (so->so_error) {
1592		error = so->so_error;
1593		so->so_error = 0;
1594		return (error);
1595	}
1596
1597	if ((so->so_state & SS_ISCONNECTED) == 0) {
1598		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1599			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1600			    !(resid == 0 && clen != 0))
1601				return (ENOTCONN);
1602		} else if (addr == 0 && !(flags&MSG_HOLD)) {
1603			return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1604			    ENOTCONN : EDESTADDRREQ);
1605		}
1606	}
1607	if (so->so_flags & SOF_ENABLE_MSGS)
1608		space = msgq_sbspace(so, control);
1609	else
1610		space = sbspace(&so->so_snd);
1611
1612	if (flags & MSG_OOB)
1613		space += 1024;
1614	if ((atomic && resid > so->so_snd.sb_hiwat) ||
1615	    clen > so->so_snd.sb_hiwat)
1616		return (EMSGSIZE);
1617
1618	if ((space < resid + clen &&
1619	    (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1620	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1621		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1622		    assumelock) {
1623			return (EWOULDBLOCK);
1624		}
1625		sbunlock(&so->so_snd, TRUE);	/* keep socket locked */
1626		*sblocked = 0;
1627		error = sbwait(&so->so_snd);
1628		if (error) {
1629			if (so->so_flags & SOF_DEFUNCT)
1630				goto defunct;
1631			return (error);
1632		}
1633		goto restart;
1634	}
1635	return (0);
1636}
1637
1638/*
1639 * Send on a socket.
1640 * If send must go all at once and message is larger than
1641 * send buffering, then hard error.
1642 * Lock against other senders.
1643 * If must go all at once and not enough room now, then
1644 * inform user that this would block and do nothing.
1645 * Otherwise, if nonblocking, send as much as possible.
1646 * The data to be sent is described by "uio" if nonzero,
1647 * otherwise by the mbuf chain "top" (which must be null
1648 * if uio is not).  Data provided in mbuf chain must be small
1649 * enough to send all at once.
1650 *
1651 * Returns nonzero on error, timeout or signal; callers
1652 * must check for short counts if EINTR/ERESTART are returned.
1653 * Data and control buffers are freed on return.
1654 * Experiment:
1655 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1656 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1657 *  point at the mbuf chain being constructed and go from there.
1658 *
1659 * Returns:	0			Success
1660 *		EOPNOTSUPP
1661 *		EINVAL
1662 *		ENOBUFS
1663 *	uiomove:EFAULT
1664 *	sosendcheck:EPIPE
1665 *	sosendcheck:EWOULDBLOCK
1666 *	sosendcheck:EINTR
1667 *	sosendcheck:EBADF
1668 *	sosendcheck:EINTR
1669 *	sosendcheck:???			[value from so_error]
1670 *	<pru_send>:ECONNRESET[TCP]
1671 *	<pru_send>:EINVAL[TCP]
1672 *	<pru_send>:ENOBUFS[TCP]
1673 *	<pru_send>:EADDRINUSE[TCP]
1674 *	<pru_send>:EADDRNOTAVAIL[TCP]
1675 *	<pru_send>:EAFNOSUPPORT[TCP]
1676 *	<pru_send>:EACCES[TCP]
1677 *	<pru_send>:EAGAIN[TCP]
1678 *	<pru_send>:EPERM[TCP]
1679 *	<pru_send>:EMSGSIZE[TCP]
1680 *	<pru_send>:EHOSTUNREACH[TCP]
1681 *	<pru_send>:ENETUNREACH[TCP]
1682 *	<pru_send>:ENETDOWN[TCP]
1683 *	<pru_send>:ENOMEM[TCP]
1684 *	<pru_send>:ENOBUFS[TCP]
1685 *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
1686 *	<pru_send>:EINVAL[AF_UNIX]
1687 *	<pru_send>:EOPNOTSUPP[AF_UNIX]
1688 *	<pru_send>:EPIPE[AF_UNIX]
1689 *	<pru_send>:ENOTCONN[AF_UNIX]
1690 *	<pru_send>:EISCONN[AF_UNIX]
1691 *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
1692 *	<sf_data_out>:???		[whatever a filter author chooses]
1693 *
1694 * Notes:	Other <pru_send> returns depend on the protocol family; all
1695 *		<sf_data_out> returns depend on what the filter author causes
1696 *		their filter to return.
1697 */
1698int
1699sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1700    struct mbuf *top, struct mbuf *control, int flags)
1701{
1702	struct mbuf **mp;
1703	struct mbuf *m, *freelist = NULL;
1704	user_ssize_t space, len, resid;
1705	int clen = 0, error, dontroute, mlen, sendflags;
1706	int atomic = sosendallatonce(so) || top;
1707	int sblocked = 0;
1708	struct proc *p = current_proc();
1709	struct mbuf *control_copy = NULL;
1710
1711	if (uio != NULL)
1712		resid = uio_resid(uio);
1713	else
1714		resid = top->m_pkthdr.len;
1715
1716	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1717	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1718
1719	socket_lock(so, 1);
1720	so_update_last_owner_locked(so, p);
1721	so_update_policy(so);
1722
1723	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1724		error = EOPNOTSUPP;
1725		socket_unlock(so, 1);
1726		goto out;
1727	}
1728
1729	/*
1730	 * In theory resid should be unsigned.
1731	 * However, space must be signed, as it might be less than 0
1732	 * if we over-committed, and we must use a signed comparison
1733	 * of space and resid.  On the other hand, a negative resid
1734	 * causes us to loop sending 0-length segments to the protocol.
1735	 *
1736	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1737	 * But it will be used by sockets doing message delivery.
1738	 *
1739	 * Note: We limit resid to be a positive 32 bits value as we use
1740	 * imin() to set bytes_to_copy -- radr://14558484
1741	 */
1742	if ((int32_t)resid < 0 || (so->so_type == SOCK_STREAM &&
1743	    !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1744		error = EINVAL;
1745		socket_unlock(so, 1);
1746		goto out;
1747	}
1748
1749	dontroute = (flags & MSG_DONTROUTE) &&
1750	    (so->so_options & SO_DONTROUTE) == 0 &&
1751	    (so->so_proto->pr_flags & PR_ATOMIC);
1752	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1753
1754	if (control != NULL)
1755		clen = control->m_len;
1756
1757	do {
1758		error = sosendcheck(so, addr, resid, clen, atomic, flags,
1759		    &sblocked, control);
1760		if (error)
1761			goto release;
1762
1763		mp = &top;
1764		if (so->so_flags & SOF_ENABLE_MSGS)
1765			space = msgq_sbspace(so, control);
1766		else
1767			space = sbspace(&so->so_snd) - clen;
1768		space += ((flags & MSG_OOB) ? 1024 : 0);
1769
1770		do {
1771			if (uio == NULL) {
1772				/*
1773				 * Data is prepackaged in "top".
1774				 */
1775				resid = 0;
1776				if (flags & MSG_EOR)
1777					top->m_flags |= M_EOR;
1778			} else {
1779				int chainlength;
1780				int bytes_to_copy;
1781				boolean_t jumbocl;
1782
1783				bytes_to_copy = imin(resid, space);
1784
1785				if (sosendminchain > 0)
1786					chainlength = 0;
1787				else
1788					chainlength = sosendmaxchain;
1789
1790				/*
1791				 * Attempt to use larger than system page-size
1792				 * clusters for large writes only if there is
1793				 * a jumbo cluster pool and if the socket is
1794				 * marked accordingly.
1795				 */
1796				jumbocl = sosendjcl && njcl > 0 &&
1797				    ((so->so_flags & SOF_MULTIPAGES) ||
1798				    sosendjcl_ignore_capab);
1799
1800				socket_unlock(so, 0);
1801
1802				do {
1803					int num_needed;
1804					int hdrs_needed = (top == NULL) ? 1 : 0;
1805
1806					/*
1807					 * try to maintain a local cache of mbuf
1808					 * clusters needed to complete this
1809					 * write the list is further limited to
1810					 * the number that are currently needed
1811					 * to fill the socket this mechanism
1812					 * allows a large number of mbufs/
1813					 * clusters to be grabbed under a single
1814					 * mbuf lock... if we can't get any
1815					 * clusters, than fall back to trying
1816					 * for mbufs if we fail early (or
1817					 * miscalcluate the number needed) make
1818					 * sure to release any clusters we
1819					 * haven't yet consumed.
1820					 */
1821					if (freelist == NULL &&
1822					    bytes_to_copy > MBIGCLBYTES &&
1823					    jumbocl) {
1824						num_needed =
1825						    bytes_to_copy / M16KCLBYTES;
1826
1827						if ((bytes_to_copy -
1828						    (num_needed * M16KCLBYTES))
1829						    >= MINCLSIZE)
1830							num_needed++;
1831
1832						freelist =
1833						    m_getpackets_internal(
1834						    (unsigned int *)&num_needed,
1835						    hdrs_needed, M_WAIT, 0,
1836						    M16KCLBYTES);
1837						/*
1838						 * Fall back to 4K cluster size
1839						 * if allocation failed
1840						 */
1841					}
1842
1843					if (freelist == NULL &&
1844					    bytes_to_copy > MCLBYTES) {
1845						num_needed =
1846						    bytes_to_copy / MBIGCLBYTES;
1847
1848						if ((bytes_to_copy -
1849						    (num_needed * MBIGCLBYTES)) >=
1850						    MINCLSIZE)
1851							num_needed++;
1852
1853						freelist =
1854						    m_getpackets_internal(
1855						    (unsigned int *)&num_needed,
1856						    hdrs_needed, M_WAIT, 0,
1857						    MBIGCLBYTES);
1858						/*
1859						 * Fall back to cluster size
1860						 * if allocation failed
1861						 */
1862					}
1863
1864					if (freelist == NULL &&
1865					    bytes_to_copy > MINCLSIZE) {
1866						num_needed =
1867						    bytes_to_copy / MCLBYTES;
1868
1869						if ((bytes_to_copy -
1870						    (num_needed * MCLBYTES)) >=
1871						    MINCLSIZE)
1872							num_needed++;
1873
1874						freelist =
1875						    m_getpackets_internal(
1876						    (unsigned int *)&num_needed,
1877						    hdrs_needed, M_WAIT, 0,
1878						    MCLBYTES);
1879						/*
1880						 * Fall back to a single mbuf
1881						 * if allocation failed
1882						 */
1883					}
1884
1885					if (freelist == NULL) {
1886						if (top == NULL)
1887							MGETHDR(freelist,
1888							    M_WAIT, MT_DATA);
1889						else
1890							MGET(freelist,
1891							    M_WAIT, MT_DATA);
1892
1893						if (freelist == NULL) {
1894							error = ENOBUFS;
1895							socket_lock(so, 0);
1896							goto release;
1897						}
1898						/*
1899						 * For datagram protocols,
1900						 * leave room for protocol
1901						 * headers in first mbuf.
1902						 */
1903						if (atomic && top == NULL &&
1904						    bytes_to_copy < MHLEN) {
1905							MH_ALIGN(freelist,
1906							    bytes_to_copy);
1907						}
1908					}
1909					m = freelist;
1910					freelist = m->m_next;
1911					m->m_next = NULL;
1912
1913					if ((m->m_flags & M_EXT))
1914						mlen = m->m_ext.ext_size;
1915					else if ((m->m_flags & M_PKTHDR))
1916						mlen =
1917						    MHLEN - m_leadingspace(m);
1918					else
1919						mlen = MLEN;
1920					len = imin(mlen, bytes_to_copy);
1921
1922					chainlength += len;
1923
1924					space -= len;
1925
1926					error = uiomove(mtod(m, caddr_t),
1927					    len, uio);
1928
1929					resid = uio_resid(uio);
1930
1931					m->m_len = len;
1932					*mp = m;
1933					top->m_pkthdr.len += len;
1934					if (error)
1935						break;
1936					mp = &m->m_next;
1937					if (resid <= 0) {
1938						if (flags & MSG_EOR)
1939							top->m_flags |= M_EOR;
1940						break;
1941					}
1942					bytes_to_copy = min(resid, space);
1943
1944				} while (space > 0 &&
1945				    (chainlength < sosendmaxchain || atomic ||
1946				    resid < MINCLSIZE));
1947
1948				socket_lock(so, 0);
1949
1950				if (error)
1951					goto release;
1952			}
1953
1954			if (flags & (MSG_HOLD|MSG_SEND)) {
1955				/* Enqueue for later, go away if HOLD */
1956				struct mbuf *mb1;
1957				if (so->so_temp && (flags & MSG_FLUSH)) {
1958					m_freem(so->so_temp);
1959					so->so_temp = NULL;
1960				}
1961				if (so->so_temp)
1962					so->so_tail->m_next = top;
1963				else
1964					so->so_temp = top;
1965				mb1 = top;
1966				while (mb1->m_next)
1967					mb1 = mb1->m_next;
1968				so->so_tail = mb1;
1969				if (flags & MSG_HOLD) {
1970					top = NULL;
1971					goto release;
1972				}
1973				top = so->so_temp;
1974			}
1975			if (dontroute)
1976				so->so_options |= SO_DONTROUTE;
1977
1978			/* Compute flags here, for pru_send and NKEs */
1979			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1980			    /*
1981			     * If the user set MSG_EOF, the protocol
1982			     * understands this flag and nothing left to
1983			     * send then use PRU_SEND_EOF instead of PRU_SEND.
1984			     */
1985			    ((flags & MSG_EOF) &&
1986			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1987			     (resid <= 0)) ? PRUS_EOF :
1988			     /* If there is more to send set PRUS_MORETOCOME */
1989			     (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1990
1991			/*
1992			 * Socket filter processing
1993			 */
1994			error = sflt_data_out(so, addr, &top,
1995			    &control, (sendflags & MSG_OOB) ?
1996			    sock_data_filt_flag_oob : 0);
1997			if (error) {
1998				if (error == EJUSTRETURN) {
1999					error = 0;
2000					clen = 0;
2001					control = NULL;
2002					top = NULL;
2003				}
2004
2005				goto release;
2006			}
2007			/*
2008			 * End Socket filter processing
2009			 */
2010
2011			if (so->so_flags & SOF_ENABLE_MSGS) {
2012				/*
2013				 * Make a copy of control mbuf,
2014				 * so that msg priority can be
2015				 * passed to subsequent mbufs.
2016				 */
2017				control_copy = m_dup(control, M_NOWAIT);
2018			}
2019			error = (*so->so_proto->pr_usrreqs->pru_send)
2020			    (so, sendflags, top, addr, control, p);
2021
2022			if (flags & MSG_SEND)
2023				so->so_temp = NULL;
2024
2025			if (dontroute)
2026				so->so_options &= ~SO_DONTROUTE;
2027
2028			clen = 0;
2029			control = control_copy;
2030			control_copy = NULL;
2031			top = NULL;
2032			mp = &top;
2033			if (error)
2034				goto release;
2035		} while (resid && space > 0);
2036	} while (resid);
2037
2038release:
2039	if (sblocked)
2040		sbunlock(&so->so_snd, FALSE);	/* will unlock socket */
2041	else
2042		socket_unlock(so, 1);
2043out:
2044	if (top != NULL)
2045		m_freem(top);
2046	if (control != NULL)
2047		m_freem(control);
2048	if (freelist != NULL)
2049		m_freem_list(freelist);
2050	if (control_copy != NULL)
2051		m_freem(control_copy);
2052
2053	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2054	    space, error);
2055
2056	return (error);
2057}
2058
2059/*
2060 * Implement receive operations on a socket.
2061 * We depend on the way that records are added to the sockbuf
2062 * by sbappend*.  In particular, each record (mbufs linked through m_next)
2063 * must begin with an address if the protocol so specifies,
2064 * followed by an optional mbuf or mbufs containing ancillary data,
2065 * and then zero or more mbufs of data.
2066 * In order to avoid blocking network interrupts for the entire time here,
2067 * we splx() while doing the actual copy to user space.
2068 * Although the sockbuf is locked, new data may still be appended,
2069 * and thus we must maintain consistency of the sockbuf during that time.
2070 *
2071 * The caller may receive the data as a single mbuf chain by supplying
2072 * an mbuf **mp0 for use in returning the chain.  The uio is then used
2073 * only for the count in uio_resid.
2074 *
2075 * Returns:	0			Success
2076 *		ENOBUFS
2077 *		ENOTCONN
2078 *		EWOULDBLOCK
2079 *	uiomove:EFAULT
2080 *	sblock:EWOULDBLOCK
2081 *	sblock:EINTR
2082 *	sbwait:EBADF
2083 *	sbwait:EINTR
2084 *	sodelayed_copy:EFAULT
2085 *	<pru_rcvoob>:EINVAL[TCP]
2086 *	<pru_rcvoob>:EWOULDBLOCK[TCP]
2087 *	<pru_rcvoob>:???
2088 *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2089 *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2090 *	<pr_domain->dom_externalize>:???
2091 *
2092 * Notes:	Additional return values from calls through <pru_rcvoob> and
2093 *		<pr_domain->dom_externalize> depend on protocols other than
2094 *		TCP or AF_UNIX, which are documented above.
2095 */
2096int
2097soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2098    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2099{
2100	struct mbuf *m, **mp, *ml = NULL;
2101	struct mbuf *nextrecord, *free_list;
2102	int flags, error, offset;
2103	user_ssize_t len;
2104	struct protosw *pr = so->so_proto;
2105	int moff, type =0;
2106	user_ssize_t orig_resid = uio_resid(uio);
2107	user_ssize_t delayed_copy_len;
2108	int can_delay;
2109	int need_event;
2110	struct proc *p = current_proc();
2111
2112	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2113	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2114
2115	socket_lock(so, 1);
2116	so_update_last_owner_locked(so, p);
2117	so_update_policy(so);
2118
2119#ifdef MORE_LOCKING_DEBUG
2120	if (so->so_usecount == 1) {
2121		panic("%s: so=%x no other reference on socket\n", __func__, so);
2122		/* NOTREACHED */
2123	}
2124#endif
2125	mp = mp0;
2126	if (psa != NULL)
2127		*psa = NULL;
2128	if (controlp != NULL)
2129		*controlp = NULL;
2130	if (flagsp != NULL)
2131		flags = *flagsp &~ MSG_EOR;
2132	else
2133		flags = 0;
2134
2135	/*
2136	 * If a recv attempt is made on a previously-accepted socket
2137	 * that has been marked as inactive (disconnected), reject
2138	 * the request.
2139	 */
2140	if (so->so_flags & SOF_DEFUNCT) {
2141		struct sockbuf *sb = &so->so_rcv;
2142
2143		error = ENOTCONN;
2144		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2145		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2146		    SOCK_DOM(so), SOCK_TYPE(so), error));
2147		/*
2148		 * This socket should have been disconnected and flushed
2149		 * prior to being returned from sodefunct(); there should
2150		 * be no data on its receive list, so panic otherwise.
2151		 */
2152		if (so->so_state & SS_DEFUNCT)
2153			sb_empty_assert(sb, __func__);
2154		socket_unlock(so, 1);
2155		return (error);
2156	}
2157
2158	/*
2159	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2160	 * regardless of the flags argument. Here is the case were
2161	 * out-of-band data is not inline.
2162	 */
2163	if ((flags & MSG_OOB) ||
2164	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2165	    (so->so_options & SO_OOBINLINE) == 0 &&
2166	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2167		m = m_get(M_WAIT, MT_DATA);
2168		if (m == NULL) {
2169			socket_unlock(so, 1);
2170			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2171			    ENOBUFS, 0, 0, 0, 0);
2172			return (ENOBUFS);
2173		}
2174		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2175		if (error)
2176			goto bad;
2177		socket_unlock(so, 0);
2178		do {
2179			error = uiomove(mtod(m, caddr_t),
2180			    imin(uio_resid(uio), m->m_len), uio);
2181			m = m_free(m);
2182		} while (uio_resid(uio) && error == 0 && m != NULL);
2183		socket_lock(so, 0);
2184bad:
2185		if (m != NULL)
2186			m_freem(m);
2187
2188		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2189			if (error == EWOULDBLOCK || error == EINVAL) {
2190				/*
2191				 * Let's try to get normal data:
2192				 * EWOULDBLOCK: out-of-band data not
2193				 * receive yet. EINVAL: out-of-band data
2194				 * already read.
2195				 */
2196				error = 0;
2197				goto nooob;
2198			} else if (error == 0 && flagsp != NULL) {
2199				*flagsp |= MSG_OOB;
2200			}
2201		}
2202		socket_unlock(so, 1);
2203		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2204		    0, 0, 0, 0);
2205
2206		return (error);
2207	}
2208nooob:
2209	if (mp != NULL)
2210		*mp = NULL;
2211	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2212		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
2213
2214	free_list = NULL;
2215	delayed_copy_len = 0;
2216restart:
2217#ifdef MORE_LOCKING_DEBUG
2218	if (so->so_usecount <= 1)
2219		printf("soreceive: sblock so=%p ref=%d on socket\n",
2220		    so, so->so_usecount);
2221#endif
2222	/*
2223	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2224	 * and if so just return to the caller.  This could happen when
2225	 * soreceive() is called by a socket upcall function during the
2226	 * time the socket is freed.  The socket buffer would have been
2227	 * locked across the upcall, therefore we cannot put this thread
2228	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2229	 * we may livelock), because the lock on the socket buffer will
2230	 * only be released when the upcall routine returns to its caller.
2231	 * Because the socket has been officially closed, there can be
2232	 * no further read on it.
2233	 *
2234	 * A multipath subflow socket would have its SS_NOFDREF set by
2235	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2236	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2237	 */
2238	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2239	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2240		socket_unlock(so, 1);
2241		return (0);
2242	}
2243
2244	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2245	if (error) {
2246		socket_unlock(so, 1);
2247		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2248		    0, 0, 0, 0);
2249		return (error);
2250	}
2251
2252	m = so->so_rcv.sb_mb;
2253	/*
2254	 * If we have less data than requested, block awaiting more
2255	 * (subject to any timeout) if:
2256	 *   1. the current count is less than the low water mark, or
2257	 *   2. MSG_WAITALL is set, and it is possible to do the entire
2258	 *	receive operation at once if we block (resid <= hiwat).
2259	 *   3. MSG_DONTWAIT is not set
2260	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2261	 * we have to do the receive in sections, and thus risk returning
2262	 * a short count if a timeout or signal occurs after we start.
2263	 */
2264	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2265	    so->so_rcv.sb_cc < uio_resid(uio)) &&
2266	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2267	    ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2268	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2269		/*
2270		 * Panic if we notice inconsistencies in the socket's
2271		 * receive list; both sb_mb and sb_cc should correctly
2272		 * reflect the contents of the list, otherwise we may
2273		 * end up with false positives during select() or poll()
2274		 * which could put the application in a bad state.
2275		 */
2276		SB_MB_CHECK(&so->so_rcv);
2277
2278		if (so->so_error) {
2279			if (m != NULL)
2280				goto dontblock;
2281			error = so->so_error;
2282			if ((flags & MSG_PEEK) == 0)
2283				so->so_error = 0;
2284			goto release;
2285		}
2286		if (so->so_state & SS_CANTRCVMORE) {
2287			if (m != NULL)
2288				goto dontblock;
2289			else
2290				goto release;
2291		}
2292		for (; m != NULL; m = m->m_next)
2293			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2294				m = so->so_rcv.sb_mb;
2295				goto dontblock;
2296			}
2297		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2298		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2299			error = ENOTCONN;
2300			goto release;
2301		}
2302		if (uio_resid(uio) == 0)
2303			goto release;
2304		if ((so->so_state & SS_NBIO) ||
2305		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2306			error = EWOULDBLOCK;
2307			goto release;
2308		}
2309		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2310		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2311		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
2312#if EVEN_MORE_LOCKING_DEBUG
2313		if (socket_debug)
2314			printf("Waiting for socket data\n");
2315#endif
2316
2317		error = sbwait(&so->so_rcv);
2318#if EVEN_MORE_LOCKING_DEBUG
2319		if (socket_debug)
2320			printf("SORECEIVE - sbwait returned %d\n", error);
2321#endif
2322		if (so->so_usecount < 1) {
2323			panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2324			    __func__, so, so->so_usecount);
2325			/* NOTREACHED */
2326		}
2327		if (error) {
2328			socket_unlock(so, 1);
2329			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2330			    0, 0, 0, 0);
2331			return (error);
2332		}
2333		goto restart;
2334	}
2335dontblock:
2336	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2337	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2338	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2339	nextrecord = m->m_nextpkt;
2340	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2341		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2342#if CONFIG_MACF_SOCKET_SUBSET
2343		/*
2344		 * Call the MAC framework for policy checking if we're in
2345		 * the user process context and the socket isn't connected.
2346		 */
2347		if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2348			struct mbuf *m0 = m;
2349			/*
2350			 * Dequeue this record (temporarily) from the receive
2351			 * list since we're about to drop the socket's lock
2352			 * where a new record may arrive and be appended to
2353			 * the list.  Upon MAC policy failure, the record
2354			 * will be freed.  Otherwise, we'll add it back to
2355			 * the head of the list.  We cannot rely on SB_LOCK
2356			 * because append operation uses the socket's lock.
2357			 */
2358			do {
2359				m->m_nextpkt = NULL;
2360				sbfree(&so->so_rcv, m);
2361				m = m->m_next;
2362			} while (m != NULL);
2363			m = m0;
2364			so->so_rcv.sb_mb = nextrecord;
2365			SB_EMPTY_FIXUP(&so->so_rcv);
2366			SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2367			SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2368			socket_unlock(so, 0);
2369			if (mac_socket_check_received(proc_ucred(p), so,
2370			    mtod(m, struct sockaddr *)) != 0) {
2371				/*
2372				 * MAC policy failure; free this record and
2373				 * process the next record (or block until
2374				 * one is available).  We have adjusted sb_cc
2375				 * and sb_mbcnt above so there is no need to
2376				 * call sbfree() again.
2377				 */
2378				do {
2379					m = m_free(m);
2380				} while (m != NULL);
2381				/*
2382				 * Clear SB_LOCK but don't unlock the socket.
2383				 * Process the next record or wait for one.
2384				 */
2385				socket_lock(so, 0);
2386				sbunlock(&so->so_rcv, TRUE); /* stay locked */
2387				goto restart;
2388			}
2389			socket_lock(so, 0);
2390			/*
2391			 * If the socket has been defunct'd, drop it.
2392			 */
2393			if (so->so_flags & SOF_DEFUNCT) {
2394				m_freem(m);
2395				error = ENOTCONN;
2396				goto release;
2397			}
2398			/*
2399			 * Re-adjust the socket receive list and re-enqueue
2400			 * the record in front of any packets which may have
2401			 * been appended while we dropped the lock.
2402			 */
2403			for (m = m0; m->m_next != NULL; m = m->m_next)
2404				sballoc(&so->so_rcv, m);
2405			sballoc(&so->so_rcv, m);
2406			if (so->so_rcv.sb_mb == NULL) {
2407				so->so_rcv.sb_lastrecord = m0;
2408				so->so_rcv.sb_mbtail = m;
2409			}
2410			m = m0;
2411			nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2412			so->so_rcv.sb_mb = m;
2413			SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2414			SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2415		}
2416#endif /* CONFIG_MACF_SOCKET_SUBSET */
2417		orig_resid = 0;
2418		if (psa != NULL) {
2419			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
2420			    mp0 == NULL);
2421			if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2422				error = EWOULDBLOCK;
2423				goto release;
2424			}
2425		}
2426		if (flags & MSG_PEEK) {
2427			m = m->m_next;
2428		} else {
2429			sbfree(&so->so_rcv, m);
2430			if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2431				panic("%s: about to create invalid socketbuf",
2432				    __func__);
2433				/* NOTREACHED */
2434			}
2435			MFREE(m, so->so_rcv.sb_mb);
2436			m = so->so_rcv.sb_mb;
2437			if (m != NULL) {
2438				m->m_nextpkt = nextrecord;
2439			} else {
2440				so->so_rcv.sb_mb = nextrecord;
2441				SB_EMPTY_FIXUP(&so->so_rcv);
2442			}
2443		}
2444	}
2445
2446	/*
2447	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2448	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2449	 * just copy the data; if !MSG_PEEK, we call into the protocol to
2450	 * perform externalization.
2451	 */
2452	if (m != NULL && m->m_type == MT_CONTROL) {
2453		struct mbuf *cm = NULL, *cmn;
2454		struct mbuf **cme = &cm;
2455		struct sockbuf *sb_rcv = &so->so_rcv;
2456		struct mbuf **msgpcm = NULL;
2457
2458		/*
2459		 * Externalizing the control messages would require us to
2460		 * drop the socket's lock below.  Once we re-acquire the
2461		 * lock, the mbuf chain might change.  In order to preserve
2462		 * consistency, we unlink all control messages from the
2463		 * first mbuf chain in one shot and link them separately
2464		 * onto a different chain.
2465		 */
2466		do {
2467			if (flags & MSG_PEEK) {
2468				if (controlp != NULL) {
2469					if (*controlp == NULL) {
2470						msgpcm = controlp;
2471					}
2472					*controlp = m_copy(m, 0, m->m_len);
2473
2474					/*
2475					 * If we failed to allocate an mbuf,
2476					 * release any previously allocated
2477					 * mbufs for control data. Return
2478					 * an error. Keep the mbufs in the
2479					 * socket as this is using
2480					 * MSG_PEEK flag.
2481					 */
2482					if (*controlp == NULL) {
2483						m_freem(*msgpcm);
2484						error = ENOBUFS;
2485						goto release;
2486					}
2487					controlp = &(*controlp)->m_next;
2488				}
2489				m = m->m_next;
2490			} else {
2491				m->m_nextpkt = NULL;
2492				sbfree(sb_rcv, m);
2493				sb_rcv->sb_mb = m->m_next;
2494				m->m_next = NULL;
2495				*cme = m;
2496				cme = &(*cme)->m_next;
2497				m = sb_rcv->sb_mb;
2498			}
2499		} while (m != NULL && m->m_type == MT_CONTROL);
2500
2501		if (!(flags & MSG_PEEK)) {
2502			if (sb_rcv->sb_mb != NULL) {
2503				sb_rcv->sb_mb->m_nextpkt = nextrecord;
2504			} else {
2505				sb_rcv->sb_mb = nextrecord;
2506				SB_EMPTY_FIXUP(sb_rcv);
2507			}
2508			if (nextrecord == NULL)
2509				sb_rcv->sb_lastrecord = m;
2510		}
2511
2512		SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2513		SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2514
2515		while (cm != NULL) {
2516			int cmsg_type;
2517
2518			cmn = cm->m_next;
2519			cm->m_next = NULL;
2520			cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2521
2522			/*
2523			 * Call the protocol to externalize SCM_RIGHTS message
2524			 * and return the modified message to the caller upon
2525			 * success.  Otherwise, all other control messages are
2526			 * returned unmodified to the caller.  Note that we
2527			 * only get into this loop if MSG_PEEK is not set.
2528			 */
2529			if (pr->pr_domain->dom_externalize != NULL &&
2530			    cmsg_type == SCM_RIGHTS) {
2531				/*
2532				 * Release socket lock: see 3903171.  This
2533				 * would also allow more records to be appended
2534				 * to the socket buffer.  We still have SB_LOCK
2535				 * set on it, so we can be sure that the head
2536				 * of the mbuf chain won't change.
2537				 */
2538				socket_unlock(so, 0);
2539				error = (*pr->pr_domain->dom_externalize)(cm);
2540				socket_lock(so, 0);
2541			} else {
2542				error = 0;
2543			}
2544
2545			if (controlp != NULL && error == 0) {
2546				*controlp = cm;
2547				controlp = &(*controlp)->m_next;
2548				orig_resid = 0;
2549			} else {
2550				(void) m_free(cm);
2551			}
2552			cm = cmn;
2553		}
2554		/*
2555		 * Update the value of nextrecord in case we received new
2556		 * records when the socket was unlocked above for
2557		 * externalizing SCM_RIGHTS.
2558		 */
2559		if (m != NULL)
2560			nextrecord = sb_rcv->sb_mb->m_nextpkt;
2561		else
2562			nextrecord = sb_rcv->sb_mb;
2563		orig_resid = 0;
2564	}
2565
2566	/*
2567	 * If the socket is a TCP socket with message delivery
2568	 * enabled, then create a control msg to deliver the
2569	 * relative TCP sequence number for this data. Waiting
2570	 * until this point will protect against failures to
2571	 * allocate an mbuf for control msgs.
2572	 */
2573	if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2574	    (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2575		struct mbuf *seq_cm;
2576
2577		seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2578		    sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2579		if (seq_cm == NULL) {
2580			/* unable to allocate a control mbuf */
2581			error = ENOBUFS;
2582			goto release;
2583		}
2584		*controlp = seq_cm;
2585		controlp = &seq_cm->m_next;
2586	}
2587
2588	if (m != NULL) {
2589		if (!(flags & MSG_PEEK)) {
2590			/*
2591			 * We get here because m points to an mbuf following
2592			 * any MT_SONAME or MT_CONTROL mbufs which have been
2593			 * processed above.  In any case, m should be pointing
2594			 * to the head of the mbuf chain, and the nextrecord
2595			 * should be either NULL or equal to m->m_nextpkt.
2596			 * See comments above about SB_LOCK.
2597			 */
2598			if (m != so->so_rcv.sb_mb ||
2599			    m->m_nextpkt != nextrecord) {
2600				panic("%s: post-control !sync so=%p m=%p "
2601				    "nextrecord=%p\n", __func__, so, m,
2602				    nextrecord);
2603				/* NOTREACHED */
2604			}
2605			if (nextrecord == NULL)
2606				so->so_rcv.sb_lastrecord = m;
2607		}
2608		type = m->m_type;
2609		if (type == MT_OOBDATA)
2610			flags |= MSG_OOB;
2611	} else {
2612		if (!(flags & MSG_PEEK)) {
2613			SB_EMPTY_FIXUP(&so->so_rcv);
2614		}
2615	}
2616	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2617	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2618
2619	moff = 0;
2620	offset = 0;
2621
2622	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2623		can_delay = 1;
2624	else
2625		can_delay = 0;
2626
2627	need_event = 0;
2628
2629	while (m != NULL &&
2630	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2631		if (m->m_type == MT_OOBDATA) {
2632			if (type != MT_OOBDATA)
2633				break;
2634		} else if (type == MT_OOBDATA) {
2635			break;
2636		}
2637		/*
2638		 * Make sure to allways set MSG_OOB event when getting
2639		 * out of band data inline.
2640		 */
2641		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2642		    (so->so_options & SO_OOBINLINE) != 0 &&
2643		    (so->so_state & SS_RCVATMARK) != 0) {
2644			flags |= MSG_OOB;
2645		}
2646		so->so_state &= ~SS_RCVATMARK;
2647		len = uio_resid(uio) - delayed_copy_len;
2648		if (so->so_oobmark && len > so->so_oobmark - offset)
2649			len = so->so_oobmark - offset;
2650		if (len > m->m_len - moff)
2651			len = m->m_len - moff;
2652		/*
2653		 * If mp is set, just pass back the mbufs.
2654		 * Otherwise copy them out via the uio, then free.
2655		 * Sockbuf must be consistent here (points to current mbuf,
2656		 * it points to next record) when we drop priority;
2657		 * we must note any additions to the sockbuf when we
2658		 * block interrupts again.
2659		 */
2660		if (mp == NULL) {
2661			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2662			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2663			if (can_delay && len == m->m_len) {
2664				/*
2665				 * only delay the copy if we're consuming the
2666				 * mbuf and we're NOT in MSG_PEEK mode
2667				 * and we have enough data to make it worthwile
2668				 * to drop and retake the lock... can_delay
2669				 * reflects the state of the 2 latter
2670				 * constraints moff should always be zero
2671				 * in these cases
2672				 */
2673				delayed_copy_len += len;
2674			} else {
2675				if (delayed_copy_len) {
2676					error = sodelayed_copy(so, uio,
2677					    &free_list, &delayed_copy_len);
2678
2679					if (error) {
2680						goto release;
2681					}
2682					/*
2683					 * can only get here if MSG_PEEK is not
2684					 * set therefore, m should point at the
2685					 * head of the rcv queue; if it doesn't,
2686					 * it means something drastically
2687					 * changed while we were out from behind
2688					 * the lock in sodelayed_copy. perhaps
2689					 * a RST on the stream. in any event,
2690					 * the stream has been interrupted. it's
2691					 * probably best just to return whatever
2692					 * data we've moved and let the caller
2693					 * sort it out...
2694					 */
2695					if (m != so->so_rcv.sb_mb) {
2696						break;
2697					}
2698				}
2699				socket_unlock(so, 0);
2700				error = uiomove(mtod(m, caddr_t) + moff,
2701				    (int)len, uio);
2702				socket_lock(so, 0);
2703
2704				if (error)
2705					goto release;
2706			}
2707		} else {
2708			uio_setresid(uio, (uio_resid(uio) - len));
2709		}
2710		if (len == m->m_len - moff) {
2711			if (m->m_flags & M_EOR)
2712				flags |= MSG_EOR;
2713			if (flags & MSG_PEEK) {
2714				m = m->m_next;
2715				moff = 0;
2716			} else {
2717				nextrecord = m->m_nextpkt;
2718				sbfree(&so->so_rcv, m);
2719				m->m_nextpkt = NULL;
2720
2721				/*
2722				 * If this packet is an unordered packet
2723				 * (indicated by M_UNORDERED_DATA flag), remove
2724				 * the additional bytes added to the
2725				 * receive socket buffer size.
2726				 */
2727				if ((so->so_flags & SOF_ENABLE_MSGS) &&
2728				    m->m_len &&
2729				    (m->m_flags & M_UNORDERED_DATA) &&
2730				    sbreserve(&so->so_rcv,
2731				    so->so_rcv.sb_hiwat - m->m_len)) {
2732					if (so->so_msg_state->msg_uno_bytes >
2733					    m->m_len) {
2734						so->so_msg_state->
2735						    msg_uno_bytes -= m->m_len;
2736					} else {
2737						so->so_msg_state->
2738						    msg_uno_bytes = 0;
2739					}
2740					m->m_flags &= ~M_UNORDERED_DATA;
2741				}
2742
2743				if (mp != NULL) {
2744					*mp = m;
2745					mp = &m->m_next;
2746					so->so_rcv.sb_mb = m = m->m_next;
2747					*mp = NULL;
2748				} else {
2749					if (free_list == NULL)
2750						free_list = m;
2751					else
2752						ml->m_next = m;
2753					ml = m;
2754					so->so_rcv.sb_mb = m = m->m_next;
2755					ml->m_next = NULL;
2756				}
2757				if (m != NULL) {
2758					m->m_nextpkt = nextrecord;
2759					if (nextrecord == NULL)
2760						so->so_rcv.sb_lastrecord = m;
2761				} else {
2762					so->so_rcv.sb_mb = nextrecord;
2763					SB_EMPTY_FIXUP(&so->so_rcv);
2764				}
2765				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2766				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2767			}
2768		} else {
2769			if (flags & MSG_PEEK) {
2770				moff += len;
2771			} else {
2772				if (mp != NULL) {
2773					int copy_flag;
2774
2775					if (flags & MSG_DONTWAIT)
2776						copy_flag = M_DONTWAIT;
2777					else
2778						copy_flag = M_WAIT;
2779					*mp = m_copym(m, 0, len, copy_flag);
2780					/*
2781					 * Failed to allocate an mbuf?
2782					 * Adjust uio_resid back, it was
2783					 * adjusted down by len bytes which
2784					 * we didn't copy over.
2785					 */
2786					if (*mp == NULL) {
2787						uio_setresid(uio,
2788						    (uio_resid(uio) + len));
2789						break;
2790					}
2791				}
2792				m->m_data += len;
2793				m->m_len -= len;
2794				so->so_rcv.sb_cc -= len;
2795			}
2796		}
2797		if (so->so_oobmark) {
2798			if ((flags & MSG_PEEK) == 0) {
2799				so->so_oobmark -= len;
2800				if (so->so_oobmark == 0) {
2801					so->so_state |= SS_RCVATMARK;
2802					/*
2803					 * delay posting the actual event until
2804					 * after any delayed copy processing
2805					 * has finished
2806					 */
2807					need_event = 1;
2808					break;
2809				}
2810			} else {
2811				offset += len;
2812				if (offset == so->so_oobmark)
2813					break;
2814			}
2815		}
2816		if (flags & MSG_EOR)
2817			break;
2818		/*
2819		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2820		 * (for non-atomic socket), we must not quit until
2821		 * "uio->uio_resid == 0" or an error termination.
2822		 * If a signal/timeout occurs, return with a short
2823		 * count but without error.  Keep sockbuf locked
2824		 * against other readers.
2825		 */
2826		while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2827		    (uio_resid(uio) - delayed_copy_len) > 0 &&
2828		    !sosendallatonce(so) && !nextrecord) {
2829			if (so->so_error || so->so_state & SS_CANTRCVMORE)
2830				goto release;
2831
2832			/*
2833			 * Depending on the protocol (e.g. TCP), the following
2834			 * might cause the socket lock to be dropped and later
2835			 * be reacquired, and more data could have arrived and
2836			 * have been appended to the receive socket buffer by
2837			 * the time it returns.  Therefore, we only sleep in
2838			 * sbwait() below if and only if the socket buffer is
2839			 * empty, in order to avoid a false sleep.
2840			 */
2841			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2842			    (((struct inpcb *)so->so_pcb)->inp_state !=
2843			    INPCB_STATE_DEAD))
2844				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2845
2846			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2847			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2848
2849			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2850				error = 0;
2851				goto release;
2852			}
2853			/*
2854			 * have to wait until after we get back from the sbwait
2855			 * to do the copy because we will drop the lock if we
2856			 * have enough data that has been delayed... by dropping
2857			 * the lock we open up a window allowing the netisr
2858			 * thread to process the incoming packets and to change
2859			 * the state of this socket... we're issuing the sbwait
2860			 * because the socket is empty and we're expecting the
2861			 * netisr thread to wake us up when more packets arrive;
2862			 * if we allow that processing to happen and then sbwait
2863			 * we could stall forever with packets sitting in the
2864			 * socket if no further packets arrive from the remote
2865			 * side.
2866			 *
2867			 * we want to copy before we've collected all the data
2868			 * to satisfy this request to allow the copy to overlap
2869			 * the incoming packet processing on an MP system
2870			 */
2871			if (delayed_copy_len > sorecvmincopy &&
2872			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2873				error = sodelayed_copy(so, uio,
2874				    &free_list, &delayed_copy_len);
2875
2876				if (error)
2877					goto release;
2878			}
2879			m = so->so_rcv.sb_mb;
2880			if (m != NULL) {
2881				nextrecord = m->m_nextpkt;
2882			}
2883			SB_MB_CHECK(&so->so_rcv);
2884		}
2885	}
2886#ifdef MORE_LOCKING_DEBUG
2887	if (so->so_usecount <= 1) {
2888		panic("%s: after big while so=%p ref=%d on socket\n",
2889		    __func__, so, so->so_usecount);
2890		/* NOTREACHED */
2891	}
2892#endif
2893
2894	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2895		if (so->so_options & SO_DONTTRUNC) {
2896			flags |= MSG_RCVMORE;
2897		} else {
2898			flags |= MSG_TRUNC;
2899			if ((flags & MSG_PEEK) == 0)
2900				(void) sbdroprecord(&so->so_rcv);
2901		}
2902	}
2903
2904	/*
2905	 * pru_rcvd below (for TCP) may cause more data to be received
2906	 * if the socket lock is dropped prior to sending the ACK; some
2907	 * legacy OpenTransport applications don't handle this well
2908	 * (if it receives less data than requested while MSG_HAVEMORE
2909	 * is set), and so we set the flag now based on what we know
2910	 * prior to calling pru_rcvd.
2911	 */
2912	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2913		flags |= MSG_HAVEMORE;
2914
2915	if ((flags & MSG_PEEK) == 0) {
2916		if (m == NULL) {
2917			so->so_rcv.sb_mb = nextrecord;
2918			/*
2919			 * First part is an inline SB_EMPTY_FIXUP().  Second
2920			 * part makes sure sb_lastrecord is up-to-date if
2921			 * there is still data in the socket buffer.
2922			 */
2923			if (so->so_rcv.sb_mb == NULL) {
2924				so->so_rcv.sb_mbtail = NULL;
2925				so->so_rcv.sb_lastrecord = NULL;
2926			} else if (nextrecord->m_nextpkt == NULL) {
2927				so->so_rcv.sb_lastrecord = nextrecord;
2928			}
2929			SB_MB_CHECK(&so->so_rcv);
2930		}
2931		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2932		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2933		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2934			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2935	}
2936
2937	if (delayed_copy_len) {
2938		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2939		if (error)
2940			goto release;
2941	}
2942	if (free_list != NULL) {
2943		m_freem_list(free_list);
2944		free_list = NULL;
2945	}
2946	if (need_event)
2947		postevent(so, 0, EV_OOB);
2948
2949	if (orig_resid == uio_resid(uio) && orig_resid &&
2950	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2951		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
2952		goto restart;
2953	}
2954
2955	if (flagsp != NULL)
2956		*flagsp |= flags;
2957release:
2958#ifdef MORE_LOCKING_DEBUG
2959	if (so->so_usecount <= 1) {
2960		panic("%s: release so=%p ref=%d on socket\n", __func__,
2961		    so, so->so_usecount);
2962		/* NOTREACHED */
2963	}
2964#endif
2965	if (delayed_copy_len)
2966		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2967
2968	if (free_list != NULL)
2969		m_freem_list(free_list);
2970
2971	sbunlock(&so->so_rcv, FALSE);	/* will unlock socket */
2972
2973	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2974	    so->so_rcv.sb_cc, 0, error);
2975
2976	return (error);
2977}
2978
2979/*
2980 * Returns:	0			Success
2981 *	uiomove:EFAULT
2982 */
2983static int
2984sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2985    user_ssize_t *resid)
2986{
2987	int error = 0;
2988	struct mbuf *m;
2989
2990	m = *free_list;
2991
2992	socket_unlock(so, 0);
2993
2994	while (m != NULL && error == 0) {
2995		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2996		m = m->m_next;
2997	}
2998	m_freem_list(*free_list);
2999
3000	*free_list = NULL;
3001	*resid = 0;
3002
3003	socket_lock(so, 0);
3004
3005	return (error);
3006}
3007
3008/*
3009 * Returns:	0			Success
3010 *		EINVAL
3011 *		ENOTCONN
3012 *	<pru_shutdown>:EINVAL
3013 *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
3014 *	<pru_shutdown>:ENOBUFS[TCP]
3015 *	<pru_shutdown>:EMSGSIZE[TCP]
3016 *	<pru_shutdown>:EHOSTUNREACH[TCP]
3017 *	<pru_shutdown>:ENETUNREACH[TCP]
3018 *	<pru_shutdown>:ENETDOWN[TCP]
3019 *	<pru_shutdown>:ENOMEM[TCP]
3020 *	<pru_shutdown>:EACCES[TCP]
3021 *	<pru_shutdown>:EMSGSIZE[TCP]
3022 *	<pru_shutdown>:ENOBUFS[TCP]
3023 *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
3024 *	<pru_shutdown>:???		[other protocol families]
3025 */
3026int
3027soshutdown(struct socket *so, int how)
3028{
3029	int error;
3030
3031	switch (how) {
3032	case SHUT_RD:
3033	case SHUT_WR:
3034	case SHUT_RDWR:
3035		socket_lock(so, 1);
3036		if ((so->so_state &
3037		    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3038			error = ENOTCONN;
3039		} else {
3040			error = soshutdownlock(so, how);
3041		}
3042		socket_unlock(so, 1);
3043		break;
3044	default:
3045		error = EINVAL;
3046		break;
3047	}
3048
3049	return (error);
3050}
3051
3052int
3053soshutdownlock(struct socket *so, int how)
3054{
3055	struct protosw *pr = so->so_proto;
3056	int error = 0;
3057
3058	sflt_notify(so, sock_evt_shutdown, &how);
3059
3060	if (how != SHUT_WR) {
3061		if ((so->so_state & SS_CANTRCVMORE) != 0) {
3062			/* read already shut down */
3063			error = ENOTCONN;
3064			goto done;
3065		}
3066		sorflush(so);
3067		postevent(so, 0, EV_RCLOSED);
3068	}
3069	if (how != SHUT_RD) {
3070		if ((so->so_state & SS_CANTSENDMORE) != 0) {
3071			/* write already shut down */
3072			error = ENOTCONN;
3073			goto done;
3074		}
3075		error = (*pr->pr_usrreqs->pru_shutdown)(so);
3076		postevent(so, 0, EV_WCLOSED);
3077	}
3078done:
3079	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
3080	return (error);
3081}
3082
3083void
3084sowflush(struct socket *so)
3085{
3086	struct sockbuf *sb = &so->so_snd;
3087#ifdef notyet
3088	lck_mtx_t *mutex_held;
3089	/*
3090	 * XXX: This code is currently commented out, because we may get here
3091	 * as part of sofreelastref(), and at that time, pr_getlock() may no
3092	 * longer be able to return us the lock; this will be fixed in future.
3093	 */
3094	if (so->so_proto->pr_getlock != NULL)
3095		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3096	else
3097		mutex_held = so->so_proto->pr_domain->dom_mtx;
3098
3099	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3100#endif /* notyet */
3101
3102	/*
3103	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
3104	 * to prevent the socket buffer from being unexpectedly altered
3105	 * while it is used by another thread in socket send/receive.
3106	 *
3107	 * sblock() must not fail here, hence the assertion.
3108	 */
3109	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3110	VERIFY(sb->sb_flags & SB_LOCK);
3111
3112	sb->sb_flags		&= ~(SB_SEL|SB_UPCALL);
3113	sb->sb_flags		|= SB_DROP;
3114	sb->sb_upcall		= NULL;
3115	sb->sb_upcallarg	= NULL;
3116
3117	sbunlock(sb, TRUE);	/* keep socket locked */
3118
3119	selthreadclear(&sb->sb_sel);
3120	sbrelease(sb);
3121}
3122
3123void
3124sorflush(struct socket *so)
3125{
3126	struct sockbuf *sb = &so->so_rcv;
3127	struct protosw *pr = so->so_proto;
3128	struct sockbuf asb;
3129#ifdef notyet
3130	lck_mtx_t *mutex_held;
3131	/*
3132	 * XXX: This code is currently commented out, because we may get here
3133	 * as part of sofreelastref(), and at that time, pr_getlock() may no
3134	 * longer be able to return us the lock; this will be fixed in future.
3135	 */
3136	if (so->so_proto->pr_getlock != NULL)
3137		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3138	else
3139		mutex_held = so->so_proto->pr_domain->dom_mtx;
3140
3141	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3142#endif /* notyet */
3143
3144	sflt_notify(so, sock_evt_flush_read, NULL);
3145
3146	socantrcvmore(so);
3147
3148	/*
3149	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
3150	 * to prevent the socket buffer from being unexpectedly altered
3151	 * while it is used by another thread in socket send/receive.
3152	 *
3153	 * sblock() must not fail here, hence the assertion.
3154	 */
3155	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3156	VERIFY(sb->sb_flags & SB_LOCK);
3157
3158	/*
3159	 * Copy only the relevant fields from "sb" to "asb" which we
3160	 * need for sbrelease() to function.  In particular, skip
3161	 * sb_sel as it contains the wait queue linkage, which would
3162	 * wreak havoc if we were to issue selthreadclear() on "asb".
3163	 * Make sure to not carry over SB_LOCK in "asb", as we need
3164	 * to acquire it later as part of sbrelease().
3165	 */
3166	bzero(&asb, sizeof (asb));
3167	asb.sb_cc		= sb->sb_cc;
3168	asb.sb_hiwat		= sb->sb_hiwat;
3169	asb.sb_mbcnt		= sb->sb_mbcnt;
3170	asb.sb_mbmax		= sb->sb_mbmax;
3171	asb.sb_ctl		= sb->sb_ctl;
3172	asb.sb_lowat		= sb->sb_lowat;
3173	asb.sb_mb		= sb->sb_mb;
3174	asb.sb_mbtail		= sb->sb_mbtail;
3175	asb.sb_lastrecord	= sb->sb_lastrecord;
3176	asb.sb_so		= sb->sb_so;
3177	asb.sb_flags		= sb->sb_flags;
3178	asb.sb_flags		&= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
3179	asb.sb_flags		|= SB_DROP;
3180
3181	/*
3182	 * Ideally we'd bzero() these and preserve the ones we need;
3183	 * but to do that we'd need to shuffle things around in the
3184	 * sockbuf, and we can't do it now because there are KEXTS
3185	 * that are directly referring to the socket structure.
3186	 *
3187	 * Setting SB_DROP acts as a barrier to prevent further appends.
3188	 * Clearing SB_SEL is done for selthreadclear() below.
3189	 */
3190	sb->sb_cc		= 0;
3191	sb->sb_hiwat		= 0;
3192	sb->sb_mbcnt		= 0;
3193	sb->sb_mbmax		= 0;
3194	sb->sb_ctl		= 0;
3195	sb->sb_lowat		= 0;
3196	sb->sb_mb		= NULL;
3197	sb->sb_mbtail		= NULL;
3198	sb->sb_lastrecord	= NULL;
3199	sb->sb_timeo.tv_sec	= 0;
3200	sb->sb_timeo.tv_usec	= 0;
3201	sb->sb_upcall		= NULL;
3202	sb->sb_upcallarg	= NULL;
3203	sb->sb_flags		&= ~(SB_SEL|SB_UPCALL);
3204	sb->sb_flags		|= SB_DROP;
3205
3206	sbunlock(sb, TRUE);	/* keep socket locked */
3207
3208	/*
3209	 * Note that selthreadclear() is called on the original "sb" and
3210	 * not the local "asb" because of the way wait queue linkage is
3211	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
3212	 * should no longer be set (cleared above.)
3213	 */
3214	selthreadclear(&sb->sb_sel);
3215
3216	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
3217		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
3218
3219	sbrelease(&asb);
3220}
3221
3222/*
3223 * Perhaps this routine, and sooptcopyout(), below, ought to come in
3224 * an additional variant to handle the case where the option value needs
3225 * to be some kind of integer, but not a specific size.
3226 * In addition to their use here, these functions are also called by the
3227 * protocol-level pr_ctloutput() routines.
3228 *
3229 * Returns:	0			Success
3230 *		EINVAL
3231 *	copyin:EFAULT
3232 */
3233int
3234sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3235{
3236	size_t	valsize;
3237
3238	/*
3239	 * If the user gives us more than we wanted, we ignore it,
3240	 * but if we don't get the minimum length the caller
3241	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
3242	 * is set to however much we actually retrieved.
3243	 */
3244	if ((valsize = sopt->sopt_valsize) < minlen)
3245		return (EINVAL);
3246	if (valsize > len)
3247		sopt->sopt_valsize = valsize = len;
3248
3249	if (sopt->sopt_p != kernproc)
3250		return (copyin(sopt->sopt_val, buf, valsize));
3251
3252	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
3253	return (0);
3254}
3255
3256/*
3257 * sooptcopyin_timeval
3258 *   Copy in a timeval value into tv_p, and take into account whether the
3259 *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
3260 *   code here so that we can verify the 64-bit tv_sec value before we lose
3261 *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
3262 */
3263static int
3264sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
3265{
3266	int			error;
3267
3268	if (proc_is64bit(sopt->sopt_p)) {
3269		struct user64_timeval	tv64;
3270
3271		if (sopt->sopt_valsize < sizeof (tv64))
3272			return (EINVAL);
3273
3274		sopt->sopt_valsize = sizeof (tv64);
3275		if (sopt->sopt_p != kernproc) {
3276			error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
3277			if (error != 0)
3278				return (error);
3279		} else {
3280			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
3281			    sizeof (tv64));
3282		}
3283		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
3284		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
3285			return (EDOM);
3286
3287		tv_p->tv_sec = tv64.tv_sec;
3288		tv_p->tv_usec = tv64.tv_usec;
3289	} else {
3290		struct user32_timeval	tv32;
3291
3292		if (sopt->sopt_valsize < sizeof (tv32))
3293			return (EINVAL);
3294
3295		sopt->sopt_valsize = sizeof (tv32);
3296		if (sopt->sopt_p != kernproc) {
3297			error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
3298			if (error != 0) {
3299				return (error);
3300			}
3301		} else {
3302			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
3303			    sizeof (tv32));
3304		}
3305#ifndef __LP64__
3306		/*
3307		 * K64todo "comparison is always false due to
3308		 * limited range of data type"
3309		 */
3310		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
3311		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
3312			return (EDOM);
3313#endif
3314		tv_p->tv_sec = tv32.tv_sec;
3315		tv_p->tv_usec = tv32.tv_usec;
3316	}
3317	return (0);
3318}
3319
3320/*
3321 * Returns:	0			Success
3322 *		EINVAL
3323 *		ENOPROTOOPT
3324 *		ENOBUFS
3325 *		EDOM
3326 *	sooptcopyin:EINVAL
3327 *	sooptcopyin:EFAULT
3328 *	sooptcopyin_timeval:EINVAL
3329 *	sooptcopyin_timeval:EFAULT
3330 *	sooptcopyin_timeval:EDOM
3331 *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3332 *	<pr_ctloutput>:???w
3333 *	sflt_attach_private:???		[whatever a filter author chooses]
3334 *	<sf_setoption>:???		[whatever a filter author chooses]
3335 *
3336 * Notes:	Other <pru_listen> returns depend on the protocol family; all
3337 *		<sf_listen> returns depend on what the filter author causes
3338 *		their filter to return.
3339 */
3340int
3341sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3342{
3343	int	error, optval;
3344	struct	linger l;
3345	struct	timeval tv;
3346#if CONFIG_MACF_SOCKET
3347	struct mac extmac;
3348#endif /* MAC_SOCKET */
3349
3350	if (sopt->sopt_dir != SOPT_SET)
3351		sopt->sopt_dir = SOPT_SET;
3352
3353	if (dolock)
3354		socket_lock(so, 1);
3355
3356	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
3357	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
3358	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
3359		/* the socket has been shutdown, no more sockopt's */
3360		error = EINVAL;
3361		goto out;
3362	}
3363
3364	error = sflt_setsockopt(so, sopt);
3365	if (error != 0) {
3366		if (error == EJUSTRETURN)
3367			error = 0;
3368		goto out;
3369	}
3370
3371	if (sopt->sopt_level != SOL_SOCKET) {
3372		if (so->so_proto != NULL &&
3373		    so->so_proto->pr_ctloutput != NULL) {
3374			error = (*so->so_proto->pr_ctloutput)(so, sopt);
3375			goto out;
3376		}
3377		error = ENOPROTOOPT;
3378	} else {
3379		/*
3380		 * Allow socket-level (SOL_SOCKET) options to be filtered by
3381		 * the protocol layer, if needed.  A zero value returned from
3382		 * the handler means use default socket-level processing as
3383		 * done by the rest of this routine.  Otherwise, any other
3384		 * return value indicates that the option is unsupported.
3385		 */
3386		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3387		    pru_socheckopt(so, sopt)) != 0)
3388			goto out;
3389
3390		error = 0;
3391		switch (sopt->sopt_name) {
3392		case SO_LINGER:
3393		case SO_LINGER_SEC:
3394			error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3395			if (error != 0)
3396				goto out;
3397
3398			so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3399			    l.l_linger : l.l_linger * hz;
3400			if (l.l_onoff != 0)
3401				so->so_options |= SO_LINGER;
3402			else
3403				so->so_options &= ~SO_LINGER;
3404			break;
3405
3406		case SO_DEBUG:
3407		case SO_KEEPALIVE:
3408		case SO_DONTROUTE:
3409		case SO_USELOOPBACK:
3410		case SO_BROADCAST:
3411		case SO_REUSEADDR:
3412		case SO_REUSEPORT:
3413		case SO_OOBINLINE:
3414		case SO_TIMESTAMP:
3415		case SO_TIMESTAMP_MONOTONIC:
3416		case SO_DONTTRUNC:
3417		case SO_WANTMORE:
3418		case SO_WANTOOBFLAG:
3419			error = sooptcopyin(sopt, &optval, sizeof (optval),
3420			    sizeof (optval));
3421			if (error != 0)
3422				goto out;
3423			if (optval)
3424				so->so_options |= sopt->sopt_name;
3425			else
3426				so->so_options &= ~sopt->sopt_name;
3427			break;
3428
3429		case SO_SNDBUF:
3430		case SO_RCVBUF:
3431		case SO_SNDLOWAT:
3432		case SO_RCVLOWAT:
3433			error = sooptcopyin(sopt, &optval, sizeof (optval),
3434			    sizeof (optval));
3435			if (error != 0)
3436				goto out;
3437
3438			/*
3439			 * Values < 1 make no sense for any of these
3440			 * options, so disallow them.
3441			 */
3442			if (optval < 1) {
3443				error = EINVAL;
3444				goto out;
3445			}
3446
3447			switch (sopt->sopt_name) {
3448			case SO_SNDBUF:
3449			case SO_RCVBUF: {
3450				struct sockbuf *sb =
3451				    (sopt->sopt_name == SO_SNDBUF) ?
3452				    &so->so_snd : &so->so_rcv;
3453				if (sbreserve(sb, (u_int32_t)optval) == 0) {
3454					error = ENOBUFS;
3455					goto out;
3456				}
3457				sb->sb_flags |= SB_USRSIZE;
3458				sb->sb_flags &= ~SB_AUTOSIZE;
3459				sb->sb_idealsize = (u_int32_t)optval;
3460				break;
3461			}
3462			/*
3463			 * Make sure the low-water is never greater than
3464			 * the high-water.
3465			 */
3466			case SO_SNDLOWAT:
3467				so->so_snd.sb_lowat =
3468				    (optval > so->so_snd.sb_hiwat) ?
3469				    so->so_snd.sb_hiwat : optval;
3470				break;
3471			case SO_RCVLOWAT:
3472				so->so_rcv.sb_lowat =
3473				    (optval > so->so_rcv.sb_hiwat) ?
3474				    so->so_rcv.sb_hiwat : optval;
3475				break;
3476			}
3477			break;
3478
3479		case SO_SNDTIMEO:
3480		case SO_RCVTIMEO:
3481			error = sooptcopyin_timeval(sopt, &tv);
3482			if (error != 0)
3483				goto out;
3484
3485			switch (sopt->sopt_name) {
3486			case SO_SNDTIMEO:
3487				so->so_snd.sb_timeo = tv;
3488				break;
3489			case SO_RCVTIMEO:
3490				so->so_rcv.sb_timeo = tv;
3491				break;
3492			}
3493			break;
3494
3495		case SO_NKE: {
3496			struct so_nke nke;
3497
3498			error = sooptcopyin(sopt, &nke, sizeof (nke),
3499			    sizeof (nke));
3500			if (error != 0)
3501				goto out;
3502
3503			error = sflt_attach_internal(so, nke.nke_handle);
3504			break;
3505		}
3506
3507		case SO_NOSIGPIPE:
3508			error = sooptcopyin(sopt, &optval, sizeof (optval),
3509			    sizeof (optval));
3510			if (error != 0)
3511				goto out;
3512			if (optval != 0)
3513				so->so_flags |= SOF_NOSIGPIPE;
3514			else
3515				so->so_flags &= ~SOF_NOSIGPIPE;
3516			break;
3517
3518		case SO_NOADDRERR:
3519			error = sooptcopyin(sopt, &optval, sizeof (optval),
3520			    sizeof (optval));
3521			if (error != 0)
3522				goto out;
3523			if (optval != 0)
3524				so->so_flags |= SOF_NOADDRAVAIL;
3525			else
3526				so->so_flags &= ~SOF_NOADDRAVAIL;
3527			break;
3528
3529		case SO_REUSESHAREUID:
3530			error = sooptcopyin(sopt, &optval, sizeof (optval),
3531			    sizeof (optval));
3532			if (error != 0)
3533				goto out;
3534			if (optval != 0)
3535				so->so_flags |= SOF_REUSESHAREUID;
3536			else
3537				so->so_flags &= ~SOF_REUSESHAREUID;
3538			break;
3539
3540		case SO_NOTIFYCONFLICT:
3541			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3542				error = EPERM;
3543				goto out;
3544			}
3545			error = sooptcopyin(sopt, &optval, sizeof (optval),
3546			    sizeof (optval));
3547			if (error != 0)
3548				goto out;
3549			if (optval != 0)
3550				so->so_flags |= SOF_NOTIFYCONFLICT;
3551			else
3552				so->so_flags &= ~SOF_NOTIFYCONFLICT;
3553			break;
3554
3555		case SO_RESTRICTIONS:
3556			error = sooptcopyin(sopt, &optval, sizeof (optval),
3557			    sizeof (optval));
3558			if (error != 0)
3559				goto out;
3560
3561			error = so_set_restrictions(so, optval);
3562			break;
3563
3564		case SO_LABEL:
3565#if CONFIG_MACF_SOCKET
3566			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3567			    sizeof (extmac))) != 0)
3568				goto out;
3569
3570			error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3571			    so, &extmac);
3572#else
3573			error = EOPNOTSUPP;
3574#endif /* MAC_SOCKET */
3575			break;
3576
3577		case SO_UPCALLCLOSEWAIT:
3578			error = sooptcopyin(sopt, &optval, sizeof (optval),
3579			    sizeof (optval));
3580			if (error != 0)
3581				goto out;
3582			if (optval != 0)
3583				so->so_flags |= SOF_UPCALLCLOSEWAIT;
3584			else
3585				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3586			break;
3587
3588		case SO_RANDOMPORT:
3589			error = sooptcopyin(sopt, &optval, sizeof (optval),
3590			    sizeof (optval));
3591			if (error != 0)
3592				goto out;
3593			if (optval != 0)
3594				so->so_flags |= SOF_BINDRANDOMPORT;
3595			else
3596				so->so_flags &= ~SOF_BINDRANDOMPORT;
3597			break;
3598
3599		case SO_NP_EXTENSIONS: {
3600			struct so_np_extensions sonpx;
3601
3602			error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
3603			    sizeof (sonpx));
3604			if (error != 0)
3605				goto out;
3606			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3607				error = EINVAL;
3608				goto out;
3609			}
3610			/*
3611			 * Only one bit defined for now
3612			 */
3613			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3614				if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3615					so->so_flags |= SOF_NPX_SETOPTSHUT;
3616				else
3617					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3618			}
3619			break;
3620		}
3621
3622		case SO_TRAFFIC_CLASS: {
3623			error = sooptcopyin(sopt, &optval, sizeof (optval),
3624			    sizeof (optval));
3625			if (error != 0)
3626				goto out;
3627			error = so_set_traffic_class(so, optval);
3628			if (error != 0)
3629				goto out;
3630			break;
3631		}
3632
3633		case SO_RECV_TRAFFIC_CLASS: {
3634			error = sooptcopyin(sopt, &optval, sizeof (optval),
3635			    sizeof (optval));
3636			if (error != 0)
3637				goto out;
3638			if (optval == 0)
3639				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3640			else
3641				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3642			break;
3643		}
3644
3645		case SO_TRAFFIC_CLASS_DBG: {
3646			struct so_tcdbg so_tcdbg;
3647
3648			error = sooptcopyin(sopt, &so_tcdbg,
3649			    sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
3650			if (error != 0)
3651				goto out;
3652			error = so_set_tcdbg(so, &so_tcdbg);
3653			if (error != 0)
3654				goto out;
3655			break;
3656		}
3657
3658		case SO_PRIVILEGED_TRAFFIC_CLASS:
3659			error = priv_check_cred(kauth_cred_get(),
3660			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
3661			if (error != 0)
3662				goto out;
3663			error = sooptcopyin(sopt, &optval, sizeof (optval),
3664			    sizeof (optval));
3665			if (error != 0)
3666				goto out;
3667			if (optval == 0)
3668				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3669			else
3670				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3671			break;
3672
3673		case SO_DEFUNCTOK:
3674			error = sooptcopyin(sopt, &optval, sizeof (optval),
3675			    sizeof (optval));
3676			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3677				if (error == 0)
3678					error = EBADF;
3679				goto out;
3680			}
3681			/*
3682			 * Any process can set SO_DEFUNCTOK (clear
3683			 * SOF_NODEFUNCT), but only root can clear
3684			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3685			 */
3686			if (optval == 0 &&
3687			    kauth_cred_issuser(kauth_cred_get()) == 0) {
3688				error = EPERM;
3689				goto out;
3690			}
3691			if (optval)
3692				so->so_flags &= ~SOF_NODEFUNCT;
3693			else
3694				so->so_flags |= SOF_NODEFUNCT;
3695
3696			if (SOCK_DOM(so) == PF_INET ||
3697			    SOCK_DOM(so) == PF_INET6) {
3698				char s[MAX_IPv6_STR_LEN];
3699				char d[MAX_IPv6_STR_LEN];
3700				struct inpcb *inp = sotoinpcb(so);
3701
3702				SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
3703				    "%s:%d] is now marked as %seligible for "
3704				    "defunct\n", __func__, proc_selfpid(),
3705				    (uint64_t)VM_KERNEL_ADDRPERM(so),
3706				    (SOCK_TYPE(so) == SOCK_STREAM) ?
3707				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
3708				    ((SOCK_DOM(so) == PF_INET) ?
3709				    (void *)&inp->inp_laddr.s_addr :
3710				    (void *)&inp->in6p_laddr), s, sizeof (s)),
3711				    ntohs(inp->in6p_lport),
3712				    inet_ntop(SOCK_DOM(so),
3713				    (SOCK_DOM(so) == PF_INET) ?
3714				    (void *)&inp->inp_faddr.s_addr :
3715				    (void *)&inp->in6p_faddr, d, sizeof (d)),
3716				    ntohs(inp->in6p_fport),
3717				    (so->so_flags & SOF_NODEFUNCT) ?
3718				    "not " : ""));
3719			} else {
3720				SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
3721				    "now marked as %seligible for defunct\n",
3722				    __func__, proc_selfpid(),
3723				    (uint64_t)VM_KERNEL_ADDRPERM(so),
3724				    SOCK_DOM(so), SOCK_TYPE(so),
3725				    (so->so_flags & SOF_NODEFUNCT) ?
3726				    "not " : ""));
3727			}
3728			break;
3729
3730		case SO_ISDEFUNCT:
3731			/* This option is not settable */
3732			error = EINVAL;
3733			break;
3734
3735		case SO_OPPORTUNISTIC:
3736			error = sooptcopyin(sopt, &optval, sizeof (optval),
3737			    sizeof (optval));
3738			if (error == 0)
3739				error = so_set_opportunistic(so, optval);
3740			break;
3741
3742		case SO_FLUSH:
3743			/* This option is handled by lower layer(s) */
3744			error = 0;
3745			break;
3746
3747		case SO_RECV_ANYIF:
3748			error = sooptcopyin(sopt, &optval, sizeof (optval),
3749			    sizeof (optval));
3750			if (error == 0)
3751				error = so_set_recv_anyif(so, optval);
3752			break;
3753
3754		case SO_TRAFFIC_MGT_BACKGROUND: {
3755			/* This option is handled by lower layer(s) */
3756			error = 0;
3757			break;
3758		}
3759
3760#if FLOW_DIVERT
3761		case SO_FLOW_DIVERT_TOKEN:
3762			error = flow_divert_token_set(so, sopt);
3763			break;
3764#endif	/* FLOW_DIVERT */
3765
3766
3767		case SO_DELEGATED:
3768			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
3769			    sizeof (optval))) != 0)
3770				break;
3771
3772			error = so_set_effective_pid(so, optval, sopt->sopt_p);
3773			break;
3774
3775		case SO_DELEGATED_UUID: {
3776			uuid_t euuid;
3777
3778			if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
3779			    sizeof (euuid))) != 0)
3780				break;
3781
3782			error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
3783			break;
3784		}
3785
3786		default:
3787			error = ENOPROTOOPT;
3788			break;
3789		}
3790		if (error == 0 && so->so_proto != NULL &&
3791		    so->so_proto->pr_ctloutput != NULL) {
3792			(void) so->so_proto->pr_ctloutput(so, sopt);
3793		}
3794	}
3795out:
3796	if (dolock)
3797		socket_unlock(so, 1);
3798	return (error);
3799}
3800
3801/* Helper routines for getsockopt */
3802int
3803sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3804{
3805	int	error;
3806	size_t	valsize;
3807
3808	error = 0;
3809
3810	/*
3811	 * Documented get behavior is that we always return a value,
3812	 * possibly truncated to fit in the user's buffer.
3813	 * Traditional behavior is that we always tell the user
3814	 * precisely how much we copied, rather than something useful
3815	 * like the total amount we had available for her.
3816	 * Note that this interface is not idempotent; the entire answer must
3817	 * generated ahead of time.
3818	 */
3819	valsize = min(len, sopt->sopt_valsize);
3820	sopt->sopt_valsize = valsize;
3821	if (sopt->sopt_val != USER_ADDR_NULL) {
3822		if (sopt->sopt_p != kernproc)
3823			error = copyout(buf, sopt->sopt_val, valsize);
3824		else
3825			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3826	}
3827	return (error);
3828}
3829
3830static int
3831sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
3832{
3833	int			error;
3834	size_t			len;
3835	struct user64_timeval	tv64;
3836	struct user32_timeval	tv32;
3837	const void *		val;
3838	size_t			valsize;
3839
3840	error = 0;
3841	if (proc_is64bit(sopt->sopt_p)) {
3842		len = sizeof (tv64);
3843		tv64.tv_sec = tv_p->tv_sec;
3844		tv64.tv_usec = tv_p->tv_usec;
3845		val = &tv64;
3846	} else {
3847		len = sizeof (tv32);
3848		tv32.tv_sec = tv_p->tv_sec;
3849		tv32.tv_usec = tv_p->tv_usec;
3850		val = &tv32;
3851	}
3852	valsize = min(len, sopt->sopt_valsize);
3853	sopt->sopt_valsize = valsize;
3854	if (sopt->sopt_val != USER_ADDR_NULL) {
3855		if (sopt->sopt_p != kernproc)
3856			error = copyout(val, sopt->sopt_val, valsize);
3857		else
3858			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3859	}
3860	return (error);
3861}
3862
3863/*
3864 * Return:	0			Success
3865 *		ENOPROTOOPT
3866 *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3867 *	<pr_ctloutput>:???
3868 *	<sf_getoption>:???
3869 */
3870int
3871sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3872{
3873	int	error, optval;
3874	struct	linger l;
3875	struct	timeval tv;
3876#if CONFIG_MACF_SOCKET
3877	struct mac extmac;
3878#endif /* MAC_SOCKET */
3879
3880	if (sopt->sopt_dir != SOPT_GET)
3881		sopt->sopt_dir = SOPT_GET;
3882
3883	if (dolock)
3884		socket_lock(so, 1);
3885
3886	error = sflt_getsockopt(so, sopt);
3887	if (error != 0) {
3888		if (error == EJUSTRETURN)
3889			error = 0;
3890		goto out;
3891	}
3892
3893	if (sopt->sopt_level != SOL_SOCKET) {
3894		if (so->so_proto != NULL &&
3895		    so->so_proto->pr_ctloutput != NULL) {
3896			error = (*so->so_proto->pr_ctloutput)(so, sopt);
3897			goto out;
3898		}
3899		error = ENOPROTOOPT;
3900	} else {
3901		/*
3902		 * Allow socket-level (SOL_SOCKET) options to be filtered by
3903		 * the protocol layer, if needed.  A zero value returned from
3904		 * the handler means use default socket-level processing as
3905		 * done by the rest of this routine.  Otherwise, any other
3906		 * return value indicates that the option is unsupported.
3907		 */
3908		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3909		    pru_socheckopt(so, sopt)) != 0)
3910			goto out;
3911
3912		error = 0;
3913		switch (sopt->sopt_name) {
3914		case SO_LINGER:
3915		case SO_LINGER_SEC:
3916			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
3917			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3918			    so->so_linger : so->so_linger / hz;
3919			error = sooptcopyout(sopt, &l, sizeof (l));
3920			break;
3921
3922		case SO_USELOOPBACK:
3923		case SO_DONTROUTE:
3924		case SO_DEBUG:
3925		case SO_KEEPALIVE:
3926		case SO_REUSEADDR:
3927		case SO_REUSEPORT:
3928		case SO_BROADCAST:
3929		case SO_OOBINLINE:
3930		case SO_TIMESTAMP:
3931		case SO_TIMESTAMP_MONOTONIC:
3932		case SO_DONTTRUNC:
3933		case SO_WANTMORE:
3934		case SO_WANTOOBFLAG:
3935			optval = so->so_options & sopt->sopt_name;
3936integer:
3937			error = sooptcopyout(sopt, &optval, sizeof (optval));
3938			break;
3939
3940		case SO_TYPE:
3941			optval = so->so_type;
3942			goto integer;
3943
3944		case SO_NREAD:
3945			if (so->so_proto->pr_flags & PR_ATOMIC) {
3946				int pkt_total;
3947				struct mbuf *m1;
3948
3949				pkt_total = 0;
3950				m1 = so->so_rcv.sb_mb;
3951				while (m1 != NULL) {
3952					if (m1->m_type == MT_DATA ||
3953					    m1->m_type == MT_HEADER ||
3954					    m1->m_type == MT_OOBDATA)
3955						pkt_total += m1->m_len;
3956					m1 = m1->m_next;
3957				}
3958				optval = pkt_total;
3959			} else {
3960				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3961			}
3962			goto integer;
3963
3964		case SO_NWRITE:
3965			optval = so->so_snd.sb_cc;
3966			goto integer;
3967
3968		case SO_ERROR:
3969			optval = so->so_error;
3970			so->so_error = 0;
3971			goto integer;
3972
3973		case SO_SNDBUF:
3974			optval = so->so_snd.sb_hiwat;
3975			goto integer;
3976
3977		case SO_RCVBUF:
3978			optval = so->so_rcv.sb_hiwat;
3979			goto integer;
3980
3981		case SO_SNDLOWAT:
3982			optval = so->so_snd.sb_lowat;
3983			goto integer;
3984
3985		case SO_RCVLOWAT:
3986			optval = so->so_rcv.sb_lowat;
3987			goto integer;
3988
3989		case SO_SNDTIMEO:
3990		case SO_RCVTIMEO:
3991			tv = (sopt->sopt_name == SO_SNDTIMEO ?
3992			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3993
3994			error = sooptcopyout_timeval(sopt, &tv);
3995			break;
3996
3997		case SO_NOSIGPIPE:
3998			optval = (so->so_flags & SOF_NOSIGPIPE);
3999			goto integer;
4000
4001		case SO_NOADDRERR:
4002			optval = (so->so_flags & SOF_NOADDRAVAIL);
4003			goto integer;
4004
4005		case SO_REUSESHAREUID:
4006			optval = (so->so_flags & SOF_REUSESHAREUID);
4007			goto integer;
4008
4009
4010		case SO_NOTIFYCONFLICT:
4011			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
4012			goto integer;
4013
4014		case SO_RESTRICTIONS:
4015			optval = so_get_restrictions(so);
4016			goto integer;
4017
4018		case SO_LABEL:
4019#if CONFIG_MACF_SOCKET
4020			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4021			    sizeof (extmac))) != 0 ||
4022			    (error = mac_socket_label_get(proc_ucred(
4023			    sopt->sopt_p), so, &extmac)) != 0)
4024				break;
4025
4026			error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4027#else
4028			error = EOPNOTSUPP;
4029#endif /* MAC_SOCKET */
4030			break;
4031
4032		case SO_PEERLABEL:
4033#if CONFIG_MACF_SOCKET
4034			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4035			    sizeof (extmac))) != 0 ||
4036			    (error = mac_socketpeer_label_get(proc_ucred(
4037			    sopt->sopt_p), so, &extmac)) != 0)
4038				break;
4039
4040			error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4041#else
4042			error = EOPNOTSUPP;
4043#endif /* MAC_SOCKET */
4044			break;
4045
4046#ifdef __APPLE_API_PRIVATE
4047		case SO_UPCALLCLOSEWAIT:
4048			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
4049			goto integer;
4050#endif
4051		case SO_RANDOMPORT:
4052			optval = (so->so_flags & SOF_BINDRANDOMPORT);
4053			goto integer;
4054
4055		case SO_NP_EXTENSIONS: {
4056			struct so_np_extensions sonpx;
4057
4058			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
4059			    SONPX_SETOPTSHUT : 0;
4060			sonpx.npx_mask = SONPX_MASK_VALID;
4061
4062			error = sooptcopyout(sopt, &sonpx,
4063			    sizeof (struct so_np_extensions));
4064			break;
4065		}
4066
4067		case SO_TRAFFIC_CLASS:
4068			optval = so->so_traffic_class;
4069			goto integer;
4070
4071		case SO_RECV_TRAFFIC_CLASS:
4072			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
4073			goto integer;
4074
4075		case SO_TRAFFIC_CLASS_STATS:
4076			error = sooptcopyout(sopt, &so->so_tc_stats,
4077			    sizeof (so->so_tc_stats));
4078			break;
4079
4080		case SO_TRAFFIC_CLASS_DBG:
4081			error = sogetopt_tcdbg(so, sopt);
4082			break;
4083
4084		case SO_PRIVILEGED_TRAFFIC_CLASS:
4085			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
4086			goto integer;
4087
4088		case SO_DEFUNCTOK:
4089			optval = !(so->so_flags & SOF_NODEFUNCT);
4090			goto integer;
4091
4092		case SO_ISDEFUNCT:
4093			optval = (so->so_flags & SOF_DEFUNCT);
4094			goto integer;
4095
4096		case SO_OPPORTUNISTIC:
4097			optval = so_get_opportunistic(so);
4098			goto integer;
4099
4100		case SO_FLUSH:
4101			/* This option is not gettable */
4102			error = EINVAL;
4103			break;
4104
4105		case SO_RECV_ANYIF:
4106			optval = so_get_recv_anyif(so);
4107			goto integer;
4108
4109		case SO_TRAFFIC_MGT_BACKGROUND:
4110			/* This option is handled by lower layer(s) */
4111			if (so->so_proto != NULL &&
4112			    so->so_proto->pr_ctloutput != NULL) {
4113				(void) so->so_proto->pr_ctloutput(so, sopt);
4114			}
4115			break;
4116
4117#if FLOW_DIVERT
4118		case SO_FLOW_DIVERT_TOKEN:
4119			error = flow_divert_token_get(so, sopt);
4120			break;
4121#endif	/* FLOW_DIVERT */
4122
4123		default:
4124			error = ENOPROTOOPT;
4125			break;
4126		}
4127	}
4128out:
4129	if (dolock)
4130		socket_unlock(so, 1);
4131	return (error);
4132}
4133
4134/*
4135 * The size limits on our soopt_getm is different from that on FreeBSD.
4136 * We limit the size of options to MCLBYTES. This will have to change
4137 * if we need to define options that need more space than MCLBYTES.
4138 */
4139int
4140soopt_getm(struct sockopt *sopt, struct mbuf **mp)
4141{
4142	struct mbuf *m, *m_prev;
4143	int sopt_size = sopt->sopt_valsize;
4144	int how;
4145
4146	if (sopt_size <= 0 || sopt_size > MCLBYTES)
4147		return (EMSGSIZE);
4148
4149	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
4150	MGET(m, how, MT_DATA);
4151	if (m == NULL)
4152		return (ENOBUFS);
4153	if (sopt_size > MLEN) {
4154		MCLGET(m, how);
4155		if ((m->m_flags & M_EXT) == 0) {
4156			m_free(m);
4157			return (ENOBUFS);
4158		}
4159		m->m_len = min(MCLBYTES, sopt_size);
4160	} else {
4161		m->m_len = min(MLEN, sopt_size);
4162	}
4163	sopt_size -= m->m_len;
4164	*mp = m;
4165	m_prev = m;
4166
4167	while (sopt_size > 0) {
4168		MGET(m, how, MT_DATA);
4169		if (m == NULL) {
4170			m_freem(*mp);
4171			return (ENOBUFS);
4172		}
4173		if (sopt_size > MLEN) {
4174			MCLGET(m, how);
4175			if ((m->m_flags & M_EXT) == 0) {
4176				m_freem(*mp);
4177				m_freem(m);
4178				return (ENOBUFS);
4179			}
4180			m->m_len = min(MCLBYTES, sopt_size);
4181		} else {
4182			m->m_len = min(MLEN, sopt_size);
4183		}
4184		sopt_size -= m->m_len;
4185		m_prev->m_next = m;
4186		m_prev = m;
4187	}
4188	return (0);
4189}
4190
4191/* copyin sopt data into mbuf chain */
4192int
4193soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
4194{
4195	struct mbuf *m0 = m;
4196
4197	if (sopt->sopt_val == USER_ADDR_NULL)
4198		return (0);
4199	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4200		if (sopt->sopt_p != kernproc) {
4201			int error;
4202
4203			error = copyin(sopt->sopt_val, mtod(m, char *),
4204			    m->m_len);
4205			if (error != 0) {
4206				m_freem(m0);
4207				return (error);
4208			}
4209		} else {
4210			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
4211			    mtod(m, char *), m->m_len);
4212		}
4213		sopt->sopt_valsize -= m->m_len;
4214		sopt->sopt_val += m->m_len;
4215		m = m->m_next;
4216	}
4217	/* should be allocated enoughly at ip6_sooptmcopyin() */
4218	if (m != NULL) {
4219		panic("soopt_mcopyin");
4220		/* NOTREACHED */
4221	}
4222	return (0);
4223}
4224
4225/* copyout mbuf chain data into soopt */
4226int
4227soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
4228{
4229	struct mbuf *m0 = m;
4230	size_t valsize = 0;
4231
4232	if (sopt->sopt_val == USER_ADDR_NULL)
4233		return (0);
4234	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4235		if (sopt->sopt_p != kernproc) {
4236			int error;
4237
4238			error = copyout(mtod(m, char *), sopt->sopt_val,
4239			    m->m_len);
4240			if (error != 0) {
4241				m_freem(m0);
4242				return (error);
4243			}
4244		} else {
4245			bcopy(mtod(m, char *),
4246			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
4247		}
4248		sopt->sopt_valsize -= m->m_len;
4249		sopt->sopt_val += m->m_len;
4250		valsize += m->m_len;
4251		m = m->m_next;
4252	}
4253	if (m != NULL) {
4254		/* enough soopt buffer should be given from user-land */
4255		m_freem(m0);
4256		return (EINVAL);
4257	}
4258	sopt->sopt_valsize = valsize;
4259	return (0);
4260}
4261
4262void
4263sohasoutofband(struct socket *so)
4264{
4265	if (so->so_pgid < 0)
4266		gsignal(-so->so_pgid, SIGURG);
4267	else if (so->so_pgid > 0)
4268		proc_signal(so->so_pgid, SIGURG);
4269	selwakeup(&so->so_rcv.sb_sel);
4270}
4271
4272int
4273sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
4274{
4275#pragma unused(cred)
4276	struct proc *p = current_proc();
4277	int revents = 0;
4278
4279	socket_lock(so, 1);
4280	so_update_last_owner_locked(so, PROC_NULL);
4281	so_update_policy(so);
4282
4283	if (events & (POLLIN | POLLRDNORM))
4284		if (soreadable(so))
4285			revents |= events & (POLLIN | POLLRDNORM);
4286
4287	if (events & (POLLOUT | POLLWRNORM))
4288		if (sowriteable(so))
4289			revents |= events & (POLLOUT | POLLWRNORM);
4290
4291	if (events & (POLLPRI | POLLRDBAND))
4292		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
4293			revents |= events & (POLLPRI | POLLRDBAND);
4294
4295	if (revents == 0) {
4296		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
4297			/*
4298			 * Darwin sets the flag first,
4299			 * BSD calls selrecord first
4300			 */
4301			so->so_rcv.sb_flags |= SB_SEL;
4302			selrecord(p, &so->so_rcv.sb_sel, wql);
4303		}
4304
4305		if (events & (POLLOUT | POLLWRNORM)) {
4306			/*
4307			 * Darwin sets the flag first,
4308			 * BSD calls selrecord first
4309			 */
4310			so->so_snd.sb_flags |= SB_SEL;
4311			selrecord(p, &so->so_snd.sb_sel, wql);
4312		}
4313	}
4314
4315	socket_unlock(so, 1);
4316	return (revents);
4317}
4318
4319int
4320soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
4321{
4322#pragma unused(fp)
4323#if !CONFIG_MACF_SOCKET
4324#pragma unused(ctx)
4325#endif /* MAC_SOCKET */
4326	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4327	struct klist *skl;
4328
4329	socket_lock(so, 1);
4330	so_update_last_owner_locked(so, PROC_NULL);
4331	so_update_policy(so);
4332
4333#if CONFIG_MACF_SOCKET
4334	if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
4335	    kn, so) != 0) {
4336		socket_unlock(so, 1);
4337		return (1);
4338	}
4339#endif /* MAC_SOCKET */
4340
4341	switch (kn->kn_filter) {
4342	case EVFILT_READ:
4343		kn->kn_fop = &soread_filtops;
4344		skl = &so->so_rcv.sb_sel.si_note;
4345		break;
4346	case EVFILT_WRITE:
4347		kn->kn_fop = &sowrite_filtops;
4348		skl = &so->so_snd.sb_sel.si_note;
4349		break;
4350	case EVFILT_SOCK:
4351		kn->kn_fop = &sock_filtops;
4352		skl = &so->so_klist;
4353		break;
4354	default:
4355		socket_unlock(so, 1);
4356		return (1);
4357	}
4358
4359	if (KNOTE_ATTACH(skl, kn)) {
4360		switch (kn->kn_filter) {
4361		case EVFILT_READ:
4362			so->so_rcv.sb_flags |= SB_KNOTE;
4363			break;
4364		case EVFILT_WRITE:
4365			so->so_snd.sb_flags |= SB_KNOTE;
4366			break;
4367		case EVFILT_SOCK:
4368			so->so_flags |= SOF_KNOTE;
4369			break;
4370		default:
4371			socket_unlock(so, 1);
4372			return (1);
4373		}
4374	}
4375	socket_unlock(so, 1);
4376	return (0);
4377}
4378
4379static void
4380filt_sordetach(struct knote *kn)
4381{
4382	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4383
4384	socket_lock(so, 1);
4385	if (so->so_rcv.sb_flags & SB_KNOTE)
4386		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
4387			so->so_rcv.sb_flags &= ~SB_KNOTE;
4388	socket_unlock(so, 1);
4389}
4390
4391/*ARGSUSED*/
4392static int
4393filt_soread(struct knote *kn, long hint)
4394{
4395	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4396
4397	if ((hint & SO_FILT_HINT_LOCKED) == 0)
4398		socket_lock(so, 1);
4399
4400	if (so->so_options & SO_ACCEPTCONN) {
4401		int isempty;
4402
4403		/*
4404		 * Radar 6615193 handle the listen case dynamically
4405		 * for kqueue read filter. This allows to call listen()
4406		 * after registering the kqueue EVFILT_READ.
4407		 */
4408
4409		kn->kn_data = so->so_qlen;
4410		isempty = ! TAILQ_EMPTY(&so->so_comp);
4411
4412		if ((hint & SO_FILT_HINT_LOCKED) == 0)
4413			socket_unlock(so, 1);
4414
4415		return (isempty);
4416	}
4417
4418	/* socket isn't a listener */
4419
4420	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4421
4422	if (so->so_oobmark) {
4423		if (kn->kn_flags & EV_OOBAND) {
4424			kn->kn_data -= so->so_oobmark;
4425			if ((hint & SO_FILT_HINT_LOCKED) == 0)
4426				socket_unlock(so, 1);
4427			return (1);
4428		}
4429		kn->kn_data = so->so_oobmark;
4430		kn->kn_flags |= EV_OOBAND;
4431	} else {
4432		if (so->so_state & SS_CANTRCVMORE) {
4433			kn->kn_flags |= EV_EOF;
4434			kn->kn_fflags = so->so_error;
4435			if ((hint & SO_FILT_HINT_LOCKED) == 0)
4436				socket_unlock(so, 1);
4437			return (1);
4438		}
4439	}
4440
4441	if (so->so_state & SS_RCVATMARK) {
4442		if (kn->kn_flags & EV_OOBAND) {
4443			if ((hint & SO_FILT_HINT_LOCKED) == 0)
4444				socket_unlock(so, 1);
4445			return (1);
4446		}
4447		kn->kn_flags |= EV_OOBAND;
4448	} else if (kn->kn_flags & EV_OOBAND) {
4449		kn->kn_data = 0;
4450		if ((hint & SO_FILT_HINT_LOCKED) == 0)
4451			socket_unlock(so, 1);
4452		return (0);
4453	}
4454
4455	if (so->so_error) {	/* temporary udp error */
4456		if ((hint & SO_FILT_HINT_LOCKED) == 0)
4457			socket_unlock(so, 1);
4458		return (1);
4459	}
4460
4461	int64_t	lowwat = so->so_rcv.sb_lowat;
4462	if (kn->kn_sfflags & NOTE_LOWAT) {
4463		if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4464			lowwat = so->so_rcv.sb_hiwat;
4465		else if (kn->kn_sdata > lowwat)
4466			lowwat = kn->kn_sdata;
4467	}
4468
4469	if ((hint & SO_FILT_HINT_LOCKED) == 0)
4470		socket_unlock(so, 1);
4471
4472	return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
4473}
4474
4475static void
4476filt_sowdetach(struct knote *kn)
4477{
4478	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4479	socket_lock(so, 1);
4480
4481	if (so->so_snd.sb_flags & SB_KNOTE)
4482		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4483			so->so_snd.sb_flags &= ~SB_KNOTE;
4484	socket_unlock(so, 1);
4485}
4486
4487int
4488so_wait_for_if_feedback(struct socket *so)
4489{
4490	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4491	    (so->so_state & SS_ISCONNECTED)) {
4492		struct inpcb *inp = sotoinpcb(so);
4493		if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4494			return (1);
4495	}
4496	return (0);
4497}
4498
4499/*ARGSUSED*/
4500static int
4501filt_sowrite(struct knote *kn, long hint)
4502{
4503	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4504	int ret = 0;
4505
4506	if ((hint & SO_FILT_HINT_LOCKED) == 0)
4507		socket_lock(so, 1);
4508
4509	kn->kn_data = sbspace(&so->so_snd);
4510	if (so->so_state & SS_CANTSENDMORE) {
4511		kn->kn_flags |= EV_EOF;
4512		kn->kn_fflags = so->so_error;
4513		ret = 1;
4514		goto out;
4515	}
4516	if (so->so_error) {	/* temporary udp error */
4517		ret = 1;
4518		goto out;
4519	}
4520	if (((so->so_state & SS_ISCONNECTED) == 0) &&
4521	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4522		ret = 0;
4523		goto out;
4524	}
4525	int64_t	lowwat = so->so_snd.sb_lowat;
4526	if (kn->kn_sfflags & NOTE_LOWAT) {
4527		if (kn->kn_sdata > so->so_snd.sb_hiwat)
4528			lowwat = so->so_snd.sb_hiwat;
4529		else if (kn->kn_sdata > lowwat)
4530			lowwat = kn->kn_sdata;
4531	}
4532	if (kn->kn_data >= lowwat) {
4533		if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4534			ret = tcp_notsent_lowat_check(so);
4535		} else {
4536			ret = 1;
4537		}
4538	}
4539	if (so_wait_for_if_feedback(so))
4540		ret = 0;
4541out:
4542	if ((hint & SO_FILT_HINT_LOCKED) == 0)
4543		socket_unlock(so, 1);
4544	return (ret);
4545}
4546
4547static void
4548filt_sockdetach(struct knote *kn)
4549{
4550	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4551	socket_lock(so, 1);
4552
4553	if ((so->so_flags & SOF_KNOTE) != 0)
4554		if (KNOTE_DETACH(&so->so_klist, kn))
4555			so->so_flags &= ~SOF_KNOTE;
4556	socket_unlock(so, 1);
4557}
4558
4559static int
4560filt_sockev(struct knote *kn, long hint)
4561{
4562	int ret = 0, locked = 0;
4563	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4564	long ev_hint = (hint & SO_FILT_HINT_EV);
4565
4566	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4567		socket_lock(so, 1);
4568		locked = 1;
4569	}
4570
4571	if (ev_hint & SO_FILT_HINT_CONNRESET) {
4572		if (kn->kn_sfflags & NOTE_CONNRESET)
4573			kn->kn_fflags |= NOTE_CONNRESET;
4574	}
4575	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
4576		if (kn->kn_sfflags & NOTE_TIMEOUT)
4577			kn->kn_fflags |= NOTE_TIMEOUT;
4578	}
4579	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
4580		if (kn->kn_sfflags & NOTE_NOSRCADDR)
4581			kn->kn_fflags |= NOTE_NOSRCADDR;
4582	}
4583	if (ev_hint & SO_FILT_HINT_IFDENIED) {
4584		if ((kn->kn_sfflags & NOTE_IFDENIED))
4585			kn->kn_fflags |= NOTE_IFDENIED;
4586	}
4587	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
4588		if (kn->kn_sfflags & NOTE_KEEPALIVE)
4589			kn->kn_fflags |= NOTE_KEEPALIVE;
4590	}
4591	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
4592		if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
4593			kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
4594	}
4595	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
4596		if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
4597			kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
4598	}
4599	if (ev_hint & SO_FILT_HINT_CONNECTED) {
4600		if (kn->kn_sfflags & NOTE_CONNECTED)
4601			kn->kn_fflags |= NOTE_CONNECTED;
4602	}
4603	if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
4604		if (kn->kn_sfflags & NOTE_DISCONNECTED)
4605			kn->kn_fflags |= NOTE_DISCONNECTED;
4606	}
4607	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
4608		if (so->so_proto != NULL &&
4609		    (so->so_proto->pr_flags & PR_EVCONNINFO) &&
4610		    (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
4611			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
4612	}
4613
4614	if ((kn->kn_sfflags & NOTE_READCLOSED) &&
4615	    (so->so_state & SS_CANTRCVMORE))
4616		kn->kn_fflags |= NOTE_READCLOSED;
4617
4618	if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
4619	    (so->so_state & SS_CANTSENDMORE))
4620		kn->kn_fflags |= NOTE_WRITECLOSED;
4621
4622	if ((kn->kn_sfflags & NOTE_SUSPEND) &&
4623	    ((ev_hint & SO_FILT_HINT_SUSPEND) ||
4624	    (so->so_flags & SOF_SUSPENDED))) {
4625		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4626		kn->kn_fflags |= NOTE_SUSPEND;
4627	}
4628
4629	if ((kn->kn_sfflags & NOTE_RESUME) &&
4630	    ((ev_hint & SO_FILT_HINT_RESUME) ||
4631	    (so->so_flags & SOF_SUSPENDED) == 0)) {
4632		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4633		kn->kn_fflags |= NOTE_RESUME;
4634	}
4635
4636	if (so->so_error != 0) {
4637		ret = 1;
4638		kn->kn_data = so->so_error;
4639		kn->kn_flags |= EV_EOF;
4640	} else {
4641		get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4642	}
4643
4644	if (kn->kn_fflags != 0)
4645		ret = 1;
4646
4647	if (locked)
4648		socket_unlock(so, 1);
4649
4650	return (ret);
4651}
4652
4653void
4654get_sockev_state(struct socket *so, u_int32_t *statep)
4655{
4656	u_int32_t state = *(statep);
4657
4658	if (so->so_state & SS_ISCONNECTED)
4659		state |= SOCKEV_CONNECTED;
4660	else
4661		state &= ~(SOCKEV_CONNECTED);
4662	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
4663	*(statep) = state;
4664}
4665
4666#define	SO_LOCK_HISTORY_STR_LEN \
4667	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
4668
4669__private_extern__ const char *
4670solockhistory_nr(struct socket *so)
4671{
4672	size_t n = 0;
4673	int i;
4674	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4675
4676	bzero(lock_history_str, sizeof (lock_history_str));
4677	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4678		n += snprintf(lock_history_str + n,
4679		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
4680		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4681		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4682	}
4683	return (lock_history_str);
4684}
4685
4686int
4687socket_lock(struct socket *so, int refcount)
4688{
4689	int error = 0;
4690	void *lr_saved;
4691
4692	lr_saved = __builtin_return_address(0);
4693
4694	if (so->so_proto->pr_lock) {
4695		error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4696	} else {
4697#ifdef MORE_LOCKING_DEBUG
4698		lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4699		    LCK_MTX_ASSERT_NOTOWNED);
4700#endif
4701		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4702		if (refcount)
4703			so->so_usecount++;
4704		so->lock_lr[so->next_lock_lr] = lr_saved;
4705		so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4706	}
4707
4708	return (error);
4709}
4710
4711int
4712socket_unlock(struct socket *so, int refcount)
4713{
4714	int error = 0;
4715	void *lr_saved;
4716	lck_mtx_t *mutex_held;
4717
4718	lr_saved = __builtin_return_address(0);
4719
4720	if (so->so_proto == NULL) {
4721		panic("%s: null so_proto so=%p\n", __func__, so);
4722		/* NOTREACHED */
4723	}
4724
4725	if (so && so->so_proto->pr_unlock) {
4726		error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4727	} else {
4728		mutex_held = so->so_proto->pr_domain->dom_mtx;
4729#ifdef MORE_LOCKING_DEBUG
4730		lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4731#endif
4732		so->unlock_lr[so->next_unlock_lr] = lr_saved;
4733		so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4734
4735		if (refcount) {
4736			if (so->so_usecount <= 0) {
4737				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
4738				    "lrh=%s", __func__, so->so_usecount, so,
4739				    SOCK_DOM(so), so->so_type,
4740				    SOCK_PROTO(so), solockhistory_nr(so));
4741				/* NOTREACHED */
4742			}
4743
4744			so->so_usecount--;
4745			if (so->so_usecount == 0)
4746				sofreelastref(so, 1);
4747		}
4748		lck_mtx_unlock(mutex_held);
4749	}
4750
4751	return (error);
4752}
4753
4754/* Called with socket locked, will unlock socket */
4755void
4756sofree(struct socket *so)
4757{
4758	lck_mtx_t *mutex_held;
4759
4760	if (so->so_proto->pr_getlock != NULL)
4761		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4762	else
4763		mutex_held = so->so_proto->pr_domain->dom_mtx;
4764	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4765
4766	sofreelastref(so, 0);
4767}
4768
4769void
4770soreference(struct socket *so)
4771{
4772	socket_lock(so, 1);	/* locks & take one reference on socket */
4773	socket_unlock(so, 0);	/* unlock only */
4774}
4775
4776void
4777sodereference(struct socket *so)
4778{
4779	socket_lock(so, 0);
4780	socket_unlock(so, 1);
4781}
4782
4783/*
4784 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4785 * possibility of using jumbo clusters.  Caller must ensure to hold
4786 * the socket lock.
4787 */
4788void
4789somultipages(struct socket *so, boolean_t set)
4790{
4791	if (set)
4792		so->so_flags |= SOF_MULTIPAGES;
4793	else
4794		so->so_flags &= ~SOF_MULTIPAGES;
4795}
4796
4797int
4798so_isdstlocal(struct socket *so) {
4799
4800	struct inpcb *inp = (struct inpcb *)so->so_pcb;
4801
4802	if (SOCK_DOM(so) == PF_INET)
4803		return (inaddr_local(inp->inp_faddr));
4804	else if (SOCK_DOM(so) == PF_INET6)
4805		return (in6addr_local(&inp->in6p_faddr));
4806
4807	return (0);
4808}
4809
4810int
4811sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4812{
4813	struct sockbuf *rcv, *snd;
4814	int err = 0, defunct;
4815
4816	rcv = &so->so_rcv;
4817	snd = &so->so_snd;
4818
4819	defunct = (so->so_flags & SOF_DEFUNCT);
4820	if (defunct) {
4821		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
4822			panic("%s: SB_DROP not set", __func__);
4823			/* NOTREACHED */
4824		}
4825		goto done;
4826	}
4827
4828	if (so->so_flags & SOF_NODEFUNCT) {
4829		if (noforce) {
4830			err = EOPNOTSUPP;
4831			SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
4832			    "so 0x%llx [%d,%d] is not eligible for defunct "
4833			    "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
4834			    level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4835			    SOCK_DOM(so), SOCK_TYPE(so), err));
4836			return (err);
4837		}
4838		so->so_flags &= ~SOF_NODEFUNCT;
4839		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4840		    "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
4841		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4842		    SOCK_DOM(so), SOCK_TYPE(so)));
4843	}
4844
4845	so->so_flags |= SOF_DEFUNCT;
4846
4847	/* Prevent further data from being appended to the socket buffers */
4848	snd->sb_flags |= SB_DROP;
4849	rcv->sb_flags |= SB_DROP;
4850
4851	/* Flush any existing data in the socket buffers */
4852	if (rcv->sb_cc != 0) {
4853		rcv->sb_flags &= ~SB_SEL;
4854		selthreadclear(&rcv->sb_sel);
4855		sbrelease(rcv);
4856	}
4857	if (snd->sb_cc != 0) {
4858		snd->sb_flags &= ~SB_SEL;
4859		selthreadclear(&snd->sb_sel);
4860		sbrelease(snd);
4861	}
4862
4863done:
4864	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
4865	    "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
4866	    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
4867	    defunct ? "is already" : "marked as"));
4868
4869	return (err);
4870}
4871
4872int
4873sodefunct(struct proc *p, struct socket *so, int level)
4874{
4875	struct sockbuf *rcv, *snd;
4876
4877	if (!(so->so_flags & SOF_DEFUNCT)) {
4878		panic("%s improperly called", __func__);
4879		/* NOTREACHED */
4880	}
4881	if (so->so_state & SS_DEFUNCT)
4882		goto done;
4883
4884	rcv = &so->so_rcv;
4885	snd = &so->so_snd;
4886
4887	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4888		char s[MAX_IPv6_STR_LEN];
4889		char d[MAX_IPv6_STR_LEN];
4890		struct inpcb *inp = sotoinpcb(so);
4891
4892		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
4893		    "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4894		    "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4895		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4896		    (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
4897		    inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
4898		    (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
4899		    s, sizeof (s)), ntohs(inp->in6p_lport),
4900		    inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
4901		    (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
4902		    d, sizeof (d)), ntohs(inp->in6p_fport),
4903		    (uint32_t)rcv->sb_sel.si_flags,
4904		    (uint32_t)snd->sb_sel.si_flags,
4905		    rcv->sb_flags, snd->sb_flags));
4906	} else {
4907		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4908		    "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4909		    "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4910		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4911		    SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
4912		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
4913		    snd->sb_flags));
4914	}
4915
4916	/*
4917	 * Unwedge threads blocked on sbwait() and sb_lock().
4918	 */
4919	sbwakeup(rcv);
4920	sbwakeup(snd);
4921
4922	if (rcv->sb_flags & SB_LOCK)
4923		sbunlock(rcv, TRUE);	/* keep socket locked */
4924	if (snd->sb_flags & SB_LOCK)
4925		sbunlock(snd, TRUE);	/* keep socket locked */
4926
4927	/*
4928	 * Flush the buffers and disconnect.  We explicitly call shutdown
4929	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4930	 * states are set for the socket.  This would also flush out data
4931	 * hanging off the receive list of this socket.
4932	 */
4933	(void) soshutdownlock(so, SHUT_RD);
4934	(void) soshutdownlock(so, SHUT_WR);
4935	(void) sodisconnectlocked(so);
4936
4937	/*
4938	 * Explicitly handle connectionless-protocol disconnection
4939	 * and release any remaining data in the socket buffers.
4940	 */
4941	if (!(so->so_flags & SS_ISDISCONNECTED))
4942		(void) soisdisconnected(so);
4943
4944	if (so->so_error == 0)
4945		so->so_error = EBADF;
4946
4947	if (rcv->sb_cc != 0) {
4948		rcv->sb_flags &= ~SB_SEL;
4949		selthreadclear(&rcv->sb_sel);
4950		sbrelease(rcv);
4951	}
4952	if (snd->sb_cc != 0) {
4953		snd->sb_flags &= ~SB_SEL;
4954		selthreadclear(&snd->sb_sel);
4955		sbrelease(snd);
4956	}
4957	so->so_state |= SS_DEFUNCT;
4958
4959done:
4960	return (0);
4961}
4962
4963__private_extern__ int
4964so_set_recv_anyif(struct socket *so, int optval)
4965{
4966	int ret = 0;
4967
4968#if INET6
4969	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4970#else
4971	if (SOCK_DOM(so) == PF_INET) {
4972#endif /* !INET6 */
4973		if (optval)
4974			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4975		else
4976			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
4977	}
4978
4979	return (ret);
4980}
4981
4982__private_extern__ int
4983so_get_recv_anyif(struct socket *so)
4984{
4985	int ret = 0;
4986
4987#if INET6
4988	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4989#else
4990	if (SOCK_DOM(so) == PF_INET) {
4991#endif /* !INET6 */
4992		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4993	}
4994
4995	return (ret);
4996}
4997
4998int
4999so_set_restrictions(struct socket *so, uint32_t vals)
5000{
5001	int nocell_old, nocell_new;
5002	int ret = 0;
5003
5004	/*
5005	 * Deny-type restrictions are trapdoors; once set they cannot be
5006	 * unset for the lifetime of the socket.  This allows them to be
5007	 * issued by a framework on behalf of the application without
5008	 * having to worry that they can be undone.
5009	 *
5010	 * Note here that socket-level restrictions overrides any protocol
5011	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
5012	 * socket restriction issued on the socket has a higher precendence
5013	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
5014	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
5015	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
5016	 */
5017	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5018	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
5019	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5020	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5021
5022	/* other than deny cellular, there's nothing more to do */
5023	if ((nocell_new - nocell_old) == 0)
5024		return (ret);
5025
5026	/* we can only set, not clear restrictions */
5027	VERIFY((nocell_new - nocell_old) > 0);
5028
5029#if INET6
5030	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5031#else
5032	if (SOCK_DOM(so) == PF_INET) {
5033#endif /* !INET6 */
5034		/* if deny cellular is now set, do what's needed for INPCB */
5035		inp_set_nocellular(sotoinpcb(so));
5036	}
5037
5038	return (ret);
5039}
5040
5041uint32_t
5042so_get_restrictions(struct socket *so)
5043{
5044	return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
5045	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5046}
5047
5048struct sockaddr_entry *
5049sockaddrentry_alloc(int how)
5050{
5051	struct sockaddr_entry *se;
5052
5053	se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
5054	if (se != NULL)
5055		bzero(se, se_zone_size);
5056
5057	return (se);
5058}
5059
5060void
5061sockaddrentry_free(struct sockaddr_entry *se)
5062{
5063	if (se->se_addr != NULL) {
5064		FREE(se->se_addr, M_SONAME);
5065		se->se_addr = NULL;
5066	}
5067	zfree(se_zone, se);
5068}
5069
5070struct sockaddr_entry *
5071sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
5072{
5073	struct sockaddr_entry *dst_se;
5074
5075	dst_se = sockaddrentry_alloc(how);
5076	if (dst_se != NULL) {
5077		int len = src_se->se_addr->sa_len;
5078
5079		MALLOC(dst_se->se_addr, struct sockaddr *,
5080		    len, M_SONAME, how | M_ZERO);
5081		if (dst_se->se_addr != NULL) {
5082			bcopy(src_se->se_addr, dst_se->se_addr, len);
5083		} else {
5084			sockaddrentry_free(dst_se);
5085			dst_se = NULL;
5086		}
5087	}
5088
5089	return (dst_se);
5090}
5091
5092struct sockaddr_list *
5093sockaddrlist_alloc(int how)
5094{
5095	struct sockaddr_list *sl;
5096
5097	sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
5098	if (sl != NULL) {
5099		bzero(sl, sl_zone_size);
5100		TAILQ_INIT(&sl->sl_head);
5101	}
5102	return (sl);
5103}
5104
5105void
5106sockaddrlist_free(struct sockaddr_list *sl)
5107{
5108	struct sockaddr_entry *se, *tse;
5109
5110	TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
5111		sockaddrlist_remove(sl, se);
5112		sockaddrentry_free(se);
5113	}
5114	VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
5115	zfree(sl_zone, sl);
5116}
5117
5118void
5119sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
5120{
5121	VERIFY(!(se->se_flags & SEF_ATTACHED));
5122	se->se_flags |= SEF_ATTACHED;
5123	TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
5124	sl->sl_cnt++;
5125	VERIFY(sl->sl_cnt != 0);
5126}
5127
5128void
5129sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
5130{
5131	VERIFY(se->se_flags & SEF_ATTACHED);
5132	se->se_flags &= ~SEF_ATTACHED;
5133	VERIFY(sl->sl_cnt != 0);
5134	sl->sl_cnt--;
5135	TAILQ_REMOVE(&sl->sl_head, se, se_link);
5136}
5137
5138struct sockaddr_list *
5139sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
5140{
5141	struct sockaddr_entry *src_se, *tse;
5142	struct sockaddr_list *dst_sl;
5143
5144	dst_sl = sockaddrlist_alloc(how);
5145	if (dst_sl == NULL)
5146		return (NULL);
5147
5148	TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
5149		struct sockaddr_entry *dst_se;
5150
5151		if (src_se->se_addr == NULL)
5152			continue;
5153
5154		dst_se = sockaddrentry_dup(src_se, how);
5155		if (dst_se == NULL) {
5156			sockaddrlist_free(dst_sl);
5157			return (NULL);
5158		}
5159
5160		sockaddrlist_insert(dst_sl, dst_se);
5161	}
5162	VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
5163
5164	return (dst_sl);
5165}
5166
5167int
5168so_set_effective_pid(struct socket *so, int epid, struct proc *p)
5169{
5170	struct proc *ep = PROC_NULL;
5171	int error = 0;
5172
5173	/* pid 0 is reserved for kernel */
5174	if (epid == 0) {
5175		error = EINVAL;
5176		goto done;
5177	}
5178
5179	/*
5180	 * If this is an in-kernel socket, prevent its delegate
5181	 * association from changing unless the socket option is
5182	 * coming from within the kernel itself.
5183	 */
5184	if (so->last_pid == 0 && p != kernproc) {
5185		error = EACCES;
5186		goto done;
5187	}
5188
5189	/*
5190	 * If this is issued by a process that's recorded as the
5191	 * real owner of the socket, or if the pid is the same as
5192	 * the process's own pid, then proceed.  Otherwise ensure
5193	 * that the issuing process has the necessary privileges.
5194	 */
5195	if (epid != so->last_pid || epid != proc_pid(p)) {
5196		if ((error = priv_check_cred(kauth_cred_get(),
5197		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5198			error = EACCES;
5199			goto done;
5200		}
5201	}
5202
5203	/* Find the process that corresponds to the effective pid */
5204	if ((ep = proc_find(epid)) == PROC_NULL) {
5205		error = ESRCH;
5206		goto done;
5207	}
5208
5209	/*
5210	 * If a process tries to delegate the socket to itself, then
5211	 * there's really nothing to do; treat it as a way for the
5212	 * delegate association to be cleared.  Note that we check
5213	 * the passed-in proc rather than calling proc_selfpid(),
5214	 * as we need to check the process issuing the socket option
5215	 * which could be kernproc.  Given that we don't allow 0 for
5216	 * effective pid, it means that a delegated in-kernel socket
5217	 * stays delegated during its lifetime (which is probably OK.)
5218	 */
5219	if (epid == proc_pid(p)) {
5220		so->so_flags &= ~SOF_DELEGATED;
5221		so->e_upid = 0;
5222		so->e_pid = 0;
5223		uuid_clear(so->e_uuid);
5224	} else {
5225		so->so_flags |= SOF_DELEGATED;
5226		so->e_upid = proc_uniqueid(ep);
5227		so->e_pid = proc_pid(ep);
5228		proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
5229	}
5230
5231done:
5232	if (error == 0 && net_io_policy_log) {
5233		uuid_string_t buf;
5234
5235		uuid_unparse(so->e_uuid, buf);
5236		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5237		    "euuid %s%s\n", __func__, proc_name_address(p),
5238		    proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5239		    SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
5240		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5241	} else if (error != 0 && net_io_policy_log) {
5242		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5243		    "ERROR (%d)\n", __func__, proc_name_address(p),
5244		    proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5245		    SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
5246		    proc_name_address(ep), error);
5247	}
5248
5249	if (ep != PROC_NULL)
5250		proc_rele(ep);
5251
5252	return (error);
5253}
5254
5255int
5256so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
5257{
5258	uuid_string_t buf;
5259	uuid_t uuid;
5260	int error = 0;
5261
5262	/* UUID must not be all-zeroes (reserved for kernel) */
5263	if (uuid_is_null(euuid)) {
5264		error = EINVAL;
5265		goto done;;
5266	}
5267
5268	/*
5269	 * If this is an in-kernel socket, prevent its delegate
5270	 * association from changing unless the socket option is
5271	 * coming from within the kernel itself.
5272	 */
5273	if (so->last_pid == 0 && p != kernproc) {
5274		error = EACCES;
5275		goto done;
5276	}
5277
5278	/* Get the UUID of the issuing process */
5279	proc_getexecutableuuid(p, uuid, sizeof (uuid));
5280
5281	/*
5282	 * If this is issued by a process that's recorded as the
5283	 * real owner of the socket, or if the uuid is the same as
5284	 * the process's own uuid, then proceed.  Otherwise ensure
5285	 * that the issuing process has the necessary privileges.
5286	 */
5287	if (uuid_compare(euuid, so->last_uuid) != 0 ||
5288	    uuid_compare(euuid, uuid) != 0) {
5289		if ((error = priv_check_cred(kauth_cred_get(),
5290		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5291			error = EACCES;
5292			goto done;
5293		}
5294	}
5295
5296	/*
5297	 * If a process tries to delegate the socket to itself, then
5298	 * there's really nothing to do; treat it as a way for the
5299	 * delegate association to be cleared.  Note that we check
5300	 * the uuid of the passed-in proc rather than that of the
5301	 * current process, as we need to check the process issuing
5302	 * the socket option which could be kernproc itself.  Given
5303	 * that we don't allow 0 for effective uuid, it means that
5304	 * a delegated in-kernel socket stays delegated during its
5305	 * lifetime (which is okay.)
5306	 */
5307	if (uuid_compare(euuid, uuid) == 0) {
5308		so->so_flags &= ~SOF_DELEGATED;
5309		so->e_upid = 0;
5310		so->e_pid = 0;
5311		uuid_clear(so->e_uuid);
5312	} else {
5313		so->so_flags |= SOF_DELEGATED;
5314		/*
5315		 * Unlike so_set_effective_pid(), we only have the UUID
5316		 * here and the process ID is not known.  Inherit the
5317		 * real {pid,upid} of the socket.
5318		 */
5319		so->e_upid = so->last_upid;
5320		so->e_pid = so->last_pid;
5321		uuid_copy(so->e_uuid, euuid);
5322	}
5323
5324done:
5325	if (error == 0 && net_io_policy_log) {
5326		uuid_unparse(so->e_uuid, buf);
5327		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
5328		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
5329		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5330		    SOCK_TYPE(so), so->e_pid, buf,
5331		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5332	} else if (error != 0 && net_io_policy_log) {
5333		uuid_unparse(euuid, buf);
5334		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
5335		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
5336		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5337		    SOCK_TYPE(so), buf, error);
5338	}
5339
5340	return (error);
5341}
5342
5343void
5344netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
5345    uint32_t ev_datalen)
5346{
5347	struct kev_msg ev_msg;
5348
5349	/*
5350	 * A netpolicy event always starts with a netpolicy_event_data
5351	 * structure, but the caller can provide for a longer event
5352	 * structure to post, depending on the event code.
5353	 */
5354	VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
5355
5356	bzero(&ev_msg, sizeof (ev_msg));
5357	ev_msg.vendor_code	= KEV_VENDOR_APPLE;
5358	ev_msg.kev_class	= KEV_NETWORK_CLASS;
5359	ev_msg.kev_subclass	= KEV_NETPOLICY_SUBCLASS;
5360	ev_msg.event_code	= ev_code;
5361
5362	ev_msg.dv[0].data_ptr	= ev_data;
5363	ev_msg.dv[0].data_length = ev_datalen;
5364
5365	kev_post_msg(&ev_msg);
5366}
5367