1/*
2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 *    must display the following acknowledgement:
43 *	This product includes software developed by the University of
44 *	California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/filedesc.h>
73#include <sys/proc.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
82#include <sys/event.h>
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/syslog.h>
91#include <sys/uio.h>
92#include <sys/uio_internal.h>
93#include <sys/ev.h>
94#include <sys/kdebug.h>
95#include <sys/un.h>
96#include <sys/user.h>
97#include <sys/priv.h>
98#include <sys/kern_event.h>
99#include <net/route.h>
100#include <net/init.h>
101#include <net/ntstat.h>
102#include <net/content_filter.h>
103#include <netinet/in.h>
104#include <netinet/in_pcb.h>
105#include <netinet/ip6.h>
106#include <netinet6/ip6_var.h>
107#include <netinet/flow_divert.h>
108#include <kern/zalloc.h>
109#include <kern/locks.h>
110#include <machine/limits.h>
111#include <libkern/OSAtomic.h>
112#include <pexpert/pexpert.h>
113#include <kern/assert.h>
114#include <kern/task.h>
115#include <sys/kpi_mbuf.h>
116#include <sys/mcache.h>
117#include <sys/unpcb.h>
118
119#if CONFIG_MACF
120#include <security/mac.h>
121#include <security/mac_framework.h>
122#endif /* MAC */
123
124#if MULTIPATH
125#include <netinet/mp_pcb.h>
126#include <netinet/mptcp_var.h>
127#endif /* MULTIPATH */
128
129/* TODO: this should be in a header file somewhere */
130extern char *proc_name_address(void *p);
131
132static u_int32_t	so_cache_hw;	/* High water mark for socache */
133static u_int32_t	so_cache_timeouts;	/* number of timeouts */
134static u_int32_t	so_cache_max_freed;	/* max freed per timeout */
135static u_int32_t	cached_sock_count = 0;
136STAILQ_HEAD(, socket)	so_cache_head;
137int	max_cached_sock_count = MAX_CACHED_SOCKETS;
138static u_int32_t	so_cache_time;
139static int		socketinit_done;
140static struct zone	*so_cache_zone;
141
142static lck_grp_t	*so_cache_mtx_grp;
143static lck_attr_t	*so_cache_mtx_attr;
144static lck_grp_attr_t	*so_cache_mtx_grp_attr;
145static lck_mtx_t	*so_cache_mtx;
146
147#include <machine/limits.h>
148
149static void	filt_sordetach(struct knote *kn);
150static int	filt_soread(struct knote *kn, long hint);
151static void	filt_sowdetach(struct knote *kn);
152static int	filt_sowrite(struct knote *kn, long hint);
153static void	filt_sockdetach(struct knote *kn);
154static int	filt_sockev(struct knote *kn, long hint);
155
156static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
157static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
158
159static struct filterops soread_filtops = {
160	.f_isfd = 1,
161	.f_detach = filt_sordetach,
162	.f_event = filt_soread,
163};
164
165static struct filterops sowrite_filtops = {
166	.f_isfd = 1,
167	.f_detach = filt_sowdetach,
168	.f_event = filt_sowrite,
169};
170
171static struct filterops sock_filtops = {
172	.f_isfd = 1,
173	.f_detach = filt_sockdetach,
174	.f_event = filt_sockev,
175};
176
177SYSCTL_DECL(_kern_ipc);
178
179#define	EVEN_MORE_LOCKING_DEBUG 0
180
181int socket_debug = 0;
182SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
183	CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
184
185static int socket_zone = M_SOCKET;
186so_gen_t	so_gencnt;	/* generation count for sockets */
187
188MALLOC_DEFINE(M_SONAME, "soname", "socket name");
189MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
190
191#define	DBG_LAYER_IN_BEG	NETDBG_CODE(DBG_NETSOCK, 0)
192#define	DBG_LAYER_IN_END	NETDBG_CODE(DBG_NETSOCK, 2)
193#define	DBG_LAYER_OUT_BEG	NETDBG_CODE(DBG_NETSOCK, 1)
194#define	DBG_LAYER_OUT_END	NETDBG_CODE(DBG_NETSOCK, 3)
195#define	DBG_FNC_SOSEND		NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
196#define	DBG_FNC_SOSEND_LIST	NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
197#define	DBG_FNC_SORECEIVE	NETDBG_CODE(DBG_NETSOCK, (8 << 8))
198#define	DBG_FNC_SORECEIVE_LIST	NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
199#define	DBG_FNC_SOSHUTDOWN	NETDBG_CODE(DBG_NETSOCK, (9 << 8))
200
201#define	MAX_SOOPTGETM_SIZE	(128 * MCLBYTES)
202
203int somaxconn = SOMAXCONN;
204SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
205	CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
206
207/* Should we get a maximum also ??? */
208static int sosendmaxchain = 65536;
209static int sosendminchain = 16384;
210static int sorecvmincopy  = 16384;
211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
212	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
213SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
214	CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
215
216/*
217 * Set to enable jumbo clusters (if available) for large writes when
218 * the socket is marked with SOF_MULTIPAGES; see below.
219 */
220int sosendjcl = 1;
221SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
222	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
223
224/*
225 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
226 * writes on the socket for all protocols on any network interfaces,
227 * depending upon sosendjcl above.  Be extra careful when setting this
228 * to 1, because sending down packets that cross physical pages down to
229 * broken drivers (those that falsely assume that the physical pages
230 * are contiguous) might lead to system panics or silent data corruption.
231 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
232 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
233 * capable.  Set this to 1 only for testing/debugging purposes.
234 */
235int sosendjcl_ignore_capab = 0;
236SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
237	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
238
239int sosendbigcl_ignore_capab = 0;
240SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
241	CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
242
243int sodefunctlog = 0;
244SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
245	&sodefunctlog, 0, "");
246
247int sothrottlelog = 0;
248SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
249	&sothrottlelog, 0, "");
250
251int sorestrictrecv = 1;
252SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
253	&sorestrictrecv, 0, "Enable inbound interface restrictions");
254
255int sorestrictsend = 1;
256SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
257	&sorestrictsend, 0, "Enable outbound interface restrictions");
258
259extern struct inpcbinfo tcbinfo;
260
261/* TODO: these should be in header file */
262extern int get_inpcb_str_size(void);
263extern int get_tcp_str_size(void);
264
265static unsigned int sl_zone_size;		/* size of sockaddr_list */
266static struct zone *sl_zone;			/* zone for sockaddr_list */
267
268static unsigned int se_zone_size;		/* size of sockaddr_entry */
269static struct zone *se_zone;			/* zone for sockaddr_entry */
270
271vm_size_t	so_cache_zone_element_size;
272
273static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
274static void cached_sock_alloc(struct socket **, int);
275static void cached_sock_free(struct socket *);
276
277/*
278 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
279 * setting the DSCP code on the packet based on the service class; see
280 * <rdar://problem/11277343> for details.
281 */
282__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
283SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
284	&sotcdb, 0, "");
285
286void
287socketinit(void)
288{
289	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
290	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
291
292	if (socketinit_done) {
293		printf("socketinit: already called...\n");
294		return;
295	}
296	socketinit_done = 1;
297
298	PE_parse_boot_argn("socket_debug", &socket_debug,
299	    sizeof (socket_debug));
300
301	/*
302	 * allocate lock group attribute and group for socket cache mutex
303	 */
304	so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
305	so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
306	    so_cache_mtx_grp_attr);
307
308	/*
309	 * allocate the lock attribute for socket cache mutex
310	 */
311	so_cache_mtx_attr = lck_attr_alloc_init();
312
313	/* cached sockets mutex */
314	so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
315	if (so_cache_mtx == NULL) {
316		panic("%s: unable to allocate so_cache_mtx\n", __func__);
317		/* NOTREACHED */
318	}
319	STAILQ_INIT(&so_cache_head);
320
321	so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
322	    + get_inpcb_str_size() + 4 + get_tcp_str_size());
323
324	so_cache_zone = zinit(so_cache_zone_element_size,
325	    (120000 * so_cache_zone_element_size), 8192, "socache zone");
326	zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
327	zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
328
329	sl_zone_size = sizeof (struct sockaddr_list);
330	if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
331	    "sockaddr_list")) == NULL) {
332		panic("%s: unable to allocate sockaddr_list zone\n", __func__);
333		/* NOTREACHED */
334	}
335	zone_change(sl_zone, Z_CALLERACCT, FALSE);
336	zone_change(sl_zone, Z_EXPAND, TRUE);
337
338	se_zone_size = sizeof (struct sockaddr_entry);
339	if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
340	    "sockaddr_entry")) == NULL) {
341		panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
342		/* NOTREACHED */
343	}
344	zone_change(se_zone, Z_CALLERACCT, FALSE);
345	zone_change(se_zone, Z_EXPAND, TRUE);
346
347
348	in_pcbinit();
349	sflt_init();
350	socket_tclass_init();
351#if MULTIPATH
352	mp_pcbinit();
353#endif /* MULTIPATH */
354}
355
356static void
357cached_sock_alloc(struct socket **so, int waitok)
358{
359	caddr_t	temp;
360	uintptr_t offset;
361
362	lck_mtx_lock(so_cache_mtx);
363
364	if (!STAILQ_EMPTY(&so_cache_head)) {
365		VERIFY(cached_sock_count > 0);
366
367		*so = STAILQ_FIRST(&so_cache_head);
368		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
369		STAILQ_NEXT((*so), so_cache_ent) = NULL;
370
371		cached_sock_count--;
372		lck_mtx_unlock(so_cache_mtx);
373
374		temp = (*so)->so_saved_pcb;
375		bzero((caddr_t)*so, sizeof (struct socket));
376
377		(*so)->so_saved_pcb = temp;
378	} else {
379
380		lck_mtx_unlock(so_cache_mtx);
381
382		if (waitok)
383			*so = (struct socket *)zalloc(so_cache_zone);
384		else
385			*so = (struct socket *)zalloc_noblock(so_cache_zone);
386
387		if (*so == NULL)
388			return;
389
390		bzero((caddr_t)*so, sizeof (struct socket));
391
392		/*
393		 * Define offsets for extra structures into our
394		 * single block of memory. Align extra structures
395		 * on longword boundaries.
396		 */
397
398		offset = (uintptr_t)*so;
399		offset += sizeof (struct socket);
400
401		offset = ALIGN(offset);
402
403		(*so)->so_saved_pcb = (caddr_t)offset;
404		offset += get_inpcb_str_size();
405
406		offset = ALIGN(offset);
407
408		((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
409		    (caddr_t)offset;
410	}
411
412	(*so)->cached_in_sock_layer = true;
413}
414
415static void
416cached_sock_free(struct socket *so)
417{
418
419	lck_mtx_lock(so_cache_mtx);
420
421	so_cache_time = net_uptime();
422	if (++cached_sock_count > max_cached_sock_count) {
423		--cached_sock_count;
424		lck_mtx_unlock(so_cache_mtx);
425		zfree(so_cache_zone, so);
426	} else {
427		if (so_cache_hw < cached_sock_count)
428			so_cache_hw = cached_sock_count;
429
430		STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
431
432		so->cache_timestamp = so_cache_time;
433		lck_mtx_unlock(so_cache_mtx);
434	}
435}
436
437void
438so_update_last_owner_locked(struct socket *so, proc_t self)
439{
440	if (so->last_pid != 0) {
441		/*
442		 * last_pid and last_upid should remain zero for sockets
443		 * created using sock_socket. The check above achieves that
444		 */
445		if (self == PROC_NULL)
446			self = current_proc();
447
448		if (so->last_upid != proc_uniqueid(self) ||
449		    so->last_pid != proc_pid(self)) {
450			so->last_upid = proc_uniqueid(self);
451			so->last_pid = proc_pid(self);
452			proc_getexecutableuuid(self, so->last_uuid,
453			    sizeof (so->last_uuid));
454		}
455		proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
456	}
457}
458
459void
460so_update_policy(struct socket *so)
461{
462	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
463		(void) inp_update_policy(sotoinpcb(so));
464}
465
466#if NECP
467static void
468so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr)
469{
470	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
471		inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0);
472}
473#endif /* NECP */
474
475boolean_t
476so_cache_timer(void)
477{
478	struct socket	*p;
479	int		n_freed = 0;
480	boolean_t rc = FALSE;
481
482	lck_mtx_lock(so_cache_mtx);
483	so_cache_timeouts++;
484	so_cache_time = net_uptime();
485
486	while (!STAILQ_EMPTY(&so_cache_head)) {
487		VERIFY(cached_sock_count > 0);
488		p = STAILQ_FIRST(&so_cache_head);
489		if ((so_cache_time - p->cache_timestamp) <
490			SO_CACHE_TIME_LIMIT)
491			break;
492
493		STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
494		--cached_sock_count;
495
496		zfree(so_cache_zone, p);
497
498		if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
499			so_cache_max_freed++;
500			break;
501		}
502	}
503
504	/* Schedule again if there is more to cleanup */
505	if (!STAILQ_EMPTY(&so_cache_head))
506		rc = TRUE;
507
508	lck_mtx_unlock(so_cache_mtx);
509	return (rc);
510}
511
512/*
513 * Get a socket structure from our zone, and initialize it.
514 * We don't implement `waitok' yet (see comments in uipc_domain.c).
515 * Note that it would probably be better to allocate socket
516 * and PCB at the same time, but I'm not convinced that all
517 * the protocols can be easily modified to do this.
518 */
519struct socket *
520soalloc(int waitok, int dom, int type)
521{
522	struct socket *so;
523
524	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
525		cached_sock_alloc(&so, waitok);
526	} else {
527		MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
528		    M_WAITOK);
529		if (so != NULL)
530			bzero(so, sizeof (*so));
531	}
532	if (so != NULL) {
533		so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
534		so->so_zone = socket_zone;
535#if CONFIG_MACF_SOCKET
536		/* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
537		if (mac_socket_label_init(so, !waitok) != 0) {
538			sodealloc(so);
539			return (NULL);
540		}
541#endif /* MAC_SOCKET */
542	}
543
544	return (so);
545}
546
547int
548socreate_internal(int dom, struct socket **aso, int type, int proto,
549    struct proc *p, uint32_t flags, struct proc *ep)
550{
551	struct protosw *prp;
552	struct socket *so;
553	int error = 0;
554
555#if TCPDEBUG
556	extern int tcpconsdebug;
557#endif
558
559	VERIFY(aso != NULL);
560	*aso = NULL;
561
562	if (proto != 0)
563		prp = pffindproto(dom, proto, type);
564	else
565		prp = pffindtype(dom, type);
566
567	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
568		if (pffinddomain(dom) == NULL)
569			return (EAFNOSUPPORT);
570		if (proto != 0) {
571			if (pffindprotonotype(dom, proto) != NULL)
572				return (EPROTOTYPE);
573		}
574		return (EPROTONOSUPPORT);
575	}
576	if (prp->pr_type != type)
577		return (EPROTOTYPE);
578	so = soalloc(1, dom, type);
579	if (so == NULL)
580		return (ENOBUFS);
581
582	if (flags & SOCF_ASYNC)
583		so->so_state |= SS_NBIO;
584#if MULTIPATH
585	if (flags & SOCF_MP_SUBFLOW) {
586		/*
587		 * A multipath subflow socket is used internally in the kernel,
588		 * therefore it does not have a file desciptor associated by
589		 * default.
590		 */
591		so->so_state |= SS_NOFDREF;
592		so->so_flags |= SOF_MP_SUBFLOW;
593	}
594#endif /* MULTIPATH */
595
596	TAILQ_INIT(&so->so_incomp);
597	TAILQ_INIT(&so->so_comp);
598	so->so_type = type;
599	so->last_upid = proc_uniqueid(p);
600	so->last_pid = proc_pid(p);
601	proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
602	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
603
604	if (ep != PROC_NULL && ep != p) {
605		so->e_upid = proc_uniqueid(ep);
606		so->e_pid = proc_pid(ep);
607		proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
608		so->so_flags |= SOF_DELEGATED;
609	}
610
611	so->so_cred = kauth_cred_proc_ref(p);
612	if (!suser(kauth_cred_get(), NULL))
613		so->so_state |= SS_PRIV;
614
615	so->so_proto = prp;
616	so->so_rcv.sb_flags |= SB_RECV;
617	so->so_rcv.sb_so = so->so_snd.sb_so = so;
618	so->next_lock_lr = 0;
619	so->next_unlock_lr = 0;
620
621#if CONFIG_MACF_SOCKET
622	mac_socket_label_associate(kauth_cred_get(), so);
623#endif /* MAC_SOCKET */
624
625	/*
626	 * Attachment will create the per pcb lock if necessary and
627	 * increase refcount for creation, make sure it's done before
628	 * socket is inserted in lists.
629	 */
630	so->so_usecount++;
631
632	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
633	if (error != 0) {
634		/*
635		 * Warning:
636		 * If so_pcb is not zero, the socket will be leaked,
637		 * so protocol attachment handler must be coded carefuly
638		 */
639		so->so_state |= SS_NOFDREF;
640		so->so_usecount--;
641		sofreelastref(so, 1);	/* will deallocate the socket */
642		return (error);
643	}
644
645	atomic_add_32(&prp->pr_domain->dom_refs, 1);
646	TAILQ_INIT(&so->so_evlist);
647
648	/* Attach socket filters for this protocol */
649	sflt_initsock(so);
650#if TCPDEBUG
651	if (tcpconsdebug == 2)
652		so->so_options |= SO_DEBUG;
653#endif
654	so_set_default_traffic_class(so);
655
656	/*
657	 * If this thread or task is marked to create backgrounded sockets,
658	 * mark the socket as background.
659	 */
660	if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
661		socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
662		so->so_background_thread = current_thread();
663	}
664
665	switch (dom) {
666	/*
667	 * Don't mark Unix domain, system or multipath sockets as
668	 * eligible for defunct by default.
669	 */
670	case PF_LOCAL:
671	case PF_SYSTEM:
672	case PF_MULTIPATH:
673		so->so_flags |= SOF_NODEFUNCT;
674		break;
675	default:
676		break;
677	}
678
679	/*
680	 * Entitlements can't be checked at socket creation time except if the
681	 * application requested a feature guarded by a privilege (c.f., socket
682	 * delegation).
683	 * The priv(9) and the Sandboxing APIs are designed with the idea that
684	 * a privilege check should only be triggered by a userland request.
685	 * A privilege check at socket creation time is time consuming and
686	 * could trigger many authorisation error messages from the security
687	 * APIs.
688	 */
689
690	*aso = so;
691
692	return (0);
693}
694
695/*
696 * Returns:	0			Success
697 *		EAFNOSUPPORT
698 *		EPROTOTYPE
699 *		EPROTONOSUPPORT
700 *		ENOBUFS
701 *	<pru_attach>:ENOBUFS[AF_UNIX]
702 *	<pru_attach>:ENOBUFS[TCP]
703 *	<pru_attach>:ENOMEM[TCP]
704 *	<pru_attach>:???		[other protocol families, IPSEC]
705 */
706int
707socreate(int dom, struct socket **aso, int type, int proto)
708{
709	return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
710	    PROC_NULL));
711}
712
713int
714socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
715{
716	int error = 0;
717	struct proc *ep = PROC_NULL;
718
719	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
720		error = ESRCH;
721		goto done;
722	}
723
724	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
725
726	/*
727	 * It might not be wise to hold the proc reference when calling
728	 * socreate_internal since it calls soalloc with M_WAITOK
729	 */
730done:
731	if (ep != PROC_NULL)
732		proc_rele(ep);
733
734	return (error);
735}
736
737/*
738 * Returns:	0			Success
739 *	<pru_bind>:EINVAL		Invalid argument [COMMON_START]
740 *	<pru_bind>:EAFNOSUPPORT		Address family not supported
741 *	<pru_bind>:EADDRNOTAVAIL	Address not available.
742 *	<pru_bind>:EINVAL		Invalid argument
743 *	<pru_bind>:EAFNOSUPPORT		Address family not supported [notdef]
744 *	<pru_bind>:EACCES		Permission denied
745 *	<pru_bind>:EADDRINUSE		Address in use
746 *	<pru_bind>:EAGAIN		Resource unavailable, try again
747 *	<pru_bind>:EPERM		Operation not permitted
748 *	<pru_bind>:???
749 *	<sf_bind>:???
750 *
751 * Notes:	It's not possible to fully enumerate the return codes above,
752 *		since socket filter authors and protocol family authors may
753 *		not choose to limit their error returns to those listed, even
754 *		though this may result in some software operating incorrectly.
755 *
756 *		The error codes which are enumerated above are those known to
757 *		be returned by the tcp_usr_bind function supplied.
758 */
759int
760sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
761{
762	struct proc *p = current_proc();
763	int error = 0;
764
765	if (dolock)
766		socket_lock(so, 1);
767	VERIFY(so->so_usecount > 1);
768
769	so_update_last_owner_locked(so, p);
770	so_update_policy(so);
771
772#if NECP
773	so_update_necp_policy(so, nam, NULL);
774#endif /* NECP */
775
776	/*
777	 * If this is a bind request on a socket that has been marked
778	 * as inactive, reject it now before we go any further.
779	 */
780	if (so->so_flags & SOF_DEFUNCT) {
781		error = EINVAL;
782		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
783		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
784		    SOCK_DOM(so), SOCK_TYPE(so), error));
785		goto out;
786	}
787
788	/* Socket filter */
789	error = sflt_bind(so, nam);
790
791	if (error == 0)
792		error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
793out:
794	if (dolock)
795		socket_unlock(so, 1);
796
797	if (error == EJUSTRETURN)
798		error = 0;
799
800	return (error);
801}
802
803void
804sodealloc(struct socket *so)
805{
806	kauth_cred_unref(&so->so_cred);
807
808	/* Remove any filters */
809	sflt_termsock(so);
810
811#if CONTENT_FILTER
812	cfil_sock_detach(so);
813#endif /* CONTENT_FILTER */
814
815	/* Delete the state allocated for msg queues on a socket */
816	if (so->so_flags & SOF_ENABLE_MSGS) {
817		FREE(so->so_msg_state, M_TEMP);
818		so->so_msg_state = NULL;
819	}
820	VERIFY(so->so_msg_state == NULL);
821
822	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
823
824#if CONFIG_MACF_SOCKET
825	mac_socket_label_destroy(so);
826#endif /* MAC_SOCKET */
827
828	if (so->cached_in_sock_layer) {
829		cached_sock_free(so);
830	} else {
831		FREE_ZONE(so, sizeof (*so), so->so_zone);
832	}
833}
834
835/*
836 * Returns:	0			Success
837 *		EINVAL
838 *		EOPNOTSUPP
839 *	<pru_listen>:EINVAL[AF_UNIX]
840 *	<pru_listen>:EINVAL[TCP]
841 *	<pru_listen>:EADDRNOTAVAIL[TCP]	Address not available.
842 *	<pru_listen>:EINVAL[TCP]	Invalid argument
843 *	<pru_listen>:EAFNOSUPPORT[TCP]	Address family not supported [notdef]
844 *	<pru_listen>:EACCES[TCP]	Permission denied
845 *	<pru_listen>:EADDRINUSE[TCP]	Address in use
846 *	<pru_listen>:EAGAIN[TCP]	Resource unavailable, try again
847 *	<pru_listen>:EPERM[TCP]		Operation not permitted
848 *	<sf_listen>:???
849 *
850 * Notes:	Other <pru_listen> returns depend on the protocol family; all
851 *		<sf_listen> returns depend on what the filter author causes
852 *		their filter to return.
853 */
854int
855solisten(struct socket *so, int backlog)
856{
857	struct proc *p = current_proc();
858	int error = 0;
859
860	socket_lock(so, 1);
861
862	so_update_last_owner_locked(so, p);
863	so_update_policy(so);
864
865#if NECP
866	so_update_necp_policy(so, NULL, NULL);
867#endif /* NECP */
868
869	if (so->so_proto == NULL) {
870		error = EINVAL;
871		goto out;
872	}
873	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
874		error = EOPNOTSUPP;
875		goto out;
876	}
877
878	/*
879	 * If the listen request is made on a socket that is not fully
880	 * disconnected, or on a socket that has been marked as inactive,
881	 * reject the request now.
882	 */
883	if ((so->so_state &
884	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
885	    (so->so_flags & SOF_DEFUNCT)) {
886		error = EINVAL;
887		if (so->so_flags & SOF_DEFUNCT) {
888			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
889			    "(%d)\n", __func__, proc_pid(p),
890			    (uint64_t)VM_KERNEL_ADDRPERM(so),
891			    SOCK_DOM(so), SOCK_TYPE(so), error));
892		}
893		goto out;
894	}
895
896	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
897		error = EPERM;
898		goto out;
899	}
900
901	error = sflt_listen(so);
902	if (error == 0)
903		error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
904
905	if (error) {
906		if (error == EJUSTRETURN)
907			error = 0;
908		goto out;
909	}
910
911	if (TAILQ_EMPTY(&so->so_comp))
912		so->so_options |= SO_ACCEPTCONN;
913	/*
914	 * POSIX: The implementation may have an upper limit on the length of
915	 * the listen queue-either global or per accepting socket. If backlog
916	 * exceeds this limit, the length of the listen queue is set to the
917	 * limit.
918	 *
919	 * If listen() is called with a backlog argument value that is less
920	 * than 0, the function behaves as if it had been called with a backlog
921	 * argument value of 0.
922	 *
923	 * A backlog argument of 0 may allow the socket to accept connections,
924	 * in which case the length of the listen queue may be set to an
925	 * implementation-defined minimum value.
926	 */
927	if (backlog <= 0 || backlog > somaxconn)
928		backlog = somaxconn;
929
930	so->so_qlimit = backlog;
931out:
932	socket_unlock(so, 1);
933	return (error);
934}
935
936void
937sofreelastref(struct socket *so, int dealloc)
938{
939	struct socket *head = so->so_head;
940
941	/* Assume socket is locked */
942
943	if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
944		selthreadclear(&so->so_snd.sb_sel);
945		selthreadclear(&so->so_rcv.sb_sel);
946		so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
947		so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
948		so->so_event = sonullevent;
949		return;
950	}
951	if (head != NULL) {
952		socket_lock(head, 1);
953		if (so->so_state & SS_INCOMP) {
954			TAILQ_REMOVE(&head->so_incomp, so, so_list);
955			head->so_incqlen--;
956		} else if (so->so_state & SS_COMP) {
957			/*
958			 * We must not decommission a socket that's
959			 * on the accept(2) queue.  If we do, then
960			 * accept(2) may hang after select(2) indicated
961			 * that the listening socket was ready.
962			 */
963			selthreadclear(&so->so_snd.sb_sel);
964			selthreadclear(&so->so_rcv.sb_sel);
965			so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
966			so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
967			so->so_event = sonullevent;
968			socket_unlock(head, 1);
969			return;
970		} else {
971			panic("sofree: not queued");
972		}
973		head->so_qlen--;
974		so->so_state &= ~SS_INCOMP;
975		so->so_head = NULL;
976		socket_unlock(head, 1);
977	}
978	sowflush(so);
979	sorflush(so);
980
981#if FLOW_DIVERT
982	if (so->so_flags & SOF_FLOW_DIVERT) {
983		flow_divert_detach(so);
984	}
985#endif	/* FLOW_DIVERT */
986
987	/* 3932268: disable upcall */
988	so->so_rcv.sb_flags &= ~SB_UPCALL;
989	so->so_snd.sb_flags &= ~SB_UPCALL;
990	so->so_event = sonullevent;
991
992	if (dealloc)
993		sodealloc(so);
994}
995
996void
997soclose_wait_locked(struct socket *so)
998{
999	lck_mtx_t *mutex_held;
1000
1001	if (so->so_proto->pr_getlock != NULL)
1002		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1003	else
1004		mutex_held = so->so_proto->pr_domain->dom_mtx;
1005	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1006
1007	/*
1008	 * Double check here and return if there's no outstanding upcall;
1009	 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1010	 */
1011	if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1012		return;
1013	so->so_rcv.sb_flags &= ~SB_UPCALL;
1014	so->so_snd.sb_flags &= ~SB_UPCALL;
1015	so->so_flags |= SOF_CLOSEWAIT;
1016	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1017	    "soclose_wait_locked", NULL);
1018	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1019	so->so_flags &= ~SOF_CLOSEWAIT;
1020}
1021
1022/*
1023 * Close a socket on last file table reference removal.
1024 * Initiate disconnect if connected.
1025 * Free socket when disconnect complete.
1026 */
1027int
1028soclose_locked(struct socket *so)
1029{
1030	int error = 0;
1031	lck_mtx_t *mutex_held;
1032	struct timespec ts;
1033
1034	if (so->so_usecount == 0) {
1035		panic("soclose: so=%p refcount=0\n", so);
1036		/* NOTREACHED */
1037	}
1038
1039	sflt_notify(so, sock_evt_closing, NULL);
1040
1041	if (so->so_upcallusecount)
1042		soclose_wait_locked(so);
1043
1044#if CONTENT_FILTER
1045	/*
1046	 * We have to wait until the content filters are done
1047	 */
1048	if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1049		cfil_sock_close_wait(so);
1050		cfil_sock_is_closed(so);
1051		cfil_sock_detach(so);
1052	}
1053#endif /* CONTENT_FILTER */
1054
1055	if ((so->so_options & SO_ACCEPTCONN)) {
1056		struct socket *sp, *sonext;
1057		int socklock = 0;
1058
1059		/*
1060		 * We do not want new connection to be added
1061		 * to the connection queues
1062		 */
1063		so->so_options &= ~SO_ACCEPTCONN;
1064
1065		for (sp = TAILQ_FIRST(&so->so_incomp);
1066		    sp != NULL; sp = sonext) {
1067			sonext = TAILQ_NEXT(sp, so_list);
1068
1069			/*
1070			 * Radar 5350314
1071			 * skip sockets thrown away by tcpdropdropblreq
1072			 * they will get cleanup by the garbage collection.
1073			 * otherwise, remove the incomp socket from the queue
1074			 * and let soabort trigger the appropriate cleanup.
1075			 */
1076			if (sp->so_flags & SOF_OVERFLOW)
1077				continue;
1078
1079			if (so->so_proto->pr_getlock != NULL) {
1080				/*
1081				 * Lock ordering for consistency with the
1082				 * rest of the stack, we lock the socket
1083				 * first and then grabb the head.
1084				 */
1085				socket_unlock(so, 0);
1086				socket_lock(sp, 1);
1087				socket_lock(so, 0);
1088				socklock = 1;
1089			}
1090
1091			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1092			so->so_incqlen--;
1093
1094			if (sp->so_state & SS_INCOMP) {
1095				sp->so_state &= ~SS_INCOMP;
1096				sp->so_head = NULL;
1097
1098				(void) soabort(sp);
1099			}
1100
1101			if (socklock)
1102				socket_unlock(sp, 1);
1103		}
1104
1105		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1106			/* Dequeue from so_comp since sofree() won't do it */
1107			TAILQ_REMOVE(&so->so_comp, sp, so_list);
1108			so->so_qlen--;
1109
1110			if (so->so_proto->pr_getlock != NULL) {
1111				socket_unlock(so, 0);
1112				socket_lock(sp, 1);
1113			}
1114
1115			if (sp->so_state & SS_COMP) {
1116				sp->so_state &= ~SS_COMP;
1117				sp->so_head = NULL;
1118
1119				(void) soabort(sp);
1120			}
1121
1122			if (so->so_proto->pr_getlock != NULL) {
1123				socket_unlock(sp, 1);
1124				socket_lock(so, 0);
1125			}
1126		}
1127	}
1128	if (so->so_pcb == NULL) {
1129		/* 3915887: mark the socket as ready for dealloc */
1130		so->so_flags |= SOF_PCBCLEARING;
1131		goto discard;
1132	}
1133	if (so->so_state & SS_ISCONNECTED) {
1134		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1135			error = sodisconnectlocked(so);
1136			if (error)
1137				goto drop;
1138		}
1139		if (so->so_options & SO_LINGER) {
1140			if ((so->so_state & SS_ISDISCONNECTING) &&
1141			    (so->so_state & SS_NBIO))
1142				goto drop;
1143			if (so->so_proto->pr_getlock != NULL)
1144				mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1145			else
1146				mutex_held = so->so_proto->pr_domain->dom_mtx;
1147			while (so->so_state & SS_ISCONNECTED) {
1148				ts.tv_sec = (so->so_linger/100);
1149				ts.tv_nsec = (so->so_linger % 100) *
1150				    NSEC_PER_USEC * 1000 * 10;
1151				error = msleep((caddr_t)&so->so_timeo,
1152				    mutex_held, PSOCK | PCATCH, "soclose", &ts);
1153				if (error) {
1154					/*
1155					 * It's OK when the time fires,
1156					 * don't report an error
1157					 */
1158					if (error == EWOULDBLOCK)
1159						error = 0;
1160					break;
1161				}
1162			}
1163		}
1164	}
1165drop:
1166	if (so->so_usecount == 0) {
1167		panic("soclose: usecount is zero so=%p\n", so);
1168		/* NOTREACHED */
1169	}
1170	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1171		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1172		if (error == 0)
1173			error = error2;
1174	}
1175	if (so->so_usecount <= 0) {
1176		panic("soclose: usecount is zero so=%p\n", so);
1177		/* NOTREACHED */
1178	}
1179discard:
1180	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1181	    (so->so_state & SS_NOFDREF)) {
1182		panic("soclose: NOFDREF");
1183		/* NOTREACHED */
1184	}
1185	so->so_state |= SS_NOFDREF;
1186
1187	if (so->so_flags & SOF_MP_SUBFLOW)
1188		so->so_flags &= ~SOF_MP_SUBFLOW;
1189
1190	if ((so->so_flags & SOF_KNOTE) != 0)
1191		KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1192
1193	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1194	evsofree(so);
1195
1196	so->so_usecount--;
1197	sofree(so);
1198	return (error);
1199}
1200
1201int
1202soclose(struct socket *so)
1203{
1204	int error = 0;
1205	socket_lock(so, 1);
1206
1207	if (so->so_retaincnt == 0) {
1208		error = soclose_locked(so);
1209	} else {
1210		/*
1211		 * if the FD is going away, but socket is
1212		 * retained in kernel remove its reference
1213		 */
1214		so->so_usecount--;
1215		if (so->so_usecount < 2)
1216			panic("soclose: retaincnt non null and so=%p "
1217			    "usecount=%d\n", so, so->so_usecount);
1218	}
1219	socket_unlock(so, 1);
1220	return (error);
1221}
1222
1223/*
1224 * Must be called at splnet...
1225 */
1226/* Should already be locked */
1227int
1228soabort(struct socket *so)
1229{
1230	int error;
1231
1232#ifdef MORE_LOCKING_DEBUG
1233	lck_mtx_t *mutex_held;
1234
1235	if (so->so_proto->pr_getlock != NULL)
1236		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1237	else
1238		mutex_held = so->so_proto->pr_domain->dom_mtx;
1239	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1240#endif
1241
1242	if ((so->so_flags & SOF_ABORTED) == 0) {
1243		so->so_flags |= SOF_ABORTED;
1244		error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1245		if (error) {
1246			sofree(so);
1247			return (error);
1248		}
1249	}
1250	return (0);
1251}
1252
1253int
1254soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1255{
1256	int error;
1257
1258	if (dolock)
1259		socket_lock(so, 1);
1260
1261	so_update_last_owner_locked(so, PROC_NULL);
1262	so_update_policy(so);
1263#if NECP
1264	so_update_necp_policy(so, NULL, NULL);
1265#endif /* NECP */
1266
1267	if ((so->so_state & SS_NOFDREF) == 0)
1268		panic("soaccept: !NOFDREF");
1269	so->so_state &= ~SS_NOFDREF;
1270	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1271
1272	if (dolock)
1273		socket_unlock(so, 1);
1274	return (error);
1275}
1276
1277int
1278soaccept(struct socket *so, struct sockaddr **nam)
1279{
1280	return (soacceptlock(so, nam, 1));
1281}
1282
1283int
1284soacceptfilter(struct socket *so)
1285{
1286	struct sockaddr *local = NULL, *remote = NULL;
1287	int error = 0;
1288	struct socket *head = so->so_head;
1289
1290	/*
1291	 * Hold the lock even if this socket has not been made visible
1292	 * to the filter(s).  For sockets with global locks, this protects
1293	 * against the head or peer going away
1294	 */
1295	socket_lock(so, 1);
1296	if (sogetaddr_locked(so, &remote, 1) != 0 ||
1297	    sogetaddr_locked(so, &local, 0) != 0) {
1298		so->so_state &= ~(SS_NOFDREF | SS_COMP);
1299		so->so_head = NULL;
1300		socket_unlock(so, 1);
1301		soclose(so);
1302		/* Out of resources; try it again next time */
1303		error = ECONNABORTED;
1304		goto done;
1305	}
1306
1307	error = sflt_accept(head, so, local, remote);
1308
1309	/*
1310	 * If we get EJUSTRETURN from one of the filters, mark this socket
1311	 * as inactive and return it anyway.  This newly accepted socket
1312	 * will be disconnected later before we hand it off to the caller.
1313	 */
1314	if (error == EJUSTRETURN) {
1315		error = 0;
1316		(void) sosetdefunct(current_proc(), so,
1317		    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1318	}
1319
1320	if (error != 0) {
1321		/*
1322		 * This may seem like a duplication to the above error
1323		 * handling part when we return ECONNABORTED, except
1324		 * the following is done while holding the lock since
1325		 * the socket has been exposed to the filter(s) earlier.
1326		 */
1327		so->so_state &= ~(SS_NOFDREF | SS_COMP);
1328		so->so_head = NULL;
1329		socket_unlock(so, 1);
1330		soclose(so);
1331		/* Propagate socket filter's error code to the caller */
1332	} else {
1333		socket_unlock(so, 1);
1334	}
1335done:
1336	/* Callee checks for NULL pointer */
1337	sock_freeaddr(remote);
1338	sock_freeaddr(local);
1339	return (error);
1340}
1341
1342/*
1343 * Returns:	0			Success
1344 *		EOPNOTSUPP		Operation not supported on socket
1345 *		EISCONN			Socket is connected
1346 *	<pru_connect>:EADDRNOTAVAIL	Address not available.
1347 *	<pru_connect>:EINVAL		Invalid argument
1348 *	<pru_connect>:EAFNOSUPPORT	Address family not supported [notdef]
1349 *	<pru_connect>:EACCES		Permission denied
1350 *	<pru_connect>:EADDRINUSE	Address in use
1351 *	<pru_connect>:EAGAIN		Resource unavailable, try again
1352 *	<pru_connect>:EPERM		Operation not permitted
1353 *	<sf_connect_out>:???		[anything a filter writer might set]
1354 */
1355int
1356soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1357{
1358	int error;
1359	struct proc *p = current_proc();
1360
1361	if (dolock)
1362		socket_lock(so, 1);
1363
1364	so_update_last_owner_locked(so, p);
1365	so_update_policy(so);
1366
1367#if NECP
1368	so_update_necp_policy(so, NULL, nam);
1369#endif /* NECP */
1370
1371	/*
1372	 * If this is a listening socket or if this is a previously-accepted
1373	 * socket that has been marked as inactive, reject the connect request.
1374	 */
1375	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1376		error = EOPNOTSUPP;
1377		if (so->so_flags & SOF_DEFUNCT) {
1378			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1379			    "(%d)\n", __func__, proc_pid(p),
1380			    (uint64_t)VM_KERNEL_ADDRPERM(so),
1381			    SOCK_DOM(so), SOCK_TYPE(so), error));
1382		}
1383		if (dolock)
1384			socket_unlock(so, 1);
1385		return (error);
1386	}
1387
1388	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1389		if (dolock)
1390			socket_unlock(so, 1);
1391		return (EPERM);
1392	}
1393
1394	/*
1395	 * If protocol is connection-based, can only connect once.
1396	 * Otherwise, if connected, try to disconnect first.
1397	 * This allows user to disconnect by connecting to, e.g.,
1398	 * a null address.
1399	 */
1400	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1401	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1402	    (error = sodisconnectlocked(so)))) {
1403		error = EISCONN;
1404	} else {
1405		/*
1406		 * Run connect filter before calling protocol:
1407		 *  - non-blocking connect returns before completion;
1408		 */
1409		error = sflt_connectout(so, nam);
1410		if (error != 0) {
1411			if (error == EJUSTRETURN)
1412				error = 0;
1413		} else {
1414			error = (*so->so_proto->pr_usrreqs->pru_connect)
1415			    (so, nam, p);
1416		}
1417	}
1418	if (dolock)
1419		socket_unlock(so, 1);
1420	return (error);
1421}
1422
1423int
1424soconnect(struct socket *so, struct sockaddr *nam)
1425{
1426	return (soconnectlock(so, nam, 1));
1427}
1428
1429/*
1430 * Returns:	0			Success
1431 *	<pru_connect2>:EINVAL[AF_UNIX]
1432 *	<pru_connect2>:EPROTOTYPE[AF_UNIX]
1433 *	<pru_connect2>:???		[other protocol families]
1434 *
1435 * Notes:	<pru_connect2> is not supported by [TCP].
1436 */
1437int
1438soconnect2(struct socket *so1, struct socket *so2)
1439{
1440	int error;
1441
1442	socket_lock(so1, 1);
1443	if (so2->so_proto->pr_lock)
1444		socket_lock(so2, 1);
1445
1446	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1447
1448	socket_unlock(so1, 1);
1449	if (so2->so_proto->pr_lock)
1450		socket_unlock(so2, 1);
1451	return (error);
1452}
1453
1454int
1455soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1456    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1457    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1458    uint32_t arglen)
1459{
1460	int error;
1461
1462	so_update_last_owner_locked(so, p);
1463	so_update_policy(so);
1464
1465	/*
1466	 * If this is a listening socket or if this is a previously-accepted
1467	 * socket that has been marked as inactive, reject the connect request.
1468	 */
1469	if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1470		error = EOPNOTSUPP;
1471		if (so->so_flags & SOF_DEFUNCT) {
1472			SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1473			    "(%d)\n", __func__, proc_pid(p),
1474			    (uint64_t)VM_KERNEL_ADDRPERM(so),
1475			    SOCK_DOM(so), SOCK_TYPE(so), error));
1476		}
1477		return (error);
1478	}
1479
1480	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1481		return (EPERM);
1482
1483	/*
1484	 * If protocol is connection-based, can only connect once
1485	 * unless PR_MULTICONN is set.  Otherwise, if connected,
1486	 * try to disconnect first.  This allows user to disconnect
1487	 * by connecting to, e.g., a null address.
1488	 */
1489	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1490	    !(so->so_proto->pr_flags & PR_MULTICONN) &&
1491	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1492	    (error = sodisconnectlocked(so)) != 0)) {
1493		error = EISCONN;
1494	} else {
1495		/*
1496		 * Run connect filter before calling protocol:
1497		 *  - non-blocking connect returns before completion;
1498		 */
1499		error = sflt_connectxout(so, dst_sl);
1500		if (error != 0) {
1501			if (error == EJUSTRETURN)
1502				error = 0;
1503		} else {
1504			error = (*so->so_proto->pr_usrreqs->pru_connectx)
1505			    (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1506			    flags, arg, arglen);
1507		}
1508	}
1509
1510	return (error);
1511}
1512
1513int
1514sodisconnectlocked(struct socket *so)
1515{
1516	int error;
1517
1518	if ((so->so_state & SS_ISCONNECTED) == 0) {
1519		error = ENOTCONN;
1520		goto bad;
1521	}
1522	if (so->so_state & SS_ISDISCONNECTING) {
1523		error = EALREADY;
1524		goto bad;
1525	}
1526
1527	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1528	if (error == 0)
1529		sflt_notify(so, sock_evt_disconnected, NULL);
1530
1531bad:
1532	return (error);
1533}
1534
1535/* Locking version */
1536int
1537sodisconnect(struct socket *so)
1538{
1539	int error;
1540
1541	socket_lock(so, 1);
1542	error = sodisconnectlocked(so);
1543	socket_unlock(so, 1);
1544	return (error);
1545}
1546
1547int
1548sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1549{
1550	int error;
1551
1552	/*
1553	 * Call the protocol disconnectx handler; let it handle all
1554	 * matters related to the connection state of this session.
1555	 */
1556	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1557	if (error == 0) {
1558		/*
1559		 * The event applies only for the session, not for
1560		 * the disconnection of individual subflows.
1561		 */
1562		if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1563			sflt_notify(so, sock_evt_disconnected, NULL);
1564	}
1565	return (error);
1566}
1567
1568int
1569sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1570{
1571	int error;
1572
1573	socket_lock(so, 1);
1574	error = sodisconnectxlocked(so, aid, cid);
1575	socket_unlock(so, 1);
1576	return (error);
1577}
1578
1579int
1580sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1581{
1582	return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1583}
1584
1585#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1586
1587/*
1588 * sosendcheck will lock the socket buffer if it isn't locked and
1589 * verify that there is space for the data being inserted.
1590 *
1591 * Returns:	0			Success
1592 *		EPIPE
1593 *	sblock:EWOULDBLOCK
1594 *	sblock:EINTR
1595 *	sbwait:EBADF
1596 *	sbwait:EINTR
1597 *	[so_error]:???
1598 */
1599int
1600sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1601    int32_t clen, int32_t atomic, int flags, int *sblocked,
1602    struct mbuf *control)
1603{
1604	int	error = 0;
1605	int32_t space;
1606	int	assumelock = 0;
1607
1608restart:
1609	if (*sblocked == 0) {
1610		if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1611		    so->so_send_filt_thread != 0 &&
1612		    so->so_send_filt_thread == current_thread()) {
1613			/*
1614			 * We're being called recursively from a filter,
1615			 * allow this to continue. Radar 4150520.
1616			 * Don't set sblocked because we don't want
1617			 * to perform an unlock later.
1618			 */
1619			assumelock = 1;
1620		} else {
1621			error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1622			if (error) {
1623				if (so->so_flags & SOF_DEFUNCT)
1624					goto defunct;
1625				return (error);
1626			}
1627			*sblocked = 1;
1628		}
1629	}
1630
1631	/*
1632	 * If a send attempt is made on a socket that has been marked
1633	 * as inactive (disconnected), reject the request.
1634	 */
1635	if (so->so_flags & SOF_DEFUNCT) {
1636defunct:
1637		error = EPIPE;
1638		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1639		    __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1640		    SOCK_DOM(so), SOCK_TYPE(so), error));
1641		return (error);
1642	}
1643
1644	if (so->so_state & SS_CANTSENDMORE) {
1645#if CONTENT_FILTER
1646		/*
1647		 * Can re-inject data of half closed connections
1648		 */
1649		if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1650			so->so_snd.sb_cfil_thread == current_thread() &&
1651			cfil_sock_data_pending(&so->so_snd) != 0)
1652			CFIL_LOG(LOG_INFO,
1653				"so %llx ignore SS_CANTSENDMORE",
1654				(uint64_t)VM_KERNEL_ADDRPERM(so));
1655		else
1656#endif /* CONTENT_FILTER */
1657			return (EPIPE);
1658	}
1659	if (so->so_error) {
1660		error = so->so_error;
1661		so->so_error = 0;
1662		return (error);
1663	}
1664
1665	if ((so->so_state & SS_ISCONNECTED) == 0) {
1666		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1667			if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1668			    (resid != 0 || clen == 0)) {
1669#if MPTCP
1670				/*
1671				 * MPTCP Fast Join sends data before the
1672				 * socket is truly connected.
1673				 */
1674				if ((so->so_flags & (SOF_MP_SUBFLOW |
1675					SOF_MPTCP_FASTJOIN)) !=
1676				    (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1677#endif /* MPTCP */
1678				return (ENOTCONN);
1679			}
1680		} else if (addr == 0 && !(flags&MSG_HOLD)) {
1681			return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1682			    ENOTCONN : EDESTADDRREQ);
1683		}
1684	}
1685	if (so->so_flags & SOF_ENABLE_MSGS)
1686		space = msgq_sbspace(so, control);
1687	else
1688		space = sbspace(&so->so_snd);
1689
1690	if (flags & MSG_OOB)
1691		space += 1024;
1692	if ((atomic && resid > so->so_snd.sb_hiwat) ||
1693	    clen > so->so_snd.sb_hiwat)
1694		return (EMSGSIZE);
1695
1696	if ((space < resid + clen &&
1697	    (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1698	    (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1699		if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1700		    assumelock) {
1701			return (EWOULDBLOCK);
1702		}
1703		sbunlock(&so->so_snd, TRUE);	/* keep socket locked */
1704		*sblocked = 0;
1705		error = sbwait(&so->so_snd);
1706		if (error) {
1707			if (so->so_flags & SOF_DEFUNCT)
1708				goto defunct;
1709			return (error);
1710		}
1711		goto restart;
1712	}
1713	return (0);
1714}
1715
1716/*
1717 * Send on a socket.
1718 * If send must go all at once and message is larger than
1719 * send buffering, then hard error.
1720 * Lock against other senders.
1721 * If must go all at once and not enough room now, then
1722 * inform user that this would block and do nothing.
1723 * Otherwise, if nonblocking, send as much as possible.
1724 * The data to be sent is described by "uio" if nonzero,
1725 * otherwise by the mbuf chain "top" (which must be null
1726 * if uio is not).  Data provided in mbuf chain must be small
1727 * enough to send all at once.
1728 *
1729 * Returns nonzero on error, timeout or signal; callers
1730 * must check for short counts if EINTR/ERESTART are returned.
1731 * Data and control buffers are freed on return.
1732 * Experiment:
1733 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1734 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1735 *  point at the mbuf chain being constructed and go from there.
1736 *
1737 * Returns:	0			Success
1738 *		EOPNOTSUPP
1739 *		EINVAL
1740 *		ENOBUFS
1741 *	uiomove:EFAULT
1742 *	sosendcheck:EPIPE
1743 *	sosendcheck:EWOULDBLOCK
1744 *	sosendcheck:EINTR
1745 *	sosendcheck:EBADF
1746 *	sosendcheck:EINTR
1747 *	sosendcheck:???			[value from so_error]
1748 *	<pru_send>:ECONNRESET[TCP]
1749 *	<pru_send>:EINVAL[TCP]
1750 *	<pru_send>:ENOBUFS[TCP]
1751 *	<pru_send>:EADDRINUSE[TCP]
1752 *	<pru_send>:EADDRNOTAVAIL[TCP]
1753 *	<pru_send>:EAFNOSUPPORT[TCP]
1754 *	<pru_send>:EACCES[TCP]
1755 *	<pru_send>:EAGAIN[TCP]
1756 *	<pru_send>:EPERM[TCP]
1757 *	<pru_send>:EMSGSIZE[TCP]
1758 *	<pru_send>:EHOSTUNREACH[TCP]
1759 *	<pru_send>:ENETUNREACH[TCP]
1760 *	<pru_send>:ENETDOWN[TCP]
1761 *	<pru_send>:ENOMEM[TCP]
1762 *	<pru_send>:ENOBUFS[TCP]
1763 *	<pru_send>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
1764 *	<pru_send>:EINVAL[AF_UNIX]
1765 *	<pru_send>:EOPNOTSUPP[AF_UNIX]
1766 *	<pru_send>:EPIPE[AF_UNIX]
1767 *	<pru_send>:ENOTCONN[AF_UNIX]
1768 *	<pru_send>:EISCONN[AF_UNIX]
1769 *	<pru_send>:???[AF_UNIX]		[whatever a filter author chooses]
1770 *	<sf_data_out>:???		[whatever a filter author chooses]
1771 *
1772 * Notes:	Other <pru_send> returns depend on the protocol family; all
1773 *		<sf_data_out> returns depend on what the filter author causes
1774 *		their filter to return.
1775 */
1776int
1777sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1778    struct mbuf *top, struct mbuf *control, int flags)
1779{
1780	struct mbuf **mp;
1781	struct mbuf *m, *freelist = NULL;
1782	user_ssize_t space, len, resid;
1783	int clen = 0, error, dontroute, mlen, sendflags;
1784	int atomic = sosendallatonce(so) || top;
1785	int sblocked = 0;
1786	struct proc *p = current_proc();
1787	struct mbuf *control_copy = NULL;
1788
1789	if (uio != NULL)
1790		resid = uio_resid(uio);
1791	else
1792		resid = top->m_pkthdr.len;
1793
1794	KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1795	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1796
1797	socket_lock(so, 1);
1798
1799	/*
1800	 * Re-injection should not affect process accounting
1801	 */
1802	if ((flags & MSG_SKIPCFIL) == 0) {
1803	so_update_last_owner_locked(so, p);
1804	so_update_policy(so);
1805
1806#if NECP
1807	so_update_necp_policy(so, NULL, addr);
1808#endif /* NECP */
1809	}
1810
1811	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1812		error = EOPNOTSUPP;
1813		socket_unlock(so, 1);
1814		goto out;
1815	}
1816
1817	/*
1818	 * In theory resid should be unsigned.
1819	 * However, space must be signed, as it might be less than 0
1820	 * if we over-committed, and we must use a signed comparison
1821	 * of space and resid.  On the other hand, a negative resid
1822	 * causes us to loop sending 0-length segments to the protocol.
1823	 *
1824	 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1825	 * But it will be used by sockets doing message delivery.
1826	 *
1827	 * Note: We limit resid to be a positive int value as we use
1828	 * imin() to set bytes_to_copy -- radr://14558484
1829	 */
1830	if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1831	    !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1832		error = EINVAL;
1833		socket_unlock(so, 1);
1834		goto out;
1835	}
1836
1837	dontroute = (flags & MSG_DONTROUTE) &&
1838	    (so->so_options & SO_DONTROUTE) == 0 &&
1839	    (so->so_proto->pr_flags & PR_ATOMIC);
1840	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1841
1842	if (control != NULL)
1843		clen = control->m_len;
1844
1845	do {
1846		error = sosendcheck(so, addr, resid, clen, atomic, flags,
1847		    &sblocked, control);
1848		if (error)
1849			goto release;
1850
1851		mp = &top;
1852		if (so->so_flags & SOF_ENABLE_MSGS)
1853			space = msgq_sbspace(so, control);
1854		else
1855			space = sbspace(&so->so_snd) - clen;
1856		space += ((flags & MSG_OOB) ? 1024 : 0);
1857
1858		do {
1859			if (uio == NULL) {
1860				/*
1861				 * Data is prepackaged in "top".
1862				 */
1863				resid = 0;
1864				if (flags & MSG_EOR)
1865					top->m_flags |= M_EOR;
1866			} else {
1867				int chainlength;
1868				int bytes_to_copy;
1869				boolean_t jumbocl;
1870				boolean_t bigcl;
1871
1872				bytes_to_copy = imin(resid, space);
1873
1874				if (sosendminchain > 0)
1875					chainlength = 0;
1876				else
1877					chainlength = sosendmaxchain;
1878
1879				/*
1880				 * Use big 4 KB cluster only when outgoing
1881				 * interface does not want 2 LB clusters
1882				 */
1883				bigcl =
1884				    !(so->so_flags1 & SOF1_IF_2KCL) ||
1885				    sosendbigcl_ignore_capab;
1886
1887				/*
1888				 * Attempt to use larger than system page-size
1889				 * clusters for large writes only if there is
1890				 * a jumbo cluster pool and if the socket is
1891				 * marked accordingly.
1892				 */
1893				jumbocl = sosendjcl && njcl > 0 &&
1894				    ((so->so_flags & SOF_MULTIPAGES) ||
1895				    sosendjcl_ignore_capab) &&
1896				    bigcl;
1897
1898				socket_unlock(so, 0);
1899
1900				do {
1901					int num_needed;
1902					int hdrs_needed = (top == NULL) ? 1 : 0;
1903
1904					/*
1905					 * try to maintain a local cache of mbuf
1906					 * clusters needed to complete this
1907					 * write the list is further limited to
1908					 * the number that are currently needed
1909					 * to fill the socket this mechanism
1910					 * allows a large number of mbufs/
1911					 * clusters to be grabbed under a single
1912					 * mbuf lock... if we can't get any
1913					 * clusters, than fall back to trying
1914					 * for mbufs if we fail early (or
1915					 * miscalcluate the number needed) make
1916					 * sure to release any clusters we
1917					 * haven't yet consumed.
1918					 */
1919					if (freelist == NULL &&
1920					    bytes_to_copy > MBIGCLBYTES &&
1921					    jumbocl) {
1922						num_needed =
1923						    bytes_to_copy / M16KCLBYTES;
1924
1925						if ((bytes_to_copy -
1926						    (num_needed * M16KCLBYTES))
1927						    >= MINCLSIZE)
1928							num_needed++;
1929
1930						freelist =
1931						    m_getpackets_internal(
1932						    (unsigned int *)&num_needed,
1933						    hdrs_needed, M_WAIT, 0,
1934						    M16KCLBYTES);
1935						/*
1936						 * Fall back to 4K cluster size
1937						 * if allocation failed
1938						 */
1939					}
1940
1941					if (freelist == NULL &&
1942					    bytes_to_copy > MCLBYTES &&
1943					    bigcl) {
1944						num_needed =
1945						    bytes_to_copy / MBIGCLBYTES;
1946
1947						if ((bytes_to_copy -
1948						    (num_needed * MBIGCLBYTES)) >=
1949						    MINCLSIZE)
1950							num_needed++;
1951
1952						freelist =
1953						    m_getpackets_internal(
1954						    (unsigned int *)&num_needed,
1955						    hdrs_needed, M_WAIT, 0,
1956						    MBIGCLBYTES);
1957						/*
1958						 * Fall back to cluster size
1959						 * if allocation failed
1960						 */
1961					}
1962
1963					if (freelist == NULL &&
1964					    bytes_to_copy > MINCLSIZE) {
1965						num_needed =
1966						    bytes_to_copy / MCLBYTES;
1967
1968						if ((bytes_to_copy -
1969						    (num_needed * MCLBYTES)) >=
1970						    MINCLSIZE)
1971							num_needed++;
1972
1973						freelist =
1974						    m_getpackets_internal(
1975						    (unsigned int *)&num_needed,
1976						    hdrs_needed, M_WAIT, 0,
1977						    MCLBYTES);
1978						/*
1979						 * Fall back to a single mbuf
1980						 * if allocation failed
1981						 */
1982					}
1983
1984					if (freelist == NULL) {
1985						if (top == NULL)
1986							MGETHDR(freelist,
1987							    M_WAIT, MT_DATA);
1988						else
1989							MGET(freelist,
1990							    M_WAIT, MT_DATA);
1991
1992						if (freelist == NULL) {
1993							error = ENOBUFS;
1994							socket_lock(so, 0);
1995							goto release;
1996						}
1997						/*
1998						 * For datagram protocols,
1999						 * leave room for protocol
2000						 * headers in first mbuf.
2001						 */
2002						if (atomic && top == NULL &&
2003						    bytes_to_copy < MHLEN) {
2004							MH_ALIGN(freelist,
2005							    bytes_to_copy);
2006						}
2007					}
2008					m = freelist;
2009					freelist = m->m_next;
2010					m->m_next = NULL;
2011
2012					if ((m->m_flags & M_EXT))
2013						mlen = m->m_ext.ext_size;
2014					else if ((m->m_flags & M_PKTHDR))
2015						mlen =
2016						    MHLEN - m_leadingspace(m);
2017					else
2018						mlen = MLEN;
2019					len = imin(mlen, bytes_to_copy);
2020
2021					chainlength += len;
2022
2023					space -= len;
2024
2025					error = uiomove(mtod(m, caddr_t),
2026					    len, uio);
2027
2028					resid = uio_resid(uio);
2029
2030					m->m_len = len;
2031					*mp = m;
2032					top->m_pkthdr.len += len;
2033					if (error)
2034						break;
2035					mp = &m->m_next;
2036					if (resid <= 0) {
2037						if (flags & MSG_EOR)
2038							top->m_flags |= M_EOR;
2039						break;
2040					}
2041					bytes_to_copy = min(resid, space);
2042
2043				} while (space > 0 &&
2044				    (chainlength < sosendmaxchain || atomic ||
2045				    resid < MINCLSIZE));
2046
2047				socket_lock(so, 0);
2048
2049				if (error)
2050					goto release;
2051			}
2052
2053			if (flags & (MSG_HOLD|MSG_SEND)) {
2054				/* Enqueue for later, go away if HOLD */
2055				struct mbuf *mb1;
2056				if (so->so_temp && (flags & MSG_FLUSH)) {
2057					m_freem(so->so_temp);
2058					so->so_temp = NULL;
2059				}
2060				if (so->so_temp)
2061					so->so_tail->m_next = top;
2062				else
2063					so->so_temp = top;
2064				mb1 = top;
2065				while (mb1->m_next)
2066					mb1 = mb1->m_next;
2067				so->so_tail = mb1;
2068				if (flags & MSG_HOLD) {
2069					top = NULL;
2070					goto release;
2071				}
2072				top = so->so_temp;
2073			}
2074			if (dontroute)
2075				so->so_options |= SO_DONTROUTE;
2076
2077			/* Compute flags here, for pru_send and NKEs */
2078			sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2079			    /*
2080			     * If the user set MSG_EOF, the protocol
2081			     * understands this flag and nothing left to
2082			     * send then use PRU_SEND_EOF instead of PRU_SEND.
2083			     */
2084			    ((flags & MSG_EOF) &&
2085			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2086			     (resid <= 0)) ? PRUS_EOF :
2087			     /* If there is more to send set PRUS_MORETOCOME */
2088			     (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2089
2090			if ((flags & MSG_SKIPCFIL) == 0) {
2091				/*
2092				 * Socket filter processing
2093				 */
2094				error = sflt_data_out(so, addr, &top,
2095				    &control, (sendflags & MSG_OOB) ?
2096				    sock_data_filt_flag_oob : 0);
2097				if (error) {
2098					if (error == EJUSTRETURN) {
2099						error = 0;
2100						clen = 0;
2101						control = NULL;
2102						top = NULL;
2103					}
2104					goto release;
2105				}
2106#if CONTENT_FILTER
2107				/*
2108				 * Content filter processing
2109				 */
2110				error = cfil_sock_data_out(so, addr, top,
2111				   control, (sendflags & MSG_OOB) ?
2112				    sock_data_filt_flag_oob : 0);
2113				if (error) {
2114					if (error == EJUSTRETURN) {
2115						error = 0;
2116						clen = 0;
2117						control = NULL;
2118						top = NULL;
2119						}
2120					goto release;
2121				}
2122#endif /* CONTENT_FILTER */
2123			}
2124			if (so->so_flags & SOF_ENABLE_MSGS) {
2125				/*
2126				 * Make a copy of control mbuf,
2127				 * so that msg priority can be
2128				 * passed to subsequent mbufs.
2129				 */
2130				control_copy = m_dup(control, M_NOWAIT);
2131			}
2132			error = (*so->so_proto->pr_usrreqs->pru_send)
2133			    (so, sendflags, top, addr, control, p);
2134
2135			if (flags & MSG_SEND)
2136				so->so_temp = NULL;
2137
2138			if (dontroute)
2139				so->so_options &= ~SO_DONTROUTE;
2140
2141			clen = 0;
2142			control = control_copy;
2143			control_copy = NULL;
2144			top = NULL;
2145			mp = &top;
2146			if (error)
2147				goto release;
2148		} while (resid && space > 0);
2149	} while (resid);
2150
2151release:
2152	if (sblocked)
2153		sbunlock(&so->so_snd, FALSE);	/* will unlock socket */
2154	else
2155		socket_unlock(so, 1);
2156out:
2157	if (top != NULL)
2158		m_freem(top);
2159	if (control != NULL)
2160		m_freem(control);
2161	if (freelist != NULL)
2162		m_freem_list(freelist);
2163	if (control_copy != NULL)
2164		m_freem(control_copy);
2165
2166	KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2167	    space, error);
2168
2169	return (error);
2170}
2171
2172int
2173sosend_list(struct socket *so, struct sockaddr *addr, struct uio **uioarray,
2174     u_int uiocnt, struct mbuf *top, struct mbuf *control, int flags)
2175{
2176	struct mbuf *m, *freelist = NULL;
2177	user_ssize_t len, resid;
2178	int clen = 0, error, dontroute, mlen;
2179	int atomic = sosendallatonce(so) || top;
2180	int sblocked = 0;
2181	struct proc *p = current_proc();
2182	u_int uiofirst = 0;
2183	u_int uiolast = 0;
2184
2185	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2186	    so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2187
2188	if (so->so_type != SOCK_DGRAM) {
2189		error = EINVAL;
2190		goto out;
2191	}
2192	if (atomic == 0) {
2193		error = EINVAL;
2194		goto out;
2195	}
2196	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2197		error = EPROTONOSUPPORT;
2198		goto out;
2199	}
2200	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2201		error = EINVAL;
2202		goto out;
2203	}
2204	if (uioarray != NULL)
2205		resid = uio_array_resid(uioarray, uiocnt);
2206	else
2207		resid = mbuf_pkt_list_len(top);
2208
2209	/*
2210	 * In theory resid should be unsigned.
2211	 * However, space must be signed, as it might be less than 0
2212	 * if we over-committed, and we must use a signed comparison
2213	 * of space and resid.  On the other hand, a negative resid
2214	 * causes us to loop sending 0-length segments to the protocol.
2215	 *
2216	 * Note: We limit resid to be a positive int value as we use
2217	 * imin() to set bytes_to_copy -- radr://14558484
2218	 */
2219	if (resid < 0 || resid > INT_MAX) {
2220		error = EINVAL;
2221		goto out;
2222	}
2223	/*
2224	 * Disallow functionality not currently supported
2225	 * Note: Will need to treat arrays of addresses and controls
2226	 */
2227	if (addr != NULL) {
2228		printf("%s addr not supported\n", __func__);
2229		error = EOPNOTSUPP;
2230		goto out;
2231	}
2232	if (control != NULL) {
2233		printf("%s control not supported\n", __func__);
2234		error = EOPNOTSUPP;
2235		goto out;
2236	}
2237
2238	socket_lock(so, 1);
2239	so_update_last_owner_locked(so, p);
2240	so_update_policy(so);
2241
2242#if NECP
2243	so_update_necp_policy(so, NULL, addr);
2244#endif /* NECP */
2245
2246	dontroute = (flags & MSG_DONTROUTE) &&
2247	    (so->so_options & SO_DONTROUTE) == 0 &&
2248	    (so->so_proto->pr_flags & PR_ATOMIC);
2249	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2250
2251	if (control != NULL)
2252		clen = control->m_len;
2253
2254	error = sosendcheck(so, addr, resid, clen, atomic, flags,
2255	    &sblocked, control);
2256	if (error)
2257		goto release;
2258
2259	do {
2260		int i;
2261
2262		if (uioarray == NULL) {
2263			/*
2264			 * Data is prepackaged in "top".
2265			 */
2266			resid = 0;
2267		} else {
2268			int num_needed = 0;
2269			int chainlength;
2270			size_t maxpktlen = 0;
2271
2272			if (sosendminchain > 0)
2273				chainlength = 0;
2274			else
2275				chainlength = sosendmaxchain;
2276
2277			socket_unlock(so, 0);
2278
2279			/*
2280			 * Find a set of uio that fit in a reasonable number
2281			 * of mbuf packets
2282			 */
2283			for (i = uiofirst; i < uiocnt; i++) {
2284				struct uio *auio = uioarray[i];
2285
2286				len = uio_resid(auio);
2287
2288				/* Do nothing for empty messages */
2289				if (len == 0)
2290					continue;
2291
2292				num_needed += 1;
2293				uiolast += 1;
2294
2295				if (len > maxpktlen)
2296					maxpktlen = len;
2297
2298				chainlength += len;
2299				if (chainlength > sosendmaxchain)
2300					break;
2301			}
2302			/*
2303			 * Nothing left to send
2304			 */
2305			if (num_needed == 0) {
2306				socket_lock(so, 0);
2307				break;
2308			}
2309			/*
2310			 * Allocate the mbuf packets at once
2311			 */
2312			freelist = m_allocpacket_internal(
2313			    (unsigned int *)&num_needed,
2314			    maxpktlen, NULL, M_WAIT, 1, 0);
2315
2316			if (freelist == NULL) {
2317				socket_lock(so, 0);
2318				error = ENOMEM;
2319				goto release;
2320			}
2321			/*
2322			 * Copy each uio of the set into its own mbuf packet
2323			 */
2324			for (i = uiofirst, m = freelist;
2325			    i < uiolast && m != NULL;
2326			    i++) {
2327				int bytes_to_copy;
2328				struct mbuf *n;
2329				struct uio *auio = uioarray[i];
2330
2331				bytes_to_copy = uio_resid(auio);
2332
2333				/* Do nothing for empty messages */
2334				if (bytes_to_copy == 0)
2335					continue;
2336
2337				for (n = m; n != NULL; n = n->m_next) {
2338					mlen = mbuf_maxlen(n);
2339
2340					len = imin(mlen, bytes_to_copy);
2341
2342					/*
2343					 * Note: uiomove() decrements the iovec
2344					 * length
2345					 */
2346					error = uiomove(mtod(n, caddr_t),
2347					    len, auio);
2348					if (error != 0)
2349						break;
2350					n->m_len = len;
2351					m->m_pkthdr.len += len;
2352
2353					VERIFY(m->m_pkthdr.len <= maxpktlen);
2354
2355					bytes_to_copy -= len;
2356					resid -= len;
2357				}
2358				if (m->m_pkthdr.len == 0) {
2359					printf("%s so %llx pkt %llx len null\n",
2360					    __func__,
2361					    (uint64_t)VM_KERNEL_ADDRPERM(so),
2362					    (uint64_t)VM_KERNEL_ADDRPERM(m));
2363				}
2364				if (error != 0)
2365					break;
2366				m = m->m_nextpkt;
2367			}
2368
2369			socket_lock(so, 0);
2370
2371			if (error)
2372				goto release;
2373			top = freelist;
2374			freelist = NULL;
2375		}
2376
2377		if (dontroute)
2378			so->so_options |= SO_DONTROUTE;
2379
2380		if ((flags & MSG_SKIPCFIL) == 0) {
2381			struct mbuf **prevnextp = NULL;
2382
2383			for (i = uiofirst, m = top;
2384			    i < uiolast && m != NULL;
2385			    i++) {
2386				struct mbuf *nextpkt = m->m_nextpkt;
2387
2388				/*
2389				 * Socket filter processing
2390				 */
2391				error = sflt_data_out(so, addr, &m,
2392				    &control, 0);
2393				if (error != 0 && error != EJUSTRETURN)
2394					goto release;
2395
2396#if CONTENT_FILTER
2397				if (error == 0) {
2398					/*
2399					 * Content filter processing
2400					 */
2401					error = cfil_sock_data_out(so, addr, m,
2402					   control, 0);
2403					if (error != 0 && error != EJUSTRETURN)
2404						goto release;
2405				}
2406#endif /* CONTENT_FILTER */
2407				/*
2408				 * Remove packet from the list when
2409				 * swallowed by a filter
2410				 */
2411				if (error == EJUSTRETURN) {
2412					error = 0;
2413					if (prevnextp != NULL)
2414						*prevnextp = nextpkt;
2415					else
2416						top = nextpkt;
2417				}
2418
2419				m = nextpkt;
2420				if (m != NULL)
2421					prevnextp = &m->m_nextpkt;
2422			}
2423		}
2424		if (top != NULL)
2425			error = (*so->so_proto->pr_usrreqs->pru_send_list)
2426			    (so, 0, top, addr, control, p);
2427
2428		if (dontroute)
2429			so->so_options &= ~SO_DONTROUTE;
2430
2431		clen = 0;
2432		top = NULL;
2433		uiofirst = uiolast;
2434	} while (resid > 0 && error == 0);
2435release:
2436	if (sblocked)
2437		sbunlock(&so->so_snd, FALSE);	/* will unlock socket */
2438	else
2439		socket_unlock(so, 1);
2440out:
2441	if (top != NULL)
2442		m_freem(top);
2443	if (control != NULL)
2444		m_freem(control);
2445	if (freelist != NULL)
2446		m_freem_list(freelist);
2447
2448	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2449	    so->so_snd.sb_cc, 0, error);
2450
2451	return (error);
2452}
2453
2454/*
2455 * Implement receive operations on a socket.
2456 * We depend on the way that records are added to the sockbuf
2457 * by sbappend*.  In particular, each record (mbufs linked through m_next)
2458 * must begin with an address if the protocol so specifies,
2459 * followed by an optional mbuf or mbufs containing ancillary data,
2460 * and then zero or more mbufs of data.
2461 * In order to avoid blocking network interrupts for the entire time here,
2462 * we splx() while doing the actual copy to user space.
2463 * Although the sockbuf is locked, new data may still be appended,
2464 * and thus we must maintain consistency of the sockbuf during that time.
2465 *
2466 * The caller may receive the data as a single mbuf chain by supplying
2467 * an mbuf **mp0 for use in returning the chain.  The uio is then used
2468 * only for the count in uio_resid.
2469 *
2470 * Returns:	0			Success
2471 *		ENOBUFS
2472 *		ENOTCONN
2473 *		EWOULDBLOCK
2474 *	uiomove:EFAULT
2475 *	sblock:EWOULDBLOCK
2476 *	sblock:EINTR
2477 *	sbwait:EBADF
2478 *	sbwait:EINTR
2479 *	sodelayed_copy:EFAULT
2480 *	<pru_rcvoob>:EINVAL[TCP]
2481 *	<pru_rcvoob>:EWOULDBLOCK[TCP]
2482 *	<pru_rcvoob>:???
2483 *	<pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2484 *	<pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2485 *	<pr_domain->dom_externalize>:???
2486 *
2487 * Notes:	Additional return values from calls through <pru_rcvoob> and
2488 *		<pr_domain->dom_externalize> depend on protocols other than
2489 *		TCP or AF_UNIX, which are documented above.
2490 */
2491int
2492soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2493    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2494{
2495	struct mbuf *m, **mp, *ml = NULL;
2496	struct mbuf *nextrecord, *free_list;
2497	int flags, error, offset;
2498	user_ssize_t len;
2499	struct protosw *pr = so->so_proto;
2500	int moff, type =0;
2501	user_ssize_t orig_resid = uio_resid(uio);
2502	user_ssize_t delayed_copy_len;
2503	int can_delay;
2504	int need_event;
2505	struct proc *p = current_proc();
2506
2507	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2508	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2509
2510	/*
2511	 * Sanity check on the length passed by caller as we are making 'int'
2512	 * comparisons
2513	 */
2514	if (orig_resid < 0 || orig_resid > INT_MAX)
2515		return (EINVAL);
2516
2517	socket_lock(so, 1);
2518	so_update_last_owner_locked(so, p);
2519	so_update_policy(so);
2520
2521#ifdef MORE_LOCKING_DEBUG
2522	if (so->so_usecount == 1) {
2523		panic("%s: so=%x no other reference on socket\n", __func__, so);
2524		/* NOTREACHED */
2525	}
2526#endif
2527	mp = mp0;
2528	if (psa != NULL)
2529		*psa = NULL;
2530	if (controlp != NULL)
2531		*controlp = NULL;
2532	if (flagsp != NULL)
2533		flags = *flagsp &~ MSG_EOR;
2534	else
2535		flags = 0;
2536
2537	/*
2538	 * If a recv attempt is made on a previously-accepted socket
2539	 * that has been marked as inactive (disconnected), reject
2540	 * the request.
2541	 */
2542	if (so->so_flags & SOF_DEFUNCT) {
2543		struct sockbuf *sb = &so->so_rcv;
2544
2545		error = ENOTCONN;
2546		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2547		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2548		    SOCK_DOM(so), SOCK_TYPE(so), error));
2549		/*
2550		 * This socket should have been disconnected and flushed
2551		 * prior to being returned from sodefunct(); there should
2552		 * be no data on its receive list, so panic otherwise.
2553		 */
2554		if (so->so_state & SS_DEFUNCT)
2555			sb_empty_assert(sb, __func__);
2556		socket_unlock(so, 1);
2557		return (error);
2558	}
2559
2560	/*
2561	 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2562	 * regardless of the flags argument. Here is the case were
2563	 * out-of-band data is not inline.
2564	 */
2565	if ((flags & MSG_OOB) ||
2566	    ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2567	    (so->so_options & SO_OOBINLINE) == 0 &&
2568	    (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2569		m = m_get(M_WAIT, MT_DATA);
2570		if (m == NULL) {
2571			socket_unlock(so, 1);
2572			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2573			    ENOBUFS, 0, 0, 0, 0);
2574			return (ENOBUFS);
2575		}
2576		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2577		if (error)
2578			goto bad;
2579		socket_unlock(so, 0);
2580		do {
2581			error = uiomove(mtod(m, caddr_t),
2582			    imin(uio_resid(uio), m->m_len), uio);
2583			m = m_free(m);
2584		} while (uio_resid(uio) && error == 0 && m != NULL);
2585		socket_lock(so, 0);
2586bad:
2587		if (m != NULL)
2588			m_freem(m);
2589
2590		if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2591			if (error == EWOULDBLOCK || error == EINVAL) {
2592				/*
2593				 * Let's try to get normal data:
2594				 * EWOULDBLOCK: out-of-band data not
2595				 * receive yet. EINVAL: out-of-band data
2596				 * already read.
2597				 */
2598				error = 0;
2599				goto nooob;
2600			} else if (error == 0 && flagsp != NULL) {
2601				*flagsp |= MSG_OOB;
2602			}
2603		}
2604		socket_unlock(so, 1);
2605		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2606		    0, 0, 0, 0);
2607
2608		return (error);
2609	}
2610nooob:
2611	if (mp != NULL)
2612		*mp = NULL;
2613
2614	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
2615		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
2616	}
2617
2618	free_list = NULL;
2619	delayed_copy_len = 0;
2620restart:
2621#ifdef MORE_LOCKING_DEBUG
2622	if (so->so_usecount <= 1)
2623		printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
2624		    (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
2625#endif
2626	/*
2627	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2628	 * and if so just return to the caller.  This could happen when
2629	 * soreceive() is called by a socket upcall function during the
2630	 * time the socket is freed.  The socket buffer would have been
2631	 * locked across the upcall, therefore we cannot put this thread
2632	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2633	 * we may livelock), because the lock on the socket buffer will
2634	 * only be released when the upcall routine returns to its caller.
2635	 * Because the socket has been officially closed, there can be
2636	 * no further read on it.
2637	 *
2638	 * A multipath subflow socket would have its SS_NOFDREF set by
2639	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2640	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2641	 */
2642	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2643	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2644		socket_unlock(so, 1);
2645		return (0);
2646	}
2647
2648	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2649	if (error) {
2650		socket_unlock(so, 1);
2651		KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2652		    0, 0, 0, 0);
2653		return (error);
2654	}
2655
2656	m = so->so_rcv.sb_mb;
2657	/*
2658	 * If we have less data than requested, block awaiting more
2659	 * (subject to any timeout) if:
2660	 *   1. the current count is less than the low water mark, or
2661	 *   2. MSG_WAITALL is set, and it is possible to do the entire
2662	 *	receive operation at once if we block (resid <= hiwat).
2663	 *   3. MSG_DONTWAIT is not set
2664	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2665	 * we have to do the receive in sections, and thus risk returning
2666	 * a short count if a timeout or signal occurs after we start.
2667	 */
2668	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2669	    so->so_rcv.sb_cc < uio_resid(uio)) &&
2670	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2671	    ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2672	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2673		/*
2674		 * Panic if we notice inconsistencies in the socket's
2675		 * receive list; both sb_mb and sb_cc should correctly
2676		 * reflect the contents of the list, otherwise we may
2677		 * end up with false positives during select() or poll()
2678		 * which could put the application in a bad state.
2679		 */
2680		SB_MB_CHECK(&so->so_rcv);
2681
2682		if (so->so_error) {
2683			if (m != NULL)
2684				goto dontblock;
2685			error = so->so_error;
2686			if ((flags & MSG_PEEK) == 0)
2687				so->so_error = 0;
2688			goto release;
2689		}
2690		if (so->so_state & SS_CANTRCVMORE) {
2691#if CONTENT_FILTER
2692			/*
2693			 * Deal with half closed connections
2694			 */
2695			if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2696				cfil_sock_data_pending(&so->so_rcv) != 0)
2697				CFIL_LOG(LOG_INFO,
2698					"so %llx ignore SS_CANTRCVMORE",
2699					(uint64_t)VM_KERNEL_ADDRPERM(so));
2700			else
2701#endif /* CONTENT_FILTER */
2702			if (m != NULL)
2703				goto dontblock;
2704			else
2705				goto release;
2706		}
2707		for (; m != NULL; m = m->m_next)
2708			if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2709				m = so->so_rcv.sb_mb;
2710				goto dontblock;
2711			}
2712		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2713		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2714			error = ENOTCONN;
2715			goto release;
2716		}
2717		if (uio_resid(uio) == 0)
2718			goto release;
2719		if ((so->so_state & SS_NBIO) ||
2720		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2721			error = EWOULDBLOCK;
2722			goto release;
2723		}
2724		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2725		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2726		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
2727#if EVEN_MORE_LOCKING_DEBUG
2728		if (socket_debug)
2729			printf("Waiting for socket data\n");
2730#endif
2731
2732		error = sbwait(&so->so_rcv);
2733#if EVEN_MORE_LOCKING_DEBUG
2734		if (socket_debug)
2735			printf("SORECEIVE - sbwait returned %d\n", error);
2736#endif
2737		if (so->so_usecount < 1) {
2738			panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2739			    __func__, so, so->so_usecount);
2740			/* NOTREACHED */
2741		}
2742		if (error) {
2743			socket_unlock(so, 1);
2744			KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2745			    0, 0, 0, 0);
2746			return (error);
2747		}
2748		goto restart;
2749	}
2750dontblock:
2751	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2752	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2753	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2754	nextrecord = m->m_nextpkt;
2755	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2756		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2757#if CONFIG_MACF_SOCKET_SUBSET
2758		/*
2759		 * Call the MAC framework for policy checking if we're in
2760		 * the user process context and the socket isn't connected.
2761		 */
2762		if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2763			struct mbuf *m0 = m;
2764			/*
2765			 * Dequeue this record (temporarily) from the receive
2766			 * list since we're about to drop the socket's lock
2767			 * where a new record may arrive and be appended to
2768			 * the list.  Upon MAC policy failure, the record
2769			 * will be freed.  Otherwise, we'll add it back to
2770			 * the head of the list.  We cannot rely on SB_LOCK
2771			 * because append operation uses the socket's lock.
2772			 */
2773			do {
2774				m->m_nextpkt = NULL;
2775				sbfree(&so->so_rcv, m);
2776				m = m->m_next;
2777			} while (m != NULL);
2778			m = m0;
2779			so->so_rcv.sb_mb = nextrecord;
2780			SB_EMPTY_FIXUP(&so->so_rcv);
2781			SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2782			SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2783			socket_unlock(so, 0);
2784
2785			if (mac_socket_check_received(proc_ucred(p), so,
2786			    mtod(m, struct sockaddr *)) != 0) {
2787				/*
2788				 * MAC policy failure; free this record and
2789				 * process the next record (or block until
2790				 * one is available).  We have adjusted sb_cc
2791				 * and sb_mbcnt above so there is no need to
2792				 * call sbfree() again.
2793				 */
2794				do {
2795					m = m_free(m);
2796				} while (m != NULL);
2797				/*
2798				 * Clear SB_LOCK but don't unlock the socket.
2799				 * Process the next record or wait for one.
2800				 */
2801				socket_lock(so, 0);
2802				sbunlock(&so->so_rcv, TRUE); /* stay locked */
2803				goto restart;
2804			}
2805			socket_lock(so, 0);
2806			/*
2807			 * If the socket has been defunct'd, drop it.
2808			 */
2809			if (so->so_flags & SOF_DEFUNCT) {
2810				m_freem(m);
2811				error = ENOTCONN;
2812				goto release;
2813			}
2814			/*
2815			 * Re-adjust the socket receive list and re-enqueue
2816			 * the record in front of any packets which may have
2817			 * been appended while we dropped the lock.
2818			 */
2819			for (m = m0; m->m_next != NULL; m = m->m_next)
2820				sballoc(&so->so_rcv, m);
2821			sballoc(&so->so_rcv, m);
2822			if (so->so_rcv.sb_mb == NULL) {
2823				so->so_rcv.sb_lastrecord = m0;
2824				so->so_rcv.sb_mbtail = m;
2825			}
2826			m = m0;
2827			nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2828			so->so_rcv.sb_mb = m;
2829			SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2830			SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2831		}
2832#endif /* CONFIG_MACF_SOCKET_SUBSET */
2833		orig_resid = 0;
2834		if (psa != NULL) {
2835			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
2836			    mp0 == NULL);
2837			if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2838				error = EWOULDBLOCK;
2839				goto release;
2840			}
2841		}
2842		if (flags & MSG_PEEK) {
2843			m = m->m_next;
2844		} else {
2845			sbfree(&so->so_rcv, m);
2846			if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2847				panic("%s: about to create invalid socketbuf",
2848				    __func__);
2849				/* NOTREACHED */
2850			}
2851			MFREE(m, so->so_rcv.sb_mb);
2852			m = so->so_rcv.sb_mb;
2853			if (m != NULL) {
2854				m->m_nextpkt = nextrecord;
2855			} else {
2856				so->so_rcv.sb_mb = nextrecord;
2857				SB_EMPTY_FIXUP(&so->so_rcv);
2858			}
2859		}
2860	}
2861
2862	/*
2863	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2864	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2865	 * just copy the data; if !MSG_PEEK, we call into the protocol to
2866	 * perform externalization.
2867	 */
2868	if (m != NULL && m->m_type == MT_CONTROL) {
2869		struct mbuf *cm = NULL, *cmn;
2870		struct mbuf **cme = &cm;
2871		struct sockbuf *sb_rcv = &so->so_rcv;
2872		struct mbuf **msgpcm = NULL;
2873
2874		/*
2875		 * Externalizing the control messages would require us to
2876		 * drop the socket's lock below.  Once we re-acquire the
2877		 * lock, the mbuf chain might change.  In order to preserve
2878		 * consistency, we unlink all control messages from the
2879		 * first mbuf chain in one shot and link them separately
2880		 * onto a different chain.
2881		 */
2882		do {
2883			if (flags & MSG_PEEK) {
2884				if (controlp != NULL) {
2885					if (*controlp == NULL) {
2886						msgpcm = controlp;
2887					}
2888					*controlp = m_copy(m, 0, m->m_len);
2889
2890					/*
2891					 * If we failed to allocate an mbuf,
2892					 * release any previously allocated
2893					 * mbufs for control data. Return
2894					 * an error. Keep the mbufs in the
2895					 * socket as this is using
2896					 * MSG_PEEK flag.
2897					 */
2898					if (*controlp == NULL) {
2899						m_freem(*msgpcm);
2900						error = ENOBUFS;
2901						goto release;
2902					}
2903					controlp = &(*controlp)->m_next;
2904				}
2905				m = m->m_next;
2906			} else {
2907				m->m_nextpkt = NULL;
2908				sbfree(sb_rcv, m);
2909				sb_rcv->sb_mb = m->m_next;
2910				m->m_next = NULL;
2911				*cme = m;
2912				cme = &(*cme)->m_next;
2913				m = sb_rcv->sb_mb;
2914			}
2915		} while (m != NULL && m->m_type == MT_CONTROL);
2916
2917		if (!(flags & MSG_PEEK)) {
2918			if (sb_rcv->sb_mb != NULL) {
2919				sb_rcv->sb_mb->m_nextpkt = nextrecord;
2920			} else {
2921				sb_rcv->sb_mb = nextrecord;
2922				SB_EMPTY_FIXUP(sb_rcv);
2923			}
2924			if (nextrecord == NULL)
2925				sb_rcv->sb_lastrecord = m;
2926		}
2927
2928		SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2929		SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2930
2931		while (cm != NULL) {
2932			int cmsg_type;
2933
2934			cmn = cm->m_next;
2935			cm->m_next = NULL;
2936			cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2937
2938			/*
2939			 * Call the protocol to externalize SCM_RIGHTS message
2940			 * and return the modified message to the caller upon
2941			 * success.  Otherwise, all other control messages are
2942			 * returned unmodified to the caller.  Note that we
2943			 * only get into this loop if MSG_PEEK is not set.
2944			 */
2945			if (pr->pr_domain->dom_externalize != NULL &&
2946			    cmsg_type == SCM_RIGHTS) {
2947				/*
2948				 * Release socket lock: see 3903171.  This
2949				 * would also allow more records to be appended
2950				 * to the socket buffer.  We still have SB_LOCK
2951				 * set on it, so we can be sure that the head
2952				 * of the mbuf chain won't change.
2953				 */
2954				socket_unlock(so, 0);
2955				error = (*pr->pr_domain->dom_externalize)(cm);
2956				socket_lock(so, 0);
2957			} else {
2958				error = 0;
2959			}
2960
2961			if (controlp != NULL && error == 0) {
2962				*controlp = cm;
2963				controlp = &(*controlp)->m_next;
2964				orig_resid = 0;
2965			} else {
2966				(void) m_free(cm);
2967			}
2968			cm = cmn;
2969		}
2970		/*
2971		 * Update the value of nextrecord in case we received new
2972		 * records when the socket was unlocked above for
2973		 * externalizing SCM_RIGHTS.
2974		 */
2975		if (m != NULL)
2976			nextrecord = sb_rcv->sb_mb->m_nextpkt;
2977		else
2978			nextrecord = sb_rcv->sb_mb;
2979		orig_resid = 0;
2980	}
2981
2982	/*
2983	 * If the socket is a TCP socket with message delivery
2984	 * enabled, then create a control msg to deliver the
2985	 * relative TCP sequence number for this data. Waiting
2986	 * until this point will protect against failures to
2987	 * allocate an mbuf for control msgs.
2988	 */
2989	if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2990	    (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2991		struct mbuf *seq_cm;
2992
2993		seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2994		    sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2995		if (seq_cm == NULL) {
2996			/* unable to allocate a control mbuf */
2997			error = ENOBUFS;
2998			goto release;
2999		}
3000		*controlp = seq_cm;
3001		controlp = &seq_cm->m_next;
3002	}
3003
3004	if (m != NULL) {
3005		if (!(flags & MSG_PEEK)) {
3006			/*
3007			 * We get here because m points to an mbuf following
3008			 * any MT_SONAME or MT_CONTROL mbufs which have been
3009			 * processed above.  In any case, m should be pointing
3010			 * to the head of the mbuf chain, and the nextrecord
3011			 * should be either NULL or equal to m->m_nextpkt.
3012			 * See comments above about SB_LOCK.
3013			 */
3014			if (m != so->so_rcv.sb_mb ||
3015			    m->m_nextpkt != nextrecord) {
3016				panic("%s: post-control !sync so=%p m=%p "
3017				    "nextrecord=%p\n", __func__, so, m,
3018				    nextrecord);
3019				/* NOTREACHED */
3020			}
3021			if (nextrecord == NULL)
3022				so->so_rcv.sb_lastrecord = m;
3023		}
3024		type = m->m_type;
3025		if (type == MT_OOBDATA)
3026			flags |= MSG_OOB;
3027	} else {
3028		if (!(flags & MSG_PEEK)) {
3029			SB_EMPTY_FIXUP(&so->so_rcv);
3030		}
3031	}
3032	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3033	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3034
3035	moff = 0;
3036	offset = 0;
3037
3038	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3039		can_delay = 1;
3040	else
3041		can_delay = 0;
3042
3043	need_event = 0;
3044
3045	while (m != NULL &&
3046	    (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3047		if (m->m_type == MT_OOBDATA) {
3048			if (type != MT_OOBDATA)
3049				break;
3050		} else if (type == MT_OOBDATA) {
3051			break;
3052		}
3053		/*
3054		 * Make sure to allways set MSG_OOB event when getting
3055		 * out of band data inline.
3056		 */
3057		if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3058		    (so->so_options & SO_OOBINLINE) != 0 &&
3059		    (so->so_state & SS_RCVATMARK) != 0) {
3060			flags |= MSG_OOB;
3061		}
3062		so->so_state &= ~SS_RCVATMARK;
3063		len = uio_resid(uio) - delayed_copy_len;
3064		if (so->so_oobmark && len > so->so_oobmark - offset)
3065			len = so->so_oobmark - offset;
3066		if (len > m->m_len - moff)
3067			len = m->m_len - moff;
3068		/*
3069		 * If mp is set, just pass back the mbufs.
3070		 * Otherwise copy them out via the uio, then free.
3071		 * Sockbuf must be consistent here (points to current mbuf,
3072		 * it points to next record) when we drop priority;
3073		 * we must note any additions to the sockbuf when we
3074		 * block interrupts again.
3075		 */
3076		if (mp == NULL) {
3077			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3078			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3079			if (can_delay && len == m->m_len) {
3080				/*
3081				 * only delay the copy if we're consuming the
3082				 * mbuf and we're NOT in MSG_PEEK mode
3083				 * and we have enough data to make it worthwile
3084				 * to drop and retake the lock... can_delay
3085				 * reflects the state of the 2 latter
3086				 * constraints moff should always be zero
3087				 * in these cases
3088				 */
3089				delayed_copy_len += len;
3090			} else {
3091				if (delayed_copy_len) {
3092					error = sodelayed_copy(so, uio,
3093					    &free_list, &delayed_copy_len);
3094
3095					if (error) {
3096						goto release;
3097					}
3098					/*
3099					 * can only get here if MSG_PEEK is not
3100					 * set therefore, m should point at the
3101					 * head of the rcv queue; if it doesn't,
3102					 * it means something drastically
3103					 * changed while we were out from behind
3104					 * the lock in sodelayed_copy. perhaps
3105					 * a RST on the stream. in any event,
3106					 * the stream has been interrupted. it's
3107					 * probably best just to return whatever
3108					 * data we've moved and let the caller
3109					 * sort it out...
3110					 */
3111					if (m != so->so_rcv.sb_mb) {
3112						break;
3113					}
3114				}
3115				socket_unlock(so, 0);
3116				error = uiomove(mtod(m, caddr_t) + moff,
3117				    (int)len, uio);
3118				socket_lock(so, 0);
3119
3120				if (error)
3121					goto release;
3122			}
3123		} else {
3124			uio_setresid(uio, (uio_resid(uio) - len));
3125		}
3126		if (len == m->m_len - moff) {
3127			if (m->m_flags & M_EOR)
3128				flags |= MSG_EOR;
3129			if (flags & MSG_PEEK) {
3130				m = m->m_next;
3131				moff = 0;
3132			} else {
3133				nextrecord = m->m_nextpkt;
3134				sbfree(&so->so_rcv, m);
3135				m->m_nextpkt = NULL;
3136
3137				/*
3138				 * If this packet is an unordered packet
3139				 * (indicated by M_UNORDERED_DATA flag), remove
3140				 * the additional bytes added to the
3141				 * receive socket buffer size.
3142				 */
3143				if ((so->so_flags & SOF_ENABLE_MSGS) &&
3144				    m->m_len &&
3145				    (m->m_flags & M_UNORDERED_DATA) &&
3146				    sbreserve(&so->so_rcv,
3147				    so->so_rcv.sb_hiwat - m->m_len)) {
3148					if (so->so_msg_state->msg_uno_bytes >
3149					    m->m_len) {
3150						so->so_msg_state->
3151						    msg_uno_bytes -= m->m_len;
3152					} else {
3153						so->so_msg_state->
3154						    msg_uno_bytes = 0;
3155					}
3156					m->m_flags &= ~M_UNORDERED_DATA;
3157				}
3158
3159				if (mp != NULL) {
3160					*mp = m;
3161					mp = &m->m_next;
3162					so->so_rcv.sb_mb = m = m->m_next;
3163					*mp = NULL;
3164				} else {
3165					if (free_list == NULL)
3166						free_list = m;
3167					else
3168						ml->m_next = m;
3169					ml = m;
3170					so->so_rcv.sb_mb = m = m->m_next;
3171					ml->m_next = NULL;
3172				}
3173				if (m != NULL) {
3174					m->m_nextpkt = nextrecord;
3175					if (nextrecord == NULL)
3176						so->so_rcv.sb_lastrecord = m;
3177				} else {
3178					so->so_rcv.sb_mb = nextrecord;
3179					SB_EMPTY_FIXUP(&so->so_rcv);
3180				}
3181				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3182				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3183			}
3184		} else {
3185			if (flags & MSG_PEEK) {
3186				moff += len;
3187			} else {
3188				if (mp != NULL) {
3189					int copy_flag;
3190
3191					if (flags & MSG_DONTWAIT)
3192						copy_flag = M_DONTWAIT;
3193					else
3194						copy_flag = M_WAIT;
3195					*mp = m_copym(m, 0, len, copy_flag);
3196					/*
3197					 * Failed to allocate an mbuf?
3198					 * Adjust uio_resid back, it was
3199					 * adjusted down by len bytes which
3200					 * we didn't copy over.
3201					 */
3202					if (*mp == NULL) {
3203						uio_setresid(uio,
3204						    (uio_resid(uio) + len));
3205						break;
3206					}
3207				}
3208				m->m_data += len;
3209				m->m_len -= len;
3210				so->so_rcv.sb_cc -= len;
3211			}
3212		}
3213		if (so->so_oobmark) {
3214			if ((flags & MSG_PEEK) == 0) {
3215				so->so_oobmark -= len;
3216				if (so->so_oobmark == 0) {
3217					so->so_state |= SS_RCVATMARK;
3218					/*
3219					 * delay posting the actual event until
3220					 * after any delayed copy processing
3221					 * has finished
3222					 */
3223					need_event = 1;
3224					break;
3225				}
3226			} else {
3227				offset += len;
3228				if (offset == so->so_oobmark)
3229					break;
3230			}
3231		}
3232		if (flags & MSG_EOR)
3233			break;
3234		/*
3235		 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3236		 * (for non-atomic socket), we must not quit until
3237		 * "uio->uio_resid == 0" or an error termination.
3238		 * If a signal/timeout occurs, return with a short
3239		 * count but without error.  Keep sockbuf locked
3240		 * against other readers.
3241		 */
3242		while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3243		    (uio_resid(uio) - delayed_copy_len) > 0 &&
3244		    !sosendallatonce(so) && !nextrecord) {
3245			if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3246#if CONTENT_FILTER
3247			    && cfil_sock_data_pending(&so->so_rcv) == 0
3248#endif /* CONTENT_FILTER */
3249			    ))
3250				goto release;
3251
3252			/*
3253			 * Depending on the protocol (e.g. TCP), the following
3254			 * might cause the socket lock to be dropped and later
3255			 * be reacquired, and more data could have arrived and
3256			 * have been appended to the receive socket buffer by
3257			 * the time it returns.  Therefore, we only sleep in
3258			 * sbwait() below if and only if the socket buffer is
3259			 * empty, in order to avoid a false sleep.
3260			 */
3261			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3262			    (((struct inpcb *)so->so_pcb)->inp_state !=
3263			    INPCB_STATE_DEAD))
3264				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3265
3266			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3267			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3268
3269			if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3270				error = 0;
3271				goto release;
3272			}
3273			/*
3274			 * have to wait until after we get back from the sbwait
3275			 * to do the copy because we will drop the lock if we
3276			 * have enough data that has been delayed... by dropping
3277			 * the lock we open up a window allowing the netisr
3278			 * thread to process the incoming packets and to change
3279			 * the state of this socket... we're issuing the sbwait
3280			 * because the socket is empty and we're expecting the
3281			 * netisr thread to wake us up when more packets arrive;
3282			 * if we allow that processing to happen and then sbwait
3283			 * we could stall forever with packets sitting in the
3284			 * socket if no further packets arrive from the remote
3285			 * side.
3286			 *
3287			 * we want to copy before we've collected all the data
3288			 * to satisfy this request to allow the copy to overlap
3289			 * the incoming packet processing on an MP system
3290			 */
3291			if (delayed_copy_len > sorecvmincopy &&
3292			    (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3293				error = sodelayed_copy(so, uio,
3294				    &free_list, &delayed_copy_len);
3295
3296				if (error)
3297					goto release;
3298			}
3299			m = so->so_rcv.sb_mb;
3300			if (m != NULL) {
3301				nextrecord = m->m_nextpkt;
3302			}
3303			SB_MB_CHECK(&so->so_rcv);
3304		}
3305	}
3306#ifdef MORE_LOCKING_DEBUG
3307	if (so->so_usecount <= 1) {
3308		panic("%s: after big while so=%p ref=%d on socket\n",
3309		    __func__, so, so->so_usecount);
3310		/* NOTREACHED */
3311	}
3312#endif
3313
3314	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3315		if (so->so_options & SO_DONTTRUNC) {
3316			flags |= MSG_RCVMORE;
3317		} else {
3318			flags |= MSG_TRUNC;
3319			if ((flags & MSG_PEEK) == 0)
3320				(void) sbdroprecord(&so->so_rcv);
3321		}
3322	}
3323
3324	/*
3325	 * pru_rcvd below (for TCP) may cause more data to be received
3326	 * if the socket lock is dropped prior to sending the ACK; some
3327	 * legacy OpenTransport applications don't handle this well
3328	 * (if it receives less data than requested while MSG_HAVEMORE
3329	 * is set), and so we set the flag now based on what we know
3330	 * prior to calling pru_rcvd.
3331	 */
3332	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3333		flags |= MSG_HAVEMORE;
3334
3335	if ((flags & MSG_PEEK) == 0) {
3336		if (m == NULL) {
3337			so->so_rcv.sb_mb = nextrecord;
3338			/*
3339			 * First part is an inline SB_EMPTY_FIXUP().  Second
3340			 * part makes sure sb_lastrecord is up-to-date if
3341			 * there is still data in the socket buffer.
3342			 */
3343			if (so->so_rcv.sb_mb == NULL) {
3344				so->so_rcv.sb_mbtail = NULL;
3345				so->so_rcv.sb_lastrecord = NULL;
3346			} else if (nextrecord->m_nextpkt == NULL) {
3347				so->so_rcv.sb_lastrecord = nextrecord;
3348			}
3349			SB_MB_CHECK(&so->so_rcv);
3350		}
3351		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3352		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3353		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3354			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3355	}
3356
3357	if (delayed_copy_len) {
3358		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3359		if (error)
3360			goto release;
3361	}
3362	if (free_list != NULL) {
3363		m_freem_list(free_list);
3364		free_list = NULL;
3365	}
3366	if (need_event)
3367		postevent(so, 0, EV_OOB);
3368
3369	if (orig_resid == uio_resid(uio) && orig_resid &&
3370	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3371		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
3372		goto restart;
3373	}
3374
3375	if (flagsp != NULL)
3376		*flagsp |= flags;
3377release:
3378#ifdef MORE_LOCKING_DEBUG
3379	if (so->so_usecount <= 1) {
3380		panic("%s: release so=%p ref=%d on socket\n", __func__,
3381		    so, so->so_usecount);
3382		/* NOTREACHED */
3383	}
3384#endif
3385	if (delayed_copy_len)
3386		error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3387
3388	if (free_list != NULL)
3389		m_freem_list(free_list);
3390
3391	sbunlock(&so->so_rcv, FALSE);	/* will unlock socket */
3392
3393	KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3394	    so->so_rcv.sb_cc, 0, error);
3395
3396	return (error);
3397}
3398
3399/*
3400 * Returns:	0			Success
3401 *	uiomove:EFAULT
3402 */
3403static int
3404sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3405    user_ssize_t *resid)
3406{
3407	int error = 0;
3408	struct mbuf *m;
3409
3410	m = *free_list;
3411
3412	socket_unlock(so, 0);
3413
3414	while (m != NULL && error == 0) {
3415		error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3416		m = m->m_next;
3417	}
3418	m_freem_list(*free_list);
3419
3420	*free_list = NULL;
3421	*resid = 0;
3422
3423	socket_lock(so, 0);
3424
3425	return (error);
3426}
3427
3428int
3429soreceive_list(struct socket *so, struct sockaddr **psa, struct uio **uioarray,
3430	u_int uiocnt, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3431{
3432	struct mbuf *m, **mp;
3433	struct mbuf *nextrecord;
3434	struct mbuf *ml = NULL, *free_list = NULL;
3435	int flags, error, offset;
3436	user_ssize_t len;
3437	struct protosw *pr = so->so_proto;
3438	user_ssize_t orig_resid, resid;
3439	struct proc *p = current_proc();
3440	struct uio *auio = NULL;
3441	int i = 0;
3442	int sblocked = 0;
3443
3444	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3445	    so, uiocnt,
3446	    so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3447
3448	mp = mp0;
3449	if (psa != NULL)
3450		*psa = NULL;
3451	if (controlp != NULL)
3452		*controlp = NULL;
3453	if (flagsp != NULL)
3454		flags = *flagsp &~ MSG_EOR;
3455	else
3456		flags = 0;
3457	/*
3458	 * Disallow functionality not currently supported
3459	 */
3460	if (mp0 != NULL) {
3461		printf("%s mp0 not supported\n", __func__);
3462		error = EOPNOTSUPP;
3463		goto out;
3464	}
3465	if (psa != NULL) {
3466		printf("%s sockaddr not supported\n", __func__);
3467		error = EOPNOTSUPP;
3468		goto out;
3469	}
3470	if (controlp != NULL) {
3471		printf("%s control not supported\n", __func__);
3472		error = EOPNOTSUPP;
3473		goto out;
3474	}
3475
3476	/*
3477	 * Sanity checks:
3478	 * - Only supports don't wait flags
3479	 * - Only support datagram sockets (could be extended to raw)
3480	 * - Must be atomic
3481	 * - Protocol must support packet chains
3482	 * - The uio array is NULL (should we panic?)
3483	 */
3484	if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
3485		printf("%s flags not supported\n", __func__);
3486		error = EOPNOTSUPP;
3487		goto out;
3488	}
3489	if (so->so_type != SOCK_DGRAM) {
3490		error = EINVAL;
3491		goto out;
3492	}
3493	if (sosendallatonce(so) == 0) {
3494		error = EINVAL;
3495		goto out;
3496	}
3497	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3498		error = EPROTONOSUPPORT;
3499		goto out;
3500	}
3501	if (uioarray == NULL) {
3502		printf("%s uioarray is NULL\n", __func__);
3503		error = EINVAL;
3504		goto out;
3505	}
3506	if (uiocnt == 0) {
3507		printf("%s uiocnt is 0\n", __func__);
3508		error = EINVAL;
3509		goto out;
3510	}
3511	/*
3512	 * Sanity check on the length passed by caller as we are making 'int'
3513	 * comparisons
3514	 */
3515	resid = orig_resid = uio_array_resid(uioarray, uiocnt);
3516	if (orig_resid < 0 || orig_resid > INT_MAX) {
3517		error = EINVAL;
3518		goto out;
3519	}
3520
3521	socket_lock(so, 1);
3522	so_update_last_owner_locked(so, p);
3523	so_update_policy(so);
3524
3525#if NECP
3526	so_update_necp_policy(so, NULL, NULL);
3527#endif /* NECP */
3528
3529	/*
3530	 * If a recv attempt is made on a previously-accepted socket
3531	 * that has been marked as inactive (disconnected), reject
3532	 * the request.
3533	 */
3534	if (so->so_flags & SOF_DEFUNCT) {
3535		struct sockbuf *sb = &so->so_rcv;
3536
3537		error = ENOTCONN;
3538		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3539		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
3540		    SOCK_DOM(so), SOCK_TYPE(so), error));
3541		/*
3542		 * This socket should have been disconnected and flushed
3543		 * prior to being returned from sodefunct(); there should
3544		 * be no data on its receive list, so panic otherwise.
3545		 */
3546		if (so->so_state & SS_DEFUNCT)
3547			sb_empty_assert(sb, __func__);
3548		goto release;
3549	}
3550	if (mp != NULL)
3551		*mp = NULL;
3552restart:
3553	/*
3554	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3555	 * and if so just return to the caller.  This could happen when
3556	 * soreceive() is called by a socket upcall function during the
3557	 * time the socket is freed.  The socket buffer would have been
3558	 * locked across the upcall, therefore we cannot put this thread
3559	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3560	 * we may livelock), because the lock on the socket buffer will
3561	 * only be released when the upcall routine returns to its caller.
3562	 * Because the socket has been officially closed, there can be
3563	 * no further read on it.
3564	 */
3565	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3566	    (SS_NOFDREF | SS_CANTRCVMORE)) {
3567		error = 0;
3568		goto release;
3569	}
3570
3571	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3572	if (error) {
3573		goto release;
3574	}
3575	sblocked = 1;
3576
3577	/*
3578	 * Skip empty uio
3579	 */
3580	auio = uioarray[i];
3581	while (uio_resid(auio) == 0) {
3582		i++;
3583		if (i >= uiocnt) {
3584			error = 0;
3585			goto release;
3586		}
3587	}
3588
3589	m = so->so_rcv.sb_mb;
3590	/*
3591	 * Block awaiting more datagram if needed
3592	 */
3593	if (m == NULL) {
3594		/*
3595		 * Panic if we notice inconsistencies in the socket's
3596		 * receive list; both sb_mb and sb_cc should correctly
3597		 * reflect the contents of the list, otherwise we may
3598		 * end up with false positives during select() or poll()
3599		 * which could put the application in a bad state.
3600		 */
3601		SB_MB_CHECK(&so->so_rcv);
3602
3603		if (so->so_error) {
3604			error = so->so_error;
3605			goto release;
3606		}
3607		if (so->so_state & SS_CANTRCVMORE) {
3608			goto release;
3609		}
3610		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3611		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3612			error = ENOTCONN;
3613			goto release;
3614		}
3615		if ((so->so_state & SS_NBIO) ||
3616		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3617			error = EWOULDBLOCK;
3618			goto release;
3619		}
3620		/*
3621		 * Do not block if we got some data
3622		 * Note: We could use MSG_WAITALL to wait
3623		 */
3624		resid = uio_array_resid(uioarray, uiocnt);
3625		if (resid != orig_resid) {
3626			error = 0;
3627			goto release;
3628		}
3629
3630		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632
3633		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
3634		sblocked = 0;
3635
3636		error = sbwait(&so->so_rcv);
3637		if (error) {
3638			goto release;
3639		}
3640		goto restart;
3641	}
3642
3643	if (m->m_pkthdr.len == 0) {
3644		printf("%s so %llx pkt %llx len is null\n",
3645			__func__,
3646			(uint64_t)VM_KERNEL_ADDRPERM(so),
3647			(uint64_t)VM_KERNEL_ADDRPERM(m));
3648		goto restart;
3649	}
3650	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3651	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3652	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3653
3654	/*
3655	 * Consume the current uio index as we have a datagram
3656	 */
3657	i += 1;
3658	nextrecord = m->m_nextpkt;
3659
3660#if SO_RECEIVE_LIST_SOCKADDR_NOT_YET
3661	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3662		/*
3663		 * to be adapted from soreceive()
3664		 */
3665	}
3666#endif /* SO_RECEIVE_LIST_SOCKADDR_NOT_YET */
3667
3668#if SO_RECEIVE_LIST_CONTROL_NOT_YET
3669	/*
3670	 * Process one or more MT_CONTROL mbufs present before any data mbufs
3671	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3672	 * just copy the data; if !MSG_PEEK, we call into the protocol to
3673	 * perform externalization.
3674	 */
3675	if (m != NULL && m->m_type == MT_CONTROL) {
3676		/*
3677		 * to be adapted from soreceive()
3678		 */
3679	}
3680#endif /* SO_RECEIVE_LIST_CONTROL_NOT_YET */
3681
3682	offset = 0;
3683
3684	/*
3685	 * Loop to copy out the mbufs of the current record
3686	 */
3687	while (m != NULL && uio_resid(auio) > 0 && error == 0) {
3688		len = uio_resid(auio);
3689
3690		if (m->m_len == 0)
3691			printf("%s: so %llx m %llx m_len is 0\n",
3692				__func__,
3693				(uint64_t)VM_KERNEL_ADDRPERM(so),
3694				(uint64_t)VM_KERNEL_ADDRPERM(m));
3695
3696		/*
3697		 * Clip to the residual length
3698		 */
3699		if (len > m->m_len)
3700			len = m->m_len;
3701		/*
3702		 * If mp is set, just pass back the mbufs.
3703		 * Otherwise copy them out via the uio, then free.
3704		 * Sockbuf must be consistent here (points to current mbuf,
3705		 * it points to next record) when we drop priority;
3706		 * we must note any additions to the sockbuf when we
3707		 * block interrupts again.
3708		 */
3709		if (mp != NULL) {
3710			uio_setresid(auio, (uio_resid(auio) - len));
3711		} else {
3712			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3713			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3714
3715			socket_unlock(so, 0);
3716			error = uiomove(mtod(m, caddr_t), (int)len, auio);
3717			socket_lock(so, 0);
3718
3719			if (error)
3720				goto release;
3721		}
3722		if (len == m->m_len) {
3723			/*
3724			 * m was entirely copied
3725			 */
3726			nextrecord = m->m_nextpkt;
3727			sbfree(&so->so_rcv, m);
3728			m->m_nextpkt = NULL;
3729
3730			/*
3731			 * Move to m_next
3732			 */
3733			if (mp != NULL) {
3734				*mp = m;
3735				mp = &m->m_next;
3736				so->so_rcv.sb_mb = m = m->m_next;
3737				*mp = NULL;
3738			} else {
3739				if (free_list == NULL)
3740					free_list = m;
3741				else
3742					ml->m_next = m;
3743				ml = m;
3744				so->so_rcv.sb_mb = m = m->m_next;
3745				ml->m_next = NULL;
3746				ml->m_nextpkt = NULL;
3747			}
3748			if (m != NULL) {
3749				m->m_nextpkt = nextrecord;
3750				if (nextrecord == NULL)
3751					so->so_rcv.sb_lastrecord = m;
3752			} else {
3753				so->so_rcv.sb_mb = nextrecord;
3754				SB_EMPTY_FIXUP(&so->so_rcv);
3755			}
3756			SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3757			SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3758		} else {
3759			/*
3760			 * Stop the loop on partial copy
3761			 */
3762			if (mp != NULL) {
3763				int copy_flag;
3764
3765				if (flags & MSG_DONTWAIT)
3766					copy_flag = M_DONTWAIT;
3767				else
3768					copy_flag = M_WAIT;
3769				*mp = m_copym(m, 0, len, copy_flag);
3770				/*
3771				 * Failed to allocate an mbuf?
3772				 * Adjust uio_resid back, it was
3773				 * adjusted down by len bytes which
3774				 * we didn't copy over.
3775				 */
3776				if (*mp == NULL) {
3777					uio_setresid(auio,
3778					    (uio_resid(auio) + len));
3779					error = ENOMEM;
3780					break;
3781				}
3782			}
3783			break;
3784		}
3785	}
3786#ifdef MORE_LOCKING_DEBUG
3787	if (so->so_usecount <= 1) {
3788		panic("%s: after big while so=%llx ref=%d on socket\n",
3789		    __func__,
3790		    (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
3791		/* NOTREACHED */
3792	}
3793#endif
3794	/*
3795	 * Tell the caller we made a partial copy
3796	 */
3797	if (m != NULL) {
3798		if (so->so_options & SO_DONTTRUNC) {
3799			m->m_data += len;
3800			m->m_len -= len;
3801			so->so_rcv.sb_cc -= len;
3802			flags |= MSG_RCVMORE;
3803		} else {
3804			(void) sbdroprecord(&so->so_rcv);
3805			nextrecord = so->so_rcv.sb_mb;
3806			m = NULL;
3807			flags |= MSG_TRUNC;
3808		}
3809	}
3810
3811	if (m == NULL) {
3812		so->so_rcv.sb_mb = nextrecord;
3813		/*
3814		 * First part is an inline SB_EMPTY_FIXUP().  Second
3815		 * part makes sure sb_lastrecord is up-to-date if
3816		 * there is still data in the socket buffer.
3817		 */
3818		if (so->so_rcv.sb_mb == NULL) {
3819			so->so_rcv.sb_mbtail = NULL;
3820			so->so_rcv.sb_lastrecord = NULL;
3821		} else if (nextrecord->m_nextpkt == NULL) {
3822			so->so_rcv.sb_lastrecord = nextrecord;
3823		}
3824		SB_MB_CHECK(&so->so_rcv);
3825	}
3826	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3827	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3828
3829	/*
3830	 * We can continue to the next packet as long as:
3831	 * - We haven't exhausted the uio array
3832	 * - There was no error
3833	 * - A packet was not truncated
3834	 * - We can still receive more data
3835	 */
3836	if (i < uiocnt && error == 0 &&
3837	    (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0
3838	    && (so->so_state & SS_CANTRCVMORE) == 0) {
3839		sbunlock(&so->so_rcv, TRUE);	/* keep socket locked */
3840		sblocked = 0;
3841
3842		goto restart;
3843	}
3844
3845release:
3846	/*
3847	 * pru_rcvd may cause more data to be received if the socket lock
3848	 * is dropped so we set MSG_HAVEMORE now based on what we know.
3849	 * That way the caller won't be surprised if it receives less data than requested.
3850	 */
3851	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3852		flags |= MSG_HAVEMORE;
3853
3854	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3855		(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3856
3857	if (flagsp != NULL)
3858		*flagsp |= flags;
3859	if (sblocked)
3860		sbunlock(&so->so_rcv, FALSE);	/* will unlock socket */
3861	else
3862		socket_unlock(so, 1);
3863out:
3864	/*
3865	 * Amortize the cost
3866	 */
3867	if (free_list != NULL)
3868		m_freem_list(free_list);
3869
3870	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
3871	    0, 0, 0, 0);
3872	return (error);
3873}
3874
3875/*
3876 * Returns:	0			Success
3877 *		EINVAL
3878 *		ENOTCONN
3879 *	<pru_shutdown>:EINVAL
3880 *	<pru_shutdown>:EADDRNOTAVAIL[TCP]
3881 *	<pru_shutdown>:ENOBUFS[TCP]
3882 *	<pru_shutdown>:EMSGSIZE[TCP]
3883 *	<pru_shutdown>:EHOSTUNREACH[TCP]
3884 *	<pru_shutdown>:ENETUNREACH[TCP]
3885 *	<pru_shutdown>:ENETDOWN[TCP]
3886 *	<pru_shutdown>:ENOMEM[TCP]
3887 *	<pru_shutdown>:EACCES[TCP]
3888 *	<pru_shutdown>:EMSGSIZE[TCP]
3889 *	<pru_shutdown>:ENOBUFS[TCP]
3890 *	<pru_shutdown>:???[TCP]		[ignorable: mostly IPSEC/firewall/DLIL]
3891 *	<pru_shutdown>:???		[other protocol families]
3892 */
3893int
3894soshutdown(struct socket *so, int how)
3895{
3896	int error;
3897
3898	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
3899
3900	switch (how) {
3901	case SHUT_RD:
3902	case SHUT_WR:
3903	case SHUT_RDWR:
3904		socket_lock(so, 1);
3905		if ((so->so_state &
3906		    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3907			error = ENOTCONN;
3908		} else {
3909			error = soshutdownlock(so, how);
3910		}
3911		socket_unlock(so, 1);
3912		break;
3913	default:
3914		error = EINVAL;
3915		break;
3916	}
3917
3918	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
3919
3920	return (error);
3921}
3922
3923int
3924soshutdownlock_final(struct socket *so, int how)
3925{
3926	struct protosw *pr = so->so_proto;
3927	int error = 0;
3928
3929	sflt_notify(so, sock_evt_shutdown, &how);
3930
3931	if (how != SHUT_WR) {
3932		if ((so->so_state & SS_CANTRCVMORE) != 0) {
3933			/* read already shut down */
3934			error = ENOTCONN;
3935			goto done;
3936		}
3937		sorflush(so);
3938		postevent(so, 0, EV_RCLOSED);
3939	}
3940	if (how != SHUT_RD) {
3941		if ((so->so_state & SS_CANTSENDMORE) != 0) {
3942			/* write already shut down */
3943			error = ENOTCONN;
3944			goto done;
3945		}
3946		error = (*pr->pr_usrreqs->pru_shutdown)(so);
3947		postevent(so, 0, EV_WCLOSED);
3948	}
3949done:
3950	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
3951	return (error);
3952}
3953
3954int
3955soshutdownlock(struct socket *so, int how)
3956{
3957	int error = 0;
3958
3959#if CONTENT_FILTER
3960	/*
3961	 * A content filter may delay the actual shutdown until it
3962	 * has processed the pending data
3963	 */
3964	if (so->so_flags & SOF_CONTENT_FILTER) {
3965		error = cfil_sock_shutdown(so, &how);
3966		if (error == EJUSTRETURN) {
3967			error = 0;
3968			goto done;
3969		} else if (error != 0) {
3970			goto done;
3971		}
3972	}
3973#endif /* CONTENT_FILTER */
3974
3975	error = soshutdownlock_final(so, how);
3976
3977done:
3978	return (error);
3979}
3980
3981void
3982sowflush(struct socket *so)
3983{
3984	struct sockbuf *sb = &so->so_snd;
3985#ifdef notyet
3986	lck_mtx_t *mutex_held;
3987	/*
3988	 * XXX: This code is currently commented out, because we may get here
3989	 * as part of sofreelastref(), and at that time, pr_getlock() may no
3990	 * longer be able to return us the lock; this will be fixed in future.
3991	 */
3992	if (so->so_proto->pr_getlock != NULL)
3993		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3994	else
3995		mutex_held = so->so_proto->pr_domain->dom_mtx;
3996
3997	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3998#endif /* notyet */
3999
4000	/*
4001	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4002	 * to prevent the socket buffer from being unexpectedly altered
4003	 * while it is used by another thread in socket send/receive.
4004	 *
4005	 * sblock() must not fail here, hence the assertion.
4006	 */
4007	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4008	VERIFY(sb->sb_flags & SB_LOCK);
4009
4010	sb->sb_flags		&= ~(SB_SEL|SB_UPCALL);
4011	sb->sb_flags		|= SB_DROP;
4012	sb->sb_upcall		= NULL;
4013	sb->sb_upcallarg	= NULL;
4014
4015	sbunlock(sb, TRUE);	/* keep socket locked */
4016
4017	selthreadclear(&sb->sb_sel);
4018	sbrelease(sb);
4019}
4020
4021void
4022sorflush(struct socket *so)
4023{
4024	struct sockbuf *sb = &so->so_rcv;
4025	struct protosw *pr = so->so_proto;
4026	struct sockbuf asb;
4027#ifdef notyet
4028	lck_mtx_t *mutex_held;
4029	/*
4030	 * XXX: This code is currently commented out, because we may get here
4031	 * as part of sofreelastref(), and at that time, pr_getlock() may no
4032	 * longer be able to return us the lock; this will be fixed in future.
4033	 */
4034	if (so->so_proto->pr_getlock != NULL)
4035		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4036	else
4037		mutex_held = so->so_proto->pr_domain->dom_mtx;
4038
4039	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4040#endif /* notyet */
4041
4042	sflt_notify(so, sock_evt_flush_read, NULL);
4043
4044	socantrcvmore(so);
4045
4046	/*
4047	 * Obtain lock on the socket buffer (SB_LOCK).  This is required
4048	 * to prevent the socket buffer from being unexpectedly altered
4049	 * while it is used by another thread in socket send/receive.
4050	 *
4051	 * sblock() must not fail here, hence the assertion.
4052	 */
4053	(void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4054	VERIFY(sb->sb_flags & SB_LOCK);
4055
4056	/*
4057	 * Copy only the relevant fields from "sb" to "asb" which we
4058	 * need for sbrelease() to function.  In particular, skip
4059	 * sb_sel as it contains the wait queue linkage, which would
4060	 * wreak havoc if we were to issue selthreadclear() on "asb".
4061	 * Make sure to not carry over SB_LOCK in "asb", as we need
4062	 * to acquire it later as part of sbrelease().
4063	 */
4064	bzero(&asb, sizeof (asb));
4065	asb.sb_cc		= sb->sb_cc;
4066	asb.sb_hiwat		= sb->sb_hiwat;
4067	asb.sb_mbcnt		= sb->sb_mbcnt;
4068	asb.sb_mbmax		= sb->sb_mbmax;
4069	asb.sb_ctl		= sb->sb_ctl;
4070	asb.sb_lowat		= sb->sb_lowat;
4071	asb.sb_mb		= sb->sb_mb;
4072	asb.sb_mbtail		= sb->sb_mbtail;
4073	asb.sb_lastrecord	= sb->sb_lastrecord;
4074	asb.sb_so		= sb->sb_so;
4075	asb.sb_flags		= sb->sb_flags;
4076	asb.sb_flags		&= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4077	asb.sb_flags		|= SB_DROP;
4078
4079	/*
4080	 * Ideally we'd bzero() these and preserve the ones we need;
4081	 * but to do that we'd need to shuffle things around in the
4082	 * sockbuf, and we can't do it now because there are KEXTS
4083	 * that are directly referring to the socket structure.
4084	 *
4085	 * Setting SB_DROP acts as a barrier to prevent further appends.
4086	 * Clearing SB_SEL is done for selthreadclear() below.
4087	 */
4088	sb->sb_cc		= 0;
4089	sb->sb_hiwat		= 0;
4090	sb->sb_mbcnt		= 0;
4091	sb->sb_mbmax		= 0;
4092	sb->sb_ctl		= 0;
4093	sb->sb_lowat		= 0;
4094	sb->sb_mb		= NULL;
4095	sb->sb_mbtail		= NULL;
4096	sb->sb_lastrecord	= NULL;
4097	sb->sb_timeo.tv_sec	= 0;
4098	sb->sb_timeo.tv_usec	= 0;
4099	sb->sb_upcall		= NULL;
4100	sb->sb_upcallarg	= NULL;
4101	sb->sb_flags		&= ~(SB_SEL|SB_UPCALL);
4102	sb->sb_flags		|= SB_DROP;
4103
4104	sbunlock(sb, TRUE);	/* keep socket locked */
4105
4106	/*
4107	 * Note that selthreadclear() is called on the original "sb" and
4108	 * not the local "asb" because of the way wait queue linkage is
4109	 * implemented.  Given that selwakeup() may be triggered, SB_SEL
4110	 * should no longer be set (cleared above.)
4111	 */
4112	selthreadclear(&sb->sb_sel);
4113
4114	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4115		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4116
4117	sbrelease(&asb);
4118}
4119
4120/*
4121 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4122 * an additional variant to handle the case where the option value needs
4123 * to be some kind of integer, but not a specific size.
4124 * In addition to their use here, these functions are also called by the
4125 * protocol-level pr_ctloutput() routines.
4126 *
4127 * Returns:	0			Success
4128 *		EINVAL
4129 *	copyin:EFAULT
4130 */
4131int
4132sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4133{
4134	size_t	valsize;
4135
4136	/*
4137	 * If the user gives us more than we wanted, we ignore it,
4138	 * but if we don't get the minimum length the caller
4139	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
4140	 * is set to however much we actually retrieved.
4141	 */
4142	if ((valsize = sopt->sopt_valsize) < minlen)
4143		return (EINVAL);
4144	if (valsize > len)
4145		sopt->sopt_valsize = valsize = len;
4146
4147	if (sopt->sopt_p != kernproc)
4148		return (copyin(sopt->sopt_val, buf, valsize));
4149
4150	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4151	return (0);
4152}
4153
4154/*
4155 * sooptcopyin_timeval
4156 *   Copy in a timeval value into tv_p, and take into account whether the
4157 *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4158 *   code here so that we can verify the 64-bit tv_sec value before we lose
4159 *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4160 */
4161static int
4162sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4163{
4164	int			error;
4165
4166	if (proc_is64bit(sopt->sopt_p)) {
4167		struct user64_timeval	tv64;
4168
4169		if (sopt->sopt_valsize < sizeof (tv64))
4170			return (EINVAL);
4171
4172		sopt->sopt_valsize = sizeof (tv64);
4173		if (sopt->sopt_p != kernproc) {
4174			error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4175			if (error != 0)
4176				return (error);
4177		} else {
4178			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4179			    sizeof (tv64));
4180		}
4181		if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4182		    tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4183			return (EDOM);
4184
4185		tv_p->tv_sec = tv64.tv_sec;
4186		tv_p->tv_usec = tv64.tv_usec;
4187	} else {
4188		struct user32_timeval	tv32;
4189
4190		if (sopt->sopt_valsize < sizeof (tv32))
4191			return (EINVAL);
4192
4193		sopt->sopt_valsize = sizeof (tv32);
4194		if (sopt->sopt_p != kernproc) {
4195			error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4196			if (error != 0) {
4197				return (error);
4198			}
4199		} else {
4200			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4201			    sizeof (tv32));
4202		}
4203#ifndef __LP64__
4204		/*
4205		 * K64todo "comparison is always false due to
4206		 * limited range of data type"
4207		 */
4208		if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4209		    tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4210			return (EDOM);
4211#endif
4212		tv_p->tv_sec = tv32.tv_sec;
4213		tv_p->tv_usec = tv32.tv_usec;
4214	}
4215	return (0);
4216}
4217
4218/*
4219 * Returns:	0			Success
4220 *		EINVAL
4221 *		ENOPROTOOPT
4222 *		ENOBUFS
4223 *		EDOM
4224 *	sooptcopyin:EINVAL
4225 *	sooptcopyin:EFAULT
4226 *	sooptcopyin_timeval:EINVAL
4227 *	sooptcopyin_timeval:EFAULT
4228 *	sooptcopyin_timeval:EDOM
4229 *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4230 *	<pr_ctloutput>:???w
4231 *	sflt_attach_private:???		[whatever a filter author chooses]
4232 *	<sf_setoption>:???		[whatever a filter author chooses]
4233 *
4234 * Notes:	Other <pru_listen> returns depend on the protocol family; all
4235 *		<sf_listen> returns depend on what the filter author causes
4236 *		their filter to return.
4237 */
4238int
4239sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4240{
4241	int	error, optval;
4242	struct	linger l;
4243	struct	timeval tv;
4244#if CONFIG_MACF_SOCKET
4245	struct mac extmac;
4246#endif /* MAC_SOCKET */
4247
4248	if (sopt->sopt_dir != SOPT_SET)
4249		sopt->sopt_dir = SOPT_SET;
4250
4251	if (dolock)
4252		socket_lock(so, 1);
4253
4254	if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4255	    (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4256	    (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4257		/* the socket has been shutdown, no more sockopt's */
4258		error = EINVAL;
4259		goto out;
4260	}
4261
4262	error = sflt_setsockopt(so, sopt);
4263	if (error != 0) {
4264		if (error == EJUSTRETURN)
4265			error = 0;
4266		goto out;
4267	}
4268
4269	if (sopt->sopt_level != SOL_SOCKET) {
4270		if (so->so_proto != NULL &&
4271		    so->so_proto->pr_ctloutput != NULL) {
4272			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4273			goto out;
4274		}
4275		error = ENOPROTOOPT;
4276	} else {
4277		/*
4278		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4279		 * the protocol layer, if needed.  A zero value returned from
4280		 * the handler means use default socket-level processing as
4281		 * done by the rest of this routine.  Otherwise, any other
4282		 * return value indicates that the option is unsupported.
4283		 */
4284		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4285		    pru_socheckopt(so, sopt)) != 0)
4286			goto out;
4287
4288		error = 0;
4289		switch (sopt->sopt_name) {
4290		case SO_LINGER:
4291		case SO_LINGER_SEC:
4292			error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4293			if (error != 0)
4294				goto out;
4295
4296			so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4297			    l.l_linger : l.l_linger * hz;
4298			if (l.l_onoff != 0)
4299				so->so_options |= SO_LINGER;
4300			else
4301				so->so_options &= ~SO_LINGER;
4302			break;
4303
4304		case SO_DEBUG:
4305		case SO_KEEPALIVE:
4306		case SO_DONTROUTE:
4307		case SO_USELOOPBACK:
4308		case SO_BROADCAST:
4309		case SO_REUSEADDR:
4310		case SO_REUSEPORT:
4311		case SO_OOBINLINE:
4312		case SO_TIMESTAMP:
4313		case SO_TIMESTAMP_MONOTONIC:
4314		case SO_DONTTRUNC:
4315		case SO_WANTMORE:
4316		case SO_WANTOOBFLAG:
4317		case SO_NOWAKEFROMSLEEP:
4318			error = sooptcopyin(sopt, &optval, sizeof (optval),
4319			    sizeof (optval));
4320			if (error != 0)
4321				goto out;
4322			if (optval)
4323				so->so_options |= sopt->sopt_name;
4324			else
4325				so->so_options &= ~sopt->sopt_name;
4326			break;
4327
4328		case SO_SNDBUF:
4329		case SO_RCVBUF:
4330		case SO_SNDLOWAT:
4331		case SO_RCVLOWAT:
4332			error = sooptcopyin(sopt, &optval, sizeof (optval),
4333			    sizeof (optval));
4334			if (error != 0)
4335				goto out;
4336
4337			/*
4338			 * Values < 1 make no sense for any of these
4339			 * options, so disallow them.
4340			 */
4341			if (optval < 1) {
4342				error = EINVAL;
4343				goto out;
4344			}
4345
4346			switch (sopt->sopt_name) {
4347			case SO_SNDBUF:
4348			case SO_RCVBUF: {
4349				struct sockbuf *sb =
4350				    (sopt->sopt_name == SO_SNDBUF) ?
4351				    &so->so_snd : &so->so_rcv;
4352				if (sbreserve(sb, (u_int32_t)optval) == 0) {
4353					error = ENOBUFS;
4354					goto out;
4355				}
4356				sb->sb_flags |= SB_USRSIZE;
4357				sb->sb_flags &= ~SB_AUTOSIZE;
4358				sb->sb_idealsize = (u_int32_t)optval;
4359				break;
4360			}
4361			/*
4362			 * Make sure the low-water is never greater than
4363			 * the high-water.
4364			 */
4365			case SO_SNDLOWAT: {
4366				int space = sbspace(&so->so_snd);
4367				u_int32_t hiwat = so->so_snd.sb_hiwat;
4368
4369				if (so->so_snd.sb_flags & SB_UNIX) {
4370					struct unpcb *unp =
4371					    (struct unpcb *)(so->so_pcb);
4372					if (unp != NULL && unp->unp_conn != NULL) {
4373						hiwat += unp->unp_conn->unp_cc;
4374					}
4375				}
4376
4377				so->so_snd.sb_lowat =
4378				    (optval > hiwat) ?
4379				    hiwat : optval;
4380
4381				if (space >= so->so_snd.sb_lowat) {
4382					sowwakeup(so);
4383				}
4384				break;
4385		  	}
4386			case SO_RCVLOWAT: {
4387				int64_t data_len;
4388				so->so_rcv.sb_lowat =
4389				    (optval > so->so_rcv.sb_hiwat) ?
4390				    so->so_rcv.sb_hiwat : optval;
4391				data_len = so->so_rcv.sb_cc
4392				    - so->so_rcv.sb_ctl;
4393				if (data_len >= so->so_rcv.sb_lowat)
4394				    sorwakeup(so);
4395				break;
4396			}
4397			}
4398			break;
4399
4400		case SO_SNDTIMEO:
4401		case SO_RCVTIMEO:
4402			error = sooptcopyin_timeval(sopt, &tv);
4403			if (error != 0)
4404				goto out;
4405
4406			switch (sopt->sopt_name) {
4407			case SO_SNDTIMEO:
4408				so->so_snd.sb_timeo = tv;
4409				break;
4410			case SO_RCVTIMEO:
4411				so->so_rcv.sb_timeo = tv;
4412				break;
4413			}
4414			break;
4415
4416		case SO_NKE: {
4417			struct so_nke nke;
4418
4419			error = sooptcopyin(sopt, &nke, sizeof (nke),
4420			    sizeof (nke));
4421			if (error != 0)
4422				goto out;
4423
4424			error = sflt_attach_internal(so, nke.nke_handle);
4425			break;
4426		}
4427
4428		case SO_NOSIGPIPE:
4429			error = sooptcopyin(sopt, &optval, sizeof (optval),
4430			    sizeof (optval));
4431			if (error != 0)
4432				goto out;
4433			if (optval != 0)
4434				so->so_flags |= SOF_NOSIGPIPE;
4435			else
4436				so->so_flags &= ~SOF_NOSIGPIPE;
4437			break;
4438
4439		case SO_NOADDRERR:
4440			error = sooptcopyin(sopt, &optval, sizeof (optval),
4441			    sizeof (optval));
4442			if (error != 0)
4443				goto out;
4444			if (optval != 0)
4445				so->so_flags |= SOF_NOADDRAVAIL;
4446			else
4447				so->so_flags &= ~SOF_NOADDRAVAIL;
4448			break;
4449
4450		case SO_REUSESHAREUID:
4451			error = sooptcopyin(sopt, &optval, sizeof (optval),
4452			    sizeof (optval));
4453			if (error != 0)
4454				goto out;
4455			if (optval != 0)
4456				so->so_flags |= SOF_REUSESHAREUID;
4457			else
4458				so->so_flags &= ~SOF_REUSESHAREUID;
4459			break;
4460
4461		case SO_NOTIFYCONFLICT:
4462			if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4463				error = EPERM;
4464				goto out;
4465			}
4466			error = sooptcopyin(sopt, &optval, sizeof (optval),
4467			    sizeof (optval));
4468			if (error != 0)
4469				goto out;
4470			if (optval != 0)
4471				so->so_flags |= SOF_NOTIFYCONFLICT;
4472			else
4473				so->so_flags &= ~SOF_NOTIFYCONFLICT;
4474			break;
4475
4476		case SO_RESTRICTIONS:
4477			error = sooptcopyin(sopt, &optval, sizeof (optval),
4478			    sizeof (optval));
4479			if (error != 0)
4480				goto out;
4481
4482			error = so_set_restrictions(so, optval);
4483			break;
4484
4485		case SO_AWDL_UNRESTRICTED:
4486			if (SOCK_DOM(so) != PF_INET &&
4487			    SOCK_DOM(so) != PF_INET6) {
4488				error = EOPNOTSUPP;
4489				goto out;
4490			}
4491			error = sooptcopyin(sopt, &optval, sizeof(optval),
4492			    sizeof(optval));
4493			if (error != 0)
4494				goto out;
4495			if (optval != 0) {
4496				kauth_cred_t cred =  NULL;
4497				proc_t ep = PROC_NULL;
4498
4499				if (so->so_flags & SOF_DELEGATED) {
4500					ep = proc_find(so->e_pid);
4501					if (ep)
4502						cred = kauth_cred_proc_ref(ep);
4503				}
4504				error = priv_check_cred(
4505				    cred ? cred : so->so_cred,
4506				    PRIV_NET_RESTRICTED_AWDL, 0);
4507				if (error == 0)
4508					inp_set_awdl_unrestricted(
4509					    sotoinpcb(so));
4510				if (cred)
4511					kauth_cred_unref(&cred);
4512				if (ep != PROC_NULL)
4513					proc_rele(ep);
4514			} else
4515				inp_clear_awdl_unrestricted(sotoinpcb(so));
4516			break;
4517
4518		case SO_LABEL:
4519#if CONFIG_MACF_SOCKET
4520			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4521			    sizeof (extmac))) != 0)
4522				goto out;
4523
4524			error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4525			    so, &extmac);
4526#else
4527			error = EOPNOTSUPP;
4528#endif /* MAC_SOCKET */
4529			break;
4530
4531		case SO_UPCALLCLOSEWAIT:
4532			error = sooptcopyin(sopt, &optval, sizeof (optval),
4533			    sizeof (optval));
4534			if (error != 0)
4535				goto out;
4536			if (optval != 0)
4537				so->so_flags |= SOF_UPCALLCLOSEWAIT;
4538			else
4539				so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4540			break;
4541
4542		case SO_RANDOMPORT:
4543			error = sooptcopyin(sopt, &optval, sizeof (optval),
4544			    sizeof (optval));
4545			if (error != 0)
4546				goto out;
4547			if (optval != 0)
4548				so->so_flags |= SOF_BINDRANDOMPORT;
4549			else
4550				so->so_flags &= ~SOF_BINDRANDOMPORT;
4551			break;
4552
4553		case SO_NP_EXTENSIONS: {
4554			struct so_np_extensions sonpx;
4555
4556			error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4557			    sizeof (sonpx));
4558			if (error != 0)
4559				goto out;
4560			if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4561				error = EINVAL;
4562				goto out;
4563			}
4564			/*
4565			 * Only one bit defined for now
4566			 */
4567			if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4568				if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4569					so->so_flags |= SOF_NPX_SETOPTSHUT;
4570				else
4571					so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4572			}
4573			break;
4574		}
4575
4576		case SO_TRAFFIC_CLASS: {
4577			error = sooptcopyin(sopt, &optval, sizeof (optval),
4578			    sizeof (optval));
4579			if (error != 0)
4580				goto out;
4581			error = so_set_traffic_class(so, optval);
4582			if (error != 0)
4583				goto out;
4584			break;
4585		}
4586
4587		case SO_RECV_TRAFFIC_CLASS: {
4588			error = sooptcopyin(sopt, &optval, sizeof (optval),
4589			    sizeof (optval));
4590			if (error != 0)
4591				goto out;
4592			if (optval == 0)
4593				so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4594			else
4595				so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4596			break;
4597		}
4598
4599		case SO_TRAFFIC_CLASS_DBG: {
4600			struct so_tcdbg so_tcdbg;
4601
4602			error = sooptcopyin(sopt, &so_tcdbg,
4603			    sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4604			if (error != 0)
4605				goto out;
4606			error = so_set_tcdbg(so, &so_tcdbg);
4607			if (error != 0)
4608				goto out;
4609			break;
4610		}
4611
4612		case SO_PRIVILEGED_TRAFFIC_CLASS:
4613			error = priv_check_cred(kauth_cred_get(),
4614			    PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4615			if (error != 0)
4616				goto out;
4617			error = sooptcopyin(sopt, &optval, sizeof (optval),
4618			    sizeof (optval));
4619			if (error != 0)
4620				goto out;
4621			if (optval == 0)
4622				so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4623			else
4624				so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4625			break;
4626
4627		case SO_DEFUNCTOK:
4628			error = sooptcopyin(sopt, &optval, sizeof (optval),
4629			    sizeof (optval));
4630			if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4631				if (error == 0)
4632					error = EBADF;
4633				goto out;
4634			}
4635			/*
4636			 * Any process can set SO_DEFUNCTOK (clear
4637			 * SOF_NODEFUNCT), but only root can clear
4638			 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4639			 */
4640			if (optval == 0 &&
4641			    kauth_cred_issuser(kauth_cred_get()) == 0) {
4642				error = EPERM;
4643				goto out;
4644			}
4645			if (optval)
4646				so->so_flags &= ~SOF_NODEFUNCT;
4647			else
4648				so->so_flags |= SOF_NODEFUNCT;
4649
4650			if (SOCK_DOM(so) == PF_INET ||
4651			    SOCK_DOM(so) == PF_INET6) {
4652				char s[MAX_IPv6_STR_LEN];
4653				char d[MAX_IPv6_STR_LEN];
4654				struct inpcb *inp = sotoinpcb(so);
4655
4656				SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4657				    "%s:%d] is now marked as %seligible for "
4658				    "defunct\n", __func__, proc_selfpid(),
4659				    (uint64_t)VM_KERNEL_ADDRPERM(so),
4660				    (SOCK_TYPE(so) == SOCK_STREAM) ?
4661				    "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4662				    ((SOCK_DOM(so) == PF_INET) ?
4663				    (void *)&inp->inp_laddr.s_addr :
4664				    (void *)&inp->in6p_laddr), s, sizeof (s)),
4665				    ntohs(inp->in6p_lport),
4666				    inet_ntop(SOCK_DOM(so),
4667				    (SOCK_DOM(so) == PF_INET) ?
4668				    (void *)&inp->inp_faddr.s_addr :
4669				    (void *)&inp->in6p_faddr, d, sizeof (d)),
4670				    ntohs(inp->in6p_fport),
4671				    (so->so_flags & SOF_NODEFUNCT) ?
4672				    "not " : ""));
4673			} else {
4674				SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4675				    "now marked as %seligible for defunct\n",
4676				    __func__, proc_selfpid(),
4677				    (uint64_t)VM_KERNEL_ADDRPERM(so),
4678				    SOCK_DOM(so), SOCK_TYPE(so),
4679				    (so->so_flags & SOF_NODEFUNCT) ?
4680				    "not " : ""));
4681			}
4682			break;
4683
4684		case SO_ISDEFUNCT:
4685			/* This option is not settable */
4686			error = EINVAL;
4687			break;
4688
4689		case SO_OPPORTUNISTIC:
4690			error = sooptcopyin(sopt, &optval, sizeof (optval),
4691			    sizeof (optval));
4692			if (error == 0)
4693				error = so_set_opportunistic(so, optval);
4694			break;
4695
4696		case SO_FLUSH:
4697			/* This option is handled by lower layer(s) */
4698			error = 0;
4699			break;
4700
4701		case SO_RECV_ANYIF:
4702			error = sooptcopyin(sopt, &optval, sizeof (optval),
4703			    sizeof (optval));
4704			if (error == 0)
4705				error = so_set_recv_anyif(so, optval);
4706			break;
4707
4708		case SO_TRAFFIC_MGT_BACKGROUND: {
4709			/* This option is handled by lower layer(s) */
4710			error = 0;
4711			break;
4712		}
4713
4714#if FLOW_DIVERT
4715		case SO_FLOW_DIVERT_TOKEN:
4716			error = flow_divert_token_set(so, sopt);
4717			break;
4718#endif	/* FLOW_DIVERT */
4719
4720
4721		case SO_DELEGATED:
4722			if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
4723			    sizeof (optval))) != 0)
4724				break;
4725
4726			error = so_set_effective_pid(so, optval, sopt->sopt_p);
4727			break;
4728
4729		case SO_DELEGATED_UUID: {
4730			uuid_t euuid;
4731
4732			if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
4733			    sizeof (euuid))) != 0)
4734				break;
4735
4736			error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
4737			break;
4738		}
4739
4740#if NECP
4741		case SO_NECP_ATTRIBUTES:
4742			error = necp_set_socket_attributes(so, sopt);
4743			break;
4744#endif /* NECP */
4745
4746#if MPTCP
4747		case SO_MPTCP_FASTJOIN:
4748			if (!((so->so_flags & SOF_MP_SUBFLOW) ||
4749			    ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
4750			    (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
4751				error = ENOPROTOOPT;
4752				break;
4753			}
4754
4755			error = sooptcopyin(sopt, &optval, sizeof (optval),
4756			    sizeof (optval));
4757			if (error != 0)
4758				goto out;
4759			if (optval == 0)
4760				so->so_flags &= ~SOF_MPTCP_FASTJOIN;
4761			else
4762				so->so_flags |= SOF_MPTCP_FASTJOIN;
4763			break;
4764#endif /* MPTCP */
4765
4766		default:
4767			error = ENOPROTOOPT;
4768			break;
4769		}
4770		if (error == 0 && so->so_proto != NULL &&
4771		    so->so_proto->pr_ctloutput != NULL) {
4772			(void) so->so_proto->pr_ctloutput(so, sopt);
4773		}
4774	}
4775out:
4776	if (dolock)
4777		socket_unlock(so, 1);
4778	return (error);
4779}
4780
4781/* Helper routines for getsockopt */
4782int
4783sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
4784{
4785	int	error;
4786	size_t	valsize;
4787
4788	error = 0;
4789
4790	/*
4791	 * Documented get behavior is that we always return a value,
4792	 * possibly truncated to fit in the user's buffer.
4793	 * Traditional behavior is that we always tell the user
4794	 * precisely how much we copied, rather than something useful
4795	 * like the total amount we had available for her.
4796	 * Note that this interface is not idempotent; the entire answer must
4797	 * generated ahead of time.
4798	 */
4799	valsize = min(len, sopt->sopt_valsize);
4800	sopt->sopt_valsize = valsize;
4801	if (sopt->sopt_val != USER_ADDR_NULL) {
4802		if (sopt->sopt_p != kernproc)
4803			error = copyout(buf, sopt->sopt_val, valsize);
4804		else
4805			bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4806	}
4807	return (error);
4808}
4809
4810static int
4811sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
4812{
4813	int			error;
4814	size_t			len;
4815	struct user64_timeval	tv64;
4816	struct user32_timeval	tv32;
4817	const void *		val;
4818	size_t			valsize;
4819
4820	error = 0;
4821	if (proc_is64bit(sopt->sopt_p)) {
4822		len = sizeof (tv64);
4823		tv64.tv_sec = tv_p->tv_sec;
4824		tv64.tv_usec = tv_p->tv_usec;
4825		val = &tv64;
4826	} else {
4827		len = sizeof (tv32);
4828		tv32.tv_sec = tv_p->tv_sec;
4829		tv32.tv_usec = tv_p->tv_usec;
4830		val = &tv32;
4831	}
4832	valsize = min(len, sopt->sopt_valsize);
4833	sopt->sopt_valsize = valsize;
4834	if (sopt->sopt_val != USER_ADDR_NULL) {
4835		if (sopt->sopt_p != kernproc)
4836			error = copyout(val, sopt->sopt_val, valsize);
4837		else
4838			bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4839	}
4840	return (error);
4841}
4842
4843/*
4844 * Return:	0			Success
4845 *		ENOPROTOOPT
4846 *	<pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4847 *	<pr_ctloutput>:???
4848 *	<sf_getoption>:???
4849 */
4850int
4851sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4852{
4853	int	error, optval;
4854	struct	linger l;
4855	struct	timeval tv;
4856#if CONFIG_MACF_SOCKET
4857	struct mac extmac;
4858#endif /* MAC_SOCKET */
4859
4860	if (sopt->sopt_dir != SOPT_GET)
4861		sopt->sopt_dir = SOPT_GET;
4862
4863	if (dolock)
4864		socket_lock(so, 1);
4865
4866	error = sflt_getsockopt(so, sopt);
4867	if (error != 0) {
4868		if (error == EJUSTRETURN)
4869			error = 0;
4870		goto out;
4871	}
4872
4873	if (sopt->sopt_level != SOL_SOCKET) {
4874		if (so->so_proto != NULL &&
4875		    so->so_proto->pr_ctloutput != NULL) {
4876			error = (*so->so_proto->pr_ctloutput)(so, sopt);
4877			goto out;
4878		}
4879		error = ENOPROTOOPT;
4880	} else {
4881		/*
4882		 * Allow socket-level (SOL_SOCKET) options to be filtered by
4883		 * the protocol layer, if needed.  A zero value returned from
4884		 * the handler means use default socket-level processing as
4885		 * done by the rest of this routine.  Otherwise, any other
4886		 * return value indicates that the option is unsupported.
4887		 */
4888		if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4889		    pru_socheckopt(so, sopt)) != 0)
4890			goto out;
4891
4892		error = 0;
4893		switch (sopt->sopt_name) {
4894		case SO_LINGER:
4895		case SO_LINGER_SEC:
4896			l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
4897			l.l_linger = (sopt->sopt_name == SO_LINGER) ?
4898			    so->so_linger : so->so_linger / hz;
4899			error = sooptcopyout(sopt, &l, sizeof (l));
4900			break;
4901
4902		case SO_USELOOPBACK:
4903		case SO_DONTROUTE:
4904		case SO_DEBUG:
4905		case SO_KEEPALIVE:
4906		case SO_REUSEADDR:
4907		case SO_REUSEPORT:
4908		case SO_BROADCAST:
4909		case SO_OOBINLINE:
4910		case SO_TIMESTAMP:
4911		case SO_TIMESTAMP_MONOTONIC:
4912		case SO_DONTTRUNC:
4913		case SO_WANTMORE:
4914		case SO_WANTOOBFLAG:
4915		case SO_NOWAKEFROMSLEEP:
4916			optval = so->so_options & sopt->sopt_name;
4917integer:
4918			error = sooptcopyout(sopt, &optval, sizeof (optval));
4919			break;
4920
4921		case SO_TYPE:
4922			optval = so->so_type;
4923			goto integer;
4924
4925		case SO_NREAD:
4926			if (so->so_proto->pr_flags & PR_ATOMIC) {
4927				int pkt_total;
4928				struct mbuf *m1;
4929
4930				pkt_total = 0;
4931				m1 = so->so_rcv.sb_mb;
4932				while (m1 != NULL) {
4933					if (m1->m_type == MT_DATA ||
4934					    m1->m_type == MT_HEADER ||
4935					    m1->m_type == MT_OOBDATA)
4936						pkt_total += m1->m_len;
4937					m1 = m1->m_next;
4938				}
4939				optval = pkt_total;
4940			} else {
4941				optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4942			}
4943			goto integer;
4944
4945		case SO_NUMRCVPKT:
4946			if (so->so_proto->pr_flags & PR_ATOMIC) {
4947				int cnt = 0;
4948				struct mbuf *m1;
4949
4950				m1 = so->so_rcv.sb_mb;
4951				while (m1 != NULL) {
4952					if (m1->m_type == MT_DATA ||
4953					    m1->m_type == MT_HEADER ||
4954					    m1->m_type == MT_OOBDATA)
4955						cnt += 1;
4956					m1 = m1->m_nextpkt;
4957				}
4958				optval = cnt;
4959				goto integer;
4960			} else {
4961				error = EINVAL;
4962				break;
4963			}
4964
4965		case SO_NWRITE:
4966			optval = so->so_snd.sb_cc;
4967			goto integer;
4968
4969		case SO_ERROR:
4970			optval = so->so_error;
4971			so->so_error = 0;
4972			goto integer;
4973
4974		case SO_SNDBUF: {
4975			u_int32_t hiwat = so->so_snd.sb_hiwat;
4976
4977			if (so->so_snd.sb_flags & SB_UNIX) {
4978				struct unpcb *unp =
4979				    (struct unpcb *)(so->so_pcb);
4980				if (unp != NULL && unp->unp_conn != NULL) {
4981					hiwat += unp->unp_conn->unp_cc;
4982				}
4983			}
4984
4985			optval = hiwat;
4986			goto integer;
4987		}
4988		case SO_RCVBUF:
4989			optval = so->so_rcv.sb_hiwat;
4990			goto integer;
4991
4992		case SO_SNDLOWAT:
4993			optval = so->so_snd.sb_lowat;
4994			goto integer;
4995
4996		case SO_RCVLOWAT:
4997			optval = so->so_rcv.sb_lowat;
4998			goto integer;
4999
5000		case SO_SNDTIMEO:
5001		case SO_RCVTIMEO:
5002			tv = (sopt->sopt_name == SO_SNDTIMEO ?
5003			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5004
5005			error = sooptcopyout_timeval(sopt, &tv);
5006			break;
5007
5008		case SO_NOSIGPIPE:
5009			optval = (so->so_flags & SOF_NOSIGPIPE);
5010			goto integer;
5011
5012		case SO_NOADDRERR:
5013			optval = (so->so_flags & SOF_NOADDRAVAIL);
5014			goto integer;
5015
5016		case SO_REUSESHAREUID:
5017			optval = (so->so_flags & SOF_REUSESHAREUID);
5018			goto integer;
5019
5020
5021		case SO_NOTIFYCONFLICT:
5022			optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5023			goto integer;
5024
5025		case SO_RESTRICTIONS:
5026			optval = so_get_restrictions(so);
5027			goto integer;
5028
5029		case SO_AWDL_UNRESTRICTED:
5030			if (SOCK_DOM(so) == PF_INET ||
5031			    SOCK_DOM(so) == PF_INET6) {
5032				optval = inp_get_awdl_unrestricted(
5033				    sotoinpcb(so));
5034				goto integer;
5035			} else
5036				error = EOPNOTSUPP;
5037			break;
5038
5039		case SO_LABEL:
5040#if CONFIG_MACF_SOCKET
5041			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5042			    sizeof (extmac))) != 0 ||
5043			    (error = mac_socket_label_get(proc_ucred(
5044			    sopt->sopt_p), so, &extmac)) != 0)
5045				break;
5046
5047			error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5048#else
5049			error = EOPNOTSUPP;
5050#endif /* MAC_SOCKET */
5051			break;
5052
5053		case SO_PEERLABEL:
5054#if CONFIG_MACF_SOCKET
5055			if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5056			    sizeof (extmac))) != 0 ||
5057			    (error = mac_socketpeer_label_get(proc_ucred(
5058			    sopt->sopt_p), so, &extmac)) != 0)
5059				break;
5060
5061			error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5062#else
5063			error = EOPNOTSUPP;
5064#endif /* MAC_SOCKET */
5065			break;
5066
5067#ifdef __APPLE_API_PRIVATE
5068		case SO_UPCALLCLOSEWAIT:
5069			optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5070			goto integer;
5071#endif
5072		case SO_RANDOMPORT:
5073			optval = (so->so_flags & SOF_BINDRANDOMPORT);
5074			goto integer;
5075
5076		case SO_NP_EXTENSIONS: {
5077			struct so_np_extensions sonpx;
5078
5079			sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5080			    SONPX_SETOPTSHUT : 0;
5081			sonpx.npx_mask = SONPX_MASK_VALID;
5082
5083			error = sooptcopyout(sopt, &sonpx,
5084			    sizeof (struct so_np_extensions));
5085			break;
5086		}
5087
5088		case SO_TRAFFIC_CLASS:
5089			optval = so->so_traffic_class;
5090			goto integer;
5091
5092		case SO_RECV_TRAFFIC_CLASS:
5093			optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5094			goto integer;
5095
5096		case SO_TRAFFIC_CLASS_STATS:
5097			error = sooptcopyout(sopt, &so->so_tc_stats,
5098			    sizeof (so->so_tc_stats));
5099			break;
5100
5101		case SO_TRAFFIC_CLASS_DBG:
5102			error = sogetopt_tcdbg(so, sopt);
5103			break;
5104
5105		case SO_PRIVILEGED_TRAFFIC_CLASS:
5106			optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5107			goto integer;
5108
5109		case SO_DEFUNCTOK:
5110			optval = !(so->so_flags & SOF_NODEFUNCT);
5111			goto integer;
5112
5113		case SO_ISDEFUNCT:
5114			optval = (so->so_flags & SOF_DEFUNCT);
5115			goto integer;
5116
5117		case SO_OPPORTUNISTIC:
5118			optval = so_get_opportunistic(so);
5119			goto integer;
5120
5121		case SO_FLUSH:
5122			/* This option is not gettable */
5123			error = EINVAL;
5124			break;
5125
5126		case SO_RECV_ANYIF:
5127			optval = so_get_recv_anyif(so);
5128			goto integer;
5129
5130		case SO_TRAFFIC_MGT_BACKGROUND:
5131			/* This option is handled by lower layer(s) */
5132			if (so->so_proto != NULL &&
5133			    so->so_proto->pr_ctloutput != NULL) {
5134				(void) so->so_proto->pr_ctloutput(so, sopt);
5135			}
5136			break;
5137
5138#if FLOW_DIVERT
5139		case SO_FLOW_DIVERT_TOKEN:
5140			error = flow_divert_token_get(so, sopt);
5141			break;
5142#endif	/* FLOW_DIVERT */
5143
5144#if NECP
5145		case SO_NECP_ATTRIBUTES:
5146			error = necp_get_socket_attributes(so, sopt);
5147			break;
5148#endif /* NECP */
5149
5150#if CONTENT_FILTER
5151		case SO_CFIL_SOCK_ID: {
5152			cfil_sock_id_t sock_id;
5153
5154			sock_id = cfil_sock_id_from_socket(so);
5155
5156			error = sooptcopyout(sopt, &sock_id,
5157				sizeof(cfil_sock_id_t));
5158			break;
5159		}
5160#endif	/* CONTENT_FILTER */
5161
5162#if MPTCP
5163		case SO_MPTCP_FASTJOIN:
5164			if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5165			    ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5166			    (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5167				error = ENOPROTOOPT;
5168				break;
5169			}
5170			optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5171			break;
5172#endif /* MPTCP */
5173
5174		default:
5175			error = ENOPROTOOPT;
5176			break;
5177		}
5178	}
5179out:
5180	if (dolock)
5181		socket_unlock(so, 1);
5182	return (error);
5183}
5184
5185/*
5186 * The size limits on our soopt_getm is different from that on FreeBSD.
5187 * We limit the size of options to MCLBYTES. This will have to change
5188 * if we need to define options that need more space than MCLBYTES.
5189 */
5190int
5191soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5192{
5193	struct mbuf *m, *m_prev;
5194	int sopt_size = sopt->sopt_valsize;
5195	int how;
5196
5197	if (sopt_size <= 0 || sopt_size > MCLBYTES)
5198		return (EMSGSIZE);
5199
5200	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5201	MGET(m, how, MT_DATA);
5202	if (m == NULL)
5203		return (ENOBUFS);
5204	if (sopt_size > MLEN) {
5205		MCLGET(m, how);
5206		if ((m->m_flags & M_EXT) == 0) {
5207			m_free(m);
5208			return (ENOBUFS);
5209		}
5210		m->m_len = min(MCLBYTES, sopt_size);
5211	} else {
5212		m->m_len = min(MLEN, sopt_size);
5213	}
5214	sopt_size -= m->m_len;
5215	*mp = m;
5216	m_prev = m;
5217
5218	while (sopt_size > 0) {
5219		MGET(m, how, MT_DATA);
5220		if (m == NULL) {
5221			m_freem(*mp);
5222			return (ENOBUFS);
5223		}
5224		if (sopt_size > MLEN) {
5225			MCLGET(m, how);
5226			if ((m->m_flags & M_EXT) == 0) {
5227				m_freem(*mp);
5228				m_freem(m);
5229				return (ENOBUFS);
5230			}
5231			m->m_len = min(MCLBYTES, sopt_size);
5232		} else {
5233			m->m_len = min(MLEN, sopt_size);
5234		}
5235		sopt_size -= m->m_len;
5236		m_prev->m_next = m;
5237		m_prev = m;
5238	}
5239	return (0);
5240}
5241
5242/* copyin sopt data into mbuf chain */
5243int
5244soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5245{
5246	struct mbuf *m0 = m;
5247
5248	if (sopt->sopt_val == USER_ADDR_NULL)
5249		return (0);
5250	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5251		if (sopt->sopt_p != kernproc) {
5252			int error;
5253
5254			error = copyin(sopt->sopt_val, mtod(m, char *),
5255			    m->m_len);
5256			if (error != 0) {
5257				m_freem(m0);
5258				return (error);
5259			}
5260		} else {
5261			bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5262			    mtod(m, char *), m->m_len);
5263		}
5264		sopt->sopt_valsize -= m->m_len;
5265		sopt->sopt_val += m->m_len;
5266		m = m->m_next;
5267	}
5268	/* should be allocated enoughly at ip6_sooptmcopyin() */
5269	if (m != NULL) {
5270		panic("soopt_mcopyin");
5271		/* NOTREACHED */
5272	}
5273	return (0);
5274}
5275
5276/* copyout mbuf chain data into soopt */
5277int
5278soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5279{
5280	struct mbuf *m0 = m;
5281	size_t valsize = 0;
5282
5283	if (sopt->sopt_val == USER_ADDR_NULL)
5284		return (0);
5285	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5286		if (sopt->sopt_p != kernproc) {
5287			int error;
5288
5289			error = copyout(mtod(m, char *), sopt->sopt_val,
5290			    m->m_len);
5291			if (error != 0) {
5292				m_freem(m0);
5293				return (error);
5294			}
5295		} else {
5296			bcopy(mtod(m, char *),
5297			    CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5298		}
5299		sopt->sopt_valsize -= m->m_len;
5300		sopt->sopt_val += m->m_len;
5301		valsize += m->m_len;
5302		m = m->m_next;
5303	}
5304	if (m != NULL) {
5305		/* enough soopt buffer should be given from user-land */
5306		m_freem(m0);
5307		return (EINVAL);
5308	}
5309	sopt->sopt_valsize = valsize;
5310	return (0);
5311}
5312
5313void
5314sohasoutofband(struct socket *so)
5315{
5316	if (so->so_pgid < 0)
5317		gsignal(-so->so_pgid, SIGURG);
5318	else if (so->so_pgid > 0)
5319		proc_signal(so->so_pgid, SIGURG);
5320	selwakeup(&so->so_rcv.sb_sel);
5321}
5322
5323int
5324sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5325{
5326#pragma unused(cred)
5327	struct proc *p = current_proc();
5328	int revents = 0;
5329
5330	socket_lock(so, 1);
5331	so_update_last_owner_locked(so, PROC_NULL);
5332	so_update_policy(so);
5333
5334	if (events & (POLLIN | POLLRDNORM))
5335		if (soreadable(so))
5336			revents |= events & (POLLIN | POLLRDNORM);
5337
5338	if (events & (POLLOUT | POLLWRNORM))
5339		if (sowriteable(so))
5340			revents |= events & (POLLOUT | POLLWRNORM);
5341
5342	if (events & (POLLPRI | POLLRDBAND))
5343		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5344			revents |= events & (POLLPRI | POLLRDBAND);
5345
5346	if (revents == 0) {
5347		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5348			/*
5349			 * Darwin sets the flag first,
5350			 * BSD calls selrecord first
5351			 */
5352			so->so_rcv.sb_flags |= SB_SEL;
5353			selrecord(p, &so->so_rcv.sb_sel, wql);
5354		}
5355
5356		if (events & (POLLOUT | POLLWRNORM)) {
5357			/*
5358			 * Darwin sets the flag first,
5359			 * BSD calls selrecord first
5360			 */
5361			so->so_snd.sb_flags |= SB_SEL;
5362			selrecord(p, &so->so_snd.sb_sel, wql);
5363		}
5364	}
5365
5366	socket_unlock(so, 1);
5367	return (revents);
5368}
5369
5370int
5371soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5372{
5373#pragma unused(fp)
5374#if !CONFIG_MACF_SOCKET
5375#pragma unused(ctx)
5376#endif /* MAC_SOCKET */
5377	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5378	struct klist *skl;
5379
5380	socket_lock(so, 1);
5381	so_update_last_owner_locked(so, PROC_NULL);
5382	so_update_policy(so);
5383
5384#if CONFIG_MACF_SOCKET
5385	if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5386	    kn, so) != 0) {
5387		socket_unlock(so, 1);
5388		return (1);
5389	}
5390#endif /* MAC_SOCKET */
5391
5392	switch (kn->kn_filter) {
5393	case EVFILT_READ:
5394		kn->kn_fop = &soread_filtops;
5395		skl = &so->so_rcv.sb_sel.si_note;
5396		break;
5397	case EVFILT_WRITE:
5398		kn->kn_fop = &sowrite_filtops;
5399		skl = &so->so_snd.sb_sel.si_note;
5400		break;
5401	case EVFILT_SOCK:
5402		kn->kn_fop = &sock_filtops;
5403		skl = &so->so_klist;
5404		break;
5405	default:
5406		socket_unlock(so, 1);
5407		return (1);
5408	}
5409
5410	if (KNOTE_ATTACH(skl, kn)) {
5411		switch (kn->kn_filter) {
5412		case EVFILT_READ:
5413			so->so_rcv.sb_flags |= SB_KNOTE;
5414			break;
5415		case EVFILT_WRITE:
5416			so->so_snd.sb_flags |= SB_KNOTE;
5417			break;
5418		case EVFILT_SOCK:
5419			so->so_flags |= SOF_KNOTE;
5420			break;
5421		default:
5422			socket_unlock(so, 1);
5423			return (1);
5424		}
5425	}
5426	socket_unlock(so, 1);
5427	return (0);
5428}
5429
5430static void
5431filt_sordetach(struct knote *kn)
5432{
5433	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5434
5435	socket_lock(so, 1);
5436	if (so->so_rcv.sb_flags & SB_KNOTE)
5437		if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5438			so->so_rcv.sb_flags &= ~SB_KNOTE;
5439	socket_unlock(so, 1);
5440}
5441
5442/*ARGSUSED*/
5443static int
5444filt_soread(struct knote *kn, long hint)
5445{
5446	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5447
5448	if ((hint & SO_FILT_HINT_LOCKED) == 0)
5449		socket_lock(so, 1);
5450
5451	if (so->so_options & SO_ACCEPTCONN) {
5452		int isempty;
5453
5454		/*
5455		 * Radar 6615193 handle the listen case dynamically
5456		 * for kqueue read filter. This allows to call listen()
5457		 * after registering the kqueue EVFILT_READ.
5458		 */
5459
5460		kn->kn_data = so->so_qlen;
5461		isempty = ! TAILQ_EMPTY(&so->so_comp);
5462
5463		if ((hint & SO_FILT_HINT_LOCKED) == 0)
5464			socket_unlock(so, 1);
5465
5466		return (isempty);
5467	}
5468
5469	/* socket isn't a listener */
5470
5471	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5472
5473	if (so->so_oobmark) {
5474		if (kn->kn_flags & EV_OOBAND) {
5475			kn->kn_data -= so->so_oobmark;
5476			if ((hint & SO_FILT_HINT_LOCKED) == 0)
5477				socket_unlock(so, 1);
5478			return (1);
5479		}
5480		kn->kn_data = so->so_oobmark;
5481		kn->kn_flags |= EV_OOBAND;
5482	} else {
5483		if ((so->so_state & SS_CANTRCVMORE)
5484#if CONTENT_FILTER
5485		&& cfil_sock_data_pending(&so->so_rcv) == 0
5486#endif /* CONTENT_FILTER */
5487		) {
5488			kn->kn_flags |= EV_EOF;
5489			kn->kn_fflags = so->so_error;
5490			if ((hint & SO_FILT_HINT_LOCKED) == 0)
5491				socket_unlock(so, 1);
5492			return (1);
5493		}
5494	}
5495
5496	if (so->so_state & SS_RCVATMARK) {
5497		if (kn->kn_flags & EV_OOBAND) {
5498			if ((hint & SO_FILT_HINT_LOCKED) == 0)
5499				socket_unlock(so, 1);
5500			return (1);
5501		}
5502		kn->kn_flags |= EV_OOBAND;
5503	} else if (kn->kn_flags & EV_OOBAND) {
5504		kn->kn_data = 0;
5505		if ((hint & SO_FILT_HINT_LOCKED) == 0)
5506			socket_unlock(so, 1);
5507		return (0);
5508	}
5509
5510	if (so->so_error) {	/* temporary udp error */
5511		if ((hint & SO_FILT_HINT_LOCKED) == 0)
5512			socket_unlock(so, 1);
5513		return (1);
5514	}
5515
5516	int64_t	lowwat = so->so_rcv.sb_lowat;
5517	if (kn->kn_sfflags & NOTE_LOWAT) {
5518		if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5519			lowwat = so->so_rcv.sb_hiwat;
5520		else if (kn->kn_sdata > lowwat)
5521			lowwat = kn->kn_sdata;
5522	}
5523
5524	if ((hint & SO_FILT_HINT_LOCKED) == 0)
5525		socket_unlock(so, 1);
5526
5527	return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
5528}
5529
5530static void
5531filt_sowdetach(struct knote *kn)
5532{
5533	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5534	socket_lock(so, 1);
5535
5536	if (so->so_snd.sb_flags & SB_KNOTE)
5537		if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5538			so->so_snd.sb_flags &= ~SB_KNOTE;
5539	socket_unlock(so, 1);
5540}
5541
5542int
5543so_wait_for_if_feedback(struct socket *so)
5544{
5545	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5546	    (so->so_state & SS_ISCONNECTED)) {
5547		struct inpcb *inp = sotoinpcb(so);
5548		if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5549			return (1);
5550	}
5551	return (0);
5552}
5553
5554/*ARGSUSED*/
5555static int
5556filt_sowrite(struct knote *kn, long hint)
5557{
5558	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5559	int ret = 0;
5560
5561	if ((hint & SO_FILT_HINT_LOCKED) == 0)
5562		socket_lock(so, 1);
5563
5564	kn->kn_data = sbspace(&so->so_snd);
5565	if (so->so_state & SS_CANTSENDMORE) {
5566		kn->kn_flags |= EV_EOF;
5567		kn->kn_fflags = so->so_error;
5568		ret = 1;
5569		goto out;
5570	}
5571	if (so->so_error) {	/* temporary udp error */
5572		ret = 1;
5573		goto out;
5574	}
5575	if (((so->so_state & SS_ISCONNECTED) == 0) &&
5576	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
5577		ret = 0;
5578		goto out;
5579	}
5580	int64_t	lowwat = so->so_snd.sb_lowat;
5581	if (kn->kn_sfflags & NOTE_LOWAT) {
5582		if (kn->kn_sdata > so->so_snd.sb_hiwat)
5583			lowwat = so->so_snd.sb_hiwat;
5584		else if (kn->kn_sdata > lowwat)
5585			lowwat = kn->kn_sdata;
5586	}
5587	if (kn->kn_data >= lowwat) {
5588		if (so->so_flags & SOF_NOTSENT_LOWAT) {
5589			if ((SOCK_DOM(so) == PF_INET
5590			    || SOCK_DOM(so) == PF_INET6)
5591			    && so->so_type == SOCK_STREAM) {
5592				ret = tcp_notsent_lowat_check(so);
5593			}
5594#if MPTCP
5595			else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5596			    (SOCK_PROTO(so) == IPPROTO_TCP)) {
5597				ret = mptcp_notsent_lowat_check(so);
5598			}
5599#endif
5600			else {
5601				return (1);
5602			}
5603		} else {
5604			ret = 1;
5605		}
5606	}
5607	if (so_wait_for_if_feedback(so))
5608		ret = 0;
5609out:
5610	if ((hint & SO_FILT_HINT_LOCKED) == 0)
5611		socket_unlock(so, 1);
5612	return (ret);
5613}
5614
5615static void
5616filt_sockdetach(struct knote *kn)
5617{
5618	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5619	socket_lock(so, 1);
5620
5621	if ((so->so_flags & SOF_KNOTE) != 0)
5622		if (KNOTE_DETACH(&so->so_klist, kn))
5623			so->so_flags &= ~SOF_KNOTE;
5624	socket_unlock(so, 1);
5625}
5626
5627static int
5628filt_sockev(struct knote *kn, long hint)
5629{
5630	int ret = 0, locked = 0;
5631	struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5632	long ev_hint = (hint & SO_FILT_HINT_EV);
5633
5634	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
5635		socket_lock(so, 1);
5636		locked = 1;
5637	}
5638
5639	if (ev_hint & SO_FILT_HINT_CONNRESET) {
5640		if (kn->kn_sfflags & NOTE_CONNRESET)
5641			kn->kn_fflags |= NOTE_CONNRESET;
5642	}
5643	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
5644		if (kn->kn_sfflags & NOTE_TIMEOUT)
5645			kn->kn_fflags |= NOTE_TIMEOUT;
5646	}
5647	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
5648		if (kn->kn_sfflags & NOTE_NOSRCADDR)
5649			kn->kn_fflags |= NOTE_NOSRCADDR;
5650	}
5651	if (ev_hint & SO_FILT_HINT_IFDENIED) {
5652		if ((kn->kn_sfflags & NOTE_IFDENIED))
5653			kn->kn_fflags |= NOTE_IFDENIED;
5654	}
5655	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
5656		if (kn->kn_sfflags & NOTE_KEEPALIVE)
5657			kn->kn_fflags |= NOTE_KEEPALIVE;
5658	}
5659	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
5660		if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
5661			kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
5662	}
5663	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
5664		if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
5665			kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
5666	}
5667	if (ev_hint & SO_FILT_HINT_CONNECTED) {
5668		if (kn->kn_sfflags & NOTE_CONNECTED)
5669			kn->kn_fflags |= NOTE_CONNECTED;
5670	}
5671	if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
5672		if (kn->kn_sfflags & NOTE_DISCONNECTED)
5673			kn->kn_fflags |= NOTE_DISCONNECTED;
5674	}
5675	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
5676		if (so->so_proto != NULL &&
5677		    (so->so_proto->pr_flags & PR_EVCONNINFO) &&
5678		    (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
5679			kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
5680	}
5681
5682	if ((kn->kn_sfflags & NOTE_READCLOSED) &&
5683	    (so->so_state & SS_CANTRCVMORE)
5684#if CONTENT_FILTER
5685		&& cfil_sock_data_pending(&so->so_rcv) == 0
5686#endif /* CONTENT_FILTER */
5687		)
5688		kn->kn_fflags |= NOTE_READCLOSED;
5689
5690	if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
5691	    (so->so_state & SS_CANTSENDMORE))
5692		kn->kn_fflags |= NOTE_WRITECLOSED;
5693
5694	if ((kn->kn_sfflags & NOTE_SUSPEND) &&
5695	    ((ev_hint & SO_FILT_HINT_SUSPEND) ||
5696	    (so->so_flags & SOF_SUSPENDED))) {
5697		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5698		kn->kn_fflags |= NOTE_SUSPEND;
5699	}
5700
5701	if ((kn->kn_sfflags & NOTE_RESUME) &&
5702	    ((ev_hint & SO_FILT_HINT_RESUME) ||
5703	    (so->so_flags & SOF_SUSPENDED) == 0)) {
5704		kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5705		kn->kn_fflags |= NOTE_RESUME;
5706	}
5707
5708	if (so->so_error != 0) {
5709		ret = 1;
5710		kn->kn_data = so->so_error;
5711		kn->kn_flags |= EV_EOF;
5712	} else {
5713		get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
5714	}
5715
5716	if (kn->kn_fflags != 0)
5717		ret = 1;
5718
5719	if (locked)
5720		socket_unlock(so, 1);
5721
5722	return (ret);
5723}
5724
5725void
5726get_sockev_state(struct socket *so, u_int32_t *statep)
5727{
5728	u_int32_t state = *(statep);
5729
5730	if (so->so_state & SS_ISCONNECTED)
5731		state |= SOCKEV_CONNECTED;
5732	else
5733		state &= ~(SOCKEV_CONNECTED);
5734	state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
5735	*(statep) = state;
5736}
5737
5738#define	SO_LOCK_HISTORY_STR_LEN \
5739	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
5740
5741__private_extern__ const char *
5742solockhistory_nr(struct socket *so)
5743{
5744	size_t n = 0;
5745	int i;
5746	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
5747
5748	bzero(lock_history_str, sizeof (lock_history_str));
5749	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
5750		n += snprintf(lock_history_str + n,
5751		    SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
5752		    so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
5753		    so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
5754	}
5755	return (lock_history_str);
5756}
5757
5758int
5759socket_lock(struct socket *so, int refcount)
5760{
5761	int error = 0;
5762	void *lr_saved;
5763
5764	lr_saved = __builtin_return_address(0);
5765
5766	if (so->so_proto->pr_lock) {
5767		error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
5768	} else {
5769#ifdef MORE_LOCKING_DEBUG
5770		lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
5771		    LCK_MTX_ASSERT_NOTOWNED);
5772#endif
5773		lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
5774		if (refcount)
5775			so->so_usecount++;
5776		so->lock_lr[so->next_lock_lr] = lr_saved;
5777		so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
5778	}
5779
5780	return (error);
5781}
5782
5783int
5784socket_unlock(struct socket *so, int refcount)
5785{
5786	int error = 0;
5787	void *lr_saved;
5788	lck_mtx_t *mutex_held;
5789
5790	lr_saved = __builtin_return_address(0);
5791
5792	if (so->so_proto == NULL) {
5793		panic("%s: null so_proto so=%p\n", __func__, so);
5794		/* NOTREACHED */
5795	}
5796
5797	if (so && so->so_proto->pr_unlock) {
5798		error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
5799	} else {
5800		mutex_held = so->so_proto->pr_domain->dom_mtx;
5801#ifdef MORE_LOCKING_DEBUG
5802		lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5803#endif
5804		so->unlock_lr[so->next_unlock_lr] = lr_saved;
5805		so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5806
5807		if (refcount) {
5808			if (so->so_usecount <= 0) {
5809				panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
5810				    "lrh=%s", __func__, so->so_usecount, so,
5811				    SOCK_DOM(so), so->so_type,
5812				    SOCK_PROTO(so), solockhistory_nr(so));
5813				/* NOTREACHED */
5814			}
5815
5816			so->so_usecount--;
5817			if (so->so_usecount == 0)
5818				sofreelastref(so, 1);
5819		}
5820		lck_mtx_unlock(mutex_held);
5821	}
5822
5823	return (error);
5824}
5825
5826/* Called with socket locked, will unlock socket */
5827void
5828sofree(struct socket *so)
5829{
5830	lck_mtx_t *mutex_held;
5831
5832	if (so->so_proto->pr_getlock != NULL)
5833		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
5834	else
5835		mutex_held = so->so_proto->pr_domain->dom_mtx;
5836	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5837
5838	sofreelastref(so, 0);
5839}
5840
5841void
5842soreference(struct socket *so)
5843{
5844	socket_lock(so, 1);	/* locks & take one reference on socket */
5845	socket_unlock(so, 0);	/* unlock only */
5846}
5847
5848void
5849sodereference(struct socket *so)
5850{
5851	socket_lock(so, 0);
5852	socket_unlock(so, 1);
5853}
5854
5855/*
5856 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
5857 * possibility of using jumbo clusters.  Caller must ensure to hold
5858 * the socket lock.
5859 */
5860void
5861somultipages(struct socket *so, boolean_t set)
5862{
5863	if (set)
5864		so->so_flags |= SOF_MULTIPAGES;
5865	else
5866		so->so_flags &= ~SOF_MULTIPAGES;
5867}
5868
5869void
5870soif2kcl(struct socket *so, boolean_t set)
5871{
5872	if (set)
5873		so->so_flags1 |= SOF1_IF_2KCL;
5874	else
5875		so->so_flags1 &= ~SOF1_IF_2KCL;
5876}
5877
5878int
5879so_isdstlocal(struct socket *so) {
5880
5881	struct inpcb *inp = (struct inpcb *)so->so_pcb;
5882
5883	if (SOCK_DOM(so) == PF_INET)
5884		return (inaddr_local(inp->inp_faddr));
5885	else if (SOCK_DOM(so) == PF_INET6)
5886		return (in6addr_local(&inp->in6p_faddr));
5887
5888	return (0);
5889}
5890
5891int
5892sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
5893{
5894	struct sockbuf *rcv, *snd;
5895	int err = 0, defunct;
5896
5897	rcv = &so->so_rcv;
5898	snd = &so->so_snd;
5899
5900	defunct = (so->so_flags & SOF_DEFUNCT);
5901	if (defunct) {
5902		if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
5903			panic("%s: SB_DROP not set", __func__);
5904			/* NOTREACHED */
5905		}
5906		goto done;
5907	}
5908
5909	if (so->so_flags & SOF_NODEFUNCT) {
5910		if (noforce) {
5911			err = EOPNOTSUPP;
5912			SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
5913			    "so 0x%llx [%d,%d] is not eligible for defunct "
5914			    "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
5915			    level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5916			    SOCK_DOM(so), SOCK_TYPE(so), err));
5917			return (err);
5918		}
5919		so->so_flags &= ~SOF_NODEFUNCT;
5920		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5921		    "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
5922		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5923		    SOCK_DOM(so), SOCK_TYPE(so)));
5924	}
5925
5926	so->so_flags |= SOF_DEFUNCT;
5927
5928	/* Prevent further data from being appended to the socket buffers */
5929	snd->sb_flags |= SB_DROP;
5930	rcv->sb_flags |= SB_DROP;
5931
5932	/* Flush any existing data in the socket buffers */
5933	if (rcv->sb_cc != 0) {
5934		rcv->sb_flags &= ~SB_SEL;
5935		selthreadclear(&rcv->sb_sel);
5936		sbrelease(rcv);
5937	}
5938	if (snd->sb_cc != 0) {
5939		snd->sb_flags &= ~SB_SEL;
5940		selthreadclear(&snd->sb_sel);
5941		sbrelease(snd);
5942	}
5943
5944done:
5945	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
5946	    "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
5947	    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
5948	    defunct ? "is already" : "marked as"));
5949
5950	return (err);
5951}
5952
5953int
5954sodefunct(struct proc *p, struct socket *so, int level)
5955{
5956	struct sockbuf *rcv, *snd;
5957
5958	if (!(so->so_flags & SOF_DEFUNCT)) {
5959		panic("%s improperly called", __func__);
5960		/* NOTREACHED */
5961	}
5962	if (so->so_state & SS_DEFUNCT)
5963		goto done;
5964
5965	rcv = &so->so_rcv;
5966	snd = &so->so_snd;
5967
5968	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5969		char s[MAX_IPv6_STR_LEN];
5970		char d[MAX_IPv6_STR_LEN];
5971		struct inpcb *inp = sotoinpcb(so);
5972
5973		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
5974		    "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5975		    "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5976		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5977		    (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
5978		    inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
5979		    (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
5980		    s, sizeof (s)), ntohs(inp->in6p_lport),
5981		    inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
5982		    (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
5983		    d, sizeof (d)), ntohs(inp->in6p_fport),
5984		    (uint32_t)rcv->sb_sel.si_flags,
5985		    (uint32_t)snd->sb_sel.si_flags,
5986		    rcv->sb_flags, snd->sb_flags));
5987	} else {
5988		SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5989		    "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5990		    "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5991		    proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5992		    SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
5993		    (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
5994		    snd->sb_flags));
5995	}
5996
5997	/*
5998	 * Unwedge threads blocked on sbwait() and sb_lock().
5999	 */
6000	sbwakeup(rcv);
6001	sbwakeup(snd);
6002
6003	so->so_flags1 |= SOF1_DEFUNCTINPROG;
6004	if (rcv->sb_flags & SB_LOCK)
6005		sbunlock(rcv, TRUE);	/* keep socket locked */
6006	if (snd->sb_flags & SB_LOCK)
6007		sbunlock(snd, TRUE);	/* keep socket locked */
6008
6009	/*
6010	 * Flush the buffers and disconnect.  We explicitly call shutdown
6011	 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6012	 * states are set for the socket.  This would also flush out data
6013	 * hanging off the receive list of this socket.
6014	 */
6015	(void) soshutdownlock_final(so, SHUT_RD);
6016	(void) soshutdownlock_final(so, SHUT_WR);
6017	(void) sodisconnectlocked(so);
6018
6019	/*
6020	 * Explicitly handle connectionless-protocol disconnection
6021	 * and release any remaining data in the socket buffers.
6022	 */
6023	if (!(so->so_flags & SS_ISDISCONNECTED))
6024		(void) soisdisconnected(so);
6025
6026	if (so->so_error == 0)
6027		so->so_error = EBADF;
6028
6029	if (rcv->sb_cc != 0) {
6030		rcv->sb_flags &= ~SB_SEL;
6031		selthreadclear(&rcv->sb_sel);
6032		sbrelease(rcv);
6033	}
6034	if (snd->sb_cc != 0) {
6035		snd->sb_flags &= ~SB_SEL;
6036		selthreadclear(&snd->sb_sel);
6037		sbrelease(snd);
6038	}
6039	so->so_state |= SS_DEFUNCT;
6040
6041done:
6042	return (0);
6043}
6044
6045__private_extern__ int
6046so_set_recv_anyif(struct socket *so, int optval)
6047{
6048	int ret = 0;
6049
6050#if INET6
6051	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6052#else
6053	if (SOCK_DOM(so) == PF_INET) {
6054#endif /* !INET6 */
6055		if (optval)
6056			sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6057		else
6058			sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6059	}
6060
6061	return (ret);
6062}
6063
6064__private_extern__ int
6065so_get_recv_anyif(struct socket *so)
6066{
6067	int ret = 0;
6068
6069#if INET6
6070	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6071#else
6072	if (SOCK_DOM(so) == PF_INET) {
6073#endif /* !INET6 */
6074		ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6075	}
6076
6077	return (ret);
6078}
6079
6080int
6081so_set_restrictions(struct socket *so, uint32_t vals)
6082{
6083	int nocell_old, nocell_new;
6084	int noexpensive_old, noexpensive_new;
6085
6086	/*
6087	 * Deny-type restrictions are trapdoors; once set they cannot be
6088	 * unset for the lifetime of the socket.  This allows them to be
6089	 * issued by a framework on behalf of the application without
6090	 * having to worry that they can be undone.
6091	 *
6092	 * Note here that socket-level restrictions overrides any protocol
6093	 * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
6094	 * socket restriction issued on the socket has a higher precendence
6095	 * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
6096	 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6097	 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6098	 */
6099	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6100	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6101	so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6102	    SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6103	    SO_RESTRICT_DENY_EXPENSIVE));
6104	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6105	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6106
6107	/* we can only set, not clear restrictions */
6108	if ((nocell_new - nocell_old) == 0 &&
6109	    (noexpensive_new - noexpensive_old) == 0)
6110		return (0);
6111#if INET6
6112	if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6113#else
6114	if (SOCK_DOM(so) == PF_INET) {
6115#endif /* !INET6 */
6116		if (nocell_new - nocell_old != 0) {
6117			/* if deny cellular is now set, do what's needed for INPCB */
6118			inp_set_nocellular(sotoinpcb(so));
6119		}
6120		if (noexpensive_new - noexpensive_old != 0) {
6121			inp_set_noexpensive(sotoinpcb(so));
6122		}
6123	}
6124
6125	return (0);
6126}
6127
6128uint32_t
6129so_get_restrictions(struct socket *so)
6130{
6131	return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6132	    SO_RESTRICT_DENY_OUT |
6133	    SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6134}
6135
6136struct sockaddr_entry *
6137sockaddrentry_alloc(int how)
6138{
6139	struct sockaddr_entry *se;
6140
6141	se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6142	if (se != NULL)
6143		bzero(se, se_zone_size);
6144
6145	return (se);
6146}
6147
6148void
6149sockaddrentry_free(struct sockaddr_entry *se)
6150{
6151	if (se->se_addr != NULL) {
6152		FREE(se->se_addr, M_SONAME);
6153		se->se_addr = NULL;
6154	}
6155	zfree(se_zone, se);
6156}
6157
6158struct sockaddr_entry *
6159sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6160{
6161	struct sockaddr_entry *dst_se;
6162
6163	dst_se = sockaddrentry_alloc(how);
6164	if (dst_se != NULL) {
6165		int len = src_se->se_addr->sa_len;
6166
6167		MALLOC(dst_se->se_addr, struct sockaddr *,
6168		    len, M_SONAME, how | M_ZERO);
6169		if (dst_se->se_addr != NULL) {
6170			bcopy(src_se->se_addr, dst_se->se_addr, len);
6171		} else {
6172			sockaddrentry_free(dst_se);
6173			dst_se = NULL;
6174		}
6175	}
6176
6177	return (dst_se);
6178}
6179
6180struct sockaddr_list *
6181sockaddrlist_alloc(int how)
6182{
6183	struct sockaddr_list *sl;
6184
6185	sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6186	if (sl != NULL) {
6187		bzero(sl, sl_zone_size);
6188		TAILQ_INIT(&sl->sl_head);
6189	}
6190	return (sl);
6191}
6192
6193void
6194sockaddrlist_free(struct sockaddr_list *sl)
6195{
6196	struct sockaddr_entry *se, *tse;
6197
6198	TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6199		sockaddrlist_remove(sl, se);
6200		sockaddrentry_free(se);
6201	}
6202	VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6203	zfree(sl_zone, sl);
6204}
6205
6206void
6207sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6208{
6209	VERIFY(!(se->se_flags & SEF_ATTACHED));
6210	se->se_flags |= SEF_ATTACHED;
6211	TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6212	sl->sl_cnt++;
6213	VERIFY(sl->sl_cnt != 0);
6214}
6215
6216void
6217sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6218{
6219	VERIFY(se->se_flags & SEF_ATTACHED);
6220	se->se_flags &= ~SEF_ATTACHED;
6221	VERIFY(sl->sl_cnt != 0);
6222	sl->sl_cnt--;
6223	TAILQ_REMOVE(&sl->sl_head, se, se_link);
6224}
6225
6226struct sockaddr_list *
6227sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6228{
6229	struct sockaddr_entry *src_se, *tse;
6230	struct sockaddr_list *dst_sl;
6231
6232	dst_sl = sockaddrlist_alloc(how);
6233	if (dst_sl == NULL)
6234		return (NULL);
6235
6236	TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6237		struct sockaddr_entry *dst_se;
6238
6239		if (src_se->se_addr == NULL)
6240			continue;
6241
6242		dst_se = sockaddrentry_dup(src_se, how);
6243		if (dst_se == NULL) {
6244			sockaddrlist_free(dst_sl);
6245			return (NULL);
6246		}
6247
6248		sockaddrlist_insert(dst_sl, dst_se);
6249	}
6250	VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6251
6252	return (dst_sl);
6253}
6254
6255int
6256so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6257{
6258	struct proc *ep = PROC_NULL;
6259	int error = 0;
6260
6261	/* pid 0 is reserved for kernel */
6262	if (epid == 0) {
6263		error = EINVAL;
6264		goto done;
6265	}
6266
6267	/*
6268	 * If this is an in-kernel socket, prevent its delegate
6269	 * association from changing unless the socket option is
6270	 * coming from within the kernel itself.
6271	 */
6272	if (so->last_pid == 0 && p != kernproc) {
6273		error = EACCES;
6274		goto done;
6275	}
6276
6277	/*
6278	 * If this is issued by a process that's recorded as the
6279	 * real owner of the socket, or if the pid is the same as
6280	 * the process's own pid, then proceed.  Otherwise ensure
6281	 * that the issuing process has the necessary privileges.
6282	 */
6283	if (epid != so->last_pid || epid != proc_pid(p)) {
6284		if ((error = priv_check_cred(kauth_cred_get(),
6285		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6286			error = EACCES;
6287			goto done;
6288		}
6289	}
6290
6291	/* Find the process that corresponds to the effective pid */
6292	if ((ep = proc_find(epid)) == PROC_NULL) {
6293		error = ESRCH;
6294		goto done;
6295	}
6296
6297	/*
6298	 * If a process tries to delegate the socket to itself, then
6299	 * there's really nothing to do; treat it as a way for the
6300	 * delegate association to be cleared.  Note that we check
6301	 * the passed-in proc rather than calling proc_selfpid(),
6302	 * as we need to check the process issuing the socket option
6303	 * which could be kernproc.  Given that we don't allow 0 for
6304	 * effective pid, it means that a delegated in-kernel socket
6305	 * stays delegated during its lifetime (which is probably OK.)
6306	 */
6307	if (epid == proc_pid(p)) {
6308		so->so_flags &= ~SOF_DELEGATED;
6309		so->e_upid = 0;
6310		so->e_pid = 0;
6311		uuid_clear(so->e_uuid);
6312	} else {
6313		so->so_flags |= SOF_DELEGATED;
6314		so->e_upid = proc_uniqueid(ep);
6315		so->e_pid = proc_pid(ep);
6316		proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6317	}
6318done:
6319	if (error == 0 && net_io_policy_log) {
6320		uuid_string_t buf;
6321
6322		uuid_unparse(so->e_uuid, buf);
6323		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6324		    "euuid %s%s\n", __func__, proc_name_address(p),
6325		    proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6326		    SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
6327		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6328	} else if (error != 0 && net_io_policy_log) {
6329		log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6330		    "ERROR (%d)\n", __func__, proc_name_address(p),
6331		    proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6332		    SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
6333		    proc_name_address(ep), error);
6334	}
6335
6336	/* Update this socket's policy upon success */
6337	if (error == 0) {
6338		so->so_policy_gencnt *= -1;
6339		so_update_policy(so);
6340#if NECP
6341		so_update_necp_policy(so, NULL, NULL);
6342#endif /* NECP */
6343	}
6344
6345	if (ep != PROC_NULL)
6346		proc_rele(ep);
6347
6348	return (error);
6349}
6350
6351int
6352so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
6353{
6354	uuid_string_t buf;
6355	uuid_t uuid;
6356	int error = 0;
6357
6358	/* UUID must not be all-zeroes (reserved for kernel) */
6359	if (uuid_is_null(euuid)) {
6360		error = EINVAL;
6361		goto done;;
6362	}
6363
6364	/*
6365	 * If this is an in-kernel socket, prevent its delegate
6366	 * association from changing unless the socket option is
6367	 * coming from within the kernel itself.
6368	 */
6369	if (so->last_pid == 0 && p != kernproc) {
6370		error = EACCES;
6371		goto done;
6372	}
6373
6374	/* Get the UUID of the issuing process */
6375	proc_getexecutableuuid(p, uuid, sizeof (uuid));
6376
6377	/*
6378	 * If this is issued by a process that's recorded as the
6379	 * real owner of the socket, or if the uuid is the same as
6380	 * the process's own uuid, then proceed.  Otherwise ensure
6381	 * that the issuing process has the necessary privileges.
6382	 */
6383	if (uuid_compare(euuid, so->last_uuid) != 0 ||
6384	    uuid_compare(euuid, uuid) != 0) {
6385		if ((error = priv_check_cred(kauth_cred_get(),
6386		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6387			error = EACCES;
6388			goto done;
6389		}
6390	}
6391
6392	/*
6393	 * If a process tries to delegate the socket to itself, then
6394	 * there's really nothing to do; treat it as a way for the
6395	 * delegate association to be cleared.  Note that we check
6396	 * the uuid of the passed-in proc rather than that of the
6397	 * current process, as we need to check the process issuing
6398	 * the socket option which could be kernproc itself.  Given
6399	 * that we don't allow 0 for effective uuid, it means that
6400	 * a delegated in-kernel socket stays delegated during its
6401	 * lifetime (which is okay.)
6402	 */
6403	if (uuid_compare(euuid, uuid) == 0) {
6404		so->so_flags &= ~SOF_DELEGATED;
6405		so->e_upid = 0;
6406		so->e_pid = 0;
6407		uuid_clear(so->e_uuid);
6408	} else {
6409		so->so_flags |= SOF_DELEGATED;
6410		/*
6411		 * Unlike so_set_effective_pid(), we only have the UUID
6412		 * here and the process ID is not known.  Inherit the
6413		 * real {pid,upid} of the socket.
6414		 */
6415		so->e_upid = so->last_upid;
6416		so->e_pid = so->last_pid;
6417		uuid_copy(so->e_uuid, euuid);
6418	}
6419
6420done:
6421	if (error == 0 && net_io_policy_log) {
6422		uuid_unparse(so->e_uuid, buf);
6423		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
6424		    "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
6425		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6426		    SOCK_TYPE(so), so->e_pid, buf,
6427		    ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6428	} else if (error != 0 && net_io_policy_log) {
6429		uuid_unparse(euuid, buf);
6430		log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
6431		    "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
6432		    (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6433		    SOCK_TYPE(so), buf, error);
6434	}
6435
6436	/* Update this socket's policy upon success */
6437	if (error == 0) {
6438		so->so_policy_gencnt *= -1;
6439		so_update_policy(so);
6440#if NECP
6441		so_update_necp_policy(so, NULL, NULL);
6442#endif /* NECP */
6443	}
6444
6445	return (error);
6446}
6447
6448void
6449netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
6450    uint32_t ev_datalen)
6451{
6452	struct kev_msg ev_msg;
6453
6454	/*
6455	 * A netpolicy event always starts with a netpolicy_event_data
6456	 * structure, but the caller can provide for a longer event
6457	 * structure to post, depending on the event code.
6458	 */
6459	VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
6460
6461	bzero(&ev_msg, sizeof (ev_msg));
6462	ev_msg.vendor_code	= KEV_VENDOR_APPLE;
6463	ev_msg.kev_class	= KEV_NETWORK_CLASS;
6464	ev_msg.kev_subclass	= KEV_NETPOLICY_SUBCLASS;
6465	ev_msg.event_code	= ev_code;
6466
6467	ev_msg.dv[0].data_ptr	= ev_data;
6468	ev_msg.dv[0].data_length = ev_datalen;
6469
6470	kev_post_msg(&ev_msg);
6471}
6472
6473void
6474socket_post_kev_msg(uint32_t ev_code,
6475    struct kev_socket_event_data *ev_data,
6476    uint32_t ev_datalen)
6477{
6478	struct kev_msg ev_msg;
6479
6480	bzero(&ev_msg, sizeof(ev_msg));
6481	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6482	ev_msg.kev_class = KEV_NETWORK_CLASS;
6483	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
6484	ev_msg.event_code = ev_code;
6485
6486	ev_msg.dv[0].data_ptr = ev_data;
6487	ev_msg.dv[0]. data_length = ev_datalen;
6488
6489	kev_post_msg(&ev_msg);
6490}
6491
6492void
6493socket_post_kev_msg_closed(struct socket *so)
6494{
6495	struct kev_socket_closed ev;
6496	struct sockaddr *socksa = NULL, *peersa = NULL;
6497	int err;
6498	bzero(&ev, sizeof(ev));
6499	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
6500	if (err == 0) {
6501		err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
6502		    &peersa);
6503		if (err == 0) {
6504			memcpy(&ev.ev_data.kev_sockname, socksa,
6505			    min(socksa->sa_len,
6506			    sizeof (ev.ev_data.kev_sockname)));
6507			memcpy(&ev.ev_data.kev_peername, peersa,
6508			    min(peersa->sa_len,
6509			    sizeof (ev.ev_data.kev_peername)));
6510			socket_post_kev_msg(KEV_SOCKET_CLOSED,
6511			    &ev.ev_data, sizeof (ev));
6512		}
6513	}
6514	if (socksa != NULL)
6515		FREE(socksa, M_SONAME);
6516	if (peersa != NULL)
6517		FREE(peersa, M_SONAME);
6518}
6519