1/* 2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1982, 1986, 1988, 1990, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/filedesc.h> 73#include <sys/proc.h> 74#include <sys/proc_internal.h> 75#include <sys/kauth.h> 76#include <sys/file_internal.h> 77#include <sys/fcntl.h> 78#include <sys/malloc.h> 79#include <sys/mbuf.h> 80#include <sys/domain.h> 81#include <sys/kernel.h> 82#include <sys/event.h> 83#include <sys/poll.h> 84#include <sys/protosw.h> 85#include <sys/socket.h> 86#include <sys/socketvar.h> 87#include <sys/resourcevar.h> 88#include <sys/signalvar.h> 89#include <sys/sysctl.h> 90#include <sys/syslog.h> 91#include <sys/uio.h> 92#include <sys/uio_internal.h> 93#include <sys/ev.h> 94#include <sys/kdebug.h> 95#include <sys/un.h> 96#include <sys/user.h> 97#include <sys/priv.h> 98#include <sys/kern_event.h> 99#include <net/route.h> 100#include <net/init.h> 101#include <net/ntstat.h> 102#include <net/content_filter.h> 103#include <netinet/in.h> 104#include <netinet/in_pcb.h> 105#include <netinet/ip6.h> 106#include <netinet6/ip6_var.h> 107#include <netinet/flow_divert.h> 108#include <kern/zalloc.h> 109#include <kern/locks.h> 110#include <machine/limits.h> 111#include <libkern/OSAtomic.h> 112#include <pexpert/pexpert.h> 113#include <kern/assert.h> 114#include <kern/task.h> 115#include <sys/kpi_mbuf.h> 116#include <sys/mcache.h> 117#include <sys/unpcb.h> 118 119#if CONFIG_MACF 120#include <security/mac.h> 121#include <security/mac_framework.h> 122#endif /* MAC */ 123 124#if MULTIPATH 125#include <netinet/mp_pcb.h> 126#include <netinet/mptcp_var.h> 127#endif /* MULTIPATH */ 128 129/* TODO: this should be in a header file somewhere */ 130extern char *proc_name_address(void *p); 131 132static u_int32_t so_cache_hw; /* High water mark for socache */ 133static u_int32_t so_cache_timeouts; /* number of timeouts */ 134static u_int32_t so_cache_max_freed; /* max freed per timeout */ 135static u_int32_t cached_sock_count = 0; 136STAILQ_HEAD(, socket) so_cache_head; 137int max_cached_sock_count = MAX_CACHED_SOCKETS; 138static u_int32_t so_cache_time; 139static int socketinit_done; 140static struct zone *so_cache_zone; 141 142static lck_grp_t *so_cache_mtx_grp; 143static lck_attr_t *so_cache_mtx_attr; 144static lck_grp_attr_t *so_cache_mtx_grp_attr; 145static lck_mtx_t *so_cache_mtx; 146 147#include <machine/limits.h> 148 149static void filt_sordetach(struct knote *kn); 150static int filt_soread(struct knote *kn, long hint); 151static void filt_sowdetach(struct knote *kn); 152static int filt_sowrite(struct knote *kn, long hint); 153static void filt_sockdetach(struct knote *kn); 154static int filt_sockev(struct knote *kn, long hint); 155 156static int sooptcopyin_timeval(struct sockopt *, struct timeval *); 157static int sooptcopyout_timeval(struct sockopt *, const struct timeval *); 158 159static struct filterops soread_filtops = { 160 .f_isfd = 1, 161 .f_detach = filt_sordetach, 162 .f_event = filt_soread, 163}; 164 165static struct filterops sowrite_filtops = { 166 .f_isfd = 1, 167 .f_detach = filt_sowdetach, 168 .f_event = filt_sowrite, 169}; 170 171static struct filterops sock_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_sockdetach, 174 .f_event = filt_sockev, 175}; 176 177SYSCTL_DECL(_kern_ipc); 178 179#define EVEN_MORE_LOCKING_DEBUG 0 180 181int socket_debug = 0; 182SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug, 183 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, ""); 184 185static int socket_zone = M_SOCKET; 186so_gen_t so_gencnt; /* generation count for sockets */ 187 188MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 189MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 190 191#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0) 192#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2) 193#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1) 194#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3) 195#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1) 196#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3) 197#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8)) 198#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3) 199#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8)) 200 201#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES) 202 203int somaxconn = SOMAXCONN; 204SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 205 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); 206 207/* Should we get a maximum also ??? */ 208static int sosendmaxchain = 65536; 209static int sosendminchain = 16384; 210static int sorecvmincopy = 16384; 211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, 212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); 213SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, 214 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); 215 216/* 217 * Set to enable jumbo clusters (if available) for large writes when 218 * the socket is marked with SOF_MULTIPAGES; see below. 219 */ 220int sosendjcl = 1; 221SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, 222 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); 223 224/* 225 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large 226 * writes on the socket for all protocols on any network interfaces, 227 * depending upon sosendjcl above. Be extra careful when setting this 228 * to 1, because sending down packets that cross physical pages down to 229 * broken drivers (those that falsely assume that the physical pages 230 * are contiguous) might lead to system panics or silent data corruption. 231 * When set to 0, the system will respect SOF_MULTIPAGES, which is set 232 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES 233 * capable. Set this to 1 only for testing/debugging purposes. 234 */ 235int sosendjcl_ignore_capab = 0; 236SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, 237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); 238 239int sosendbigcl_ignore_capab = 0; 240SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab, 241 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, ""); 242 243int sodefunctlog = 0; 244SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, 245 &sodefunctlog, 0, ""); 246 247int sothrottlelog = 0; 248SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED, 249 &sothrottlelog, 0, ""); 250 251int sorestrictrecv = 1; 252SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED, 253 &sorestrictrecv, 0, "Enable inbound interface restrictions"); 254 255int sorestrictsend = 1; 256SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED, 257 &sorestrictsend, 0, "Enable outbound interface restrictions"); 258 259extern struct inpcbinfo tcbinfo; 260 261/* TODO: these should be in header file */ 262extern int get_inpcb_str_size(void); 263extern int get_tcp_str_size(void); 264 265static unsigned int sl_zone_size; /* size of sockaddr_list */ 266static struct zone *sl_zone; /* zone for sockaddr_list */ 267 268static unsigned int se_zone_size; /* size of sockaddr_entry */ 269static struct zone *se_zone; /* zone for sockaddr_entry */ 270 271vm_size_t so_cache_zone_element_size; 272 273static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *); 274static void cached_sock_alloc(struct socket **, int); 275static void cached_sock_free(struct socket *); 276 277/* 278 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from 279 * setting the DSCP code on the packet based on the service class; see 280 * <rdar://problem/11277343> for details. 281 */ 282__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP; 283SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, 284 &sotcdb, 0, ""); 285 286void 287socketinit(void) 288{ 289 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t)); 290 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t))); 291 292 if (socketinit_done) { 293 printf("socketinit: already called...\n"); 294 return; 295 } 296 socketinit_done = 1; 297 298 PE_parse_boot_argn("socket_debug", &socket_debug, 299 sizeof (socket_debug)); 300 301 /* 302 * allocate lock group attribute and group for socket cache mutex 303 */ 304 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init(); 305 so_cache_mtx_grp = lck_grp_alloc_init("so_cache", 306 so_cache_mtx_grp_attr); 307 308 /* 309 * allocate the lock attribute for socket cache mutex 310 */ 311 so_cache_mtx_attr = lck_attr_alloc_init(); 312 313 /* cached sockets mutex */ 314 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr); 315 if (so_cache_mtx == NULL) { 316 panic("%s: unable to allocate so_cache_mtx\n", __func__); 317 /* NOTREACHED */ 318 } 319 STAILQ_INIT(&so_cache_head); 320 321 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4 322 + get_inpcb_str_size() + 4 + get_tcp_str_size()); 323 324 so_cache_zone = zinit(so_cache_zone_element_size, 325 (120000 * so_cache_zone_element_size), 8192, "socache zone"); 326 zone_change(so_cache_zone, Z_CALLERACCT, FALSE); 327 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE); 328 329 sl_zone_size = sizeof (struct sockaddr_list); 330 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024, 331 "sockaddr_list")) == NULL) { 332 panic("%s: unable to allocate sockaddr_list zone\n", __func__); 333 /* NOTREACHED */ 334 } 335 zone_change(sl_zone, Z_CALLERACCT, FALSE); 336 zone_change(sl_zone, Z_EXPAND, TRUE); 337 338 se_zone_size = sizeof (struct sockaddr_entry); 339 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024, 340 "sockaddr_entry")) == NULL) { 341 panic("%s: unable to allocate sockaddr_entry zone\n", __func__); 342 /* NOTREACHED */ 343 } 344 zone_change(se_zone, Z_CALLERACCT, FALSE); 345 zone_change(se_zone, Z_EXPAND, TRUE); 346 347 348 in_pcbinit(); 349 sflt_init(); 350 socket_tclass_init(); 351#if MULTIPATH 352 mp_pcbinit(); 353#endif /* MULTIPATH */ 354} 355 356static void 357cached_sock_alloc(struct socket **so, int waitok) 358{ 359 caddr_t temp; 360 uintptr_t offset; 361 362 lck_mtx_lock(so_cache_mtx); 363 364 if (!STAILQ_EMPTY(&so_cache_head)) { 365 VERIFY(cached_sock_count > 0); 366 367 *so = STAILQ_FIRST(&so_cache_head); 368 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); 369 STAILQ_NEXT((*so), so_cache_ent) = NULL; 370 371 cached_sock_count--; 372 lck_mtx_unlock(so_cache_mtx); 373 374 temp = (*so)->so_saved_pcb; 375 bzero((caddr_t)*so, sizeof (struct socket)); 376 377 (*so)->so_saved_pcb = temp; 378 } else { 379 380 lck_mtx_unlock(so_cache_mtx); 381 382 if (waitok) 383 *so = (struct socket *)zalloc(so_cache_zone); 384 else 385 *so = (struct socket *)zalloc_noblock(so_cache_zone); 386 387 if (*so == NULL) 388 return; 389 390 bzero((caddr_t)*so, sizeof (struct socket)); 391 392 /* 393 * Define offsets for extra structures into our 394 * single block of memory. Align extra structures 395 * on longword boundaries. 396 */ 397 398 offset = (uintptr_t)*so; 399 offset += sizeof (struct socket); 400 401 offset = ALIGN(offset); 402 403 (*so)->so_saved_pcb = (caddr_t)offset; 404 offset += get_inpcb_str_size(); 405 406 offset = ALIGN(offset); 407 408 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb = 409 (caddr_t)offset; 410 } 411 412 (*so)->cached_in_sock_layer = true; 413} 414 415static void 416cached_sock_free(struct socket *so) 417{ 418 419 lck_mtx_lock(so_cache_mtx); 420 421 so_cache_time = net_uptime(); 422 if (++cached_sock_count > max_cached_sock_count) { 423 --cached_sock_count; 424 lck_mtx_unlock(so_cache_mtx); 425 zfree(so_cache_zone, so); 426 } else { 427 if (so_cache_hw < cached_sock_count) 428 so_cache_hw = cached_sock_count; 429 430 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent); 431 432 so->cache_timestamp = so_cache_time; 433 lck_mtx_unlock(so_cache_mtx); 434 } 435} 436 437void 438so_update_last_owner_locked(struct socket *so, proc_t self) 439{ 440 if (so->last_pid != 0) { 441 /* 442 * last_pid and last_upid should remain zero for sockets 443 * created using sock_socket. The check above achieves that 444 */ 445 if (self == PROC_NULL) 446 self = current_proc(); 447 448 if (so->last_upid != proc_uniqueid(self) || 449 so->last_pid != proc_pid(self)) { 450 so->last_upid = proc_uniqueid(self); 451 so->last_pid = proc_pid(self); 452 proc_getexecutableuuid(self, so->last_uuid, 453 sizeof (so->last_uuid)); 454 } 455 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid)); 456 } 457} 458 459void 460so_update_policy(struct socket *so) 461{ 462 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) 463 (void) inp_update_policy(sotoinpcb(so)); 464} 465 466#if NECP 467static void 468so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr) 469{ 470 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) 471 inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0); 472} 473#endif /* NECP */ 474 475boolean_t 476so_cache_timer(void) 477{ 478 struct socket *p; 479 int n_freed = 0; 480 boolean_t rc = FALSE; 481 482 lck_mtx_lock(so_cache_mtx); 483 so_cache_timeouts++; 484 so_cache_time = net_uptime(); 485 486 while (!STAILQ_EMPTY(&so_cache_head)) { 487 VERIFY(cached_sock_count > 0); 488 p = STAILQ_FIRST(&so_cache_head); 489 if ((so_cache_time - p->cache_timestamp) < 490 SO_CACHE_TIME_LIMIT) 491 break; 492 493 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); 494 --cached_sock_count; 495 496 zfree(so_cache_zone, p); 497 498 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) { 499 so_cache_max_freed++; 500 break; 501 } 502 } 503 504 /* Schedule again if there is more to cleanup */ 505 if (!STAILQ_EMPTY(&so_cache_head)) 506 rc = TRUE; 507 508 lck_mtx_unlock(so_cache_mtx); 509 return (rc); 510} 511 512/* 513 * Get a socket structure from our zone, and initialize it. 514 * We don't implement `waitok' yet (see comments in uipc_domain.c). 515 * Note that it would probably be better to allocate socket 516 * and PCB at the same time, but I'm not convinced that all 517 * the protocols can be easily modified to do this. 518 */ 519struct socket * 520soalloc(int waitok, int dom, int type) 521{ 522 struct socket *so; 523 524 if ((dom == PF_INET) && (type == SOCK_STREAM)) { 525 cached_sock_alloc(&so, waitok); 526 } else { 527 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone, 528 M_WAITOK); 529 if (so != NULL) 530 bzero(so, sizeof (*so)); 531 } 532 if (so != NULL) { 533 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); 534 so->so_zone = socket_zone; 535#if CONFIG_MACF_SOCKET 536 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */ 537 if (mac_socket_label_init(so, !waitok) != 0) { 538 sodealloc(so); 539 return (NULL); 540 } 541#endif /* MAC_SOCKET */ 542 } 543 544 return (so); 545} 546 547int 548socreate_internal(int dom, struct socket **aso, int type, int proto, 549 struct proc *p, uint32_t flags, struct proc *ep) 550{ 551 struct protosw *prp; 552 struct socket *so; 553 int error = 0; 554 555#if TCPDEBUG 556 extern int tcpconsdebug; 557#endif 558 559 VERIFY(aso != NULL); 560 *aso = NULL; 561 562 if (proto != 0) 563 prp = pffindproto(dom, proto, type); 564 else 565 prp = pffindtype(dom, type); 566 567 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) { 568 if (pffinddomain(dom) == NULL) 569 return (EAFNOSUPPORT); 570 if (proto != 0) { 571 if (pffindprotonotype(dom, proto) != NULL) 572 return (EPROTOTYPE); 573 } 574 return (EPROTONOSUPPORT); 575 } 576 if (prp->pr_type != type) 577 return (EPROTOTYPE); 578 so = soalloc(1, dom, type); 579 if (so == NULL) 580 return (ENOBUFS); 581 582 if (flags & SOCF_ASYNC) 583 so->so_state |= SS_NBIO; 584#if MULTIPATH 585 if (flags & SOCF_MP_SUBFLOW) { 586 /* 587 * A multipath subflow socket is used internally in the kernel, 588 * therefore it does not have a file desciptor associated by 589 * default. 590 */ 591 so->so_state |= SS_NOFDREF; 592 so->so_flags |= SOF_MP_SUBFLOW; 593 } 594#endif /* MULTIPATH */ 595 596 TAILQ_INIT(&so->so_incomp); 597 TAILQ_INIT(&so->so_comp); 598 so->so_type = type; 599 so->last_upid = proc_uniqueid(p); 600 so->last_pid = proc_pid(p); 601 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid)); 602 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid)); 603 604 if (ep != PROC_NULL && ep != p) { 605 so->e_upid = proc_uniqueid(ep); 606 so->e_pid = proc_pid(ep); 607 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid)); 608 so->so_flags |= SOF_DELEGATED; 609 } 610 611 so->so_cred = kauth_cred_proc_ref(p); 612 if (!suser(kauth_cred_get(), NULL)) 613 so->so_state |= SS_PRIV; 614 615 so->so_proto = prp; 616 so->so_rcv.sb_flags |= SB_RECV; 617 so->so_rcv.sb_so = so->so_snd.sb_so = so; 618 so->next_lock_lr = 0; 619 so->next_unlock_lr = 0; 620 621#if CONFIG_MACF_SOCKET 622 mac_socket_label_associate(kauth_cred_get(), so); 623#endif /* MAC_SOCKET */ 624 625 /* 626 * Attachment will create the per pcb lock if necessary and 627 * increase refcount for creation, make sure it's done before 628 * socket is inserted in lists. 629 */ 630 so->so_usecount++; 631 632 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); 633 if (error != 0) { 634 /* 635 * Warning: 636 * If so_pcb is not zero, the socket will be leaked, 637 * so protocol attachment handler must be coded carefuly 638 */ 639 so->so_state |= SS_NOFDREF; 640 so->so_usecount--; 641 sofreelastref(so, 1); /* will deallocate the socket */ 642 return (error); 643 } 644 645 atomic_add_32(&prp->pr_domain->dom_refs, 1); 646 TAILQ_INIT(&so->so_evlist); 647 648 /* Attach socket filters for this protocol */ 649 sflt_initsock(so); 650#if TCPDEBUG 651 if (tcpconsdebug == 2) 652 so->so_options |= SO_DEBUG; 653#endif 654 so_set_default_traffic_class(so); 655 656 /* 657 * If this thread or task is marked to create backgrounded sockets, 658 * mark the socket as background. 659 */ 660 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) { 661 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); 662 so->so_background_thread = current_thread(); 663 } 664 665 switch (dom) { 666 /* 667 * Don't mark Unix domain, system or multipath sockets as 668 * eligible for defunct by default. 669 */ 670 case PF_LOCAL: 671 case PF_SYSTEM: 672 case PF_MULTIPATH: 673 so->so_flags |= SOF_NODEFUNCT; 674 break; 675 default: 676 break; 677 } 678 679 /* 680 * Entitlements can't be checked at socket creation time except if the 681 * application requested a feature guarded by a privilege (c.f., socket 682 * delegation). 683 * The priv(9) and the Sandboxing APIs are designed with the idea that 684 * a privilege check should only be triggered by a userland request. 685 * A privilege check at socket creation time is time consuming and 686 * could trigger many authorisation error messages from the security 687 * APIs. 688 */ 689 690 *aso = so; 691 692 return (0); 693} 694 695/* 696 * Returns: 0 Success 697 * EAFNOSUPPORT 698 * EPROTOTYPE 699 * EPROTONOSUPPORT 700 * ENOBUFS 701 * <pru_attach>:ENOBUFS[AF_UNIX] 702 * <pru_attach>:ENOBUFS[TCP] 703 * <pru_attach>:ENOMEM[TCP] 704 * <pru_attach>:??? [other protocol families, IPSEC] 705 */ 706int 707socreate(int dom, struct socket **aso, int type, int proto) 708{ 709 return (socreate_internal(dom, aso, type, proto, current_proc(), 0, 710 PROC_NULL)); 711} 712 713int 714socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid) 715{ 716 int error = 0; 717 struct proc *ep = PROC_NULL; 718 719 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) { 720 error = ESRCH; 721 goto done; 722 } 723 724 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep); 725 726 /* 727 * It might not be wise to hold the proc reference when calling 728 * socreate_internal since it calls soalloc with M_WAITOK 729 */ 730done: 731 if (ep != PROC_NULL) 732 proc_rele(ep); 733 734 return (error); 735} 736 737/* 738 * Returns: 0 Success 739 * <pru_bind>:EINVAL Invalid argument [COMMON_START] 740 * <pru_bind>:EAFNOSUPPORT Address family not supported 741 * <pru_bind>:EADDRNOTAVAIL Address not available. 742 * <pru_bind>:EINVAL Invalid argument 743 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef] 744 * <pru_bind>:EACCES Permission denied 745 * <pru_bind>:EADDRINUSE Address in use 746 * <pru_bind>:EAGAIN Resource unavailable, try again 747 * <pru_bind>:EPERM Operation not permitted 748 * <pru_bind>:??? 749 * <sf_bind>:??? 750 * 751 * Notes: It's not possible to fully enumerate the return codes above, 752 * since socket filter authors and protocol family authors may 753 * not choose to limit their error returns to those listed, even 754 * though this may result in some software operating incorrectly. 755 * 756 * The error codes which are enumerated above are those known to 757 * be returned by the tcp_usr_bind function supplied. 758 */ 759int 760sobindlock(struct socket *so, struct sockaddr *nam, int dolock) 761{ 762 struct proc *p = current_proc(); 763 int error = 0; 764 765 if (dolock) 766 socket_lock(so, 1); 767 VERIFY(so->so_usecount > 1); 768 769 so_update_last_owner_locked(so, p); 770 so_update_policy(so); 771 772#if NECP 773 so_update_necp_policy(so, nam, NULL); 774#endif /* NECP */ 775 776 /* 777 * If this is a bind request on a socket that has been marked 778 * as inactive, reject it now before we go any further. 779 */ 780 if (so->so_flags & SOF_DEFUNCT) { 781 error = EINVAL; 782 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 783 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 784 SOCK_DOM(so), SOCK_TYPE(so), error)); 785 goto out; 786 } 787 788 /* Socket filter */ 789 error = sflt_bind(so, nam); 790 791 if (error == 0) 792 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); 793out: 794 if (dolock) 795 socket_unlock(so, 1); 796 797 if (error == EJUSTRETURN) 798 error = 0; 799 800 return (error); 801} 802 803void 804sodealloc(struct socket *so) 805{ 806 kauth_cred_unref(&so->so_cred); 807 808 /* Remove any filters */ 809 sflt_termsock(so); 810 811#if CONTENT_FILTER 812 cfil_sock_detach(so); 813#endif /* CONTENT_FILTER */ 814 815 /* Delete the state allocated for msg queues on a socket */ 816 if (so->so_flags & SOF_ENABLE_MSGS) { 817 FREE(so->so_msg_state, M_TEMP); 818 so->so_msg_state = NULL; 819 } 820 VERIFY(so->so_msg_state == NULL); 821 822 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); 823 824#if CONFIG_MACF_SOCKET 825 mac_socket_label_destroy(so); 826#endif /* MAC_SOCKET */ 827 828 if (so->cached_in_sock_layer) { 829 cached_sock_free(so); 830 } else { 831 FREE_ZONE(so, sizeof (*so), so->so_zone); 832 } 833} 834 835/* 836 * Returns: 0 Success 837 * EINVAL 838 * EOPNOTSUPP 839 * <pru_listen>:EINVAL[AF_UNIX] 840 * <pru_listen>:EINVAL[TCP] 841 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available. 842 * <pru_listen>:EINVAL[TCP] Invalid argument 843 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef] 844 * <pru_listen>:EACCES[TCP] Permission denied 845 * <pru_listen>:EADDRINUSE[TCP] Address in use 846 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again 847 * <pru_listen>:EPERM[TCP] Operation not permitted 848 * <sf_listen>:??? 849 * 850 * Notes: Other <pru_listen> returns depend on the protocol family; all 851 * <sf_listen> returns depend on what the filter author causes 852 * their filter to return. 853 */ 854int 855solisten(struct socket *so, int backlog) 856{ 857 struct proc *p = current_proc(); 858 int error = 0; 859 860 socket_lock(so, 1); 861 862 so_update_last_owner_locked(so, p); 863 so_update_policy(so); 864 865#if NECP 866 so_update_necp_policy(so, NULL, NULL); 867#endif /* NECP */ 868 869 if (so->so_proto == NULL) { 870 error = EINVAL; 871 goto out; 872 } 873 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { 874 error = EOPNOTSUPP; 875 goto out; 876 } 877 878 /* 879 * If the listen request is made on a socket that is not fully 880 * disconnected, or on a socket that has been marked as inactive, 881 * reject the request now. 882 */ 883 if ((so->so_state & 884 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) || 885 (so->so_flags & SOF_DEFUNCT)) { 886 error = EINVAL; 887 if (so->so_flags & SOF_DEFUNCT) { 888 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 889 "(%d)\n", __func__, proc_pid(p), 890 (uint64_t)VM_KERNEL_ADDRPERM(so), 891 SOCK_DOM(so), SOCK_TYPE(so), error)); 892 } 893 goto out; 894 } 895 896 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) { 897 error = EPERM; 898 goto out; 899 } 900 901 error = sflt_listen(so); 902 if (error == 0) 903 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); 904 905 if (error) { 906 if (error == EJUSTRETURN) 907 error = 0; 908 goto out; 909 } 910 911 if (TAILQ_EMPTY(&so->so_comp)) 912 so->so_options |= SO_ACCEPTCONN; 913 /* 914 * POSIX: The implementation may have an upper limit on the length of 915 * the listen queue-either global or per accepting socket. If backlog 916 * exceeds this limit, the length of the listen queue is set to the 917 * limit. 918 * 919 * If listen() is called with a backlog argument value that is less 920 * than 0, the function behaves as if it had been called with a backlog 921 * argument value of 0. 922 * 923 * A backlog argument of 0 may allow the socket to accept connections, 924 * in which case the length of the listen queue may be set to an 925 * implementation-defined minimum value. 926 */ 927 if (backlog <= 0 || backlog > somaxconn) 928 backlog = somaxconn; 929 930 so->so_qlimit = backlog; 931out: 932 socket_unlock(so, 1); 933 return (error); 934} 935 936void 937sofreelastref(struct socket *so, int dealloc) 938{ 939 struct socket *head = so->so_head; 940 941 /* Assume socket is locked */ 942 943 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) { 944 selthreadclear(&so->so_snd.sb_sel); 945 selthreadclear(&so->so_rcv.sb_sel); 946 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL); 947 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL); 948 so->so_event = sonullevent; 949 return; 950 } 951 if (head != NULL) { 952 socket_lock(head, 1); 953 if (so->so_state & SS_INCOMP) { 954 TAILQ_REMOVE(&head->so_incomp, so, so_list); 955 head->so_incqlen--; 956 } else if (so->so_state & SS_COMP) { 957 /* 958 * We must not decommission a socket that's 959 * on the accept(2) queue. If we do, then 960 * accept(2) may hang after select(2) indicated 961 * that the listening socket was ready. 962 */ 963 selthreadclear(&so->so_snd.sb_sel); 964 selthreadclear(&so->so_rcv.sb_sel); 965 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL); 966 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL); 967 so->so_event = sonullevent; 968 socket_unlock(head, 1); 969 return; 970 } else { 971 panic("sofree: not queued"); 972 } 973 head->so_qlen--; 974 so->so_state &= ~SS_INCOMP; 975 so->so_head = NULL; 976 socket_unlock(head, 1); 977 } 978 sowflush(so); 979 sorflush(so); 980 981#if FLOW_DIVERT 982 if (so->so_flags & SOF_FLOW_DIVERT) { 983 flow_divert_detach(so); 984 } 985#endif /* FLOW_DIVERT */ 986 987 /* 3932268: disable upcall */ 988 so->so_rcv.sb_flags &= ~SB_UPCALL; 989 so->so_snd.sb_flags &= ~SB_UPCALL; 990 so->so_event = sonullevent; 991 992 if (dealloc) 993 sodealloc(so); 994} 995 996void 997soclose_wait_locked(struct socket *so) 998{ 999 lck_mtx_t *mutex_held; 1000 1001 if (so->so_proto->pr_getlock != NULL) 1002 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 1003 else 1004 mutex_held = so->so_proto->pr_domain->dom_mtx; 1005 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 1006 1007 /* 1008 * Double check here and return if there's no outstanding upcall; 1009 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. 1010 */ 1011 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) 1012 return; 1013 so->so_rcv.sb_flags &= ~SB_UPCALL; 1014 so->so_snd.sb_flags &= ~SB_UPCALL; 1015 so->so_flags |= SOF_CLOSEWAIT; 1016 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1), 1017 "soclose_wait_locked", NULL); 1018 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 1019 so->so_flags &= ~SOF_CLOSEWAIT; 1020} 1021 1022/* 1023 * Close a socket on last file table reference removal. 1024 * Initiate disconnect if connected. 1025 * Free socket when disconnect complete. 1026 */ 1027int 1028soclose_locked(struct socket *so) 1029{ 1030 int error = 0; 1031 lck_mtx_t *mutex_held; 1032 struct timespec ts; 1033 1034 if (so->so_usecount == 0) { 1035 panic("soclose: so=%p refcount=0\n", so); 1036 /* NOTREACHED */ 1037 } 1038 1039 sflt_notify(so, sock_evt_closing, NULL); 1040 1041 if (so->so_upcallusecount) 1042 soclose_wait_locked(so); 1043 1044#if CONTENT_FILTER 1045 /* 1046 * We have to wait until the content filters are done 1047 */ 1048 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) { 1049 cfil_sock_close_wait(so); 1050 cfil_sock_is_closed(so); 1051 cfil_sock_detach(so); 1052 } 1053#endif /* CONTENT_FILTER */ 1054 1055 if ((so->so_options & SO_ACCEPTCONN)) { 1056 struct socket *sp, *sonext; 1057 int socklock = 0; 1058 1059 /* 1060 * We do not want new connection to be added 1061 * to the connection queues 1062 */ 1063 so->so_options &= ~SO_ACCEPTCONN; 1064 1065 for (sp = TAILQ_FIRST(&so->so_incomp); 1066 sp != NULL; sp = sonext) { 1067 sonext = TAILQ_NEXT(sp, so_list); 1068 1069 /* 1070 * Radar 5350314 1071 * skip sockets thrown away by tcpdropdropblreq 1072 * they will get cleanup by the garbage collection. 1073 * otherwise, remove the incomp socket from the queue 1074 * and let soabort trigger the appropriate cleanup. 1075 */ 1076 if (sp->so_flags & SOF_OVERFLOW) 1077 continue; 1078 1079 if (so->so_proto->pr_getlock != NULL) { 1080 /* 1081 * Lock ordering for consistency with the 1082 * rest of the stack, we lock the socket 1083 * first and then grabb the head. 1084 */ 1085 socket_unlock(so, 0); 1086 socket_lock(sp, 1); 1087 socket_lock(so, 0); 1088 socklock = 1; 1089 } 1090 1091 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 1092 so->so_incqlen--; 1093 1094 if (sp->so_state & SS_INCOMP) { 1095 sp->so_state &= ~SS_INCOMP; 1096 sp->so_head = NULL; 1097 1098 (void) soabort(sp); 1099 } 1100 1101 if (socklock) 1102 socket_unlock(sp, 1); 1103 } 1104 1105 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 1106 /* Dequeue from so_comp since sofree() won't do it */ 1107 TAILQ_REMOVE(&so->so_comp, sp, so_list); 1108 so->so_qlen--; 1109 1110 if (so->so_proto->pr_getlock != NULL) { 1111 socket_unlock(so, 0); 1112 socket_lock(sp, 1); 1113 } 1114 1115 if (sp->so_state & SS_COMP) { 1116 sp->so_state &= ~SS_COMP; 1117 sp->so_head = NULL; 1118 1119 (void) soabort(sp); 1120 } 1121 1122 if (so->so_proto->pr_getlock != NULL) { 1123 socket_unlock(sp, 1); 1124 socket_lock(so, 0); 1125 } 1126 } 1127 } 1128 if (so->so_pcb == NULL) { 1129 /* 3915887: mark the socket as ready for dealloc */ 1130 so->so_flags |= SOF_PCBCLEARING; 1131 goto discard; 1132 } 1133 if (so->so_state & SS_ISCONNECTED) { 1134 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1135 error = sodisconnectlocked(so); 1136 if (error) 1137 goto drop; 1138 } 1139 if (so->so_options & SO_LINGER) { 1140 if ((so->so_state & SS_ISDISCONNECTING) && 1141 (so->so_state & SS_NBIO)) 1142 goto drop; 1143 if (so->so_proto->pr_getlock != NULL) 1144 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 1145 else 1146 mutex_held = so->so_proto->pr_domain->dom_mtx; 1147 while (so->so_state & SS_ISCONNECTED) { 1148 ts.tv_sec = (so->so_linger/100); 1149 ts.tv_nsec = (so->so_linger % 100) * 1150 NSEC_PER_USEC * 1000 * 10; 1151 error = msleep((caddr_t)&so->so_timeo, 1152 mutex_held, PSOCK | PCATCH, "soclose", &ts); 1153 if (error) { 1154 /* 1155 * It's OK when the time fires, 1156 * don't report an error 1157 */ 1158 if (error == EWOULDBLOCK) 1159 error = 0; 1160 break; 1161 } 1162 } 1163 } 1164 } 1165drop: 1166 if (so->so_usecount == 0) { 1167 panic("soclose: usecount is zero so=%p\n", so); 1168 /* NOTREACHED */ 1169 } 1170 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) { 1171 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 1172 if (error == 0) 1173 error = error2; 1174 } 1175 if (so->so_usecount <= 0) { 1176 panic("soclose: usecount is zero so=%p\n", so); 1177 /* NOTREACHED */ 1178 } 1179discard: 1180 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) && 1181 (so->so_state & SS_NOFDREF)) { 1182 panic("soclose: NOFDREF"); 1183 /* NOTREACHED */ 1184 } 1185 so->so_state |= SS_NOFDREF; 1186 1187 if (so->so_flags & SOF_MP_SUBFLOW) 1188 so->so_flags &= ~SOF_MP_SUBFLOW; 1189 1190 if ((so->so_flags & SOF_KNOTE) != 0) 1191 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED); 1192 1193 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1); 1194 evsofree(so); 1195 1196 so->so_usecount--; 1197 sofree(so); 1198 return (error); 1199} 1200 1201int 1202soclose(struct socket *so) 1203{ 1204 int error = 0; 1205 socket_lock(so, 1); 1206 1207 if (so->so_retaincnt == 0) { 1208 error = soclose_locked(so); 1209 } else { 1210 /* 1211 * if the FD is going away, but socket is 1212 * retained in kernel remove its reference 1213 */ 1214 so->so_usecount--; 1215 if (so->so_usecount < 2) 1216 panic("soclose: retaincnt non null and so=%p " 1217 "usecount=%d\n", so, so->so_usecount); 1218 } 1219 socket_unlock(so, 1); 1220 return (error); 1221} 1222 1223/* 1224 * Must be called at splnet... 1225 */ 1226/* Should already be locked */ 1227int 1228soabort(struct socket *so) 1229{ 1230 int error; 1231 1232#ifdef MORE_LOCKING_DEBUG 1233 lck_mtx_t *mutex_held; 1234 1235 if (so->so_proto->pr_getlock != NULL) 1236 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 1237 else 1238 mutex_held = so->so_proto->pr_domain->dom_mtx; 1239 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 1240#endif 1241 1242 if ((so->so_flags & SOF_ABORTED) == 0) { 1243 so->so_flags |= SOF_ABORTED; 1244 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 1245 if (error) { 1246 sofree(so); 1247 return (error); 1248 } 1249 } 1250 return (0); 1251} 1252 1253int 1254soacceptlock(struct socket *so, struct sockaddr **nam, int dolock) 1255{ 1256 int error; 1257 1258 if (dolock) 1259 socket_lock(so, 1); 1260 1261 so_update_last_owner_locked(so, PROC_NULL); 1262 so_update_policy(so); 1263#if NECP 1264 so_update_necp_policy(so, NULL, NULL); 1265#endif /* NECP */ 1266 1267 if ((so->so_state & SS_NOFDREF) == 0) 1268 panic("soaccept: !NOFDREF"); 1269 so->so_state &= ~SS_NOFDREF; 1270 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1271 1272 if (dolock) 1273 socket_unlock(so, 1); 1274 return (error); 1275} 1276 1277int 1278soaccept(struct socket *so, struct sockaddr **nam) 1279{ 1280 return (soacceptlock(so, nam, 1)); 1281} 1282 1283int 1284soacceptfilter(struct socket *so) 1285{ 1286 struct sockaddr *local = NULL, *remote = NULL; 1287 int error = 0; 1288 struct socket *head = so->so_head; 1289 1290 /* 1291 * Hold the lock even if this socket has not been made visible 1292 * to the filter(s). For sockets with global locks, this protects 1293 * against the head or peer going away 1294 */ 1295 socket_lock(so, 1); 1296 if (sogetaddr_locked(so, &remote, 1) != 0 || 1297 sogetaddr_locked(so, &local, 0) != 0) { 1298 so->so_state &= ~(SS_NOFDREF | SS_COMP); 1299 so->so_head = NULL; 1300 socket_unlock(so, 1); 1301 soclose(so); 1302 /* Out of resources; try it again next time */ 1303 error = ECONNABORTED; 1304 goto done; 1305 } 1306 1307 error = sflt_accept(head, so, local, remote); 1308 1309 /* 1310 * If we get EJUSTRETURN from one of the filters, mark this socket 1311 * as inactive and return it anyway. This newly accepted socket 1312 * will be disconnected later before we hand it off to the caller. 1313 */ 1314 if (error == EJUSTRETURN) { 1315 error = 0; 1316 (void) sosetdefunct(current_proc(), so, 1317 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); 1318 } 1319 1320 if (error != 0) { 1321 /* 1322 * This may seem like a duplication to the above error 1323 * handling part when we return ECONNABORTED, except 1324 * the following is done while holding the lock since 1325 * the socket has been exposed to the filter(s) earlier. 1326 */ 1327 so->so_state &= ~(SS_NOFDREF | SS_COMP); 1328 so->so_head = NULL; 1329 socket_unlock(so, 1); 1330 soclose(so); 1331 /* Propagate socket filter's error code to the caller */ 1332 } else { 1333 socket_unlock(so, 1); 1334 } 1335done: 1336 /* Callee checks for NULL pointer */ 1337 sock_freeaddr(remote); 1338 sock_freeaddr(local); 1339 return (error); 1340} 1341 1342/* 1343 * Returns: 0 Success 1344 * EOPNOTSUPP Operation not supported on socket 1345 * EISCONN Socket is connected 1346 * <pru_connect>:EADDRNOTAVAIL Address not available. 1347 * <pru_connect>:EINVAL Invalid argument 1348 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef] 1349 * <pru_connect>:EACCES Permission denied 1350 * <pru_connect>:EADDRINUSE Address in use 1351 * <pru_connect>:EAGAIN Resource unavailable, try again 1352 * <pru_connect>:EPERM Operation not permitted 1353 * <sf_connect_out>:??? [anything a filter writer might set] 1354 */ 1355int 1356soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) 1357{ 1358 int error; 1359 struct proc *p = current_proc(); 1360 1361 if (dolock) 1362 socket_lock(so, 1); 1363 1364 so_update_last_owner_locked(so, p); 1365 so_update_policy(so); 1366 1367#if NECP 1368 so_update_necp_policy(so, NULL, nam); 1369#endif /* NECP */ 1370 1371 /* 1372 * If this is a listening socket or if this is a previously-accepted 1373 * socket that has been marked as inactive, reject the connect request. 1374 */ 1375 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { 1376 error = EOPNOTSUPP; 1377 if (so->so_flags & SOF_DEFUNCT) { 1378 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 1379 "(%d)\n", __func__, proc_pid(p), 1380 (uint64_t)VM_KERNEL_ADDRPERM(so), 1381 SOCK_DOM(so), SOCK_TYPE(so), error)); 1382 } 1383 if (dolock) 1384 socket_unlock(so, 1); 1385 return (error); 1386 } 1387 1388 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) { 1389 if (dolock) 1390 socket_unlock(so, 1); 1391 return (EPERM); 1392 } 1393 1394 /* 1395 * If protocol is connection-based, can only connect once. 1396 * Otherwise, if connected, try to disconnect first. 1397 * This allows user to disconnect by connecting to, e.g., 1398 * a null address. 1399 */ 1400 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1401 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1402 (error = sodisconnectlocked(so)))) { 1403 error = EISCONN; 1404 } else { 1405 /* 1406 * Run connect filter before calling protocol: 1407 * - non-blocking connect returns before completion; 1408 */ 1409 error = sflt_connectout(so, nam); 1410 if (error != 0) { 1411 if (error == EJUSTRETURN) 1412 error = 0; 1413 } else { 1414 error = (*so->so_proto->pr_usrreqs->pru_connect) 1415 (so, nam, p); 1416 } 1417 } 1418 if (dolock) 1419 socket_unlock(so, 1); 1420 return (error); 1421} 1422 1423int 1424soconnect(struct socket *so, struct sockaddr *nam) 1425{ 1426 return (soconnectlock(so, nam, 1)); 1427} 1428 1429/* 1430 * Returns: 0 Success 1431 * <pru_connect2>:EINVAL[AF_UNIX] 1432 * <pru_connect2>:EPROTOTYPE[AF_UNIX] 1433 * <pru_connect2>:??? [other protocol families] 1434 * 1435 * Notes: <pru_connect2> is not supported by [TCP]. 1436 */ 1437int 1438soconnect2(struct socket *so1, struct socket *so2) 1439{ 1440 int error; 1441 1442 socket_lock(so1, 1); 1443 if (so2->so_proto->pr_lock) 1444 socket_lock(so2, 1); 1445 1446 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1447 1448 socket_unlock(so1, 1); 1449 if (so2->so_proto->pr_lock) 1450 socket_unlock(so2, 1); 1451 return (error); 1452} 1453 1454int 1455soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl, 1456 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, 1457 associd_t aid, connid_t *pcid, uint32_t flags, void *arg, 1458 uint32_t arglen) 1459{ 1460 int error; 1461 1462 so_update_last_owner_locked(so, p); 1463 so_update_policy(so); 1464 1465 /* 1466 * If this is a listening socket or if this is a previously-accepted 1467 * socket that has been marked as inactive, reject the connect request. 1468 */ 1469 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { 1470 error = EOPNOTSUPP; 1471 if (so->so_flags & SOF_DEFUNCT) { 1472 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 1473 "(%d)\n", __func__, proc_pid(p), 1474 (uint64_t)VM_KERNEL_ADDRPERM(so), 1475 SOCK_DOM(so), SOCK_TYPE(so), error)); 1476 } 1477 return (error); 1478 } 1479 1480 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) 1481 return (EPERM); 1482 1483 /* 1484 * If protocol is connection-based, can only connect once 1485 * unless PR_MULTICONN is set. Otherwise, if connected, 1486 * try to disconnect first. This allows user to disconnect 1487 * by connecting to, e.g., a null address. 1488 */ 1489 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) && 1490 !(so->so_proto->pr_flags & PR_MULTICONN) && 1491 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1492 (error = sodisconnectlocked(so)) != 0)) { 1493 error = EISCONN; 1494 } else { 1495 /* 1496 * Run connect filter before calling protocol: 1497 * - non-blocking connect returns before completion; 1498 */ 1499 error = sflt_connectxout(so, dst_sl); 1500 if (error != 0) { 1501 if (error == EJUSTRETURN) 1502 error = 0; 1503 } else { 1504 error = (*so->so_proto->pr_usrreqs->pru_connectx) 1505 (so, src_sl, dst_sl, p, ifscope, aid, pcid, 1506 flags, arg, arglen); 1507 } 1508 } 1509 1510 return (error); 1511} 1512 1513int 1514sodisconnectlocked(struct socket *so) 1515{ 1516 int error; 1517 1518 if ((so->so_state & SS_ISCONNECTED) == 0) { 1519 error = ENOTCONN; 1520 goto bad; 1521 } 1522 if (so->so_state & SS_ISDISCONNECTING) { 1523 error = EALREADY; 1524 goto bad; 1525 } 1526 1527 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1528 if (error == 0) 1529 sflt_notify(so, sock_evt_disconnected, NULL); 1530 1531bad: 1532 return (error); 1533} 1534 1535/* Locking version */ 1536int 1537sodisconnect(struct socket *so) 1538{ 1539 int error; 1540 1541 socket_lock(so, 1); 1542 error = sodisconnectlocked(so); 1543 socket_unlock(so, 1); 1544 return (error); 1545} 1546 1547int 1548sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid) 1549{ 1550 int error; 1551 1552 /* 1553 * Call the protocol disconnectx handler; let it handle all 1554 * matters related to the connection state of this session. 1555 */ 1556 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid); 1557 if (error == 0) { 1558 /* 1559 * The event applies only for the session, not for 1560 * the disconnection of individual subflows. 1561 */ 1562 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) 1563 sflt_notify(so, sock_evt_disconnected, NULL); 1564 } 1565 return (error); 1566} 1567 1568int 1569sodisconnectx(struct socket *so, associd_t aid, connid_t cid) 1570{ 1571 int error; 1572 1573 socket_lock(so, 1); 1574 error = sodisconnectxlocked(so, aid, cid); 1575 socket_unlock(so, 1); 1576 return (error); 1577} 1578 1579int 1580sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop) 1581{ 1582 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop)); 1583} 1584 1585#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1586 1587/* 1588 * sosendcheck will lock the socket buffer if it isn't locked and 1589 * verify that there is space for the data being inserted. 1590 * 1591 * Returns: 0 Success 1592 * EPIPE 1593 * sblock:EWOULDBLOCK 1594 * sblock:EINTR 1595 * sbwait:EBADF 1596 * sbwait:EINTR 1597 * [so_error]:??? 1598 */ 1599int 1600sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid, 1601 int32_t clen, int32_t atomic, int flags, int *sblocked, 1602 struct mbuf *control) 1603{ 1604 int error = 0; 1605 int32_t space; 1606 int assumelock = 0; 1607 1608restart: 1609 if (*sblocked == 0) { 1610 if ((so->so_snd.sb_flags & SB_LOCK) != 0 && 1611 so->so_send_filt_thread != 0 && 1612 so->so_send_filt_thread == current_thread()) { 1613 /* 1614 * We're being called recursively from a filter, 1615 * allow this to continue. Radar 4150520. 1616 * Don't set sblocked because we don't want 1617 * to perform an unlock later. 1618 */ 1619 assumelock = 1; 1620 } else { 1621 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1622 if (error) { 1623 if (so->so_flags & SOF_DEFUNCT) 1624 goto defunct; 1625 return (error); 1626 } 1627 *sblocked = 1; 1628 } 1629 } 1630 1631 /* 1632 * If a send attempt is made on a socket that has been marked 1633 * as inactive (disconnected), reject the request. 1634 */ 1635 if (so->so_flags & SOF_DEFUNCT) { 1636defunct: 1637 error = EPIPE; 1638 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 1639 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so), 1640 SOCK_DOM(so), SOCK_TYPE(so), error)); 1641 return (error); 1642 } 1643 1644 if (so->so_state & SS_CANTSENDMORE) { 1645#if CONTENT_FILTER 1646 /* 1647 * Can re-inject data of half closed connections 1648 */ 1649 if ((so->so_state & SS_ISDISCONNECTED) == 0 && 1650 so->so_snd.sb_cfil_thread == current_thread() && 1651 cfil_sock_data_pending(&so->so_snd) != 0) 1652 CFIL_LOG(LOG_INFO, 1653 "so %llx ignore SS_CANTSENDMORE", 1654 (uint64_t)VM_KERNEL_ADDRPERM(so)); 1655 else 1656#endif /* CONTENT_FILTER */ 1657 return (EPIPE); 1658 } 1659 if (so->so_error) { 1660 error = so->so_error; 1661 so->so_error = 0; 1662 return (error); 1663 } 1664 1665 if ((so->so_state & SS_ISCONNECTED) == 0) { 1666 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 1667 if (((so->so_state & SS_ISCONFIRMING) == 0) && 1668 (resid != 0 || clen == 0)) { 1669#if MPTCP 1670 /* 1671 * MPTCP Fast Join sends data before the 1672 * socket is truly connected. 1673 */ 1674 if ((so->so_flags & (SOF_MP_SUBFLOW | 1675 SOF_MPTCP_FASTJOIN)) != 1676 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN)) 1677#endif /* MPTCP */ 1678 return (ENOTCONN); 1679 } 1680 } else if (addr == 0 && !(flags&MSG_HOLD)) { 1681 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ? 1682 ENOTCONN : EDESTADDRREQ); 1683 } 1684 } 1685 if (so->so_flags & SOF_ENABLE_MSGS) 1686 space = msgq_sbspace(so, control); 1687 else 1688 space = sbspace(&so->so_snd); 1689 1690 if (flags & MSG_OOB) 1691 space += 1024; 1692 if ((atomic && resid > so->so_snd.sb_hiwat) || 1693 clen > so->so_snd.sb_hiwat) 1694 return (EMSGSIZE); 1695 1696 if ((space < resid + clen && 1697 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) || 1698 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) { 1699 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || 1700 assumelock) { 1701 return (EWOULDBLOCK); 1702 } 1703 sbunlock(&so->so_snd, TRUE); /* keep socket locked */ 1704 *sblocked = 0; 1705 error = sbwait(&so->so_snd); 1706 if (error) { 1707 if (so->so_flags & SOF_DEFUNCT) 1708 goto defunct; 1709 return (error); 1710 } 1711 goto restart; 1712 } 1713 return (0); 1714} 1715 1716/* 1717 * Send on a socket. 1718 * If send must go all at once and message is larger than 1719 * send buffering, then hard error. 1720 * Lock against other senders. 1721 * If must go all at once and not enough room now, then 1722 * inform user that this would block and do nothing. 1723 * Otherwise, if nonblocking, send as much as possible. 1724 * The data to be sent is described by "uio" if nonzero, 1725 * otherwise by the mbuf chain "top" (which must be null 1726 * if uio is not). Data provided in mbuf chain must be small 1727 * enough to send all at once. 1728 * 1729 * Returns nonzero on error, timeout or signal; callers 1730 * must check for short counts if EINTR/ERESTART are returned. 1731 * Data and control buffers are freed on return. 1732 * Experiment: 1733 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf 1734 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then 1735 * point at the mbuf chain being constructed and go from there. 1736 * 1737 * Returns: 0 Success 1738 * EOPNOTSUPP 1739 * EINVAL 1740 * ENOBUFS 1741 * uiomove:EFAULT 1742 * sosendcheck:EPIPE 1743 * sosendcheck:EWOULDBLOCK 1744 * sosendcheck:EINTR 1745 * sosendcheck:EBADF 1746 * sosendcheck:EINTR 1747 * sosendcheck:??? [value from so_error] 1748 * <pru_send>:ECONNRESET[TCP] 1749 * <pru_send>:EINVAL[TCP] 1750 * <pru_send>:ENOBUFS[TCP] 1751 * <pru_send>:EADDRINUSE[TCP] 1752 * <pru_send>:EADDRNOTAVAIL[TCP] 1753 * <pru_send>:EAFNOSUPPORT[TCP] 1754 * <pru_send>:EACCES[TCP] 1755 * <pru_send>:EAGAIN[TCP] 1756 * <pru_send>:EPERM[TCP] 1757 * <pru_send>:EMSGSIZE[TCP] 1758 * <pru_send>:EHOSTUNREACH[TCP] 1759 * <pru_send>:ENETUNREACH[TCP] 1760 * <pru_send>:ENETDOWN[TCP] 1761 * <pru_send>:ENOMEM[TCP] 1762 * <pru_send>:ENOBUFS[TCP] 1763 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] 1764 * <pru_send>:EINVAL[AF_UNIX] 1765 * <pru_send>:EOPNOTSUPP[AF_UNIX] 1766 * <pru_send>:EPIPE[AF_UNIX] 1767 * <pru_send>:ENOTCONN[AF_UNIX] 1768 * <pru_send>:EISCONN[AF_UNIX] 1769 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses] 1770 * <sf_data_out>:??? [whatever a filter author chooses] 1771 * 1772 * Notes: Other <pru_send> returns depend on the protocol family; all 1773 * <sf_data_out> returns depend on what the filter author causes 1774 * their filter to return. 1775 */ 1776int 1777sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1778 struct mbuf *top, struct mbuf *control, int flags) 1779{ 1780 struct mbuf **mp; 1781 struct mbuf *m, *freelist = NULL; 1782 user_ssize_t space, len, resid; 1783 int clen = 0, error, dontroute, mlen, sendflags; 1784 int atomic = sosendallatonce(so) || top; 1785 int sblocked = 0; 1786 struct proc *p = current_proc(); 1787 struct mbuf *control_copy = NULL; 1788 1789 if (uio != NULL) 1790 resid = uio_resid(uio); 1791 else 1792 resid = top->m_pkthdr.len; 1793 1794 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid, 1795 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); 1796 1797 socket_lock(so, 1); 1798 1799 /* 1800 * Re-injection should not affect process accounting 1801 */ 1802 if ((flags & MSG_SKIPCFIL) == 0) { 1803 so_update_last_owner_locked(so, p); 1804 so_update_policy(so); 1805 1806#if NECP 1807 so_update_necp_policy(so, NULL, addr); 1808#endif /* NECP */ 1809 } 1810 1811 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { 1812 error = EOPNOTSUPP; 1813 socket_unlock(so, 1); 1814 goto out; 1815 } 1816 1817 /* 1818 * In theory resid should be unsigned. 1819 * However, space must be signed, as it might be less than 0 1820 * if we over-committed, and we must use a signed comparison 1821 * of space and resid. On the other hand, a negative resid 1822 * causes us to loop sending 0-length segments to the protocol. 1823 * 1824 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets. 1825 * But it will be used by sockets doing message delivery. 1826 * 1827 * Note: We limit resid to be a positive int value as we use 1828 * imin() to set bytes_to_copy -- radr://14558484 1829 */ 1830 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM && 1831 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) { 1832 error = EINVAL; 1833 socket_unlock(so, 1); 1834 goto out; 1835 } 1836 1837 dontroute = (flags & MSG_DONTROUTE) && 1838 (so->so_options & SO_DONTROUTE) == 0 && 1839 (so->so_proto->pr_flags & PR_ATOMIC); 1840 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); 1841 1842 if (control != NULL) 1843 clen = control->m_len; 1844 1845 do { 1846 error = sosendcheck(so, addr, resid, clen, atomic, flags, 1847 &sblocked, control); 1848 if (error) 1849 goto release; 1850 1851 mp = ⊤ 1852 if (so->so_flags & SOF_ENABLE_MSGS) 1853 space = msgq_sbspace(so, control); 1854 else 1855 space = sbspace(&so->so_snd) - clen; 1856 space += ((flags & MSG_OOB) ? 1024 : 0); 1857 1858 do { 1859 if (uio == NULL) { 1860 /* 1861 * Data is prepackaged in "top". 1862 */ 1863 resid = 0; 1864 if (flags & MSG_EOR) 1865 top->m_flags |= M_EOR; 1866 } else { 1867 int chainlength; 1868 int bytes_to_copy; 1869 boolean_t jumbocl; 1870 boolean_t bigcl; 1871 1872 bytes_to_copy = imin(resid, space); 1873 1874 if (sosendminchain > 0) 1875 chainlength = 0; 1876 else 1877 chainlength = sosendmaxchain; 1878 1879 /* 1880 * Use big 4 KB cluster only when outgoing 1881 * interface does not want 2 LB clusters 1882 */ 1883 bigcl = 1884 !(so->so_flags1 & SOF1_IF_2KCL) || 1885 sosendbigcl_ignore_capab; 1886 1887 /* 1888 * Attempt to use larger than system page-size 1889 * clusters for large writes only if there is 1890 * a jumbo cluster pool and if the socket is 1891 * marked accordingly. 1892 */ 1893 jumbocl = sosendjcl && njcl > 0 && 1894 ((so->so_flags & SOF_MULTIPAGES) || 1895 sosendjcl_ignore_capab) && 1896 bigcl; 1897 1898 socket_unlock(so, 0); 1899 1900 do { 1901 int num_needed; 1902 int hdrs_needed = (top == NULL) ? 1 : 0; 1903 1904 /* 1905 * try to maintain a local cache of mbuf 1906 * clusters needed to complete this 1907 * write the list is further limited to 1908 * the number that are currently needed 1909 * to fill the socket this mechanism 1910 * allows a large number of mbufs/ 1911 * clusters to be grabbed under a single 1912 * mbuf lock... if we can't get any 1913 * clusters, than fall back to trying 1914 * for mbufs if we fail early (or 1915 * miscalcluate the number needed) make 1916 * sure to release any clusters we 1917 * haven't yet consumed. 1918 */ 1919 if (freelist == NULL && 1920 bytes_to_copy > MBIGCLBYTES && 1921 jumbocl) { 1922 num_needed = 1923 bytes_to_copy / M16KCLBYTES; 1924 1925 if ((bytes_to_copy - 1926 (num_needed * M16KCLBYTES)) 1927 >= MINCLSIZE) 1928 num_needed++; 1929 1930 freelist = 1931 m_getpackets_internal( 1932 (unsigned int *)&num_needed, 1933 hdrs_needed, M_WAIT, 0, 1934 M16KCLBYTES); 1935 /* 1936 * Fall back to 4K cluster size 1937 * if allocation failed 1938 */ 1939 } 1940 1941 if (freelist == NULL && 1942 bytes_to_copy > MCLBYTES && 1943 bigcl) { 1944 num_needed = 1945 bytes_to_copy / MBIGCLBYTES; 1946 1947 if ((bytes_to_copy - 1948 (num_needed * MBIGCLBYTES)) >= 1949 MINCLSIZE) 1950 num_needed++; 1951 1952 freelist = 1953 m_getpackets_internal( 1954 (unsigned int *)&num_needed, 1955 hdrs_needed, M_WAIT, 0, 1956 MBIGCLBYTES); 1957 /* 1958 * Fall back to cluster size 1959 * if allocation failed 1960 */ 1961 } 1962 1963 if (freelist == NULL && 1964 bytes_to_copy > MINCLSIZE) { 1965 num_needed = 1966 bytes_to_copy / MCLBYTES; 1967 1968 if ((bytes_to_copy - 1969 (num_needed * MCLBYTES)) >= 1970 MINCLSIZE) 1971 num_needed++; 1972 1973 freelist = 1974 m_getpackets_internal( 1975 (unsigned int *)&num_needed, 1976 hdrs_needed, M_WAIT, 0, 1977 MCLBYTES); 1978 /* 1979 * Fall back to a single mbuf 1980 * if allocation failed 1981 */ 1982 } 1983 1984 if (freelist == NULL) { 1985 if (top == NULL) 1986 MGETHDR(freelist, 1987 M_WAIT, MT_DATA); 1988 else 1989 MGET(freelist, 1990 M_WAIT, MT_DATA); 1991 1992 if (freelist == NULL) { 1993 error = ENOBUFS; 1994 socket_lock(so, 0); 1995 goto release; 1996 } 1997 /* 1998 * For datagram protocols, 1999 * leave room for protocol 2000 * headers in first mbuf. 2001 */ 2002 if (atomic && top == NULL && 2003 bytes_to_copy < MHLEN) { 2004 MH_ALIGN(freelist, 2005 bytes_to_copy); 2006 } 2007 } 2008 m = freelist; 2009 freelist = m->m_next; 2010 m->m_next = NULL; 2011 2012 if ((m->m_flags & M_EXT)) 2013 mlen = m->m_ext.ext_size; 2014 else if ((m->m_flags & M_PKTHDR)) 2015 mlen = 2016 MHLEN - m_leadingspace(m); 2017 else 2018 mlen = MLEN; 2019 len = imin(mlen, bytes_to_copy); 2020 2021 chainlength += len; 2022 2023 space -= len; 2024 2025 error = uiomove(mtod(m, caddr_t), 2026 len, uio); 2027 2028 resid = uio_resid(uio); 2029 2030 m->m_len = len; 2031 *mp = m; 2032 top->m_pkthdr.len += len; 2033 if (error) 2034 break; 2035 mp = &m->m_next; 2036 if (resid <= 0) { 2037 if (flags & MSG_EOR) 2038 top->m_flags |= M_EOR; 2039 break; 2040 } 2041 bytes_to_copy = min(resid, space); 2042 2043 } while (space > 0 && 2044 (chainlength < sosendmaxchain || atomic || 2045 resid < MINCLSIZE)); 2046 2047 socket_lock(so, 0); 2048 2049 if (error) 2050 goto release; 2051 } 2052 2053 if (flags & (MSG_HOLD|MSG_SEND)) { 2054 /* Enqueue for later, go away if HOLD */ 2055 struct mbuf *mb1; 2056 if (so->so_temp && (flags & MSG_FLUSH)) { 2057 m_freem(so->so_temp); 2058 so->so_temp = NULL; 2059 } 2060 if (so->so_temp) 2061 so->so_tail->m_next = top; 2062 else 2063 so->so_temp = top; 2064 mb1 = top; 2065 while (mb1->m_next) 2066 mb1 = mb1->m_next; 2067 so->so_tail = mb1; 2068 if (flags & MSG_HOLD) { 2069 top = NULL; 2070 goto release; 2071 } 2072 top = so->so_temp; 2073 } 2074 if (dontroute) 2075 so->so_options |= SO_DONTROUTE; 2076 2077 /* Compute flags here, for pru_send and NKEs */ 2078 sendflags = (flags & MSG_OOB) ? PRUS_OOB : 2079 /* 2080 * If the user set MSG_EOF, the protocol 2081 * understands this flag and nothing left to 2082 * send then use PRU_SEND_EOF instead of PRU_SEND. 2083 */ 2084 ((flags & MSG_EOF) && 2085 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2086 (resid <= 0)) ? PRUS_EOF : 2087 /* If there is more to send set PRUS_MORETOCOME */ 2088 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2089 2090 if ((flags & MSG_SKIPCFIL) == 0) { 2091 /* 2092 * Socket filter processing 2093 */ 2094 error = sflt_data_out(so, addr, &top, 2095 &control, (sendflags & MSG_OOB) ? 2096 sock_data_filt_flag_oob : 0); 2097 if (error) { 2098 if (error == EJUSTRETURN) { 2099 error = 0; 2100 clen = 0; 2101 control = NULL; 2102 top = NULL; 2103 } 2104 goto release; 2105 } 2106#if CONTENT_FILTER 2107 /* 2108 * Content filter processing 2109 */ 2110 error = cfil_sock_data_out(so, addr, top, 2111 control, (sendflags & MSG_OOB) ? 2112 sock_data_filt_flag_oob : 0); 2113 if (error) { 2114 if (error == EJUSTRETURN) { 2115 error = 0; 2116 clen = 0; 2117 control = NULL; 2118 top = NULL; 2119 } 2120 goto release; 2121 } 2122#endif /* CONTENT_FILTER */ 2123 } 2124 if (so->so_flags & SOF_ENABLE_MSGS) { 2125 /* 2126 * Make a copy of control mbuf, 2127 * so that msg priority can be 2128 * passed to subsequent mbufs. 2129 */ 2130 control_copy = m_dup(control, M_NOWAIT); 2131 } 2132 error = (*so->so_proto->pr_usrreqs->pru_send) 2133 (so, sendflags, top, addr, control, p); 2134 2135 if (flags & MSG_SEND) 2136 so->so_temp = NULL; 2137 2138 if (dontroute) 2139 so->so_options &= ~SO_DONTROUTE; 2140 2141 clen = 0; 2142 control = control_copy; 2143 control_copy = NULL; 2144 top = NULL; 2145 mp = ⊤ 2146 if (error) 2147 goto release; 2148 } while (resid && space > 0); 2149 } while (resid); 2150 2151release: 2152 if (sblocked) 2153 sbunlock(&so->so_snd, FALSE); /* will unlock socket */ 2154 else 2155 socket_unlock(so, 1); 2156out: 2157 if (top != NULL) 2158 m_freem(top); 2159 if (control != NULL) 2160 m_freem(control); 2161 if (freelist != NULL) 2162 m_freem_list(freelist); 2163 if (control_copy != NULL) 2164 m_freem(control_copy); 2165 2166 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc, 2167 space, error); 2168 2169 return (error); 2170} 2171 2172int 2173sosend_list(struct socket *so, struct sockaddr *addr, struct uio **uioarray, 2174 u_int uiocnt, struct mbuf *top, struct mbuf *control, int flags) 2175{ 2176 struct mbuf *m, *freelist = NULL; 2177 user_ssize_t len, resid; 2178 int clen = 0, error, dontroute, mlen; 2179 int atomic = sosendallatonce(so) || top; 2180 int sblocked = 0; 2181 struct proc *p = current_proc(); 2182 u_int uiofirst = 0; 2183 u_int uiolast = 0; 2184 2185 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt, 2186 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); 2187 2188 if (so->so_type != SOCK_DGRAM) { 2189 error = EINVAL; 2190 goto out; 2191 } 2192 if (atomic == 0) { 2193 error = EINVAL; 2194 goto out; 2195 } 2196 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) { 2197 error = EPROTONOSUPPORT; 2198 goto out; 2199 } 2200 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) { 2201 error = EINVAL; 2202 goto out; 2203 } 2204 if (uioarray != NULL) 2205 resid = uio_array_resid(uioarray, uiocnt); 2206 else 2207 resid = mbuf_pkt_list_len(top); 2208 2209 /* 2210 * In theory resid should be unsigned. 2211 * However, space must be signed, as it might be less than 0 2212 * if we over-committed, and we must use a signed comparison 2213 * of space and resid. On the other hand, a negative resid 2214 * causes us to loop sending 0-length segments to the protocol. 2215 * 2216 * Note: We limit resid to be a positive int value as we use 2217 * imin() to set bytes_to_copy -- radr://14558484 2218 */ 2219 if (resid < 0 || resid > INT_MAX) { 2220 error = EINVAL; 2221 goto out; 2222 } 2223 /* 2224 * Disallow functionality not currently supported 2225 * Note: Will need to treat arrays of addresses and controls 2226 */ 2227 if (addr != NULL) { 2228 printf("%s addr not supported\n", __func__); 2229 error = EOPNOTSUPP; 2230 goto out; 2231 } 2232 if (control != NULL) { 2233 printf("%s control not supported\n", __func__); 2234 error = EOPNOTSUPP; 2235 goto out; 2236 } 2237 2238 socket_lock(so, 1); 2239 so_update_last_owner_locked(so, p); 2240 so_update_policy(so); 2241 2242#if NECP 2243 so_update_necp_policy(so, NULL, addr); 2244#endif /* NECP */ 2245 2246 dontroute = (flags & MSG_DONTROUTE) && 2247 (so->so_options & SO_DONTROUTE) == 0 && 2248 (so->so_proto->pr_flags & PR_ATOMIC); 2249 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); 2250 2251 if (control != NULL) 2252 clen = control->m_len; 2253 2254 error = sosendcheck(so, addr, resid, clen, atomic, flags, 2255 &sblocked, control); 2256 if (error) 2257 goto release; 2258 2259 do { 2260 int i; 2261 2262 if (uioarray == NULL) { 2263 /* 2264 * Data is prepackaged in "top". 2265 */ 2266 resid = 0; 2267 } else { 2268 int num_needed = 0; 2269 int chainlength; 2270 size_t maxpktlen = 0; 2271 2272 if (sosendminchain > 0) 2273 chainlength = 0; 2274 else 2275 chainlength = sosendmaxchain; 2276 2277 socket_unlock(so, 0); 2278 2279 /* 2280 * Find a set of uio that fit in a reasonable number 2281 * of mbuf packets 2282 */ 2283 for (i = uiofirst; i < uiocnt; i++) { 2284 struct uio *auio = uioarray[i]; 2285 2286 len = uio_resid(auio); 2287 2288 /* Do nothing for empty messages */ 2289 if (len == 0) 2290 continue; 2291 2292 num_needed += 1; 2293 uiolast += 1; 2294 2295 if (len > maxpktlen) 2296 maxpktlen = len; 2297 2298 chainlength += len; 2299 if (chainlength > sosendmaxchain) 2300 break; 2301 } 2302 /* 2303 * Nothing left to send 2304 */ 2305 if (num_needed == 0) { 2306 socket_lock(so, 0); 2307 break; 2308 } 2309 /* 2310 * Allocate the mbuf packets at once 2311 */ 2312 freelist = m_allocpacket_internal( 2313 (unsigned int *)&num_needed, 2314 maxpktlen, NULL, M_WAIT, 1, 0); 2315 2316 if (freelist == NULL) { 2317 socket_lock(so, 0); 2318 error = ENOMEM; 2319 goto release; 2320 } 2321 /* 2322 * Copy each uio of the set into its own mbuf packet 2323 */ 2324 for (i = uiofirst, m = freelist; 2325 i < uiolast && m != NULL; 2326 i++) { 2327 int bytes_to_copy; 2328 struct mbuf *n; 2329 struct uio *auio = uioarray[i]; 2330 2331 bytes_to_copy = uio_resid(auio); 2332 2333 /* Do nothing for empty messages */ 2334 if (bytes_to_copy == 0) 2335 continue; 2336 2337 for (n = m; n != NULL; n = n->m_next) { 2338 mlen = mbuf_maxlen(n); 2339 2340 len = imin(mlen, bytes_to_copy); 2341 2342 /* 2343 * Note: uiomove() decrements the iovec 2344 * length 2345 */ 2346 error = uiomove(mtod(n, caddr_t), 2347 len, auio); 2348 if (error != 0) 2349 break; 2350 n->m_len = len; 2351 m->m_pkthdr.len += len; 2352 2353 VERIFY(m->m_pkthdr.len <= maxpktlen); 2354 2355 bytes_to_copy -= len; 2356 resid -= len; 2357 } 2358 if (m->m_pkthdr.len == 0) { 2359 printf("%s so %llx pkt %llx len null\n", 2360 __func__, 2361 (uint64_t)VM_KERNEL_ADDRPERM(so), 2362 (uint64_t)VM_KERNEL_ADDRPERM(m)); 2363 } 2364 if (error != 0) 2365 break; 2366 m = m->m_nextpkt; 2367 } 2368 2369 socket_lock(so, 0); 2370 2371 if (error) 2372 goto release; 2373 top = freelist; 2374 freelist = NULL; 2375 } 2376 2377 if (dontroute) 2378 so->so_options |= SO_DONTROUTE; 2379 2380 if ((flags & MSG_SKIPCFIL) == 0) { 2381 struct mbuf **prevnextp = NULL; 2382 2383 for (i = uiofirst, m = top; 2384 i < uiolast && m != NULL; 2385 i++) { 2386 struct mbuf *nextpkt = m->m_nextpkt; 2387 2388 /* 2389 * Socket filter processing 2390 */ 2391 error = sflt_data_out(so, addr, &m, 2392 &control, 0); 2393 if (error != 0 && error != EJUSTRETURN) 2394 goto release; 2395 2396#if CONTENT_FILTER 2397 if (error == 0) { 2398 /* 2399 * Content filter processing 2400 */ 2401 error = cfil_sock_data_out(so, addr, m, 2402 control, 0); 2403 if (error != 0 && error != EJUSTRETURN) 2404 goto release; 2405 } 2406#endif /* CONTENT_FILTER */ 2407 /* 2408 * Remove packet from the list when 2409 * swallowed by a filter 2410 */ 2411 if (error == EJUSTRETURN) { 2412 error = 0; 2413 if (prevnextp != NULL) 2414 *prevnextp = nextpkt; 2415 else 2416 top = nextpkt; 2417 } 2418 2419 m = nextpkt; 2420 if (m != NULL) 2421 prevnextp = &m->m_nextpkt; 2422 } 2423 } 2424 if (top != NULL) 2425 error = (*so->so_proto->pr_usrreqs->pru_send_list) 2426 (so, 0, top, addr, control, p); 2427 2428 if (dontroute) 2429 so->so_options &= ~SO_DONTROUTE; 2430 2431 clen = 0; 2432 top = NULL; 2433 uiofirst = uiolast; 2434 } while (resid > 0 && error == 0); 2435release: 2436 if (sblocked) 2437 sbunlock(&so->so_snd, FALSE); /* will unlock socket */ 2438 else 2439 socket_unlock(so, 1); 2440out: 2441 if (top != NULL) 2442 m_freem(top); 2443 if (control != NULL) 2444 m_freem(control); 2445 if (freelist != NULL) 2446 m_freem_list(freelist); 2447 2448 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid, 2449 so->so_snd.sb_cc, 0, error); 2450 2451 return (error); 2452} 2453 2454/* 2455 * Implement receive operations on a socket. 2456 * We depend on the way that records are added to the sockbuf 2457 * by sbappend*. In particular, each record (mbufs linked through m_next) 2458 * must begin with an address if the protocol so specifies, 2459 * followed by an optional mbuf or mbufs containing ancillary data, 2460 * and then zero or more mbufs of data. 2461 * In order to avoid blocking network interrupts for the entire time here, 2462 * we splx() while doing the actual copy to user space. 2463 * Although the sockbuf is locked, new data may still be appended, 2464 * and thus we must maintain consistency of the sockbuf during that time. 2465 * 2466 * The caller may receive the data as a single mbuf chain by supplying 2467 * an mbuf **mp0 for use in returning the chain. The uio is then used 2468 * only for the count in uio_resid. 2469 * 2470 * Returns: 0 Success 2471 * ENOBUFS 2472 * ENOTCONN 2473 * EWOULDBLOCK 2474 * uiomove:EFAULT 2475 * sblock:EWOULDBLOCK 2476 * sblock:EINTR 2477 * sbwait:EBADF 2478 * sbwait:EINTR 2479 * sodelayed_copy:EFAULT 2480 * <pru_rcvoob>:EINVAL[TCP] 2481 * <pru_rcvoob>:EWOULDBLOCK[TCP] 2482 * <pru_rcvoob>:??? 2483 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX] 2484 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX] 2485 * <pr_domain->dom_externalize>:??? 2486 * 2487 * Notes: Additional return values from calls through <pru_rcvoob> and 2488 * <pr_domain->dom_externalize> depend on protocols other than 2489 * TCP or AF_UNIX, which are documented above. 2490 */ 2491int 2492soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2493 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2494{ 2495 struct mbuf *m, **mp, *ml = NULL; 2496 struct mbuf *nextrecord, *free_list; 2497 int flags, error, offset; 2498 user_ssize_t len; 2499 struct protosw *pr = so->so_proto; 2500 int moff, type =0; 2501 user_ssize_t orig_resid = uio_resid(uio); 2502 user_ssize_t delayed_copy_len; 2503 int can_delay; 2504 int need_event; 2505 struct proc *p = current_proc(); 2506 2507 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio), 2508 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); 2509 2510 /* 2511 * Sanity check on the length passed by caller as we are making 'int' 2512 * comparisons 2513 */ 2514 if (orig_resid < 0 || orig_resid > INT_MAX) 2515 return (EINVAL); 2516 2517 socket_lock(so, 1); 2518 so_update_last_owner_locked(so, p); 2519 so_update_policy(so); 2520 2521#ifdef MORE_LOCKING_DEBUG 2522 if (so->so_usecount == 1) { 2523 panic("%s: so=%x no other reference on socket\n", __func__, so); 2524 /* NOTREACHED */ 2525 } 2526#endif 2527 mp = mp0; 2528 if (psa != NULL) 2529 *psa = NULL; 2530 if (controlp != NULL) 2531 *controlp = NULL; 2532 if (flagsp != NULL) 2533 flags = *flagsp &~ MSG_EOR; 2534 else 2535 flags = 0; 2536 2537 /* 2538 * If a recv attempt is made on a previously-accepted socket 2539 * that has been marked as inactive (disconnected), reject 2540 * the request. 2541 */ 2542 if (so->so_flags & SOF_DEFUNCT) { 2543 struct sockbuf *sb = &so->so_rcv; 2544 2545 error = ENOTCONN; 2546 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 2547 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 2548 SOCK_DOM(so), SOCK_TYPE(so), error)); 2549 /* 2550 * This socket should have been disconnected and flushed 2551 * prior to being returned from sodefunct(); there should 2552 * be no data on its receive list, so panic otherwise. 2553 */ 2554 if (so->so_state & SS_DEFUNCT) 2555 sb_empty_assert(sb, __func__); 2556 socket_unlock(so, 1); 2557 return (error); 2558 } 2559 2560 /* 2561 * When SO_WANTOOBFLAG is set we try to get out-of-band data 2562 * regardless of the flags argument. Here is the case were 2563 * out-of-band data is not inline. 2564 */ 2565 if ((flags & MSG_OOB) || 2566 ((so->so_options & SO_WANTOOBFLAG) != 0 && 2567 (so->so_options & SO_OOBINLINE) == 0 && 2568 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) { 2569 m = m_get(M_WAIT, MT_DATA); 2570 if (m == NULL) { 2571 socket_unlock(so, 1); 2572 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, 2573 ENOBUFS, 0, 0, 0, 0); 2574 return (ENOBUFS); 2575 } 2576 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 2577 if (error) 2578 goto bad; 2579 socket_unlock(so, 0); 2580 do { 2581 error = uiomove(mtod(m, caddr_t), 2582 imin(uio_resid(uio), m->m_len), uio); 2583 m = m_free(m); 2584 } while (uio_resid(uio) && error == 0 && m != NULL); 2585 socket_lock(so, 0); 2586bad: 2587 if (m != NULL) 2588 m_freem(m); 2589 2590 if ((so->so_options & SO_WANTOOBFLAG) != 0) { 2591 if (error == EWOULDBLOCK || error == EINVAL) { 2592 /* 2593 * Let's try to get normal data: 2594 * EWOULDBLOCK: out-of-band data not 2595 * receive yet. EINVAL: out-of-band data 2596 * already read. 2597 */ 2598 error = 0; 2599 goto nooob; 2600 } else if (error == 0 && flagsp != NULL) { 2601 *flagsp |= MSG_OOB; 2602 } 2603 } 2604 socket_unlock(so, 1); 2605 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2606 0, 0, 0, 0); 2607 2608 return (error); 2609 } 2610nooob: 2611 if (mp != NULL) 2612 *mp = NULL; 2613 2614 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) { 2615 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 2616 } 2617 2618 free_list = NULL; 2619 delayed_copy_len = 0; 2620restart: 2621#ifdef MORE_LOCKING_DEBUG 2622 if (so->so_usecount <= 1) 2623 printf("soreceive: sblock so=0x%llx ref=%d on socket\n", 2624 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount); 2625#endif 2626 /* 2627 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) 2628 * and if so just return to the caller. This could happen when 2629 * soreceive() is called by a socket upcall function during the 2630 * time the socket is freed. The socket buffer would have been 2631 * locked across the upcall, therefore we cannot put this thread 2632 * to sleep (else we will deadlock) or return EWOULDBLOCK (else 2633 * we may livelock), because the lock on the socket buffer will 2634 * only be released when the upcall routine returns to its caller. 2635 * Because the socket has been officially closed, there can be 2636 * no further read on it. 2637 * 2638 * A multipath subflow socket would have its SS_NOFDREF set by 2639 * default, so check for SOF_MP_SUBFLOW socket flag; when the 2640 * socket is closed for real, SOF_MP_SUBFLOW would be cleared. 2641 */ 2642 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == 2643 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { 2644 socket_unlock(so, 1); 2645 return (0); 2646 } 2647 2648 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 2649 if (error) { 2650 socket_unlock(so, 1); 2651 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2652 0, 0, 0, 0); 2653 return (error); 2654 } 2655 2656 m = so->so_rcv.sb_mb; 2657 /* 2658 * If we have less data than requested, block awaiting more 2659 * (subject to any timeout) if: 2660 * 1. the current count is less than the low water mark, or 2661 * 2. MSG_WAITALL is set, and it is possible to do the entire 2662 * receive operation at once if we block (resid <= hiwat). 2663 * 3. MSG_DONTWAIT is not set 2664 * If MSG_WAITALL is set but resid is larger than the receive buffer, 2665 * we have to do the receive in sections, and thus risk returning 2666 * a short count if a timeout or signal occurs after we start. 2667 */ 2668 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2669 so->so_rcv.sb_cc < uio_resid(uio)) && 2670 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 2671 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) && 2672 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2673 /* 2674 * Panic if we notice inconsistencies in the socket's 2675 * receive list; both sb_mb and sb_cc should correctly 2676 * reflect the contents of the list, otherwise we may 2677 * end up with false positives during select() or poll() 2678 * which could put the application in a bad state. 2679 */ 2680 SB_MB_CHECK(&so->so_rcv); 2681 2682 if (so->so_error) { 2683 if (m != NULL) 2684 goto dontblock; 2685 error = so->so_error; 2686 if ((flags & MSG_PEEK) == 0) 2687 so->so_error = 0; 2688 goto release; 2689 } 2690 if (so->so_state & SS_CANTRCVMORE) { 2691#if CONTENT_FILTER 2692 /* 2693 * Deal with half closed connections 2694 */ 2695 if ((so->so_state & SS_ISDISCONNECTED) == 0 && 2696 cfil_sock_data_pending(&so->so_rcv) != 0) 2697 CFIL_LOG(LOG_INFO, 2698 "so %llx ignore SS_CANTRCVMORE", 2699 (uint64_t)VM_KERNEL_ADDRPERM(so)); 2700 else 2701#endif /* CONTENT_FILTER */ 2702 if (m != NULL) 2703 goto dontblock; 2704 else 2705 goto release; 2706 } 2707 for (; m != NULL; m = m->m_next) 2708 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2709 m = so->so_rcv.sb_mb; 2710 goto dontblock; 2711 } 2712 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 2713 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2714 error = ENOTCONN; 2715 goto release; 2716 } 2717 if (uio_resid(uio) == 0) 2718 goto release; 2719 if ((so->so_state & SS_NBIO) || 2720 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2721 error = EWOULDBLOCK; 2722 goto release; 2723 } 2724 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 2725 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 2726 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 2727#if EVEN_MORE_LOCKING_DEBUG 2728 if (socket_debug) 2729 printf("Waiting for socket data\n"); 2730#endif 2731 2732 error = sbwait(&so->so_rcv); 2733#if EVEN_MORE_LOCKING_DEBUG 2734 if (socket_debug) 2735 printf("SORECEIVE - sbwait returned %d\n", error); 2736#endif 2737 if (so->so_usecount < 1) { 2738 panic("%s: after 2nd sblock so=%p ref=%d on socket\n", 2739 __func__, so, so->so_usecount); 2740 /* NOTREACHED */ 2741 } 2742 if (error) { 2743 socket_unlock(so, 1); 2744 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2745 0, 0, 0, 0); 2746 return (error); 2747 } 2748 goto restart; 2749 } 2750dontblock: 2751 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); 2752 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 2753 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 2754 nextrecord = m->m_nextpkt; 2755 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { 2756 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 2757#if CONFIG_MACF_SOCKET_SUBSET 2758 /* 2759 * Call the MAC framework for policy checking if we're in 2760 * the user process context and the socket isn't connected. 2761 */ 2762 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) { 2763 struct mbuf *m0 = m; 2764 /* 2765 * Dequeue this record (temporarily) from the receive 2766 * list since we're about to drop the socket's lock 2767 * where a new record may arrive and be appended to 2768 * the list. Upon MAC policy failure, the record 2769 * will be freed. Otherwise, we'll add it back to 2770 * the head of the list. We cannot rely on SB_LOCK 2771 * because append operation uses the socket's lock. 2772 */ 2773 do { 2774 m->m_nextpkt = NULL; 2775 sbfree(&so->so_rcv, m); 2776 m = m->m_next; 2777 } while (m != NULL); 2778 m = m0; 2779 so->so_rcv.sb_mb = nextrecord; 2780 SB_EMPTY_FIXUP(&so->so_rcv); 2781 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a"); 2782 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a"); 2783 socket_unlock(so, 0); 2784 2785 if (mac_socket_check_received(proc_ucred(p), so, 2786 mtod(m, struct sockaddr *)) != 0) { 2787 /* 2788 * MAC policy failure; free this record and 2789 * process the next record (or block until 2790 * one is available). We have adjusted sb_cc 2791 * and sb_mbcnt above so there is no need to 2792 * call sbfree() again. 2793 */ 2794 do { 2795 m = m_free(m); 2796 } while (m != NULL); 2797 /* 2798 * Clear SB_LOCK but don't unlock the socket. 2799 * Process the next record or wait for one. 2800 */ 2801 socket_lock(so, 0); 2802 sbunlock(&so->so_rcv, TRUE); /* stay locked */ 2803 goto restart; 2804 } 2805 socket_lock(so, 0); 2806 /* 2807 * If the socket has been defunct'd, drop it. 2808 */ 2809 if (so->so_flags & SOF_DEFUNCT) { 2810 m_freem(m); 2811 error = ENOTCONN; 2812 goto release; 2813 } 2814 /* 2815 * Re-adjust the socket receive list and re-enqueue 2816 * the record in front of any packets which may have 2817 * been appended while we dropped the lock. 2818 */ 2819 for (m = m0; m->m_next != NULL; m = m->m_next) 2820 sballoc(&so->so_rcv, m); 2821 sballoc(&so->so_rcv, m); 2822 if (so->so_rcv.sb_mb == NULL) { 2823 so->so_rcv.sb_lastrecord = m0; 2824 so->so_rcv.sb_mbtail = m; 2825 } 2826 m = m0; 2827 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb; 2828 so->so_rcv.sb_mb = m; 2829 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b"); 2830 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b"); 2831 } 2832#endif /* CONFIG_MACF_SOCKET_SUBSET */ 2833 orig_resid = 0; 2834 if (psa != NULL) { 2835 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 2836 mp0 == NULL); 2837 if ((*psa == NULL) && (flags & MSG_NEEDSA)) { 2838 error = EWOULDBLOCK; 2839 goto release; 2840 } 2841 } 2842 if (flags & MSG_PEEK) { 2843 m = m->m_next; 2844 } else { 2845 sbfree(&so->so_rcv, m); 2846 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) { 2847 panic("%s: about to create invalid socketbuf", 2848 __func__); 2849 /* NOTREACHED */ 2850 } 2851 MFREE(m, so->so_rcv.sb_mb); 2852 m = so->so_rcv.sb_mb; 2853 if (m != NULL) { 2854 m->m_nextpkt = nextrecord; 2855 } else { 2856 so->so_rcv.sb_mb = nextrecord; 2857 SB_EMPTY_FIXUP(&so->so_rcv); 2858 } 2859 } 2860 } 2861 2862 /* 2863 * Process one or more MT_CONTROL mbufs present before any data mbufs 2864 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2865 * just copy the data; if !MSG_PEEK, we call into the protocol to 2866 * perform externalization. 2867 */ 2868 if (m != NULL && m->m_type == MT_CONTROL) { 2869 struct mbuf *cm = NULL, *cmn; 2870 struct mbuf **cme = &cm; 2871 struct sockbuf *sb_rcv = &so->so_rcv; 2872 struct mbuf **msgpcm = NULL; 2873 2874 /* 2875 * Externalizing the control messages would require us to 2876 * drop the socket's lock below. Once we re-acquire the 2877 * lock, the mbuf chain might change. In order to preserve 2878 * consistency, we unlink all control messages from the 2879 * first mbuf chain in one shot and link them separately 2880 * onto a different chain. 2881 */ 2882 do { 2883 if (flags & MSG_PEEK) { 2884 if (controlp != NULL) { 2885 if (*controlp == NULL) { 2886 msgpcm = controlp; 2887 } 2888 *controlp = m_copy(m, 0, m->m_len); 2889 2890 /* 2891 * If we failed to allocate an mbuf, 2892 * release any previously allocated 2893 * mbufs for control data. Return 2894 * an error. Keep the mbufs in the 2895 * socket as this is using 2896 * MSG_PEEK flag. 2897 */ 2898 if (*controlp == NULL) { 2899 m_freem(*msgpcm); 2900 error = ENOBUFS; 2901 goto release; 2902 } 2903 controlp = &(*controlp)->m_next; 2904 } 2905 m = m->m_next; 2906 } else { 2907 m->m_nextpkt = NULL; 2908 sbfree(sb_rcv, m); 2909 sb_rcv->sb_mb = m->m_next; 2910 m->m_next = NULL; 2911 *cme = m; 2912 cme = &(*cme)->m_next; 2913 m = sb_rcv->sb_mb; 2914 } 2915 } while (m != NULL && m->m_type == MT_CONTROL); 2916 2917 if (!(flags & MSG_PEEK)) { 2918 if (sb_rcv->sb_mb != NULL) { 2919 sb_rcv->sb_mb->m_nextpkt = nextrecord; 2920 } else { 2921 sb_rcv->sb_mb = nextrecord; 2922 SB_EMPTY_FIXUP(sb_rcv); 2923 } 2924 if (nextrecord == NULL) 2925 sb_rcv->sb_lastrecord = m; 2926 } 2927 2928 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl"); 2929 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl"); 2930 2931 while (cm != NULL) { 2932 int cmsg_type; 2933 2934 cmn = cm->m_next; 2935 cm->m_next = NULL; 2936 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type; 2937 2938 /* 2939 * Call the protocol to externalize SCM_RIGHTS message 2940 * and return the modified message to the caller upon 2941 * success. Otherwise, all other control messages are 2942 * returned unmodified to the caller. Note that we 2943 * only get into this loop if MSG_PEEK is not set. 2944 */ 2945 if (pr->pr_domain->dom_externalize != NULL && 2946 cmsg_type == SCM_RIGHTS) { 2947 /* 2948 * Release socket lock: see 3903171. This 2949 * would also allow more records to be appended 2950 * to the socket buffer. We still have SB_LOCK 2951 * set on it, so we can be sure that the head 2952 * of the mbuf chain won't change. 2953 */ 2954 socket_unlock(so, 0); 2955 error = (*pr->pr_domain->dom_externalize)(cm); 2956 socket_lock(so, 0); 2957 } else { 2958 error = 0; 2959 } 2960 2961 if (controlp != NULL && error == 0) { 2962 *controlp = cm; 2963 controlp = &(*controlp)->m_next; 2964 orig_resid = 0; 2965 } else { 2966 (void) m_free(cm); 2967 } 2968 cm = cmn; 2969 } 2970 /* 2971 * Update the value of nextrecord in case we received new 2972 * records when the socket was unlocked above for 2973 * externalizing SCM_RIGHTS. 2974 */ 2975 if (m != NULL) 2976 nextrecord = sb_rcv->sb_mb->m_nextpkt; 2977 else 2978 nextrecord = sb_rcv->sb_mb; 2979 orig_resid = 0; 2980 } 2981 2982 /* 2983 * If the socket is a TCP socket with message delivery 2984 * enabled, then create a control msg to deliver the 2985 * relative TCP sequence number for this data. Waiting 2986 * until this point will protect against failures to 2987 * allocate an mbuf for control msgs. 2988 */ 2989 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP && 2990 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) { 2991 struct mbuf *seq_cm; 2992 2993 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq, 2994 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET); 2995 if (seq_cm == NULL) { 2996 /* unable to allocate a control mbuf */ 2997 error = ENOBUFS; 2998 goto release; 2999 } 3000 *controlp = seq_cm; 3001 controlp = &seq_cm->m_next; 3002 } 3003 3004 if (m != NULL) { 3005 if (!(flags & MSG_PEEK)) { 3006 /* 3007 * We get here because m points to an mbuf following 3008 * any MT_SONAME or MT_CONTROL mbufs which have been 3009 * processed above. In any case, m should be pointing 3010 * to the head of the mbuf chain, and the nextrecord 3011 * should be either NULL or equal to m->m_nextpkt. 3012 * See comments above about SB_LOCK. 3013 */ 3014 if (m != so->so_rcv.sb_mb || 3015 m->m_nextpkt != nextrecord) { 3016 panic("%s: post-control !sync so=%p m=%p " 3017 "nextrecord=%p\n", __func__, so, m, 3018 nextrecord); 3019 /* NOTREACHED */ 3020 } 3021 if (nextrecord == NULL) 3022 so->so_rcv.sb_lastrecord = m; 3023 } 3024 type = m->m_type; 3025 if (type == MT_OOBDATA) 3026 flags |= MSG_OOB; 3027 } else { 3028 if (!(flags & MSG_PEEK)) { 3029 SB_EMPTY_FIXUP(&so->so_rcv); 3030 } 3031 } 3032 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 3033 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 3034 3035 moff = 0; 3036 offset = 0; 3037 3038 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) 3039 can_delay = 1; 3040 else 3041 can_delay = 0; 3042 3043 need_event = 0; 3044 3045 while (m != NULL && 3046 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) { 3047 if (m->m_type == MT_OOBDATA) { 3048 if (type != MT_OOBDATA) 3049 break; 3050 } else if (type == MT_OOBDATA) { 3051 break; 3052 } 3053 /* 3054 * Make sure to allways set MSG_OOB event when getting 3055 * out of band data inline. 3056 */ 3057 if ((so->so_options & SO_WANTOOBFLAG) != 0 && 3058 (so->so_options & SO_OOBINLINE) != 0 && 3059 (so->so_state & SS_RCVATMARK) != 0) { 3060 flags |= MSG_OOB; 3061 } 3062 so->so_state &= ~SS_RCVATMARK; 3063 len = uio_resid(uio) - delayed_copy_len; 3064 if (so->so_oobmark && len > so->so_oobmark - offset) 3065 len = so->so_oobmark - offset; 3066 if (len > m->m_len - moff) 3067 len = m->m_len - moff; 3068 /* 3069 * If mp is set, just pass back the mbufs. 3070 * Otherwise copy them out via the uio, then free. 3071 * Sockbuf must be consistent here (points to current mbuf, 3072 * it points to next record) when we drop priority; 3073 * we must note any additions to the sockbuf when we 3074 * block interrupts again. 3075 */ 3076 if (mp == NULL) { 3077 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 3078 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 3079 if (can_delay && len == m->m_len) { 3080 /* 3081 * only delay the copy if we're consuming the 3082 * mbuf and we're NOT in MSG_PEEK mode 3083 * and we have enough data to make it worthwile 3084 * to drop and retake the lock... can_delay 3085 * reflects the state of the 2 latter 3086 * constraints moff should always be zero 3087 * in these cases 3088 */ 3089 delayed_copy_len += len; 3090 } else { 3091 if (delayed_copy_len) { 3092 error = sodelayed_copy(so, uio, 3093 &free_list, &delayed_copy_len); 3094 3095 if (error) { 3096 goto release; 3097 } 3098 /* 3099 * can only get here if MSG_PEEK is not 3100 * set therefore, m should point at the 3101 * head of the rcv queue; if it doesn't, 3102 * it means something drastically 3103 * changed while we were out from behind 3104 * the lock in sodelayed_copy. perhaps 3105 * a RST on the stream. in any event, 3106 * the stream has been interrupted. it's 3107 * probably best just to return whatever 3108 * data we've moved and let the caller 3109 * sort it out... 3110 */ 3111 if (m != so->so_rcv.sb_mb) { 3112 break; 3113 } 3114 } 3115 socket_unlock(so, 0); 3116 error = uiomove(mtod(m, caddr_t) + moff, 3117 (int)len, uio); 3118 socket_lock(so, 0); 3119 3120 if (error) 3121 goto release; 3122 } 3123 } else { 3124 uio_setresid(uio, (uio_resid(uio) - len)); 3125 } 3126 if (len == m->m_len - moff) { 3127 if (m->m_flags & M_EOR) 3128 flags |= MSG_EOR; 3129 if (flags & MSG_PEEK) { 3130 m = m->m_next; 3131 moff = 0; 3132 } else { 3133 nextrecord = m->m_nextpkt; 3134 sbfree(&so->so_rcv, m); 3135 m->m_nextpkt = NULL; 3136 3137 /* 3138 * If this packet is an unordered packet 3139 * (indicated by M_UNORDERED_DATA flag), remove 3140 * the additional bytes added to the 3141 * receive socket buffer size. 3142 */ 3143 if ((so->so_flags & SOF_ENABLE_MSGS) && 3144 m->m_len && 3145 (m->m_flags & M_UNORDERED_DATA) && 3146 sbreserve(&so->so_rcv, 3147 so->so_rcv.sb_hiwat - m->m_len)) { 3148 if (so->so_msg_state->msg_uno_bytes > 3149 m->m_len) { 3150 so->so_msg_state-> 3151 msg_uno_bytes -= m->m_len; 3152 } else { 3153 so->so_msg_state-> 3154 msg_uno_bytes = 0; 3155 } 3156 m->m_flags &= ~M_UNORDERED_DATA; 3157 } 3158 3159 if (mp != NULL) { 3160 *mp = m; 3161 mp = &m->m_next; 3162 so->so_rcv.sb_mb = m = m->m_next; 3163 *mp = NULL; 3164 } else { 3165 if (free_list == NULL) 3166 free_list = m; 3167 else 3168 ml->m_next = m; 3169 ml = m; 3170 so->so_rcv.sb_mb = m = m->m_next; 3171 ml->m_next = NULL; 3172 } 3173 if (m != NULL) { 3174 m->m_nextpkt = nextrecord; 3175 if (nextrecord == NULL) 3176 so->so_rcv.sb_lastrecord = m; 3177 } else { 3178 so->so_rcv.sb_mb = nextrecord; 3179 SB_EMPTY_FIXUP(&so->so_rcv); 3180 } 3181 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 3182 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 3183 } 3184 } else { 3185 if (flags & MSG_PEEK) { 3186 moff += len; 3187 } else { 3188 if (mp != NULL) { 3189 int copy_flag; 3190 3191 if (flags & MSG_DONTWAIT) 3192 copy_flag = M_DONTWAIT; 3193 else 3194 copy_flag = M_WAIT; 3195 *mp = m_copym(m, 0, len, copy_flag); 3196 /* 3197 * Failed to allocate an mbuf? 3198 * Adjust uio_resid back, it was 3199 * adjusted down by len bytes which 3200 * we didn't copy over. 3201 */ 3202 if (*mp == NULL) { 3203 uio_setresid(uio, 3204 (uio_resid(uio) + len)); 3205 break; 3206 } 3207 } 3208 m->m_data += len; 3209 m->m_len -= len; 3210 so->so_rcv.sb_cc -= len; 3211 } 3212 } 3213 if (so->so_oobmark) { 3214 if ((flags & MSG_PEEK) == 0) { 3215 so->so_oobmark -= len; 3216 if (so->so_oobmark == 0) { 3217 so->so_state |= SS_RCVATMARK; 3218 /* 3219 * delay posting the actual event until 3220 * after any delayed copy processing 3221 * has finished 3222 */ 3223 need_event = 1; 3224 break; 3225 } 3226 } else { 3227 offset += len; 3228 if (offset == so->so_oobmark) 3229 break; 3230 } 3231 } 3232 if (flags & MSG_EOR) 3233 break; 3234 /* 3235 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set 3236 * (for non-atomic socket), we must not quit until 3237 * "uio->uio_resid == 0" or an error termination. 3238 * If a signal/timeout occurs, return with a short 3239 * count but without error. Keep sockbuf locked 3240 * against other readers. 3241 */ 3242 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL && 3243 (uio_resid(uio) - delayed_copy_len) > 0 && 3244 !sosendallatonce(so) && !nextrecord) { 3245 if (so->so_error || ((so->so_state & SS_CANTRCVMORE) 3246#if CONTENT_FILTER 3247 && cfil_sock_data_pending(&so->so_rcv) == 0 3248#endif /* CONTENT_FILTER */ 3249 )) 3250 goto release; 3251 3252 /* 3253 * Depending on the protocol (e.g. TCP), the following 3254 * might cause the socket lock to be dropped and later 3255 * be reacquired, and more data could have arrived and 3256 * have been appended to the receive socket buffer by 3257 * the time it returns. Therefore, we only sleep in 3258 * sbwait() below if and only if the socket buffer is 3259 * empty, in order to avoid a false sleep. 3260 */ 3261 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb && 3262 (((struct inpcb *)so->so_pcb)->inp_state != 3263 INPCB_STATE_DEAD)) 3264 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 3265 3266 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 3267 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 3268 3269 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) { 3270 error = 0; 3271 goto release; 3272 } 3273 /* 3274 * have to wait until after we get back from the sbwait 3275 * to do the copy because we will drop the lock if we 3276 * have enough data that has been delayed... by dropping 3277 * the lock we open up a window allowing the netisr 3278 * thread to process the incoming packets and to change 3279 * the state of this socket... we're issuing the sbwait 3280 * because the socket is empty and we're expecting the 3281 * netisr thread to wake us up when more packets arrive; 3282 * if we allow that processing to happen and then sbwait 3283 * we could stall forever with packets sitting in the 3284 * socket if no further packets arrive from the remote 3285 * side. 3286 * 3287 * we want to copy before we've collected all the data 3288 * to satisfy this request to allow the copy to overlap 3289 * the incoming packet processing on an MP system 3290 */ 3291 if (delayed_copy_len > sorecvmincopy && 3292 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) { 3293 error = sodelayed_copy(so, uio, 3294 &free_list, &delayed_copy_len); 3295 3296 if (error) 3297 goto release; 3298 } 3299 m = so->so_rcv.sb_mb; 3300 if (m != NULL) { 3301 nextrecord = m->m_nextpkt; 3302 } 3303 SB_MB_CHECK(&so->so_rcv); 3304 } 3305 } 3306#ifdef MORE_LOCKING_DEBUG 3307 if (so->so_usecount <= 1) { 3308 panic("%s: after big while so=%p ref=%d on socket\n", 3309 __func__, so, so->so_usecount); 3310 /* NOTREACHED */ 3311 } 3312#endif 3313 3314 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3315 if (so->so_options & SO_DONTTRUNC) { 3316 flags |= MSG_RCVMORE; 3317 } else { 3318 flags |= MSG_TRUNC; 3319 if ((flags & MSG_PEEK) == 0) 3320 (void) sbdroprecord(&so->so_rcv); 3321 } 3322 } 3323 3324 /* 3325 * pru_rcvd below (for TCP) may cause more data to be received 3326 * if the socket lock is dropped prior to sending the ACK; some 3327 * legacy OpenTransport applications don't handle this well 3328 * (if it receives less data than requested while MSG_HAVEMORE 3329 * is set), and so we set the flag now based on what we know 3330 * prior to calling pru_rcvd. 3331 */ 3332 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) 3333 flags |= MSG_HAVEMORE; 3334 3335 if ((flags & MSG_PEEK) == 0) { 3336 if (m == NULL) { 3337 so->so_rcv.sb_mb = nextrecord; 3338 /* 3339 * First part is an inline SB_EMPTY_FIXUP(). Second 3340 * part makes sure sb_lastrecord is up-to-date if 3341 * there is still data in the socket buffer. 3342 */ 3343 if (so->so_rcv.sb_mb == NULL) { 3344 so->so_rcv.sb_mbtail = NULL; 3345 so->so_rcv.sb_lastrecord = NULL; 3346 } else if (nextrecord->m_nextpkt == NULL) { 3347 so->so_rcv.sb_lastrecord = nextrecord; 3348 } 3349 SB_MB_CHECK(&so->so_rcv); 3350 } 3351 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 3352 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 3353 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 3354 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 3355 } 3356 3357 if (delayed_copy_len) { 3358 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); 3359 if (error) 3360 goto release; 3361 } 3362 if (free_list != NULL) { 3363 m_freem_list(free_list); 3364 free_list = NULL; 3365 } 3366 if (need_event) 3367 postevent(so, 0, EV_OOB); 3368 3369 if (orig_resid == uio_resid(uio) && orig_resid && 3370 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 3371 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 3372 goto restart; 3373 } 3374 3375 if (flagsp != NULL) 3376 *flagsp |= flags; 3377release: 3378#ifdef MORE_LOCKING_DEBUG 3379 if (so->so_usecount <= 1) { 3380 panic("%s: release so=%p ref=%d on socket\n", __func__, 3381 so, so->so_usecount); 3382 /* NOTREACHED */ 3383 } 3384#endif 3385 if (delayed_copy_len) 3386 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); 3387 3388 if (free_list != NULL) 3389 m_freem_list(free_list); 3390 3391 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ 3392 3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio), 3394 so->so_rcv.sb_cc, 0, error); 3395 3396 return (error); 3397} 3398 3399/* 3400 * Returns: 0 Success 3401 * uiomove:EFAULT 3402 */ 3403static int 3404sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, 3405 user_ssize_t *resid) 3406{ 3407 int error = 0; 3408 struct mbuf *m; 3409 3410 m = *free_list; 3411 3412 socket_unlock(so, 0); 3413 3414 while (m != NULL && error == 0) { 3415 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio); 3416 m = m->m_next; 3417 } 3418 m_freem_list(*free_list); 3419 3420 *free_list = NULL; 3421 *resid = 0; 3422 3423 socket_lock(so, 0); 3424 3425 return (error); 3426} 3427 3428int 3429soreceive_list(struct socket *so, struct sockaddr **psa, struct uio **uioarray, 3430 u_int uiocnt, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3431{ 3432 struct mbuf *m, **mp; 3433 struct mbuf *nextrecord; 3434 struct mbuf *ml = NULL, *free_list = NULL; 3435 int flags, error, offset; 3436 user_ssize_t len; 3437 struct protosw *pr = so->so_proto; 3438 user_ssize_t orig_resid, resid; 3439 struct proc *p = current_proc(); 3440 struct uio *auio = NULL; 3441 int i = 0; 3442 int sblocked = 0; 3443 3444 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, 3445 so, uiocnt, 3446 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); 3447 3448 mp = mp0; 3449 if (psa != NULL) 3450 *psa = NULL; 3451 if (controlp != NULL) 3452 *controlp = NULL; 3453 if (flagsp != NULL) 3454 flags = *flagsp &~ MSG_EOR; 3455 else 3456 flags = 0; 3457 /* 3458 * Disallow functionality not currently supported 3459 */ 3460 if (mp0 != NULL) { 3461 printf("%s mp0 not supported\n", __func__); 3462 error = EOPNOTSUPP; 3463 goto out; 3464 } 3465 if (psa != NULL) { 3466 printf("%s sockaddr not supported\n", __func__); 3467 error = EOPNOTSUPP; 3468 goto out; 3469 } 3470 if (controlp != NULL) { 3471 printf("%s control not supported\n", __func__); 3472 error = EOPNOTSUPP; 3473 goto out; 3474 } 3475 3476 /* 3477 * Sanity checks: 3478 * - Only supports don't wait flags 3479 * - Only support datagram sockets (could be extended to raw) 3480 * - Must be atomic 3481 * - Protocol must support packet chains 3482 * - The uio array is NULL (should we panic?) 3483 */ 3484 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) { 3485 printf("%s flags not supported\n", __func__); 3486 error = EOPNOTSUPP; 3487 goto out; 3488 } 3489 if (so->so_type != SOCK_DGRAM) { 3490 error = EINVAL; 3491 goto out; 3492 } 3493 if (sosendallatonce(so) == 0) { 3494 error = EINVAL; 3495 goto out; 3496 } 3497 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) { 3498 error = EPROTONOSUPPORT; 3499 goto out; 3500 } 3501 if (uioarray == NULL) { 3502 printf("%s uioarray is NULL\n", __func__); 3503 error = EINVAL; 3504 goto out; 3505 } 3506 if (uiocnt == 0) { 3507 printf("%s uiocnt is 0\n", __func__); 3508 error = EINVAL; 3509 goto out; 3510 } 3511 /* 3512 * Sanity check on the length passed by caller as we are making 'int' 3513 * comparisons 3514 */ 3515 resid = orig_resid = uio_array_resid(uioarray, uiocnt); 3516 if (orig_resid < 0 || orig_resid > INT_MAX) { 3517 error = EINVAL; 3518 goto out; 3519 } 3520 3521 socket_lock(so, 1); 3522 so_update_last_owner_locked(so, p); 3523 so_update_policy(so); 3524 3525#if NECP 3526 so_update_necp_policy(so, NULL, NULL); 3527#endif /* NECP */ 3528 3529 /* 3530 * If a recv attempt is made on a previously-accepted socket 3531 * that has been marked as inactive (disconnected), reject 3532 * the request. 3533 */ 3534 if (so->so_flags & SOF_DEFUNCT) { 3535 struct sockbuf *sb = &so->so_rcv; 3536 3537 error = ENOTCONN; 3538 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 3539 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 3540 SOCK_DOM(so), SOCK_TYPE(so), error)); 3541 /* 3542 * This socket should have been disconnected and flushed 3543 * prior to being returned from sodefunct(); there should 3544 * be no data on its receive list, so panic otherwise. 3545 */ 3546 if (so->so_state & SS_DEFUNCT) 3547 sb_empty_assert(sb, __func__); 3548 goto release; 3549 } 3550 if (mp != NULL) 3551 *mp = NULL; 3552restart: 3553 /* 3554 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) 3555 * and if so just return to the caller. This could happen when 3556 * soreceive() is called by a socket upcall function during the 3557 * time the socket is freed. The socket buffer would have been 3558 * locked across the upcall, therefore we cannot put this thread 3559 * to sleep (else we will deadlock) or return EWOULDBLOCK (else 3560 * we may livelock), because the lock on the socket buffer will 3561 * only be released when the upcall routine returns to its caller. 3562 * Because the socket has been officially closed, there can be 3563 * no further read on it. 3564 */ 3565 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == 3566 (SS_NOFDREF | SS_CANTRCVMORE)) { 3567 error = 0; 3568 goto release; 3569 } 3570 3571 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 3572 if (error) { 3573 goto release; 3574 } 3575 sblocked = 1; 3576 3577 /* 3578 * Skip empty uio 3579 */ 3580 auio = uioarray[i]; 3581 while (uio_resid(auio) == 0) { 3582 i++; 3583 if (i >= uiocnt) { 3584 error = 0; 3585 goto release; 3586 } 3587 } 3588 3589 m = so->so_rcv.sb_mb; 3590 /* 3591 * Block awaiting more datagram if needed 3592 */ 3593 if (m == NULL) { 3594 /* 3595 * Panic if we notice inconsistencies in the socket's 3596 * receive list; both sb_mb and sb_cc should correctly 3597 * reflect the contents of the list, otherwise we may 3598 * end up with false positives during select() or poll() 3599 * which could put the application in a bad state. 3600 */ 3601 SB_MB_CHECK(&so->so_rcv); 3602 3603 if (so->so_error) { 3604 error = so->so_error; 3605 goto release; 3606 } 3607 if (so->so_state & SS_CANTRCVMORE) { 3608 goto release; 3609 } 3610 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 3611 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 3612 error = ENOTCONN; 3613 goto release; 3614 } 3615 if ((so->so_state & SS_NBIO) || 3616 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3617 error = EWOULDBLOCK; 3618 goto release; 3619 } 3620 /* 3621 * Do not block if we got some data 3622 * Note: We could use MSG_WAITALL to wait 3623 */ 3624 resid = uio_array_resid(uioarray, uiocnt); 3625 if (resid != orig_resid) { 3626 error = 0; 3627 goto release; 3628 } 3629 3630 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 3631 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 3632 3633 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 3634 sblocked = 0; 3635 3636 error = sbwait(&so->so_rcv); 3637 if (error) { 3638 goto release; 3639 } 3640 goto restart; 3641 } 3642 3643 if (m->m_pkthdr.len == 0) { 3644 printf("%s so %llx pkt %llx len is null\n", 3645 __func__, 3646 (uint64_t)VM_KERNEL_ADDRPERM(so), 3647 (uint64_t)VM_KERNEL_ADDRPERM(m)); 3648 goto restart; 3649 } 3650 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); 3651 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 3652 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 3653 3654 /* 3655 * Consume the current uio index as we have a datagram 3656 */ 3657 i += 1; 3658 nextrecord = m->m_nextpkt; 3659 3660#if SO_RECEIVE_LIST_SOCKADDR_NOT_YET 3661 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { 3662 /* 3663 * to be adapted from soreceive() 3664 */ 3665 } 3666#endif /* SO_RECEIVE_LIST_SOCKADDR_NOT_YET */ 3667 3668#if SO_RECEIVE_LIST_CONTROL_NOT_YET 3669 /* 3670 * Process one or more MT_CONTROL mbufs present before any data mbufs 3671 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 3672 * just copy the data; if !MSG_PEEK, we call into the protocol to 3673 * perform externalization. 3674 */ 3675 if (m != NULL && m->m_type == MT_CONTROL) { 3676 /* 3677 * to be adapted from soreceive() 3678 */ 3679 } 3680#endif /* SO_RECEIVE_LIST_CONTROL_NOT_YET */ 3681 3682 offset = 0; 3683 3684 /* 3685 * Loop to copy out the mbufs of the current record 3686 */ 3687 while (m != NULL && uio_resid(auio) > 0 && error == 0) { 3688 len = uio_resid(auio); 3689 3690 if (m->m_len == 0) 3691 printf("%s: so %llx m %llx m_len is 0\n", 3692 __func__, 3693 (uint64_t)VM_KERNEL_ADDRPERM(so), 3694 (uint64_t)VM_KERNEL_ADDRPERM(m)); 3695 3696 /* 3697 * Clip to the residual length 3698 */ 3699 if (len > m->m_len) 3700 len = m->m_len; 3701 /* 3702 * If mp is set, just pass back the mbufs. 3703 * Otherwise copy them out via the uio, then free. 3704 * Sockbuf must be consistent here (points to current mbuf, 3705 * it points to next record) when we drop priority; 3706 * we must note any additions to the sockbuf when we 3707 * block interrupts again. 3708 */ 3709 if (mp != NULL) { 3710 uio_setresid(auio, (uio_resid(auio) - len)); 3711 } else { 3712 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 3713 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 3714 3715 socket_unlock(so, 0); 3716 error = uiomove(mtod(m, caddr_t), (int)len, auio); 3717 socket_lock(so, 0); 3718 3719 if (error) 3720 goto release; 3721 } 3722 if (len == m->m_len) { 3723 /* 3724 * m was entirely copied 3725 */ 3726 nextrecord = m->m_nextpkt; 3727 sbfree(&so->so_rcv, m); 3728 m->m_nextpkt = NULL; 3729 3730 /* 3731 * Move to m_next 3732 */ 3733 if (mp != NULL) { 3734 *mp = m; 3735 mp = &m->m_next; 3736 so->so_rcv.sb_mb = m = m->m_next; 3737 *mp = NULL; 3738 } else { 3739 if (free_list == NULL) 3740 free_list = m; 3741 else 3742 ml->m_next = m; 3743 ml = m; 3744 so->so_rcv.sb_mb = m = m->m_next; 3745 ml->m_next = NULL; 3746 ml->m_nextpkt = NULL; 3747 } 3748 if (m != NULL) { 3749 m->m_nextpkt = nextrecord; 3750 if (nextrecord == NULL) 3751 so->so_rcv.sb_lastrecord = m; 3752 } else { 3753 so->so_rcv.sb_mb = nextrecord; 3754 SB_EMPTY_FIXUP(&so->so_rcv); 3755 } 3756 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 3757 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 3758 } else { 3759 /* 3760 * Stop the loop on partial copy 3761 */ 3762 if (mp != NULL) { 3763 int copy_flag; 3764 3765 if (flags & MSG_DONTWAIT) 3766 copy_flag = M_DONTWAIT; 3767 else 3768 copy_flag = M_WAIT; 3769 *mp = m_copym(m, 0, len, copy_flag); 3770 /* 3771 * Failed to allocate an mbuf? 3772 * Adjust uio_resid back, it was 3773 * adjusted down by len bytes which 3774 * we didn't copy over. 3775 */ 3776 if (*mp == NULL) { 3777 uio_setresid(auio, 3778 (uio_resid(auio) + len)); 3779 error = ENOMEM; 3780 break; 3781 } 3782 } 3783 break; 3784 } 3785 } 3786#ifdef MORE_LOCKING_DEBUG 3787 if (so->so_usecount <= 1) { 3788 panic("%s: after big while so=%llx ref=%d on socket\n", 3789 __func__, 3790 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount); 3791 /* NOTREACHED */ 3792 } 3793#endif 3794 /* 3795 * Tell the caller we made a partial copy 3796 */ 3797 if (m != NULL) { 3798 if (so->so_options & SO_DONTTRUNC) { 3799 m->m_data += len; 3800 m->m_len -= len; 3801 so->so_rcv.sb_cc -= len; 3802 flags |= MSG_RCVMORE; 3803 } else { 3804 (void) sbdroprecord(&so->so_rcv); 3805 nextrecord = so->so_rcv.sb_mb; 3806 m = NULL; 3807 flags |= MSG_TRUNC; 3808 } 3809 } 3810 3811 if (m == NULL) { 3812 so->so_rcv.sb_mb = nextrecord; 3813 /* 3814 * First part is an inline SB_EMPTY_FIXUP(). Second 3815 * part makes sure sb_lastrecord is up-to-date if 3816 * there is still data in the socket buffer. 3817 */ 3818 if (so->so_rcv.sb_mb == NULL) { 3819 so->so_rcv.sb_mbtail = NULL; 3820 so->so_rcv.sb_lastrecord = NULL; 3821 } else if (nextrecord->m_nextpkt == NULL) { 3822 so->so_rcv.sb_lastrecord = nextrecord; 3823 } 3824 SB_MB_CHECK(&so->so_rcv); 3825 } 3826 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 3827 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 3828 3829 /* 3830 * We can continue to the next packet as long as: 3831 * - We haven't exhausted the uio array 3832 * - There was no error 3833 * - A packet was not truncated 3834 * - We can still receive more data 3835 */ 3836 if (i < uiocnt && error == 0 && 3837 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 3838 && (so->so_state & SS_CANTRCVMORE) == 0) { 3839 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 3840 sblocked = 0; 3841 3842 goto restart; 3843 } 3844 3845release: 3846 /* 3847 * pru_rcvd may cause more data to be received if the socket lock 3848 * is dropped so we set MSG_HAVEMORE now based on what we know. 3849 * That way the caller won't be surprised if it receives less data than requested. 3850 */ 3851 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) 3852 flags |= MSG_HAVEMORE; 3853 3854 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 3855 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 3856 3857 if (flagsp != NULL) 3858 *flagsp |= flags; 3859 if (sblocked) 3860 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ 3861 else 3862 socket_unlock(so, 1); 3863out: 3864 /* 3865 * Amortize the cost 3866 */ 3867 if (free_list != NULL) 3868 m_freem_list(free_list); 3869 3870 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error, 3871 0, 0, 0, 0); 3872 return (error); 3873} 3874 3875/* 3876 * Returns: 0 Success 3877 * EINVAL 3878 * ENOTCONN 3879 * <pru_shutdown>:EINVAL 3880 * <pru_shutdown>:EADDRNOTAVAIL[TCP] 3881 * <pru_shutdown>:ENOBUFS[TCP] 3882 * <pru_shutdown>:EMSGSIZE[TCP] 3883 * <pru_shutdown>:EHOSTUNREACH[TCP] 3884 * <pru_shutdown>:ENETUNREACH[TCP] 3885 * <pru_shutdown>:ENETDOWN[TCP] 3886 * <pru_shutdown>:ENOMEM[TCP] 3887 * <pru_shutdown>:EACCES[TCP] 3888 * <pru_shutdown>:EMSGSIZE[TCP] 3889 * <pru_shutdown>:ENOBUFS[TCP] 3890 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] 3891 * <pru_shutdown>:??? [other protocol families] 3892 */ 3893int 3894soshutdown(struct socket *so, int how) 3895{ 3896 int error; 3897 3898 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0); 3899 3900 switch (how) { 3901 case SHUT_RD: 3902 case SHUT_WR: 3903 case SHUT_RDWR: 3904 socket_lock(so, 1); 3905 if ((so->so_state & 3906 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) { 3907 error = ENOTCONN; 3908 } else { 3909 error = soshutdownlock(so, how); 3910 } 3911 socket_unlock(so, 1); 3912 break; 3913 default: 3914 error = EINVAL; 3915 break; 3916 } 3917 3918 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0); 3919 3920 return (error); 3921} 3922 3923int 3924soshutdownlock_final(struct socket *so, int how) 3925{ 3926 struct protosw *pr = so->so_proto; 3927 int error = 0; 3928 3929 sflt_notify(so, sock_evt_shutdown, &how); 3930 3931 if (how != SHUT_WR) { 3932 if ((so->so_state & SS_CANTRCVMORE) != 0) { 3933 /* read already shut down */ 3934 error = ENOTCONN; 3935 goto done; 3936 } 3937 sorflush(so); 3938 postevent(so, 0, EV_RCLOSED); 3939 } 3940 if (how != SHUT_RD) { 3941 if ((so->so_state & SS_CANTSENDMORE) != 0) { 3942 /* write already shut down */ 3943 error = ENOTCONN; 3944 goto done; 3945 } 3946 error = (*pr->pr_usrreqs->pru_shutdown)(so); 3947 postevent(so, 0, EV_WCLOSED); 3948 } 3949done: 3950 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0); 3951 return (error); 3952} 3953 3954int 3955soshutdownlock(struct socket *so, int how) 3956{ 3957 int error = 0; 3958 3959#if CONTENT_FILTER 3960 /* 3961 * A content filter may delay the actual shutdown until it 3962 * has processed the pending data 3963 */ 3964 if (so->so_flags & SOF_CONTENT_FILTER) { 3965 error = cfil_sock_shutdown(so, &how); 3966 if (error == EJUSTRETURN) { 3967 error = 0; 3968 goto done; 3969 } else if (error != 0) { 3970 goto done; 3971 } 3972 } 3973#endif /* CONTENT_FILTER */ 3974 3975 error = soshutdownlock_final(so, how); 3976 3977done: 3978 return (error); 3979} 3980 3981void 3982sowflush(struct socket *so) 3983{ 3984 struct sockbuf *sb = &so->so_snd; 3985#ifdef notyet 3986 lck_mtx_t *mutex_held; 3987 /* 3988 * XXX: This code is currently commented out, because we may get here 3989 * as part of sofreelastref(), and at that time, pr_getlock() may no 3990 * longer be able to return us the lock; this will be fixed in future. 3991 */ 3992 if (so->so_proto->pr_getlock != NULL) 3993 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 3994 else 3995 mutex_held = so->so_proto->pr_domain->dom_mtx; 3996 3997 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 3998#endif /* notyet */ 3999 4000 /* 4001 * Obtain lock on the socket buffer (SB_LOCK). This is required 4002 * to prevent the socket buffer from being unexpectedly altered 4003 * while it is used by another thread in socket send/receive. 4004 * 4005 * sblock() must not fail here, hence the assertion. 4006 */ 4007 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); 4008 VERIFY(sb->sb_flags & SB_LOCK); 4009 4010 sb->sb_flags &= ~(SB_SEL|SB_UPCALL); 4011 sb->sb_flags |= SB_DROP; 4012 sb->sb_upcall = NULL; 4013 sb->sb_upcallarg = NULL; 4014 4015 sbunlock(sb, TRUE); /* keep socket locked */ 4016 4017 selthreadclear(&sb->sb_sel); 4018 sbrelease(sb); 4019} 4020 4021void 4022sorflush(struct socket *so) 4023{ 4024 struct sockbuf *sb = &so->so_rcv; 4025 struct protosw *pr = so->so_proto; 4026 struct sockbuf asb; 4027#ifdef notyet 4028 lck_mtx_t *mutex_held; 4029 /* 4030 * XXX: This code is currently commented out, because we may get here 4031 * as part of sofreelastref(), and at that time, pr_getlock() may no 4032 * longer be able to return us the lock; this will be fixed in future. 4033 */ 4034 if (so->so_proto->pr_getlock != NULL) 4035 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 4036 else 4037 mutex_held = so->so_proto->pr_domain->dom_mtx; 4038 4039 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 4040#endif /* notyet */ 4041 4042 sflt_notify(so, sock_evt_flush_read, NULL); 4043 4044 socantrcvmore(so); 4045 4046 /* 4047 * Obtain lock on the socket buffer (SB_LOCK). This is required 4048 * to prevent the socket buffer from being unexpectedly altered 4049 * while it is used by another thread in socket send/receive. 4050 * 4051 * sblock() must not fail here, hence the assertion. 4052 */ 4053 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); 4054 VERIFY(sb->sb_flags & SB_LOCK); 4055 4056 /* 4057 * Copy only the relevant fields from "sb" to "asb" which we 4058 * need for sbrelease() to function. In particular, skip 4059 * sb_sel as it contains the wait queue linkage, which would 4060 * wreak havoc if we were to issue selthreadclear() on "asb". 4061 * Make sure to not carry over SB_LOCK in "asb", as we need 4062 * to acquire it later as part of sbrelease(). 4063 */ 4064 bzero(&asb, sizeof (asb)); 4065 asb.sb_cc = sb->sb_cc; 4066 asb.sb_hiwat = sb->sb_hiwat; 4067 asb.sb_mbcnt = sb->sb_mbcnt; 4068 asb.sb_mbmax = sb->sb_mbmax; 4069 asb.sb_ctl = sb->sb_ctl; 4070 asb.sb_lowat = sb->sb_lowat; 4071 asb.sb_mb = sb->sb_mb; 4072 asb.sb_mbtail = sb->sb_mbtail; 4073 asb.sb_lastrecord = sb->sb_lastrecord; 4074 asb.sb_so = sb->sb_so; 4075 asb.sb_flags = sb->sb_flags; 4076 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL); 4077 asb.sb_flags |= SB_DROP; 4078 4079 /* 4080 * Ideally we'd bzero() these and preserve the ones we need; 4081 * but to do that we'd need to shuffle things around in the 4082 * sockbuf, and we can't do it now because there are KEXTS 4083 * that are directly referring to the socket structure. 4084 * 4085 * Setting SB_DROP acts as a barrier to prevent further appends. 4086 * Clearing SB_SEL is done for selthreadclear() below. 4087 */ 4088 sb->sb_cc = 0; 4089 sb->sb_hiwat = 0; 4090 sb->sb_mbcnt = 0; 4091 sb->sb_mbmax = 0; 4092 sb->sb_ctl = 0; 4093 sb->sb_lowat = 0; 4094 sb->sb_mb = NULL; 4095 sb->sb_mbtail = NULL; 4096 sb->sb_lastrecord = NULL; 4097 sb->sb_timeo.tv_sec = 0; 4098 sb->sb_timeo.tv_usec = 0; 4099 sb->sb_upcall = NULL; 4100 sb->sb_upcallarg = NULL; 4101 sb->sb_flags &= ~(SB_SEL|SB_UPCALL); 4102 sb->sb_flags |= SB_DROP; 4103 4104 sbunlock(sb, TRUE); /* keep socket locked */ 4105 4106 /* 4107 * Note that selthreadclear() is called on the original "sb" and 4108 * not the local "asb" because of the way wait queue linkage is 4109 * implemented. Given that selwakeup() may be triggered, SB_SEL 4110 * should no longer be set (cleared above.) 4111 */ 4112 selthreadclear(&sb->sb_sel); 4113 4114 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 4115 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 4116 4117 sbrelease(&asb); 4118} 4119 4120/* 4121 * Perhaps this routine, and sooptcopyout(), below, ought to come in 4122 * an additional variant to handle the case where the option value needs 4123 * to be some kind of integer, but not a specific size. 4124 * In addition to their use here, these functions are also called by the 4125 * protocol-level pr_ctloutput() routines. 4126 * 4127 * Returns: 0 Success 4128 * EINVAL 4129 * copyin:EFAULT 4130 */ 4131int 4132sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 4133{ 4134 size_t valsize; 4135 4136 /* 4137 * If the user gives us more than we wanted, we ignore it, 4138 * but if we don't get the minimum length the caller 4139 * wants, we return EINVAL. On success, sopt->sopt_valsize 4140 * is set to however much we actually retrieved. 4141 */ 4142 if ((valsize = sopt->sopt_valsize) < minlen) 4143 return (EINVAL); 4144 if (valsize > len) 4145 sopt->sopt_valsize = valsize = len; 4146 4147 if (sopt->sopt_p != kernproc) 4148 return (copyin(sopt->sopt_val, buf, valsize)); 4149 4150 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize); 4151 return (0); 4152} 4153 4154/* 4155 * sooptcopyin_timeval 4156 * Copy in a timeval value into tv_p, and take into account whether the 4157 * the calling process is 64-bit or 32-bit. Moved the sanity checking 4158 * code here so that we can verify the 64-bit tv_sec value before we lose 4159 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec. 4160 */ 4161static int 4162sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p) 4163{ 4164 int error; 4165 4166 if (proc_is64bit(sopt->sopt_p)) { 4167 struct user64_timeval tv64; 4168 4169 if (sopt->sopt_valsize < sizeof (tv64)) 4170 return (EINVAL); 4171 4172 sopt->sopt_valsize = sizeof (tv64); 4173 if (sopt->sopt_p != kernproc) { 4174 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64)); 4175 if (error != 0) 4176 return (error); 4177 } else { 4178 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64, 4179 sizeof (tv64)); 4180 } 4181 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX || 4182 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) 4183 return (EDOM); 4184 4185 tv_p->tv_sec = tv64.tv_sec; 4186 tv_p->tv_usec = tv64.tv_usec; 4187 } else { 4188 struct user32_timeval tv32; 4189 4190 if (sopt->sopt_valsize < sizeof (tv32)) 4191 return (EINVAL); 4192 4193 sopt->sopt_valsize = sizeof (tv32); 4194 if (sopt->sopt_p != kernproc) { 4195 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32)); 4196 if (error != 0) { 4197 return (error); 4198 } 4199 } else { 4200 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32, 4201 sizeof (tv32)); 4202 } 4203#ifndef __LP64__ 4204 /* 4205 * K64todo "comparison is always false due to 4206 * limited range of data type" 4207 */ 4208 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX || 4209 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) 4210 return (EDOM); 4211#endif 4212 tv_p->tv_sec = tv32.tv_sec; 4213 tv_p->tv_usec = tv32.tv_usec; 4214 } 4215 return (0); 4216} 4217 4218/* 4219 * Returns: 0 Success 4220 * EINVAL 4221 * ENOPROTOOPT 4222 * ENOBUFS 4223 * EDOM 4224 * sooptcopyin:EINVAL 4225 * sooptcopyin:EFAULT 4226 * sooptcopyin_timeval:EINVAL 4227 * sooptcopyin_timeval:EFAULT 4228 * sooptcopyin_timeval:EDOM 4229 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX] 4230 * <pr_ctloutput>:???w 4231 * sflt_attach_private:??? [whatever a filter author chooses] 4232 * <sf_setoption>:??? [whatever a filter author chooses] 4233 * 4234 * Notes: Other <pru_listen> returns depend on the protocol family; all 4235 * <sf_listen> returns depend on what the filter author causes 4236 * their filter to return. 4237 */ 4238int 4239sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) 4240{ 4241 int error, optval; 4242 struct linger l; 4243 struct timeval tv; 4244#if CONFIG_MACF_SOCKET 4245 struct mac extmac; 4246#endif /* MAC_SOCKET */ 4247 4248 if (sopt->sopt_dir != SOPT_SET) 4249 sopt->sopt_dir = SOPT_SET; 4250 4251 if (dolock) 4252 socket_lock(so, 1); 4253 4254 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == 4255 (SS_CANTRCVMORE | SS_CANTSENDMORE) && 4256 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { 4257 /* the socket has been shutdown, no more sockopt's */ 4258 error = EINVAL; 4259 goto out; 4260 } 4261 4262 error = sflt_setsockopt(so, sopt); 4263 if (error != 0) { 4264 if (error == EJUSTRETURN) 4265 error = 0; 4266 goto out; 4267 } 4268 4269 if (sopt->sopt_level != SOL_SOCKET) { 4270 if (so->so_proto != NULL && 4271 so->so_proto->pr_ctloutput != NULL) { 4272 error = (*so->so_proto->pr_ctloutput)(so, sopt); 4273 goto out; 4274 } 4275 error = ENOPROTOOPT; 4276 } else { 4277 /* 4278 * Allow socket-level (SOL_SOCKET) options to be filtered by 4279 * the protocol layer, if needed. A zero value returned from 4280 * the handler means use default socket-level processing as 4281 * done by the rest of this routine. Otherwise, any other 4282 * return value indicates that the option is unsupported. 4283 */ 4284 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> 4285 pru_socheckopt(so, sopt)) != 0) 4286 goto out; 4287 4288 error = 0; 4289 switch (sopt->sopt_name) { 4290 case SO_LINGER: 4291 case SO_LINGER_SEC: 4292 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l)); 4293 if (error != 0) 4294 goto out; 4295 4296 so->so_linger = (sopt->sopt_name == SO_LINGER) ? 4297 l.l_linger : l.l_linger * hz; 4298 if (l.l_onoff != 0) 4299 so->so_options |= SO_LINGER; 4300 else 4301 so->so_options &= ~SO_LINGER; 4302 break; 4303 4304 case SO_DEBUG: 4305 case SO_KEEPALIVE: 4306 case SO_DONTROUTE: 4307 case SO_USELOOPBACK: 4308 case SO_BROADCAST: 4309 case SO_REUSEADDR: 4310 case SO_REUSEPORT: 4311 case SO_OOBINLINE: 4312 case SO_TIMESTAMP: 4313 case SO_TIMESTAMP_MONOTONIC: 4314 case SO_DONTTRUNC: 4315 case SO_WANTMORE: 4316 case SO_WANTOOBFLAG: 4317 case SO_NOWAKEFROMSLEEP: 4318 error = sooptcopyin(sopt, &optval, sizeof (optval), 4319 sizeof (optval)); 4320 if (error != 0) 4321 goto out; 4322 if (optval) 4323 so->so_options |= sopt->sopt_name; 4324 else 4325 so->so_options &= ~sopt->sopt_name; 4326 break; 4327 4328 case SO_SNDBUF: 4329 case SO_RCVBUF: 4330 case SO_SNDLOWAT: 4331 case SO_RCVLOWAT: 4332 error = sooptcopyin(sopt, &optval, sizeof (optval), 4333 sizeof (optval)); 4334 if (error != 0) 4335 goto out; 4336 4337 /* 4338 * Values < 1 make no sense for any of these 4339 * options, so disallow them. 4340 */ 4341 if (optval < 1) { 4342 error = EINVAL; 4343 goto out; 4344 } 4345 4346 switch (sopt->sopt_name) { 4347 case SO_SNDBUF: 4348 case SO_RCVBUF: { 4349 struct sockbuf *sb = 4350 (sopt->sopt_name == SO_SNDBUF) ? 4351 &so->so_snd : &so->so_rcv; 4352 if (sbreserve(sb, (u_int32_t)optval) == 0) { 4353 error = ENOBUFS; 4354 goto out; 4355 } 4356 sb->sb_flags |= SB_USRSIZE; 4357 sb->sb_flags &= ~SB_AUTOSIZE; 4358 sb->sb_idealsize = (u_int32_t)optval; 4359 break; 4360 } 4361 /* 4362 * Make sure the low-water is never greater than 4363 * the high-water. 4364 */ 4365 case SO_SNDLOWAT: { 4366 int space = sbspace(&so->so_snd); 4367 u_int32_t hiwat = so->so_snd.sb_hiwat; 4368 4369 if (so->so_snd.sb_flags & SB_UNIX) { 4370 struct unpcb *unp = 4371 (struct unpcb *)(so->so_pcb); 4372 if (unp != NULL && unp->unp_conn != NULL) { 4373 hiwat += unp->unp_conn->unp_cc; 4374 } 4375 } 4376 4377 so->so_snd.sb_lowat = 4378 (optval > hiwat) ? 4379 hiwat : optval; 4380 4381 if (space >= so->so_snd.sb_lowat) { 4382 sowwakeup(so); 4383 } 4384 break; 4385 } 4386 case SO_RCVLOWAT: { 4387 int64_t data_len; 4388 so->so_rcv.sb_lowat = 4389 (optval > so->so_rcv.sb_hiwat) ? 4390 so->so_rcv.sb_hiwat : optval; 4391 data_len = so->so_rcv.sb_cc 4392 - so->so_rcv.sb_ctl; 4393 if (data_len >= so->so_rcv.sb_lowat) 4394 sorwakeup(so); 4395 break; 4396 } 4397 } 4398 break; 4399 4400 case SO_SNDTIMEO: 4401 case SO_RCVTIMEO: 4402 error = sooptcopyin_timeval(sopt, &tv); 4403 if (error != 0) 4404 goto out; 4405 4406 switch (sopt->sopt_name) { 4407 case SO_SNDTIMEO: 4408 so->so_snd.sb_timeo = tv; 4409 break; 4410 case SO_RCVTIMEO: 4411 so->so_rcv.sb_timeo = tv; 4412 break; 4413 } 4414 break; 4415 4416 case SO_NKE: { 4417 struct so_nke nke; 4418 4419 error = sooptcopyin(sopt, &nke, sizeof (nke), 4420 sizeof (nke)); 4421 if (error != 0) 4422 goto out; 4423 4424 error = sflt_attach_internal(so, nke.nke_handle); 4425 break; 4426 } 4427 4428 case SO_NOSIGPIPE: 4429 error = sooptcopyin(sopt, &optval, sizeof (optval), 4430 sizeof (optval)); 4431 if (error != 0) 4432 goto out; 4433 if (optval != 0) 4434 so->so_flags |= SOF_NOSIGPIPE; 4435 else 4436 so->so_flags &= ~SOF_NOSIGPIPE; 4437 break; 4438 4439 case SO_NOADDRERR: 4440 error = sooptcopyin(sopt, &optval, sizeof (optval), 4441 sizeof (optval)); 4442 if (error != 0) 4443 goto out; 4444 if (optval != 0) 4445 so->so_flags |= SOF_NOADDRAVAIL; 4446 else 4447 so->so_flags &= ~SOF_NOADDRAVAIL; 4448 break; 4449 4450 case SO_REUSESHAREUID: 4451 error = sooptcopyin(sopt, &optval, sizeof (optval), 4452 sizeof (optval)); 4453 if (error != 0) 4454 goto out; 4455 if (optval != 0) 4456 so->so_flags |= SOF_REUSESHAREUID; 4457 else 4458 so->so_flags &= ~SOF_REUSESHAREUID; 4459 break; 4460 4461 case SO_NOTIFYCONFLICT: 4462 if (kauth_cred_issuser(kauth_cred_get()) == 0) { 4463 error = EPERM; 4464 goto out; 4465 } 4466 error = sooptcopyin(sopt, &optval, sizeof (optval), 4467 sizeof (optval)); 4468 if (error != 0) 4469 goto out; 4470 if (optval != 0) 4471 so->so_flags |= SOF_NOTIFYCONFLICT; 4472 else 4473 so->so_flags &= ~SOF_NOTIFYCONFLICT; 4474 break; 4475 4476 case SO_RESTRICTIONS: 4477 error = sooptcopyin(sopt, &optval, sizeof (optval), 4478 sizeof (optval)); 4479 if (error != 0) 4480 goto out; 4481 4482 error = so_set_restrictions(so, optval); 4483 break; 4484 4485 case SO_AWDL_UNRESTRICTED: 4486 if (SOCK_DOM(so) != PF_INET && 4487 SOCK_DOM(so) != PF_INET6) { 4488 error = EOPNOTSUPP; 4489 goto out; 4490 } 4491 error = sooptcopyin(sopt, &optval, sizeof(optval), 4492 sizeof(optval)); 4493 if (error != 0) 4494 goto out; 4495 if (optval != 0) { 4496 kauth_cred_t cred = NULL; 4497 proc_t ep = PROC_NULL; 4498 4499 if (so->so_flags & SOF_DELEGATED) { 4500 ep = proc_find(so->e_pid); 4501 if (ep) 4502 cred = kauth_cred_proc_ref(ep); 4503 } 4504 error = priv_check_cred( 4505 cred ? cred : so->so_cred, 4506 PRIV_NET_RESTRICTED_AWDL, 0); 4507 if (error == 0) 4508 inp_set_awdl_unrestricted( 4509 sotoinpcb(so)); 4510 if (cred) 4511 kauth_cred_unref(&cred); 4512 if (ep != PROC_NULL) 4513 proc_rele(ep); 4514 } else 4515 inp_clear_awdl_unrestricted(sotoinpcb(so)); 4516 break; 4517 4518 case SO_LABEL: 4519#if CONFIG_MACF_SOCKET 4520 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 4521 sizeof (extmac))) != 0) 4522 goto out; 4523 4524 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p), 4525 so, &extmac); 4526#else 4527 error = EOPNOTSUPP; 4528#endif /* MAC_SOCKET */ 4529 break; 4530 4531 case SO_UPCALLCLOSEWAIT: 4532 error = sooptcopyin(sopt, &optval, sizeof (optval), 4533 sizeof (optval)); 4534 if (error != 0) 4535 goto out; 4536 if (optval != 0) 4537 so->so_flags |= SOF_UPCALLCLOSEWAIT; 4538 else 4539 so->so_flags &= ~SOF_UPCALLCLOSEWAIT; 4540 break; 4541 4542 case SO_RANDOMPORT: 4543 error = sooptcopyin(sopt, &optval, sizeof (optval), 4544 sizeof (optval)); 4545 if (error != 0) 4546 goto out; 4547 if (optval != 0) 4548 so->so_flags |= SOF_BINDRANDOMPORT; 4549 else 4550 so->so_flags &= ~SOF_BINDRANDOMPORT; 4551 break; 4552 4553 case SO_NP_EXTENSIONS: { 4554 struct so_np_extensions sonpx; 4555 4556 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx), 4557 sizeof (sonpx)); 4558 if (error != 0) 4559 goto out; 4560 if (sonpx.npx_mask & ~SONPX_MASK_VALID) { 4561 error = EINVAL; 4562 goto out; 4563 } 4564 /* 4565 * Only one bit defined for now 4566 */ 4567 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) { 4568 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) 4569 so->so_flags |= SOF_NPX_SETOPTSHUT; 4570 else 4571 so->so_flags &= ~SOF_NPX_SETOPTSHUT; 4572 } 4573 break; 4574 } 4575 4576 case SO_TRAFFIC_CLASS: { 4577 error = sooptcopyin(sopt, &optval, sizeof (optval), 4578 sizeof (optval)); 4579 if (error != 0) 4580 goto out; 4581 error = so_set_traffic_class(so, optval); 4582 if (error != 0) 4583 goto out; 4584 break; 4585 } 4586 4587 case SO_RECV_TRAFFIC_CLASS: { 4588 error = sooptcopyin(sopt, &optval, sizeof (optval), 4589 sizeof (optval)); 4590 if (error != 0) 4591 goto out; 4592 if (optval == 0) 4593 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; 4594 else 4595 so->so_flags |= SOF_RECV_TRAFFIC_CLASS; 4596 break; 4597 } 4598 4599 case SO_TRAFFIC_CLASS_DBG: { 4600 struct so_tcdbg so_tcdbg; 4601 4602 error = sooptcopyin(sopt, &so_tcdbg, 4603 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg)); 4604 if (error != 0) 4605 goto out; 4606 error = so_set_tcdbg(so, &so_tcdbg); 4607 if (error != 0) 4608 goto out; 4609 break; 4610 } 4611 4612 case SO_PRIVILEGED_TRAFFIC_CLASS: 4613 error = priv_check_cred(kauth_cred_get(), 4614 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0); 4615 if (error != 0) 4616 goto out; 4617 error = sooptcopyin(sopt, &optval, sizeof (optval), 4618 sizeof (optval)); 4619 if (error != 0) 4620 goto out; 4621 if (optval == 0) 4622 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS; 4623 else 4624 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS; 4625 break; 4626 4627 case SO_DEFUNCTOK: 4628 error = sooptcopyin(sopt, &optval, sizeof (optval), 4629 sizeof (optval)); 4630 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { 4631 if (error == 0) 4632 error = EBADF; 4633 goto out; 4634 } 4635 /* 4636 * Any process can set SO_DEFUNCTOK (clear 4637 * SOF_NODEFUNCT), but only root can clear 4638 * SO_DEFUNCTOK (set SOF_NODEFUNCT). 4639 */ 4640 if (optval == 0 && 4641 kauth_cred_issuser(kauth_cred_get()) == 0) { 4642 error = EPERM; 4643 goto out; 4644 } 4645 if (optval) 4646 so->so_flags &= ~SOF_NODEFUNCT; 4647 else 4648 so->so_flags |= SOF_NODEFUNCT; 4649 4650 if (SOCK_DOM(so) == PF_INET || 4651 SOCK_DOM(so) == PF_INET6) { 4652 char s[MAX_IPv6_STR_LEN]; 4653 char d[MAX_IPv6_STR_LEN]; 4654 struct inpcb *inp = sotoinpcb(so); 4655 4656 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> " 4657 "%s:%d] is now marked as %seligible for " 4658 "defunct\n", __func__, proc_selfpid(), 4659 (uint64_t)VM_KERNEL_ADDRPERM(so), 4660 (SOCK_TYPE(so) == SOCK_STREAM) ? 4661 "TCP" : "UDP", inet_ntop(SOCK_DOM(so), 4662 ((SOCK_DOM(so) == PF_INET) ? 4663 (void *)&inp->inp_laddr.s_addr : 4664 (void *)&inp->in6p_laddr), s, sizeof (s)), 4665 ntohs(inp->in6p_lport), 4666 inet_ntop(SOCK_DOM(so), 4667 (SOCK_DOM(so) == PF_INET) ? 4668 (void *)&inp->inp_faddr.s_addr : 4669 (void *)&inp->in6p_faddr, d, sizeof (d)), 4670 ntohs(inp->in6p_fport), 4671 (so->so_flags & SOF_NODEFUNCT) ? 4672 "not " : "")); 4673 } else { 4674 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is " 4675 "now marked as %seligible for defunct\n", 4676 __func__, proc_selfpid(), 4677 (uint64_t)VM_KERNEL_ADDRPERM(so), 4678 SOCK_DOM(so), SOCK_TYPE(so), 4679 (so->so_flags & SOF_NODEFUNCT) ? 4680 "not " : "")); 4681 } 4682 break; 4683 4684 case SO_ISDEFUNCT: 4685 /* This option is not settable */ 4686 error = EINVAL; 4687 break; 4688 4689 case SO_OPPORTUNISTIC: 4690 error = sooptcopyin(sopt, &optval, sizeof (optval), 4691 sizeof (optval)); 4692 if (error == 0) 4693 error = so_set_opportunistic(so, optval); 4694 break; 4695 4696 case SO_FLUSH: 4697 /* This option is handled by lower layer(s) */ 4698 error = 0; 4699 break; 4700 4701 case SO_RECV_ANYIF: 4702 error = sooptcopyin(sopt, &optval, sizeof (optval), 4703 sizeof (optval)); 4704 if (error == 0) 4705 error = so_set_recv_anyif(so, optval); 4706 break; 4707 4708 case SO_TRAFFIC_MGT_BACKGROUND: { 4709 /* This option is handled by lower layer(s) */ 4710 error = 0; 4711 break; 4712 } 4713 4714#if FLOW_DIVERT 4715 case SO_FLOW_DIVERT_TOKEN: 4716 error = flow_divert_token_set(so, sopt); 4717 break; 4718#endif /* FLOW_DIVERT */ 4719 4720 4721 case SO_DELEGATED: 4722 if ((error = sooptcopyin(sopt, &optval, sizeof (optval), 4723 sizeof (optval))) != 0) 4724 break; 4725 4726 error = so_set_effective_pid(so, optval, sopt->sopt_p); 4727 break; 4728 4729 case SO_DELEGATED_UUID: { 4730 uuid_t euuid; 4731 4732 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid), 4733 sizeof (euuid))) != 0) 4734 break; 4735 4736 error = so_set_effective_uuid(so, euuid, sopt->sopt_p); 4737 break; 4738 } 4739 4740#if NECP 4741 case SO_NECP_ATTRIBUTES: 4742 error = necp_set_socket_attributes(so, sopt); 4743 break; 4744#endif /* NECP */ 4745 4746#if MPTCP 4747 case SO_MPTCP_FASTJOIN: 4748 if (!((so->so_flags & SOF_MP_SUBFLOW) || 4749 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) && 4750 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) { 4751 error = ENOPROTOOPT; 4752 break; 4753 } 4754 4755 error = sooptcopyin(sopt, &optval, sizeof (optval), 4756 sizeof (optval)); 4757 if (error != 0) 4758 goto out; 4759 if (optval == 0) 4760 so->so_flags &= ~SOF_MPTCP_FASTJOIN; 4761 else 4762 so->so_flags |= SOF_MPTCP_FASTJOIN; 4763 break; 4764#endif /* MPTCP */ 4765 4766 default: 4767 error = ENOPROTOOPT; 4768 break; 4769 } 4770 if (error == 0 && so->so_proto != NULL && 4771 so->so_proto->pr_ctloutput != NULL) { 4772 (void) so->so_proto->pr_ctloutput(so, sopt); 4773 } 4774 } 4775out: 4776 if (dolock) 4777 socket_unlock(so, 1); 4778 return (error); 4779} 4780 4781/* Helper routines for getsockopt */ 4782int 4783sooptcopyout(struct sockopt *sopt, void *buf, size_t len) 4784{ 4785 int error; 4786 size_t valsize; 4787 4788 error = 0; 4789 4790 /* 4791 * Documented get behavior is that we always return a value, 4792 * possibly truncated to fit in the user's buffer. 4793 * Traditional behavior is that we always tell the user 4794 * precisely how much we copied, rather than something useful 4795 * like the total amount we had available for her. 4796 * Note that this interface is not idempotent; the entire answer must 4797 * generated ahead of time. 4798 */ 4799 valsize = min(len, sopt->sopt_valsize); 4800 sopt->sopt_valsize = valsize; 4801 if (sopt->sopt_val != USER_ADDR_NULL) { 4802 if (sopt->sopt_p != kernproc) 4803 error = copyout(buf, sopt->sopt_val, valsize); 4804 else 4805 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); 4806 } 4807 return (error); 4808} 4809 4810static int 4811sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p) 4812{ 4813 int error; 4814 size_t len; 4815 struct user64_timeval tv64; 4816 struct user32_timeval tv32; 4817 const void * val; 4818 size_t valsize; 4819 4820 error = 0; 4821 if (proc_is64bit(sopt->sopt_p)) { 4822 len = sizeof (tv64); 4823 tv64.tv_sec = tv_p->tv_sec; 4824 tv64.tv_usec = tv_p->tv_usec; 4825 val = &tv64; 4826 } else { 4827 len = sizeof (tv32); 4828 tv32.tv_sec = tv_p->tv_sec; 4829 tv32.tv_usec = tv_p->tv_usec; 4830 val = &tv32; 4831 } 4832 valsize = min(len, sopt->sopt_valsize); 4833 sopt->sopt_valsize = valsize; 4834 if (sopt->sopt_val != USER_ADDR_NULL) { 4835 if (sopt->sopt_p != kernproc) 4836 error = copyout(val, sopt->sopt_val, valsize); 4837 else 4838 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); 4839 } 4840 return (error); 4841} 4842 4843/* 4844 * Return: 0 Success 4845 * ENOPROTOOPT 4846 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX] 4847 * <pr_ctloutput>:??? 4848 * <sf_getoption>:??? 4849 */ 4850int 4851sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) 4852{ 4853 int error, optval; 4854 struct linger l; 4855 struct timeval tv; 4856#if CONFIG_MACF_SOCKET 4857 struct mac extmac; 4858#endif /* MAC_SOCKET */ 4859 4860 if (sopt->sopt_dir != SOPT_GET) 4861 sopt->sopt_dir = SOPT_GET; 4862 4863 if (dolock) 4864 socket_lock(so, 1); 4865 4866 error = sflt_getsockopt(so, sopt); 4867 if (error != 0) { 4868 if (error == EJUSTRETURN) 4869 error = 0; 4870 goto out; 4871 } 4872 4873 if (sopt->sopt_level != SOL_SOCKET) { 4874 if (so->so_proto != NULL && 4875 so->so_proto->pr_ctloutput != NULL) { 4876 error = (*so->so_proto->pr_ctloutput)(so, sopt); 4877 goto out; 4878 } 4879 error = ENOPROTOOPT; 4880 } else { 4881 /* 4882 * Allow socket-level (SOL_SOCKET) options to be filtered by 4883 * the protocol layer, if needed. A zero value returned from 4884 * the handler means use default socket-level processing as 4885 * done by the rest of this routine. Otherwise, any other 4886 * return value indicates that the option is unsupported. 4887 */ 4888 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> 4889 pru_socheckopt(so, sopt)) != 0) 4890 goto out; 4891 4892 error = 0; 4893 switch (sopt->sopt_name) { 4894 case SO_LINGER: 4895 case SO_LINGER_SEC: 4896 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0); 4897 l.l_linger = (sopt->sopt_name == SO_LINGER) ? 4898 so->so_linger : so->so_linger / hz; 4899 error = sooptcopyout(sopt, &l, sizeof (l)); 4900 break; 4901 4902 case SO_USELOOPBACK: 4903 case SO_DONTROUTE: 4904 case SO_DEBUG: 4905 case SO_KEEPALIVE: 4906 case SO_REUSEADDR: 4907 case SO_REUSEPORT: 4908 case SO_BROADCAST: 4909 case SO_OOBINLINE: 4910 case SO_TIMESTAMP: 4911 case SO_TIMESTAMP_MONOTONIC: 4912 case SO_DONTTRUNC: 4913 case SO_WANTMORE: 4914 case SO_WANTOOBFLAG: 4915 case SO_NOWAKEFROMSLEEP: 4916 optval = so->so_options & sopt->sopt_name; 4917integer: 4918 error = sooptcopyout(sopt, &optval, sizeof (optval)); 4919 break; 4920 4921 case SO_TYPE: 4922 optval = so->so_type; 4923 goto integer; 4924 4925 case SO_NREAD: 4926 if (so->so_proto->pr_flags & PR_ATOMIC) { 4927 int pkt_total; 4928 struct mbuf *m1; 4929 4930 pkt_total = 0; 4931 m1 = so->so_rcv.sb_mb; 4932 while (m1 != NULL) { 4933 if (m1->m_type == MT_DATA || 4934 m1->m_type == MT_HEADER || 4935 m1->m_type == MT_OOBDATA) 4936 pkt_total += m1->m_len; 4937 m1 = m1->m_next; 4938 } 4939 optval = pkt_total; 4940 } else { 4941 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 4942 } 4943 goto integer; 4944 4945 case SO_NUMRCVPKT: 4946 if (so->so_proto->pr_flags & PR_ATOMIC) { 4947 int cnt = 0; 4948 struct mbuf *m1; 4949 4950 m1 = so->so_rcv.sb_mb; 4951 while (m1 != NULL) { 4952 if (m1->m_type == MT_DATA || 4953 m1->m_type == MT_HEADER || 4954 m1->m_type == MT_OOBDATA) 4955 cnt += 1; 4956 m1 = m1->m_nextpkt; 4957 } 4958 optval = cnt; 4959 goto integer; 4960 } else { 4961 error = EINVAL; 4962 break; 4963 } 4964 4965 case SO_NWRITE: 4966 optval = so->so_snd.sb_cc; 4967 goto integer; 4968 4969 case SO_ERROR: 4970 optval = so->so_error; 4971 so->so_error = 0; 4972 goto integer; 4973 4974 case SO_SNDBUF: { 4975 u_int32_t hiwat = so->so_snd.sb_hiwat; 4976 4977 if (so->so_snd.sb_flags & SB_UNIX) { 4978 struct unpcb *unp = 4979 (struct unpcb *)(so->so_pcb); 4980 if (unp != NULL && unp->unp_conn != NULL) { 4981 hiwat += unp->unp_conn->unp_cc; 4982 } 4983 } 4984 4985 optval = hiwat; 4986 goto integer; 4987 } 4988 case SO_RCVBUF: 4989 optval = so->so_rcv.sb_hiwat; 4990 goto integer; 4991 4992 case SO_SNDLOWAT: 4993 optval = so->so_snd.sb_lowat; 4994 goto integer; 4995 4996 case SO_RCVLOWAT: 4997 optval = so->so_rcv.sb_lowat; 4998 goto integer; 4999 5000 case SO_SNDTIMEO: 5001 case SO_RCVTIMEO: 5002 tv = (sopt->sopt_name == SO_SNDTIMEO ? 5003 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 5004 5005 error = sooptcopyout_timeval(sopt, &tv); 5006 break; 5007 5008 case SO_NOSIGPIPE: 5009 optval = (so->so_flags & SOF_NOSIGPIPE); 5010 goto integer; 5011 5012 case SO_NOADDRERR: 5013 optval = (so->so_flags & SOF_NOADDRAVAIL); 5014 goto integer; 5015 5016 case SO_REUSESHAREUID: 5017 optval = (so->so_flags & SOF_REUSESHAREUID); 5018 goto integer; 5019 5020 5021 case SO_NOTIFYCONFLICT: 5022 optval = (so->so_flags & SOF_NOTIFYCONFLICT); 5023 goto integer; 5024 5025 case SO_RESTRICTIONS: 5026 optval = so_get_restrictions(so); 5027 goto integer; 5028 5029 case SO_AWDL_UNRESTRICTED: 5030 if (SOCK_DOM(so) == PF_INET || 5031 SOCK_DOM(so) == PF_INET6) { 5032 optval = inp_get_awdl_unrestricted( 5033 sotoinpcb(so)); 5034 goto integer; 5035 } else 5036 error = EOPNOTSUPP; 5037 break; 5038 5039 case SO_LABEL: 5040#if CONFIG_MACF_SOCKET 5041 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 5042 sizeof (extmac))) != 0 || 5043 (error = mac_socket_label_get(proc_ucred( 5044 sopt->sopt_p), so, &extmac)) != 0) 5045 break; 5046 5047 error = sooptcopyout(sopt, &extmac, sizeof (extmac)); 5048#else 5049 error = EOPNOTSUPP; 5050#endif /* MAC_SOCKET */ 5051 break; 5052 5053 case SO_PEERLABEL: 5054#if CONFIG_MACF_SOCKET 5055 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 5056 sizeof (extmac))) != 0 || 5057 (error = mac_socketpeer_label_get(proc_ucred( 5058 sopt->sopt_p), so, &extmac)) != 0) 5059 break; 5060 5061 error = sooptcopyout(sopt, &extmac, sizeof (extmac)); 5062#else 5063 error = EOPNOTSUPP; 5064#endif /* MAC_SOCKET */ 5065 break; 5066 5067#ifdef __APPLE_API_PRIVATE 5068 case SO_UPCALLCLOSEWAIT: 5069 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT); 5070 goto integer; 5071#endif 5072 case SO_RANDOMPORT: 5073 optval = (so->so_flags & SOF_BINDRANDOMPORT); 5074 goto integer; 5075 5076 case SO_NP_EXTENSIONS: { 5077 struct so_np_extensions sonpx; 5078 5079 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? 5080 SONPX_SETOPTSHUT : 0; 5081 sonpx.npx_mask = SONPX_MASK_VALID; 5082 5083 error = sooptcopyout(sopt, &sonpx, 5084 sizeof (struct so_np_extensions)); 5085 break; 5086 } 5087 5088 case SO_TRAFFIC_CLASS: 5089 optval = so->so_traffic_class; 5090 goto integer; 5091 5092 case SO_RECV_TRAFFIC_CLASS: 5093 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); 5094 goto integer; 5095 5096 case SO_TRAFFIC_CLASS_STATS: 5097 error = sooptcopyout(sopt, &so->so_tc_stats, 5098 sizeof (so->so_tc_stats)); 5099 break; 5100 5101 case SO_TRAFFIC_CLASS_DBG: 5102 error = sogetopt_tcdbg(so, sopt); 5103 break; 5104 5105 case SO_PRIVILEGED_TRAFFIC_CLASS: 5106 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); 5107 goto integer; 5108 5109 case SO_DEFUNCTOK: 5110 optval = !(so->so_flags & SOF_NODEFUNCT); 5111 goto integer; 5112 5113 case SO_ISDEFUNCT: 5114 optval = (so->so_flags & SOF_DEFUNCT); 5115 goto integer; 5116 5117 case SO_OPPORTUNISTIC: 5118 optval = so_get_opportunistic(so); 5119 goto integer; 5120 5121 case SO_FLUSH: 5122 /* This option is not gettable */ 5123 error = EINVAL; 5124 break; 5125 5126 case SO_RECV_ANYIF: 5127 optval = so_get_recv_anyif(so); 5128 goto integer; 5129 5130 case SO_TRAFFIC_MGT_BACKGROUND: 5131 /* This option is handled by lower layer(s) */ 5132 if (so->so_proto != NULL && 5133 so->so_proto->pr_ctloutput != NULL) { 5134 (void) so->so_proto->pr_ctloutput(so, sopt); 5135 } 5136 break; 5137 5138#if FLOW_DIVERT 5139 case SO_FLOW_DIVERT_TOKEN: 5140 error = flow_divert_token_get(so, sopt); 5141 break; 5142#endif /* FLOW_DIVERT */ 5143 5144#if NECP 5145 case SO_NECP_ATTRIBUTES: 5146 error = necp_get_socket_attributes(so, sopt); 5147 break; 5148#endif /* NECP */ 5149 5150#if CONTENT_FILTER 5151 case SO_CFIL_SOCK_ID: { 5152 cfil_sock_id_t sock_id; 5153 5154 sock_id = cfil_sock_id_from_socket(so); 5155 5156 error = sooptcopyout(sopt, &sock_id, 5157 sizeof(cfil_sock_id_t)); 5158 break; 5159 } 5160#endif /* CONTENT_FILTER */ 5161 5162#if MPTCP 5163 case SO_MPTCP_FASTJOIN: 5164 if (!((so->so_flags & SOF_MP_SUBFLOW) || 5165 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) && 5166 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) { 5167 error = ENOPROTOOPT; 5168 break; 5169 } 5170 optval = (so->so_flags & SOF_MPTCP_FASTJOIN); 5171 break; 5172#endif /* MPTCP */ 5173 5174 default: 5175 error = ENOPROTOOPT; 5176 break; 5177 } 5178 } 5179out: 5180 if (dolock) 5181 socket_unlock(so, 1); 5182 return (error); 5183} 5184 5185/* 5186 * The size limits on our soopt_getm is different from that on FreeBSD. 5187 * We limit the size of options to MCLBYTES. This will have to change 5188 * if we need to define options that need more space than MCLBYTES. 5189 */ 5190int 5191soopt_getm(struct sockopt *sopt, struct mbuf **mp) 5192{ 5193 struct mbuf *m, *m_prev; 5194 int sopt_size = sopt->sopt_valsize; 5195 int how; 5196 5197 if (sopt_size <= 0 || sopt_size > MCLBYTES) 5198 return (EMSGSIZE); 5199 5200 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; 5201 MGET(m, how, MT_DATA); 5202 if (m == NULL) 5203 return (ENOBUFS); 5204 if (sopt_size > MLEN) { 5205 MCLGET(m, how); 5206 if ((m->m_flags & M_EXT) == 0) { 5207 m_free(m); 5208 return (ENOBUFS); 5209 } 5210 m->m_len = min(MCLBYTES, sopt_size); 5211 } else { 5212 m->m_len = min(MLEN, sopt_size); 5213 } 5214 sopt_size -= m->m_len; 5215 *mp = m; 5216 m_prev = m; 5217 5218 while (sopt_size > 0) { 5219 MGET(m, how, MT_DATA); 5220 if (m == NULL) { 5221 m_freem(*mp); 5222 return (ENOBUFS); 5223 } 5224 if (sopt_size > MLEN) { 5225 MCLGET(m, how); 5226 if ((m->m_flags & M_EXT) == 0) { 5227 m_freem(*mp); 5228 m_freem(m); 5229 return (ENOBUFS); 5230 } 5231 m->m_len = min(MCLBYTES, sopt_size); 5232 } else { 5233 m->m_len = min(MLEN, sopt_size); 5234 } 5235 sopt_size -= m->m_len; 5236 m_prev->m_next = m; 5237 m_prev = m; 5238 } 5239 return (0); 5240} 5241 5242/* copyin sopt data into mbuf chain */ 5243int 5244soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 5245{ 5246 struct mbuf *m0 = m; 5247 5248 if (sopt->sopt_val == USER_ADDR_NULL) 5249 return (0); 5250 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 5251 if (sopt->sopt_p != kernproc) { 5252 int error; 5253 5254 error = copyin(sopt->sopt_val, mtod(m, char *), 5255 m->m_len); 5256 if (error != 0) { 5257 m_freem(m0); 5258 return (error); 5259 } 5260 } else { 5261 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), 5262 mtod(m, char *), m->m_len); 5263 } 5264 sopt->sopt_valsize -= m->m_len; 5265 sopt->sopt_val += m->m_len; 5266 m = m->m_next; 5267 } 5268 /* should be allocated enoughly at ip6_sooptmcopyin() */ 5269 if (m != NULL) { 5270 panic("soopt_mcopyin"); 5271 /* NOTREACHED */ 5272 } 5273 return (0); 5274} 5275 5276/* copyout mbuf chain data into soopt */ 5277int 5278soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 5279{ 5280 struct mbuf *m0 = m; 5281 size_t valsize = 0; 5282 5283 if (sopt->sopt_val == USER_ADDR_NULL) 5284 return (0); 5285 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 5286 if (sopt->sopt_p != kernproc) { 5287 int error; 5288 5289 error = copyout(mtod(m, char *), sopt->sopt_val, 5290 m->m_len); 5291 if (error != 0) { 5292 m_freem(m0); 5293 return (error); 5294 } 5295 } else { 5296 bcopy(mtod(m, char *), 5297 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len); 5298 } 5299 sopt->sopt_valsize -= m->m_len; 5300 sopt->sopt_val += m->m_len; 5301 valsize += m->m_len; 5302 m = m->m_next; 5303 } 5304 if (m != NULL) { 5305 /* enough soopt buffer should be given from user-land */ 5306 m_freem(m0); 5307 return (EINVAL); 5308 } 5309 sopt->sopt_valsize = valsize; 5310 return (0); 5311} 5312 5313void 5314sohasoutofband(struct socket *so) 5315{ 5316 if (so->so_pgid < 0) 5317 gsignal(-so->so_pgid, SIGURG); 5318 else if (so->so_pgid > 0) 5319 proc_signal(so->so_pgid, SIGURG); 5320 selwakeup(&so->so_rcv.sb_sel); 5321} 5322 5323int 5324sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql) 5325{ 5326#pragma unused(cred) 5327 struct proc *p = current_proc(); 5328 int revents = 0; 5329 5330 socket_lock(so, 1); 5331 so_update_last_owner_locked(so, PROC_NULL); 5332 so_update_policy(so); 5333 5334 if (events & (POLLIN | POLLRDNORM)) 5335 if (soreadable(so)) 5336 revents |= events & (POLLIN | POLLRDNORM); 5337 5338 if (events & (POLLOUT | POLLWRNORM)) 5339 if (sowriteable(so)) 5340 revents |= events & (POLLOUT | POLLWRNORM); 5341 5342 if (events & (POLLPRI | POLLRDBAND)) 5343 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 5344 revents |= events & (POLLPRI | POLLRDBAND); 5345 5346 if (revents == 0) { 5347 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 5348 /* 5349 * Darwin sets the flag first, 5350 * BSD calls selrecord first 5351 */ 5352 so->so_rcv.sb_flags |= SB_SEL; 5353 selrecord(p, &so->so_rcv.sb_sel, wql); 5354 } 5355 5356 if (events & (POLLOUT | POLLWRNORM)) { 5357 /* 5358 * Darwin sets the flag first, 5359 * BSD calls selrecord first 5360 */ 5361 so->so_snd.sb_flags |= SB_SEL; 5362 selrecord(p, &so->so_snd.sb_sel, wql); 5363 } 5364 } 5365 5366 socket_unlock(so, 1); 5367 return (revents); 5368} 5369 5370int 5371soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) 5372{ 5373#pragma unused(fp) 5374#if !CONFIG_MACF_SOCKET 5375#pragma unused(ctx) 5376#endif /* MAC_SOCKET */ 5377 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5378 struct klist *skl; 5379 5380 socket_lock(so, 1); 5381 so_update_last_owner_locked(so, PROC_NULL); 5382 so_update_policy(so); 5383 5384#if CONFIG_MACF_SOCKET 5385 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)), 5386 kn, so) != 0) { 5387 socket_unlock(so, 1); 5388 return (1); 5389 } 5390#endif /* MAC_SOCKET */ 5391 5392 switch (kn->kn_filter) { 5393 case EVFILT_READ: 5394 kn->kn_fop = &soread_filtops; 5395 skl = &so->so_rcv.sb_sel.si_note; 5396 break; 5397 case EVFILT_WRITE: 5398 kn->kn_fop = &sowrite_filtops; 5399 skl = &so->so_snd.sb_sel.si_note; 5400 break; 5401 case EVFILT_SOCK: 5402 kn->kn_fop = &sock_filtops; 5403 skl = &so->so_klist; 5404 break; 5405 default: 5406 socket_unlock(so, 1); 5407 return (1); 5408 } 5409 5410 if (KNOTE_ATTACH(skl, kn)) { 5411 switch (kn->kn_filter) { 5412 case EVFILT_READ: 5413 so->so_rcv.sb_flags |= SB_KNOTE; 5414 break; 5415 case EVFILT_WRITE: 5416 so->so_snd.sb_flags |= SB_KNOTE; 5417 break; 5418 case EVFILT_SOCK: 5419 so->so_flags |= SOF_KNOTE; 5420 break; 5421 default: 5422 socket_unlock(so, 1); 5423 return (1); 5424 } 5425 } 5426 socket_unlock(so, 1); 5427 return (0); 5428} 5429 5430static void 5431filt_sordetach(struct knote *kn) 5432{ 5433 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5434 5435 socket_lock(so, 1); 5436 if (so->so_rcv.sb_flags & SB_KNOTE) 5437 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) 5438 so->so_rcv.sb_flags &= ~SB_KNOTE; 5439 socket_unlock(so, 1); 5440} 5441 5442/*ARGSUSED*/ 5443static int 5444filt_soread(struct knote *kn, long hint) 5445{ 5446 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5447 5448 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5449 socket_lock(so, 1); 5450 5451 if (so->so_options & SO_ACCEPTCONN) { 5452 int isempty; 5453 5454 /* 5455 * Radar 6615193 handle the listen case dynamically 5456 * for kqueue read filter. This allows to call listen() 5457 * after registering the kqueue EVFILT_READ. 5458 */ 5459 5460 kn->kn_data = so->so_qlen; 5461 isempty = ! TAILQ_EMPTY(&so->so_comp); 5462 5463 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5464 socket_unlock(so, 1); 5465 5466 return (isempty); 5467 } 5468 5469 /* socket isn't a listener */ 5470 5471 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 5472 5473 if (so->so_oobmark) { 5474 if (kn->kn_flags & EV_OOBAND) { 5475 kn->kn_data -= so->so_oobmark; 5476 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5477 socket_unlock(so, 1); 5478 return (1); 5479 } 5480 kn->kn_data = so->so_oobmark; 5481 kn->kn_flags |= EV_OOBAND; 5482 } else { 5483 if ((so->so_state & SS_CANTRCVMORE) 5484#if CONTENT_FILTER 5485 && cfil_sock_data_pending(&so->so_rcv) == 0 5486#endif /* CONTENT_FILTER */ 5487 ) { 5488 kn->kn_flags |= EV_EOF; 5489 kn->kn_fflags = so->so_error; 5490 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5491 socket_unlock(so, 1); 5492 return (1); 5493 } 5494 } 5495 5496 if (so->so_state & SS_RCVATMARK) { 5497 if (kn->kn_flags & EV_OOBAND) { 5498 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5499 socket_unlock(so, 1); 5500 return (1); 5501 } 5502 kn->kn_flags |= EV_OOBAND; 5503 } else if (kn->kn_flags & EV_OOBAND) { 5504 kn->kn_data = 0; 5505 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5506 socket_unlock(so, 1); 5507 return (0); 5508 } 5509 5510 if (so->so_error) { /* temporary udp error */ 5511 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5512 socket_unlock(so, 1); 5513 return (1); 5514 } 5515 5516 int64_t lowwat = so->so_rcv.sb_lowat; 5517 if (kn->kn_sfflags & NOTE_LOWAT) { 5518 if (kn->kn_sdata > so->so_rcv.sb_hiwat) 5519 lowwat = so->so_rcv.sb_hiwat; 5520 else if (kn->kn_sdata > lowwat) 5521 lowwat = kn->kn_sdata; 5522 } 5523 5524 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5525 socket_unlock(so, 1); 5526 5527 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); 5528} 5529 5530static void 5531filt_sowdetach(struct knote *kn) 5532{ 5533 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5534 socket_lock(so, 1); 5535 5536 if (so->so_snd.sb_flags & SB_KNOTE) 5537 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) 5538 so->so_snd.sb_flags &= ~SB_KNOTE; 5539 socket_unlock(so, 1); 5540} 5541 5542int 5543so_wait_for_if_feedback(struct socket *so) 5544{ 5545 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) && 5546 (so->so_state & SS_ISCONNECTED)) { 5547 struct inpcb *inp = sotoinpcb(so); 5548 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) 5549 return (1); 5550 } 5551 return (0); 5552} 5553 5554/*ARGSUSED*/ 5555static int 5556filt_sowrite(struct knote *kn, long hint) 5557{ 5558 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5559 int ret = 0; 5560 5561 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5562 socket_lock(so, 1); 5563 5564 kn->kn_data = sbspace(&so->so_snd); 5565 if (so->so_state & SS_CANTSENDMORE) { 5566 kn->kn_flags |= EV_EOF; 5567 kn->kn_fflags = so->so_error; 5568 ret = 1; 5569 goto out; 5570 } 5571 if (so->so_error) { /* temporary udp error */ 5572 ret = 1; 5573 goto out; 5574 } 5575 if (((so->so_state & SS_ISCONNECTED) == 0) && 5576 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 5577 ret = 0; 5578 goto out; 5579 } 5580 int64_t lowwat = so->so_snd.sb_lowat; 5581 if (kn->kn_sfflags & NOTE_LOWAT) { 5582 if (kn->kn_sdata > so->so_snd.sb_hiwat) 5583 lowwat = so->so_snd.sb_hiwat; 5584 else if (kn->kn_sdata > lowwat) 5585 lowwat = kn->kn_sdata; 5586 } 5587 if (kn->kn_data >= lowwat) { 5588 if (so->so_flags & SOF_NOTSENT_LOWAT) { 5589 if ((SOCK_DOM(so) == PF_INET 5590 || SOCK_DOM(so) == PF_INET6) 5591 && so->so_type == SOCK_STREAM) { 5592 ret = tcp_notsent_lowat_check(so); 5593 } 5594#if MPTCP 5595 else if ((SOCK_DOM(so) == PF_MULTIPATH) && 5596 (SOCK_PROTO(so) == IPPROTO_TCP)) { 5597 ret = mptcp_notsent_lowat_check(so); 5598 } 5599#endif 5600 else { 5601 return (1); 5602 } 5603 } else { 5604 ret = 1; 5605 } 5606 } 5607 if (so_wait_for_if_feedback(so)) 5608 ret = 0; 5609out: 5610 if ((hint & SO_FILT_HINT_LOCKED) == 0) 5611 socket_unlock(so, 1); 5612 return (ret); 5613} 5614 5615static void 5616filt_sockdetach(struct knote *kn) 5617{ 5618 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5619 socket_lock(so, 1); 5620 5621 if ((so->so_flags & SOF_KNOTE) != 0) 5622 if (KNOTE_DETACH(&so->so_klist, kn)) 5623 so->so_flags &= ~SOF_KNOTE; 5624 socket_unlock(so, 1); 5625} 5626 5627static int 5628filt_sockev(struct knote *kn, long hint) 5629{ 5630 int ret = 0, locked = 0; 5631 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 5632 long ev_hint = (hint & SO_FILT_HINT_EV); 5633 5634 if ((hint & SO_FILT_HINT_LOCKED) == 0) { 5635 socket_lock(so, 1); 5636 locked = 1; 5637 } 5638 5639 if (ev_hint & SO_FILT_HINT_CONNRESET) { 5640 if (kn->kn_sfflags & NOTE_CONNRESET) 5641 kn->kn_fflags |= NOTE_CONNRESET; 5642 } 5643 if (ev_hint & SO_FILT_HINT_TIMEOUT) { 5644 if (kn->kn_sfflags & NOTE_TIMEOUT) 5645 kn->kn_fflags |= NOTE_TIMEOUT; 5646 } 5647 if (ev_hint & SO_FILT_HINT_NOSRCADDR) { 5648 if (kn->kn_sfflags & NOTE_NOSRCADDR) 5649 kn->kn_fflags |= NOTE_NOSRCADDR; 5650 } 5651 if (ev_hint & SO_FILT_HINT_IFDENIED) { 5652 if ((kn->kn_sfflags & NOTE_IFDENIED)) 5653 kn->kn_fflags |= NOTE_IFDENIED; 5654 } 5655 if (ev_hint & SO_FILT_HINT_KEEPALIVE) { 5656 if (kn->kn_sfflags & NOTE_KEEPALIVE) 5657 kn->kn_fflags |= NOTE_KEEPALIVE; 5658 } 5659 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) { 5660 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO) 5661 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO; 5662 } 5663 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) { 5664 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO) 5665 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO; 5666 } 5667 if (ev_hint & SO_FILT_HINT_CONNECTED) { 5668 if (kn->kn_sfflags & NOTE_CONNECTED) 5669 kn->kn_fflags |= NOTE_CONNECTED; 5670 } 5671 if (ev_hint & SO_FILT_HINT_DISCONNECTED) { 5672 if (kn->kn_sfflags & NOTE_DISCONNECTED) 5673 kn->kn_fflags |= NOTE_DISCONNECTED; 5674 } 5675 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) { 5676 if (so->so_proto != NULL && 5677 (so->so_proto->pr_flags & PR_EVCONNINFO) && 5678 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED)) 5679 kn->kn_fflags |= NOTE_CONNINFO_UPDATED; 5680 } 5681 5682 if ((kn->kn_sfflags & NOTE_READCLOSED) && 5683 (so->so_state & SS_CANTRCVMORE) 5684#if CONTENT_FILTER 5685 && cfil_sock_data_pending(&so->so_rcv) == 0 5686#endif /* CONTENT_FILTER */ 5687 ) 5688 kn->kn_fflags |= NOTE_READCLOSED; 5689 5690 if ((kn->kn_sfflags & NOTE_WRITECLOSED) && 5691 (so->so_state & SS_CANTSENDMORE)) 5692 kn->kn_fflags |= NOTE_WRITECLOSED; 5693 5694 if ((kn->kn_sfflags & NOTE_SUSPEND) && 5695 ((ev_hint & SO_FILT_HINT_SUSPEND) || 5696 (so->so_flags & SOF_SUSPENDED))) { 5697 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); 5698 kn->kn_fflags |= NOTE_SUSPEND; 5699 } 5700 5701 if ((kn->kn_sfflags & NOTE_RESUME) && 5702 ((ev_hint & SO_FILT_HINT_RESUME) || 5703 (so->so_flags & SOF_SUSPENDED) == 0)) { 5704 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); 5705 kn->kn_fflags |= NOTE_RESUME; 5706 } 5707 5708 if (so->so_error != 0) { 5709 ret = 1; 5710 kn->kn_data = so->so_error; 5711 kn->kn_flags |= EV_EOF; 5712 } else { 5713 get_sockev_state(so, (u_int32_t *)&(kn->kn_data)); 5714 } 5715 5716 if (kn->kn_fflags != 0) 5717 ret = 1; 5718 5719 if (locked) 5720 socket_unlock(so, 1); 5721 5722 return (ret); 5723} 5724 5725void 5726get_sockev_state(struct socket *so, u_int32_t *statep) 5727{ 5728 u_int32_t state = *(statep); 5729 5730 if (so->so_state & SS_ISCONNECTED) 5731 state |= SOCKEV_CONNECTED; 5732 else 5733 state &= ~(SOCKEV_CONNECTED); 5734 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0); 5735 *(statep) = state; 5736} 5737 5738#define SO_LOCK_HISTORY_STR_LEN \ 5739 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1) 5740 5741__private_extern__ const char * 5742solockhistory_nr(struct socket *so) 5743{ 5744 size_t n = 0; 5745 int i; 5746 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; 5747 5748 bzero(lock_history_str, sizeof (lock_history_str)); 5749 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { 5750 n += snprintf(lock_history_str + n, 5751 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ", 5752 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], 5753 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]); 5754 } 5755 return (lock_history_str); 5756} 5757 5758int 5759socket_lock(struct socket *so, int refcount) 5760{ 5761 int error = 0; 5762 void *lr_saved; 5763 5764 lr_saved = __builtin_return_address(0); 5765 5766 if (so->so_proto->pr_lock) { 5767 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved); 5768 } else { 5769#ifdef MORE_LOCKING_DEBUG 5770 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, 5771 LCK_MTX_ASSERT_NOTOWNED); 5772#endif 5773 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); 5774 if (refcount) 5775 so->so_usecount++; 5776 so->lock_lr[so->next_lock_lr] = lr_saved; 5777 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; 5778 } 5779 5780 return (error); 5781} 5782 5783int 5784socket_unlock(struct socket *so, int refcount) 5785{ 5786 int error = 0; 5787 void *lr_saved; 5788 lck_mtx_t *mutex_held; 5789 5790 lr_saved = __builtin_return_address(0); 5791 5792 if (so->so_proto == NULL) { 5793 panic("%s: null so_proto so=%p\n", __func__, so); 5794 /* NOTREACHED */ 5795 } 5796 5797 if (so && so->so_proto->pr_unlock) { 5798 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved); 5799 } else { 5800 mutex_held = so->so_proto->pr_domain->dom_mtx; 5801#ifdef MORE_LOCKING_DEBUG 5802 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 5803#endif 5804 so->unlock_lr[so->next_unlock_lr] = lr_saved; 5805 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; 5806 5807 if (refcount) { 5808 if (so->so_usecount <= 0) { 5809 panic("%s: bad refcount=%d so=%p (%d, %d, %d) " 5810 "lrh=%s", __func__, so->so_usecount, so, 5811 SOCK_DOM(so), so->so_type, 5812 SOCK_PROTO(so), solockhistory_nr(so)); 5813 /* NOTREACHED */ 5814 } 5815 5816 so->so_usecount--; 5817 if (so->so_usecount == 0) 5818 sofreelastref(so, 1); 5819 } 5820 lck_mtx_unlock(mutex_held); 5821 } 5822 5823 return (error); 5824} 5825 5826/* Called with socket locked, will unlock socket */ 5827void 5828sofree(struct socket *so) 5829{ 5830 lck_mtx_t *mutex_held; 5831 5832 if (so->so_proto->pr_getlock != NULL) 5833 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 5834 else 5835 mutex_held = so->so_proto->pr_domain->dom_mtx; 5836 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 5837 5838 sofreelastref(so, 0); 5839} 5840 5841void 5842soreference(struct socket *so) 5843{ 5844 socket_lock(so, 1); /* locks & take one reference on socket */ 5845 socket_unlock(so, 0); /* unlock only */ 5846} 5847 5848void 5849sodereference(struct socket *so) 5850{ 5851 socket_lock(so, 0); 5852 socket_unlock(so, 1); 5853} 5854 5855/* 5856 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the 5857 * possibility of using jumbo clusters. Caller must ensure to hold 5858 * the socket lock. 5859 */ 5860void 5861somultipages(struct socket *so, boolean_t set) 5862{ 5863 if (set) 5864 so->so_flags |= SOF_MULTIPAGES; 5865 else 5866 so->so_flags &= ~SOF_MULTIPAGES; 5867} 5868 5869void 5870soif2kcl(struct socket *so, boolean_t set) 5871{ 5872 if (set) 5873 so->so_flags1 |= SOF1_IF_2KCL; 5874 else 5875 so->so_flags1 &= ~SOF1_IF_2KCL; 5876} 5877 5878int 5879so_isdstlocal(struct socket *so) { 5880 5881 struct inpcb *inp = (struct inpcb *)so->so_pcb; 5882 5883 if (SOCK_DOM(so) == PF_INET) 5884 return (inaddr_local(inp->inp_faddr)); 5885 else if (SOCK_DOM(so) == PF_INET6) 5886 return (in6addr_local(&inp->in6p_faddr)); 5887 5888 return (0); 5889} 5890 5891int 5892sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) 5893{ 5894 struct sockbuf *rcv, *snd; 5895 int err = 0, defunct; 5896 5897 rcv = &so->so_rcv; 5898 snd = &so->so_snd; 5899 5900 defunct = (so->so_flags & SOF_DEFUNCT); 5901 if (defunct) { 5902 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) { 5903 panic("%s: SB_DROP not set", __func__); 5904 /* NOTREACHED */ 5905 } 5906 goto done; 5907 } 5908 5909 if (so->so_flags & SOF_NODEFUNCT) { 5910 if (noforce) { 5911 err = EOPNOTSUPP; 5912 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) " 5913 "so 0x%llx [%d,%d] is not eligible for defunct " 5914 "(%d)\n", __func__, proc_selfpid(), proc_pid(p), 5915 level, (uint64_t)VM_KERNEL_ADDRPERM(so), 5916 SOCK_DOM(so), SOCK_TYPE(so), err)); 5917 return (err); 5918 } 5919 so->so_flags &= ~SOF_NODEFUNCT; 5920 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " 5921 "[%d,%d] defunct by force\n", __func__, proc_selfpid(), 5922 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 5923 SOCK_DOM(so), SOCK_TYPE(so))); 5924 } 5925 5926 so->so_flags |= SOF_DEFUNCT; 5927 5928 /* Prevent further data from being appended to the socket buffers */ 5929 snd->sb_flags |= SB_DROP; 5930 rcv->sb_flags |= SB_DROP; 5931 5932 /* Flush any existing data in the socket buffers */ 5933 if (rcv->sb_cc != 0) { 5934 rcv->sb_flags &= ~SB_SEL; 5935 selthreadclear(&rcv->sb_sel); 5936 sbrelease(rcv); 5937 } 5938 if (snd->sb_cc != 0) { 5939 snd->sb_flags &= ~SB_SEL; 5940 selthreadclear(&snd->sb_sel); 5941 sbrelease(snd); 5942 } 5943 5944done: 5945 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s " 5946 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, 5947 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), 5948 defunct ? "is already" : "marked as")); 5949 5950 return (err); 5951} 5952 5953int 5954sodefunct(struct proc *p, struct socket *so, int level) 5955{ 5956 struct sockbuf *rcv, *snd; 5957 5958 if (!(so->so_flags & SOF_DEFUNCT)) { 5959 panic("%s improperly called", __func__); 5960 /* NOTREACHED */ 5961 } 5962 if (so->so_state & SS_DEFUNCT) 5963 goto done; 5964 5965 rcv = &so->so_rcv; 5966 snd = &so->so_snd; 5967 5968 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 5969 char s[MAX_IPv6_STR_LEN]; 5970 char d[MAX_IPv6_STR_LEN]; 5971 struct inpcb *inp = sotoinpcb(so); 5972 5973 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s " 5974 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " 5975 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), 5976 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 5977 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", 5978 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? 5979 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), 5980 s, sizeof (s)), ntohs(inp->in6p_lport), 5981 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? 5982 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr, 5983 d, sizeof (d)), ntohs(inp->in6p_fport), 5984 (uint32_t)rcv->sb_sel.si_flags, 5985 (uint32_t)snd->sb_sel.si_flags, 5986 rcv->sb_flags, snd->sb_flags)); 5987 } else { 5988 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " 5989 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " 5990 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), 5991 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 5992 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags, 5993 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags, 5994 snd->sb_flags)); 5995 } 5996 5997 /* 5998 * Unwedge threads blocked on sbwait() and sb_lock(). 5999 */ 6000 sbwakeup(rcv); 6001 sbwakeup(snd); 6002 6003 so->so_flags1 |= SOF1_DEFUNCTINPROG; 6004 if (rcv->sb_flags & SB_LOCK) 6005 sbunlock(rcv, TRUE); /* keep socket locked */ 6006 if (snd->sb_flags & SB_LOCK) 6007 sbunlock(snd, TRUE); /* keep socket locked */ 6008 6009 /* 6010 * Flush the buffers and disconnect. We explicitly call shutdown 6011 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE 6012 * states are set for the socket. This would also flush out data 6013 * hanging off the receive list of this socket. 6014 */ 6015 (void) soshutdownlock_final(so, SHUT_RD); 6016 (void) soshutdownlock_final(so, SHUT_WR); 6017 (void) sodisconnectlocked(so); 6018 6019 /* 6020 * Explicitly handle connectionless-protocol disconnection 6021 * and release any remaining data in the socket buffers. 6022 */ 6023 if (!(so->so_flags & SS_ISDISCONNECTED)) 6024 (void) soisdisconnected(so); 6025 6026 if (so->so_error == 0) 6027 so->so_error = EBADF; 6028 6029 if (rcv->sb_cc != 0) { 6030 rcv->sb_flags &= ~SB_SEL; 6031 selthreadclear(&rcv->sb_sel); 6032 sbrelease(rcv); 6033 } 6034 if (snd->sb_cc != 0) { 6035 snd->sb_flags &= ~SB_SEL; 6036 selthreadclear(&snd->sb_sel); 6037 sbrelease(snd); 6038 } 6039 so->so_state |= SS_DEFUNCT; 6040 6041done: 6042 return (0); 6043} 6044 6045__private_extern__ int 6046so_set_recv_anyif(struct socket *so, int optval) 6047{ 6048 int ret = 0; 6049 6050#if INET6 6051 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 6052#else 6053 if (SOCK_DOM(so) == PF_INET) { 6054#endif /* !INET6 */ 6055 if (optval) 6056 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF; 6057 else 6058 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF; 6059 } 6060 6061 return (ret); 6062} 6063 6064__private_extern__ int 6065so_get_recv_anyif(struct socket *so) 6066{ 6067 int ret = 0; 6068 6069#if INET6 6070 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 6071#else 6072 if (SOCK_DOM(so) == PF_INET) { 6073#endif /* !INET6 */ 6074 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0; 6075 } 6076 6077 return (ret); 6078} 6079 6080int 6081so_set_restrictions(struct socket *so, uint32_t vals) 6082{ 6083 int nocell_old, nocell_new; 6084 int noexpensive_old, noexpensive_new; 6085 6086 /* 6087 * Deny-type restrictions are trapdoors; once set they cannot be 6088 * unset for the lifetime of the socket. This allows them to be 6089 * issued by a framework on behalf of the application without 6090 * having to worry that they can be undone. 6091 * 6092 * Note here that socket-level restrictions overrides any protocol 6093 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR 6094 * socket restriction issued on the socket has a higher precendence 6095 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID 6096 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only, 6097 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued. 6098 */ 6099 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); 6100 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); 6101 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN | 6102 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR | 6103 SO_RESTRICT_DENY_EXPENSIVE)); 6104 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); 6105 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); 6106 6107 /* we can only set, not clear restrictions */ 6108 if ((nocell_new - nocell_old) == 0 && 6109 (noexpensive_new - noexpensive_old) == 0) 6110 return (0); 6111#if INET6 6112 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 6113#else 6114 if (SOCK_DOM(so) == PF_INET) { 6115#endif /* !INET6 */ 6116 if (nocell_new - nocell_old != 0) { 6117 /* if deny cellular is now set, do what's needed for INPCB */ 6118 inp_set_nocellular(sotoinpcb(so)); 6119 } 6120 if (noexpensive_new - noexpensive_old != 0) { 6121 inp_set_noexpensive(sotoinpcb(so)); 6122 } 6123 } 6124 6125 return (0); 6126} 6127 6128uint32_t 6129so_get_restrictions(struct socket *so) 6130{ 6131 return (so->so_restrictions & (SO_RESTRICT_DENY_IN | 6132 SO_RESTRICT_DENY_OUT | 6133 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE)); 6134} 6135 6136struct sockaddr_entry * 6137sockaddrentry_alloc(int how) 6138{ 6139 struct sockaddr_entry *se; 6140 6141 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone); 6142 if (se != NULL) 6143 bzero(se, se_zone_size); 6144 6145 return (se); 6146} 6147 6148void 6149sockaddrentry_free(struct sockaddr_entry *se) 6150{ 6151 if (se->se_addr != NULL) { 6152 FREE(se->se_addr, M_SONAME); 6153 se->se_addr = NULL; 6154 } 6155 zfree(se_zone, se); 6156} 6157 6158struct sockaddr_entry * 6159sockaddrentry_dup(const struct sockaddr_entry *src_se, int how) 6160{ 6161 struct sockaddr_entry *dst_se; 6162 6163 dst_se = sockaddrentry_alloc(how); 6164 if (dst_se != NULL) { 6165 int len = src_se->se_addr->sa_len; 6166 6167 MALLOC(dst_se->se_addr, struct sockaddr *, 6168 len, M_SONAME, how | M_ZERO); 6169 if (dst_se->se_addr != NULL) { 6170 bcopy(src_se->se_addr, dst_se->se_addr, len); 6171 } else { 6172 sockaddrentry_free(dst_se); 6173 dst_se = NULL; 6174 } 6175 } 6176 6177 return (dst_se); 6178} 6179 6180struct sockaddr_list * 6181sockaddrlist_alloc(int how) 6182{ 6183 struct sockaddr_list *sl; 6184 6185 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone); 6186 if (sl != NULL) { 6187 bzero(sl, sl_zone_size); 6188 TAILQ_INIT(&sl->sl_head); 6189 } 6190 return (sl); 6191} 6192 6193void 6194sockaddrlist_free(struct sockaddr_list *sl) 6195{ 6196 struct sockaddr_entry *se, *tse; 6197 6198 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) { 6199 sockaddrlist_remove(sl, se); 6200 sockaddrentry_free(se); 6201 } 6202 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head)); 6203 zfree(sl_zone, sl); 6204} 6205 6206void 6207sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se) 6208{ 6209 VERIFY(!(se->se_flags & SEF_ATTACHED)); 6210 se->se_flags |= SEF_ATTACHED; 6211 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link); 6212 sl->sl_cnt++; 6213 VERIFY(sl->sl_cnt != 0); 6214} 6215 6216void 6217sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se) 6218{ 6219 VERIFY(se->se_flags & SEF_ATTACHED); 6220 se->se_flags &= ~SEF_ATTACHED; 6221 VERIFY(sl->sl_cnt != 0); 6222 sl->sl_cnt--; 6223 TAILQ_REMOVE(&sl->sl_head, se, se_link); 6224} 6225 6226struct sockaddr_list * 6227sockaddrlist_dup(const struct sockaddr_list *src_sl, int how) 6228{ 6229 struct sockaddr_entry *src_se, *tse; 6230 struct sockaddr_list *dst_sl; 6231 6232 dst_sl = sockaddrlist_alloc(how); 6233 if (dst_sl == NULL) 6234 return (NULL); 6235 6236 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) { 6237 struct sockaddr_entry *dst_se; 6238 6239 if (src_se->se_addr == NULL) 6240 continue; 6241 6242 dst_se = sockaddrentry_dup(src_se, how); 6243 if (dst_se == NULL) { 6244 sockaddrlist_free(dst_sl); 6245 return (NULL); 6246 } 6247 6248 sockaddrlist_insert(dst_sl, dst_se); 6249 } 6250 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt); 6251 6252 return (dst_sl); 6253} 6254 6255int 6256so_set_effective_pid(struct socket *so, int epid, struct proc *p) 6257{ 6258 struct proc *ep = PROC_NULL; 6259 int error = 0; 6260 6261 /* pid 0 is reserved for kernel */ 6262 if (epid == 0) { 6263 error = EINVAL; 6264 goto done; 6265 } 6266 6267 /* 6268 * If this is an in-kernel socket, prevent its delegate 6269 * association from changing unless the socket option is 6270 * coming from within the kernel itself. 6271 */ 6272 if (so->last_pid == 0 && p != kernproc) { 6273 error = EACCES; 6274 goto done; 6275 } 6276 6277 /* 6278 * If this is issued by a process that's recorded as the 6279 * real owner of the socket, or if the pid is the same as 6280 * the process's own pid, then proceed. Otherwise ensure 6281 * that the issuing process has the necessary privileges. 6282 */ 6283 if (epid != so->last_pid || epid != proc_pid(p)) { 6284 if ((error = priv_check_cred(kauth_cred_get(), 6285 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { 6286 error = EACCES; 6287 goto done; 6288 } 6289 } 6290 6291 /* Find the process that corresponds to the effective pid */ 6292 if ((ep = proc_find(epid)) == PROC_NULL) { 6293 error = ESRCH; 6294 goto done; 6295 } 6296 6297 /* 6298 * If a process tries to delegate the socket to itself, then 6299 * there's really nothing to do; treat it as a way for the 6300 * delegate association to be cleared. Note that we check 6301 * the passed-in proc rather than calling proc_selfpid(), 6302 * as we need to check the process issuing the socket option 6303 * which could be kernproc. Given that we don't allow 0 for 6304 * effective pid, it means that a delegated in-kernel socket 6305 * stays delegated during its lifetime (which is probably OK.) 6306 */ 6307 if (epid == proc_pid(p)) { 6308 so->so_flags &= ~SOF_DELEGATED; 6309 so->e_upid = 0; 6310 so->e_pid = 0; 6311 uuid_clear(so->e_uuid); 6312 } else { 6313 so->so_flags |= SOF_DELEGATED; 6314 so->e_upid = proc_uniqueid(ep); 6315 so->e_pid = proc_pid(ep); 6316 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid)); 6317 } 6318done: 6319 if (error == 0 && net_io_policy_log) { 6320 uuid_string_t buf; 6321 6322 uuid_unparse(so->e_uuid, buf); 6323 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " 6324 "euuid %s%s\n", __func__, proc_name_address(p), 6325 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 6326 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf, 6327 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); 6328 } else if (error != 0 && net_io_policy_log) { 6329 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " 6330 "ERROR (%d)\n", __func__, proc_name_address(p), 6331 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 6332 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" : 6333 proc_name_address(ep), error); 6334 } 6335 6336 /* Update this socket's policy upon success */ 6337 if (error == 0) { 6338 so->so_policy_gencnt *= -1; 6339 so_update_policy(so); 6340#if NECP 6341 so_update_necp_policy(so, NULL, NULL); 6342#endif /* NECP */ 6343 } 6344 6345 if (ep != PROC_NULL) 6346 proc_rele(ep); 6347 6348 return (error); 6349} 6350 6351int 6352so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p) 6353{ 6354 uuid_string_t buf; 6355 uuid_t uuid; 6356 int error = 0; 6357 6358 /* UUID must not be all-zeroes (reserved for kernel) */ 6359 if (uuid_is_null(euuid)) { 6360 error = EINVAL; 6361 goto done;; 6362 } 6363 6364 /* 6365 * If this is an in-kernel socket, prevent its delegate 6366 * association from changing unless the socket option is 6367 * coming from within the kernel itself. 6368 */ 6369 if (so->last_pid == 0 && p != kernproc) { 6370 error = EACCES; 6371 goto done; 6372 } 6373 6374 /* Get the UUID of the issuing process */ 6375 proc_getexecutableuuid(p, uuid, sizeof (uuid)); 6376 6377 /* 6378 * If this is issued by a process that's recorded as the 6379 * real owner of the socket, or if the uuid is the same as 6380 * the process's own uuid, then proceed. Otherwise ensure 6381 * that the issuing process has the necessary privileges. 6382 */ 6383 if (uuid_compare(euuid, so->last_uuid) != 0 || 6384 uuid_compare(euuid, uuid) != 0) { 6385 if ((error = priv_check_cred(kauth_cred_get(), 6386 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { 6387 error = EACCES; 6388 goto done; 6389 } 6390 } 6391 6392 /* 6393 * If a process tries to delegate the socket to itself, then 6394 * there's really nothing to do; treat it as a way for the 6395 * delegate association to be cleared. Note that we check 6396 * the uuid of the passed-in proc rather than that of the 6397 * current process, as we need to check the process issuing 6398 * the socket option which could be kernproc itself. Given 6399 * that we don't allow 0 for effective uuid, it means that 6400 * a delegated in-kernel socket stays delegated during its 6401 * lifetime (which is okay.) 6402 */ 6403 if (uuid_compare(euuid, uuid) == 0) { 6404 so->so_flags &= ~SOF_DELEGATED; 6405 so->e_upid = 0; 6406 so->e_pid = 0; 6407 uuid_clear(so->e_uuid); 6408 } else { 6409 so->so_flags |= SOF_DELEGATED; 6410 /* 6411 * Unlike so_set_effective_pid(), we only have the UUID 6412 * here and the process ID is not known. Inherit the 6413 * real {pid,upid} of the socket. 6414 */ 6415 so->e_upid = so->last_upid; 6416 so->e_pid = so->last_pid; 6417 uuid_copy(so->e_uuid, euuid); 6418 } 6419 6420done: 6421 if (error == 0 && net_io_policy_log) { 6422 uuid_unparse(so->e_uuid, buf); 6423 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d " 6424 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p), 6425 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 6426 SOCK_TYPE(so), so->e_pid, buf, 6427 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); 6428 } else if (error != 0 && net_io_policy_log) { 6429 uuid_unparse(euuid, buf); 6430 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s " 6431 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p), 6432 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 6433 SOCK_TYPE(so), buf, error); 6434 } 6435 6436 /* Update this socket's policy upon success */ 6437 if (error == 0) { 6438 so->so_policy_gencnt *= -1; 6439 so_update_policy(so); 6440#if NECP 6441 so_update_necp_policy(so, NULL, NULL); 6442#endif /* NECP */ 6443 } 6444 6445 return (error); 6446} 6447 6448void 6449netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data, 6450 uint32_t ev_datalen) 6451{ 6452 struct kev_msg ev_msg; 6453 6454 /* 6455 * A netpolicy event always starts with a netpolicy_event_data 6456 * structure, but the caller can provide for a longer event 6457 * structure to post, depending on the event code. 6458 */ 6459 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data)); 6460 6461 bzero(&ev_msg, sizeof (ev_msg)); 6462 ev_msg.vendor_code = KEV_VENDOR_APPLE; 6463 ev_msg.kev_class = KEV_NETWORK_CLASS; 6464 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS; 6465 ev_msg.event_code = ev_code; 6466 6467 ev_msg.dv[0].data_ptr = ev_data; 6468 ev_msg.dv[0].data_length = ev_datalen; 6469 6470 kev_post_msg(&ev_msg); 6471} 6472 6473void 6474socket_post_kev_msg(uint32_t ev_code, 6475 struct kev_socket_event_data *ev_data, 6476 uint32_t ev_datalen) 6477{ 6478 struct kev_msg ev_msg; 6479 6480 bzero(&ev_msg, sizeof(ev_msg)); 6481 ev_msg.vendor_code = KEV_VENDOR_APPLE; 6482 ev_msg.kev_class = KEV_NETWORK_CLASS; 6483 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS; 6484 ev_msg.event_code = ev_code; 6485 6486 ev_msg.dv[0].data_ptr = ev_data; 6487 ev_msg.dv[0]. data_length = ev_datalen; 6488 6489 kev_post_msg(&ev_msg); 6490} 6491 6492void 6493socket_post_kev_msg_closed(struct socket *so) 6494{ 6495 struct kev_socket_closed ev; 6496 struct sockaddr *socksa = NULL, *peersa = NULL; 6497 int err; 6498 bzero(&ev, sizeof(ev)); 6499 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa); 6500 if (err == 0) { 6501 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, 6502 &peersa); 6503 if (err == 0) { 6504 memcpy(&ev.ev_data.kev_sockname, socksa, 6505 min(socksa->sa_len, 6506 sizeof (ev.ev_data.kev_sockname))); 6507 memcpy(&ev.ev_data.kev_peername, peersa, 6508 min(peersa->sa_len, 6509 sizeof (ev.ev_data.kev_peername))); 6510 socket_post_kev_msg(KEV_SOCKET_CLOSED, 6511 &ev.ev_data, sizeof (ev)); 6512 } 6513 } 6514 if (socksa != NULL) 6515 FREE(socksa, M_SONAME); 6516 if (peersa != NULL) 6517 FREE(peersa, M_SONAME); 6518} 6519