1/* 2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1982, 1986, 1988, 1990, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/filedesc.h> 73#include <sys/proc.h> 74#include <sys/proc_internal.h> 75#include <sys/kauth.h> 76#include <sys/file_internal.h> 77#include <sys/fcntl.h> 78#include <sys/malloc.h> 79#include <sys/mbuf.h> 80#include <sys/domain.h> 81#include <sys/kernel.h> 82#include <sys/event.h> 83#include <sys/poll.h> 84#include <sys/protosw.h> 85#include <sys/socket.h> 86#include <sys/socketvar.h> 87#include <sys/resourcevar.h> 88#include <sys/signalvar.h> 89#include <sys/sysctl.h> 90#include <sys/syslog.h> 91#include <sys/uio.h> 92#include <sys/ev.h> 93#include <sys/kdebug.h> 94#include <sys/un.h> 95#include <sys/user.h> 96#include <sys/priv.h> 97#include <sys/kern_event.h> 98#include <net/route.h> 99#include <net/init.h> 100#include <net/ntstat.h> 101#include <netinet/in.h> 102#include <netinet/in_pcb.h> 103#include <netinet/ip6.h> 104#include <netinet6/ip6_var.h> 105#include <netinet/flow_divert.h> 106#include <kern/zalloc.h> 107#include <kern/locks.h> 108#include <machine/limits.h> 109#include <libkern/OSAtomic.h> 110#include <pexpert/pexpert.h> 111#include <kern/assert.h> 112#include <kern/task.h> 113#include <sys/kpi_mbuf.h> 114#include <sys/mcache.h> 115 116#if CONFIG_MACF 117#include <security/mac.h> 118#include <security/mac_framework.h> 119#endif /* MAC */ 120 121#if MULTIPATH 122#include <netinet/mp_pcb.h> 123#endif /* MULTIPATH */ 124 125/* TODO: this should be in a header file somewhere */ 126extern char *proc_name_address(void *p); 127 128static u_int32_t so_cache_hw; /* High water mark for socache */ 129static u_int32_t so_cache_timeouts; /* number of timeouts */ 130static u_int32_t so_cache_max_freed; /* max freed per timeout */ 131static u_int32_t cached_sock_count = 0; 132STAILQ_HEAD(, socket) so_cache_head; 133int max_cached_sock_count = MAX_CACHED_SOCKETS; 134static u_int32_t so_cache_time; 135static int socketinit_done; 136static struct zone *so_cache_zone; 137 138static lck_grp_t *so_cache_mtx_grp; 139static lck_attr_t *so_cache_mtx_attr; 140static lck_grp_attr_t *so_cache_mtx_grp_attr; 141static lck_mtx_t *so_cache_mtx; 142 143#include <machine/limits.h> 144 145static void filt_sordetach(struct knote *kn); 146static int filt_soread(struct knote *kn, long hint); 147static void filt_sowdetach(struct knote *kn); 148static int filt_sowrite(struct knote *kn, long hint); 149static void filt_sockdetach(struct knote *kn); 150static int filt_sockev(struct knote *kn, long hint); 151 152static int sooptcopyin_timeval(struct sockopt *, struct timeval *); 153static int sooptcopyout_timeval(struct sockopt *, const struct timeval *); 154 155static struct filterops soread_filtops = { 156 .f_isfd = 1, 157 .f_detach = filt_sordetach, 158 .f_event = filt_soread, 159}; 160 161static struct filterops sowrite_filtops = { 162 .f_isfd = 1, 163 .f_detach = filt_sowdetach, 164 .f_event = filt_sowrite, 165}; 166 167static struct filterops sock_filtops = { 168 .f_isfd = 1, 169 .f_detach = filt_sockdetach, 170 .f_event = filt_sockev, 171}; 172 173#define EVEN_MORE_LOCKING_DEBUG 0 174int socket_debug = 0; 175static int socket_zone = M_SOCKET; 176so_gen_t so_gencnt; /* generation count for sockets */ 177 178MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 179MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 180 181#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0) 182#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2) 183#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1) 184#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3) 185#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1) 186#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8)) 187#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8)) 188 189#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES) 190 191SYSCTL_DECL(_kern_ipc); 192 193int somaxconn = SOMAXCONN; 194SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 195 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); 196 197/* Should we get a maximum also ??? */ 198static int sosendmaxchain = 65536; 199static int sosendminchain = 16384; 200static int sorecvmincopy = 16384; 201SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, 202 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); 203SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, 204 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); 205 206/* 207 * Set to enable jumbo clusters (if available) for large writes when 208 * the socket is marked with SOF_MULTIPAGES; see below. 209 */ 210int sosendjcl = 1; 211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, 212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); 213 214/* 215 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large 216 * writes on the socket for all protocols on any network interfaces, 217 * depending upon sosendjcl above. Be extra careful when setting this 218 * to 1, because sending down packets that cross physical pages down to 219 * broken drivers (those that falsely assume that the physical pages 220 * are contiguous) might lead to system panics or silent data corruption. 221 * When set to 0, the system will respect SOF_MULTIPAGES, which is set 222 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES 223 * capable. Set this to 1 only for testing/debugging purposes. 224 */ 225int sosendjcl_ignore_capab = 0; 226SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, 227 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); 228 229int sodefunctlog = 0; 230SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, 231 &sodefunctlog, 0, ""); 232 233int sothrottlelog = 0; 234SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED, 235 &sothrottlelog, 0, ""); 236 237int sorestrictrecv = 1; 238SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED, 239 &sorestrictrecv, 0, "Enable inbound interface restrictions"); 240 241/* 242 * Socket operation routines. 243 * These routines are called by the routines in 244 * sys_socket.c or from a system process, and 245 * implement the semantics of socket operations by 246 * switching out to the protocol specific routines. 247 */ 248 249/* sys_generic.c */ 250extern void postevent(struct socket *, struct sockbuf *, int); 251extern void evsofree(struct socket *); 252extern int tcp_notsent_lowat_check(struct socket *so); 253extern struct inpcbinfo tcbinfo; 254 255/* TODO: these should be in header file */ 256extern int get_inpcb_str_size(void); 257extern int get_tcp_str_size(void); 258 259static unsigned int sl_zone_size; /* size of sockaddr_list */ 260static struct zone *sl_zone; /* zone for sockaddr_list */ 261 262static unsigned int se_zone_size; /* size of sockaddr_entry */ 263static struct zone *se_zone; /* zone for sockaddr_entry */ 264 265vm_size_t so_cache_zone_element_size; 266 267static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *); 268static void cached_sock_alloc(struct socket **, int); 269static void cached_sock_free(struct socket *); 270 271/* 272 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from 273 * setting the DSCP code on the packet based on the service class; see 274 * <rdar://problem/11277343> for details. 275 */ 276__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP; 277SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, 278 &sotcdb, 0, ""); 279 280void 281socketinit(void) 282{ 283 if (socketinit_done) { 284 printf("socketinit: already called...\n"); 285 return; 286 } 287 socketinit_done = 1; 288 289 PE_parse_boot_argn("socket_debug", &socket_debug, 290 sizeof (socket_debug)); 291 292 /* 293 * allocate lock group attribute and group for socket cache mutex 294 */ 295 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init(); 296 so_cache_mtx_grp = lck_grp_alloc_init("so_cache", 297 so_cache_mtx_grp_attr); 298 299 /* 300 * allocate the lock attribute for socket cache mutex 301 */ 302 so_cache_mtx_attr = lck_attr_alloc_init(); 303 304 /* cached sockets mutex */ 305 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr); 306 if (so_cache_mtx == NULL) { 307 panic("%s: unable to allocate so_cache_mtx\n", __func__); 308 /* NOTREACHED */ 309 } 310 STAILQ_INIT(&so_cache_head); 311 312 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4 313 + get_inpcb_str_size() + 4 + get_tcp_str_size()); 314 315 so_cache_zone = zinit(so_cache_zone_element_size, 316 (120000 * so_cache_zone_element_size), 8192, "socache zone"); 317 zone_change(so_cache_zone, Z_CALLERACCT, FALSE); 318 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE); 319 320 sl_zone_size = sizeof (struct sockaddr_list); 321 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024, 322 "sockaddr_list")) == NULL) { 323 panic("%s: unable to allocate sockaddr_list zone\n", __func__); 324 /* NOTREACHED */ 325 } 326 zone_change(sl_zone, Z_CALLERACCT, FALSE); 327 zone_change(sl_zone, Z_EXPAND, TRUE); 328 329 se_zone_size = sizeof (struct sockaddr_entry); 330 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024, 331 "sockaddr_entry")) == NULL) { 332 panic("%s: unable to allocate sockaddr_entry zone\n", __func__); 333 /* NOTREACHED */ 334 } 335 zone_change(se_zone, Z_CALLERACCT, FALSE); 336 zone_change(se_zone, Z_EXPAND, TRUE); 337 338 339 in_pcbinit(); 340 sflt_init(); 341 socket_tclass_init(); 342#if MULTIPATH 343 mp_pcbinit(); 344#endif /* MULTIPATH */ 345} 346 347static void 348cached_sock_alloc(struct socket **so, int waitok) 349{ 350 caddr_t temp; 351 uintptr_t offset; 352 353 lck_mtx_lock(so_cache_mtx); 354 355 if (!STAILQ_EMPTY(&so_cache_head)) { 356 VERIFY(cached_sock_count > 0); 357 358 *so = STAILQ_FIRST(&so_cache_head); 359 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); 360 STAILQ_NEXT((*so), so_cache_ent) = NULL; 361 362 cached_sock_count--; 363 lck_mtx_unlock(so_cache_mtx); 364 365 temp = (*so)->so_saved_pcb; 366 bzero((caddr_t)*so, sizeof (struct socket)); 367 368 (*so)->so_saved_pcb = temp; 369 } else { 370 371 lck_mtx_unlock(so_cache_mtx); 372 373 if (waitok) 374 *so = (struct socket *)zalloc(so_cache_zone); 375 else 376 *so = (struct socket *)zalloc_noblock(so_cache_zone); 377 378 if (*so == NULL) 379 return; 380 381 bzero((caddr_t)*so, sizeof (struct socket)); 382 383 /* 384 * Define offsets for extra structures into our 385 * single block of memory. Align extra structures 386 * on longword boundaries. 387 */ 388 389 offset = (uintptr_t)*so; 390 offset += sizeof (struct socket); 391 392 offset = ALIGN(offset); 393 394 (*so)->so_saved_pcb = (caddr_t)offset; 395 offset += get_inpcb_str_size(); 396 397 offset = ALIGN(offset); 398 399 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb = 400 (caddr_t)offset; 401 } 402 403 (*so)->cached_in_sock_layer = true; 404} 405 406static void 407cached_sock_free(struct socket *so) 408{ 409 410 lck_mtx_lock(so_cache_mtx); 411 412 so_cache_time = net_uptime(); 413 if (++cached_sock_count > max_cached_sock_count) { 414 --cached_sock_count; 415 lck_mtx_unlock(so_cache_mtx); 416 zfree(so_cache_zone, so); 417 } else { 418 if (so_cache_hw < cached_sock_count) 419 so_cache_hw = cached_sock_count; 420 421 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent); 422 423 so->cache_timestamp = so_cache_time; 424 lck_mtx_unlock(so_cache_mtx); 425 } 426} 427 428void 429so_update_last_owner_locked(struct socket *so, proc_t self) 430{ 431 if (so->last_pid != 0) { 432 /* 433 * last_pid and last_upid should remain zero for sockets 434 * created using sock_socket. The check above achieves that 435 */ 436 if (self == PROC_NULL) 437 self = current_proc(); 438 439 if (so->last_upid != proc_uniqueid(self) || 440 so->last_pid != proc_pid(self)) { 441 so->last_upid = proc_uniqueid(self); 442 so->last_pid = proc_pid(self); 443 proc_getexecutableuuid(self, so->last_uuid, 444 sizeof (so->last_uuid)); 445 } 446 } 447} 448 449void 450so_update_policy(struct socket *so) 451{ 452 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) 453 (void) inp_update_policy(sotoinpcb(so)); 454} 455 456boolean_t 457so_cache_timer(void) 458{ 459 struct socket *p; 460 int n_freed = 0; 461 boolean_t rc = FALSE; 462 463 lck_mtx_lock(so_cache_mtx); 464 so_cache_timeouts++; 465 so_cache_time = net_uptime(); 466 467 while (!STAILQ_EMPTY(&so_cache_head)) { 468 VERIFY(cached_sock_count > 0); 469 p = STAILQ_FIRST(&so_cache_head); 470 if ((so_cache_time - p->cache_timestamp) < 471 SO_CACHE_TIME_LIMIT) 472 break; 473 474 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); 475 --cached_sock_count; 476 477 zfree(so_cache_zone, p); 478 479 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) { 480 so_cache_max_freed++; 481 break; 482 } 483 } 484 485 /* Schedule again if there is more to cleanup */ 486 if (!STAILQ_EMPTY(&so_cache_head)) 487 rc = TRUE; 488 489 lck_mtx_unlock(so_cache_mtx); 490 return (rc); 491} 492 493/* 494 * Get a socket structure from our zone, and initialize it. 495 * We don't implement `waitok' yet (see comments in uipc_domain.c). 496 * Note that it would probably be better to allocate socket 497 * and PCB at the same time, but I'm not convinced that all 498 * the protocols can be easily modified to do this. 499 */ 500struct socket * 501soalloc(int waitok, int dom, int type) 502{ 503 struct socket *so; 504 505 if ((dom == PF_INET) && (type == SOCK_STREAM)) { 506 cached_sock_alloc(&so, waitok); 507 } else { 508 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone, 509 M_WAITOK); 510 if (so != NULL) 511 bzero(so, sizeof (*so)); 512 } 513 if (so != NULL) { 514 so->so_gencnt = ++so_gencnt; 515 so->so_zone = socket_zone; 516#if CONFIG_MACF_SOCKET 517 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */ 518 if (mac_socket_label_init(so, !waitok) != 0) { 519 sodealloc(so); 520 return (NULL); 521 } 522#endif /* MAC_SOCKET */ 523 } 524 525 return (so); 526} 527 528int 529socreate_internal(int dom, struct socket **aso, int type, int proto, 530 struct proc *p, uint32_t flags, struct proc *ep) 531{ 532 struct protosw *prp; 533 struct socket *so; 534 int error = 0; 535 536#if TCPDEBUG 537 extern int tcpconsdebug; 538#endif 539 540 VERIFY(aso != NULL); 541 *aso = NULL; 542 543 if (proto != 0) 544 prp = pffindproto(dom, proto, type); 545 else 546 prp = pffindtype(dom, type); 547 548 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) { 549 if (pffinddomain(dom) == NULL) 550 return (EAFNOSUPPORT); 551 if (proto != 0) { 552 if (pffindprotonotype(dom, proto) != NULL) 553 return (EPROTOTYPE); 554 } 555 return (EPROTONOSUPPORT); 556 } 557 if (prp->pr_type != type) 558 return (EPROTOTYPE); 559 so = soalloc(1, dom, type); 560 if (so == NULL) 561 return (ENOBUFS); 562 563 if (flags & SOCF_ASYNC) 564 so->so_state |= SS_NBIO; 565#if MULTIPATH 566 if (flags & SOCF_MP_SUBFLOW) { 567 /* 568 * A multipath subflow socket is used internally in the kernel, 569 * therefore it does not have a file desciptor associated by 570 * default. 571 */ 572 so->so_state |= SS_NOFDREF; 573 so->so_flags |= SOF_MP_SUBFLOW; 574 } 575#endif /* MULTIPATH */ 576 577 TAILQ_INIT(&so->so_incomp); 578 TAILQ_INIT(&so->so_comp); 579 so->so_type = type; 580 so->last_upid = proc_uniqueid(p); 581 so->last_pid = proc_pid(p); 582 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid)); 583 584 if (ep != PROC_NULL && ep != p) { 585 so->e_upid = proc_uniqueid(ep); 586 so->e_pid = proc_pid(ep); 587 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid)); 588 so->so_flags |= SOF_DELEGATED; 589 } 590 591 so->so_cred = kauth_cred_proc_ref(p); 592 if (!suser(kauth_cred_get(), NULL)) 593 so->so_state |= SS_PRIV; 594 595 so->so_proto = prp; 596 so->so_rcv.sb_flags |= SB_RECV; 597 so->so_rcv.sb_so = so->so_snd.sb_so = so; 598 so->next_lock_lr = 0; 599 so->next_unlock_lr = 0; 600 601#if CONFIG_MACF_SOCKET 602 mac_socket_label_associate(kauth_cred_get(), so); 603#endif /* MAC_SOCKET */ 604 605 /* 606 * Attachment will create the per pcb lock if necessary and 607 * increase refcount for creation, make sure it's done before 608 * socket is inserted in lists. 609 */ 610 so->so_usecount++; 611 612 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); 613 if (error != 0) { 614 /* 615 * Warning: 616 * If so_pcb is not zero, the socket will be leaked, 617 * so protocol attachment handler must be coded carefuly 618 */ 619 so->so_state |= SS_NOFDREF; 620 so->so_usecount--; 621 sofreelastref(so, 1); /* will deallocate the socket */ 622 return (error); 623 } 624 625 atomic_add_32(&prp->pr_domain->dom_refs, 1); 626 TAILQ_INIT(&so->so_evlist); 627 628 /* Attach socket filters for this protocol */ 629 sflt_initsock(so); 630#if TCPDEBUG 631 if (tcpconsdebug == 2) 632 so->so_options |= SO_DEBUG; 633#endif 634 so_set_default_traffic_class(so); 635 636 /* 637 * If this thread or task is marked to create backgrounded sockets, 638 * mark the socket as background. 639 */ 640 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) { 641 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); 642 so->so_background_thread = current_thread(); 643 } 644 645 switch (dom) { 646 /* 647 * Don't mark Unix domain, system or multipath sockets as 648 * eligible for defunct by default. 649 */ 650 case PF_LOCAL: 651 case PF_SYSTEM: 652 case PF_MULTIPATH: 653 so->so_flags |= SOF_NODEFUNCT; 654 break; 655 default: 656 break; 657 } 658 659 *aso = so; 660 661 return (0); 662} 663 664/* 665 * Returns: 0 Success 666 * EAFNOSUPPORT 667 * EPROTOTYPE 668 * EPROTONOSUPPORT 669 * ENOBUFS 670 * <pru_attach>:ENOBUFS[AF_UNIX] 671 * <pru_attach>:ENOBUFS[TCP] 672 * <pru_attach>:ENOMEM[TCP] 673 * <pru_attach>:??? [other protocol families, IPSEC] 674 */ 675int 676socreate(int dom, struct socket **aso, int type, int proto) 677{ 678 return (socreate_internal(dom, aso, type, proto, current_proc(), 0, 679 PROC_NULL)); 680} 681 682int 683socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid) 684{ 685 int error = 0; 686 struct proc *ep = PROC_NULL; 687 688 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) { 689 error = ESRCH; 690 goto done; 691 } 692 693 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep); 694 695 /* 696 * It might not be wise to hold the proc reference when calling 697 * socreate_internal since it calls soalloc with M_WAITOK 698 */ 699done: 700 if (ep != PROC_NULL) 701 proc_rele(ep); 702 703 return (error); 704} 705 706/* 707 * Returns: 0 Success 708 * <pru_bind>:EINVAL Invalid argument [COMMON_START] 709 * <pru_bind>:EAFNOSUPPORT Address family not supported 710 * <pru_bind>:EADDRNOTAVAIL Address not available. 711 * <pru_bind>:EINVAL Invalid argument 712 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef] 713 * <pru_bind>:EACCES Permission denied 714 * <pru_bind>:EADDRINUSE Address in use 715 * <pru_bind>:EAGAIN Resource unavailable, try again 716 * <pru_bind>:EPERM Operation not permitted 717 * <pru_bind>:??? 718 * <sf_bind>:??? 719 * 720 * Notes: It's not possible to fully enumerate the return codes above, 721 * since socket filter authors and protocol family authors may 722 * not choose to limit their error returns to those listed, even 723 * though this may result in some software operating incorrectly. 724 * 725 * The error codes which are enumerated above are those known to 726 * be returned by the tcp_usr_bind function supplied. 727 */ 728int 729sobindlock(struct socket *so, struct sockaddr *nam, int dolock) 730{ 731 struct proc *p = current_proc(); 732 int error = 0; 733 734 if (dolock) 735 socket_lock(so, 1); 736 VERIFY(so->so_usecount > 1); 737 738 so_update_last_owner_locked(so, p); 739 so_update_policy(so); 740 741 /* 742 * If this is a bind request on a socket that has been marked 743 * as inactive, reject it now before we go any further. 744 */ 745 if (so->so_flags & SOF_DEFUNCT) { 746 error = EINVAL; 747 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 748 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 749 SOCK_DOM(so), SOCK_TYPE(so), error)); 750 goto out; 751 } 752 753 /* Socket filter */ 754 error = sflt_bind(so, nam); 755 756 if (error == 0) 757 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); 758out: 759 if (dolock) 760 socket_unlock(so, 1); 761 762 if (error == EJUSTRETURN) 763 error = 0; 764 765 return (error); 766} 767 768void 769sodealloc(struct socket *so) 770{ 771 kauth_cred_unref(&so->so_cred); 772 773 /* Remove any filters */ 774 sflt_termsock(so); 775 776 /* Delete the state allocated for msg queues on a socket */ 777 if (so->so_flags & SOF_ENABLE_MSGS) { 778 FREE(so->so_msg_state, M_TEMP); 779 so->so_msg_state = NULL; 780 } 781 VERIFY(so->so_msg_state == NULL); 782 783 so->so_gencnt = ++so_gencnt; 784 785#if CONFIG_MACF_SOCKET 786 mac_socket_label_destroy(so); 787#endif /* MAC_SOCKET */ 788 789 if (so->cached_in_sock_layer) { 790 cached_sock_free(so); 791 } else { 792 FREE_ZONE(so, sizeof (*so), so->so_zone); 793 } 794} 795 796/* 797 * Returns: 0 Success 798 * EINVAL 799 * EOPNOTSUPP 800 * <pru_listen>:EINVAL[AF_UNIX] 801 * <pru_listen>:EINVAL[TCP] 802 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available. 803 * <pru_listen>:EINVAL[TCP] Invalid argument 804 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef] 805 * <pru_listen>:EACCES[TCP] Permission denied 806 * <pru_listen>:EADDRINUSE[TCP] Address in use 807 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again 808 * <pru_listen>:EPERM[TCP] Operation not permitted 809 * <sf_listen>:??? 810 * 811 * Notes: Other <pru_listen> returns depend on the protocol family; all 812 * <sf_listen> returns depend on what the filter author causes 813 * their filter to return. 814 */ 815int 816solisten(struct socket *so, int backlog) 817{ 818 struct proc *p = current_proc(); 819 int error = 0; 820 821 socket_lock(so, 1); 822 823 so_update_last_owner_locked(so, p); 824 so_update_policy(so); 825 826 if (so->so_proto == NULL) { 827 error = EINVAL; 828 goto out; 829 } 830 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { 831 error = EOPNOTSUPP; 832 goto out; 833 } 834 835 /* 836 * If the listen request is made on a socket that is not fully 837 * disconnected, or on a socket that has been marked as inactive, 838 * reject the request now. 839 */ 840 if ((so->so_state & 841 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) || 842 (so->so_flags & SOF_DEFUNCT)) { 843 error = EINVAL; 844 if (so->so_flags & SOF_DEFUNCT) { 845 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 846 "(%d)\n", __func__, proc_pid(p), 847 (uint64_t)VM_KERNEL_ADDRPERM(so), 848 SOCK_DOM(so), SOCK_TYPE(so), error)); 849 } 850 goto out; 851 } 852 853 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) { 854 error = EPERM; 855 goto out; 856 } 857 858 error = sflt_listen(so); 859 if (error == 0) 860 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); 861 862 if (error) { 863 if (error == EJUSTRETURN) 864 error = 0; 865 goto out; 866 } 867 868 if (TAILQ_EMPTY(&so->so_comp)) 869 so->so_options |= SO_ACCEPTCONN; 870 /* 871 * POSIX: The implementation may have an upper limit on the length of 872 * the listen queue-either global or per accepting socket. If backlog 873 * exceeds this limit, the length of the listen queue is set to the 874 * limit. 875 * 876 * If listen() is called with a backlog argument value that is less 877 * than 0, the function behaves as if it had been called with a backlog 878 * argument value of 0. 879 * 880 * A backlog argument of 0 may allow the socket to accept connections, 881 * in which case the length of the listen queue may be set to an 882 * implementation-defined minimum value. 883 */ 884 if (backlog <= 0 || backlog > somaxconn) 885 backlog = somaxconn; 886 887 so->so_qlimit = backlog; 888out: 889 socket_unlock(so, 1); 890 return (error); 891} 892 893void 894sofreelastref(struct socket *so, int dealloc) 895{ 896 struct socket *head = so->so_head; 897 898 /* Assume socket is locked */ 899 900 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) { 901 selthreadclear(&so->so_snd.sb_sel); 902 selthreadclear(&so->so_rcv.sb_sel); 903 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL); 904 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL); 905 so->so_event = NULL; 906 return; 907 } 908 if (head != NULL) { 909 socket_lock(head, 1); 910 if (so->so_state & SS_INCOMP) { 911 TAILQ_REMOVE(&head->so_incomp, so, so_list); 912 head->so_incqlen--; 913 } else if (so->so_state & SS_COMP) { 914 /* 915 * We must not decommission a socket that's 916 * on the accept(2) queue. If we do, then 917 * accept(2) may hang after select(2) indicated 918 * that the listening socket was ready. 919 */ 920 selthreadclear(&so->so_snd.sb_sel); 921 selthreadclear(&so->so_rcv.sb_sel); 922 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL); 923 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL); 924 so->so_event = NULL; 925 socket_unlock(head, 1); 926 return; 927 } else { 928 panic("sofree: not queued"); 929 } 930 head->so_qlen--; 931 so->so_state &= ~SS_INCOMP; 932 so->so_head = NULL; 933 socket_unlock(head, 1); 934 } 935 sowflush(so); 936 sorflush(so); 937 938#if FLOW_DIVERT 939 if (so->so_flags & SOF_FLOW_DIVERT) { 940 flow_divert_detach(so); 941 } 942#endif /* FLOW_DIVERT */ 943 944 /* 3932268: disable upcall */ 945 so->so_rcv.sb_flags &= ~SB_UPCALL; 946 so->so_snd.sb_flags &= ~SB_UPCALL; 947 so->so_event = NULL; 948 949 if (dealloc) 950 sodealloc(so); 951} 952 953void 954soclose_wait_locked(struct socket *so) 955{ 956 lck_mtx_t *mutex_held; 957 958 if (so->so_proto->pr_getlock != NULL) 959 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 960 else 961 mutex_held = so->so_proto->pr_domain->dom_mtx; 962 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 963 964 /* 965 * Double check here and return if there's no outstanding upcall; 966 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. 967 */ 968 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) 969 return; 970 so->so_rcv.sb_flags &= ~SB_UPCALL; 971 so->so_snd.sb_flags &= ~SB_UPCALL; 972 so->so_flags |= SOF_CLOSEWAIT; 973 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1), 974 "soclose_wait_locked", NULL); 975 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 976 so->so_flags &= ~SOF_CLOSEWAIT; 977} 978 979/* 980 * Close a socket on last file table reference removal. 981 * Initiate disconnect if connected. 982 * Free socket when disconnect complete. 983 */ 984int 985soclose_locked(struct socket *so) 986{ 987 int error = 0; 988 lck_mtx_t *mutex_held; 989 struct timespec ts; 990 991 if (so->so_usecount == 0) { 992 panic("soclose: so=%p refcount=0\n", so); 993 /* NOTREACHED */ 994 } 995 996 sflt_notify(so, sock_evt_closing, NULL); 997 998 if (so->so_upcallusecount) 999 soclose_wait_locked(so); 1000 1001 if ((so->so_options & SO_ACCEPTCONN)) { 1002 struct socket *sp, *sonext; 1003 int socklock = 0; 1004 1005 /* 1006 * We do not want new connection to be added 1007 * to the connection queues 1008 */ 1009 so->so_options &= ~SO_ACCEPTCONN; 1010 1011 for (sp = TAILQ_FIRST(&so->so_incomp); 1012 sp != NULL; sp = sonext) { 1013 sonext = TAILQ_NEXT(sp, so_list); 1014 1015 /* 1016 * Radar 5350314 1017 * skip sockets thrown away by tcpdropdropblreq 1018 * they will get cleanup by the garbage collection. 1019 * otherwise, remove the incomp socket from the queue 1020 * and let soabort trigger the appropriate cleanup. 1021 */ 1022 if (sp->so_flags & SOF_OVERFLOW) 1023 continue; 1024 1025 if (so->so_proto->pr_getlock != NULL) { 1026 /* 1027 * Lock ordering for consistency with the 1028 * rest of the stack, we lock the socket 1029 * first and then grabb the head. 1030 */ 1031 socket_unlock(so, 0); 1032 socket_lock(sp, 1); 1033 socket_lock(so, 0); 1034 socklock = 1; 1035 } 1036 1037 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 1038 so->so_incqlen--; 1039 1040 if (sp->so_state & SS_INCOMP) { 1041 sp->so_state &= ~SS_INCOMP; 1042 sp->so_head = NULL; 1043 1044 (void) soabort(sp); 1045 } 1046 1047 if (socklock) 1048 socket_unlock(sp, 1); 1049 } 1050 1051 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 1052 /* Dequeue from so_comp since sofree() won't do it */ 1053 TAILQ_REMOVE(&so->so_comp, sp, so_list); 1054 so->so_qlen--; 1055 1056 if (so->so_proto->pr_getlock != NULL) { 1057 socket_unlock(so, 0); 1058 socket_lock(sp, 1); 1059 } 1060 1061 if (sp->so_state & SS_COMP) { 1062 sp->so_state &= ~SS_COMP; 1063 sp->so_head = NULL; 1064 1065 (void) soabort(sp); 1066 } 1067 1068 if (so->so_proto->pr_getlock != NULL) { 1069 socket_unlock(sp, 1); 1070 socket_lock(so, 0); 1071 } 1072 } 1073 } 1074 if (so->so_pcb == NULL) { 1075 /* 3915887: mark the socket as ready for dealloc */ 1076 so->so_flags |= SOF_PCBCLEARING; 1077 goto discard; 1078 } 1079 if (so->so_state & SS_ISCONNECTED) { 1080 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1081 error = sodisconnectlocked(so); 1082 if (error) 1083 goto drop; 1084 } 1085 if (so->so_options & SO_LINGER) { 1086 if ((so->so_state & SS_ISDISCONNECTING) && 1087 (so->so_state & SS_NBIO)) 1088 goto drop; 1089 if (so->so_proto->pr_getlock != NULL) 1090 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 1091 else 1092 mutex_held = so->so_proto->pr_domain->dom_mtx; 1093 while (so->so_state & SS_ISCONNECTED) { 1094 ts.tv_sec = (so->so_linger/100); 1095 ts.tv_nsec = (so->so_linger % 100) * 1096 NSEC_PER_USEC * 1000 * 10; 1097 error = msleep((caddr_t)&so->so_timeo, 1098 mutex_held, PSOCK | PCATCH, "soclose", &ts); 1099 if (error) { 1100 /* 1101 * It's OK when the time fires, 1102 * don't report an error 1103 */ 1104 if (error == EWOULDBLOCK) 1105 error = 0; 1106 break; 1107 } 1108 } 1109 } 1110 } 1111drop: 1112 if (so->so_usecount == 0) { 1113 panic("soclose: usecount is zero so=%p\n", so); 1114 /* NOTREACHED */ 1115 } 1116 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) { 1117 /* 1118 * Let NetworkStatistics know this PCB is going away 1119 * before we detach it. 1120 */ 1121 if (nstat_collect && 1122 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) 1123 nstat_pcb_detach(so->so_pcb); 1124 1125 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 1126 if (error == 0) 1127 error = error2; 1128 } 1129 if (so->so_usecount <= 0) { 1130 panic("soclose: usecount is zero so=%p\n", so); 1131 /* NOTREACHED */ 1132 } 1133discard: 1134 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) && 1135 (so->so_state & SS_NOFDREF)) { 1136 panic("soclose: NOFDREF"); 1137 /* NOTREACHED */ 1138 } 1139 so->so_state |= SS_NOFDREF; 1140 1141 if (so->so_flags & SOF_MP_SUBFLOW) 1142 so->so_flags &= ~SOF_MP_SUBFLOW; 1143 1144 if ((so->so_flags & SOF_KNOTE) != 0) 1145 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED); 1146 1147 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1); 1148 evsofree(so); 1149 1150 so->so_usecount--; 1151 sofree(so); 1152 return (error); 1153} 1154 1155int 1156soclose(struct socket *so) 1157{ 1158 int error = 0; 1159 socket_lock(so, 1); 1160 1161 if (so->so_retaincnt == 0) { 1162 error = soclose_locked(so); 1163 } else { 1164 /* 1165 * if the FD is going away, but socket is 1166 * retained in kernel remove its reference 1167 */ 1168 so->so_usecount--; 1169 if (so->so_usecount < 2) 1170 panic("soclose: retaincnt non null and so=%p " 1171 "usecount=%d\n", so, so->so_usecount); 1172 } 1173 socket_unlock(so, 1); 1174 return (error); 1175} 1176 1177/* 1178 * Must be called at splnet... 1179 */ 1180/* Should already be locked */ 1181int 1182soabort(struct socket *so) 1183{ 1184 int error; 1185 1186#ifdef MORE_LOCKING_DEBUG 1187 lck_mtx_t *mutex_held; 1188 1189 if (so->so_proto->pr_getlock != NULL) 1190 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 1191 else 1192 mutex_held = so->so_proto->pr_domain->dom_mtx; 1193 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 1194#endif 1195 1196 if ((so->so_flags & SOF_ABORTED) == 0) { 1197 so->so_flags |= SOF_ABORTED; 1198 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 1199 if (error) { 1200 sofree(so); 1201 return (error); 1202 } 1203 } 1204 return (0); 1205} 1206 1207int 1208soacceptlock(struct socket *so, struct sockaddr **nam, int dolock) 1209{ 1210 int error; 1211 1212 if (dolock) 1213 socket_lock(so, 1); 1214 1215 so_update_last_owner_locked(so, PROC_NULL); 1216 so_update_policy(so); 1217 1218 if ((so->so_state & SS_NOFDREF) == 0) 1219 panic("soaccept: !NOFDREF"); 1220 so->so_state &= ~SS_NOFDREF; 1221 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1222 1223 if (dolock) 1224 socket_unlock(so, 1); 1225 return (error); 1226} 1227 1228int 1229soaccept(struct socket *so, struct sockaddr **nam) 1230{ 1231 return (soacceptlock(so, nam, 1)); 1232} 1233 1234int 1235soacceptfilter(struct socket *so) 1236{ 1237 struct sockaddr *local = NULL, *remote = NULL; 1238 int error = 0; 1239 struct socket *head = so->so_head; 1240 1241 /* 1242 * Hold the lock even if this socket has not been made visible 1243 * to the filter(s). For sockets with global locks, this protects 1244 * against the head or peer going away 1245 */ 1246 socket_lock(so, 1); 1247 if (sogetaddr_locked(so, &remote, 1) != 0 || 1248 sogetaddr_locked(so, &local, 0) != 0) { 1249 so->so_state &= ~(SS_NOFDREF | SS_COMP); 1250 so->so_head = NULL; 1251 socket_unlock(so, 1); 1252 soclose(so); 1253 /* Out of resources; try it again next time */ 1254 error = ECONNABORTED; 1255 goto done; 1256 } 1257 1258 error = sflt_accept(head, so, local, remote); 1259 1260 /* 1261 * If we get EJUSTRETURN from one of the filters, mark this socket 1262 * as inactive and return it anyway. This newly accepted socket 1263 * will be disconnected later before we hand it off to the caller. 1264 */ 1265 if (error == EJUSTRETURN) { 1266 error = 0; 1267 (void) sosetdefunct(current_proc(), so, 1268 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); 1269 } 1270 1271 if (error != 0) { 1272 /* 1273 * This may seem like a duplication to the above error 1274 * handling part when we return ECONNABORTED, except 1275 * the following is done while holding the lock since 1276 * the socket has been exposed to the filter(s) earlier. 1277 */ 1278 so->so_state &= ~(SS_NOFDREF | SS_COMP); 1279 so->so_head = NULL; 1280 socket_unlock(so, 1); 1281 soclose(so); 1282 /* Propagate socket filter's error code to the caller */ 1283 } else { 1284 socket_unlock(so, 1); 1285 } 1286done: 1287 /* Callee checks for NULL pointer */ 1288 sock_freeaddr(remote); 1289 sock_freeaddr(local); 1290 return (error); 1291} 1292 1293/* 1294 * Returns: 0 Success 1295 * EOPNOTSUPP Operation not supported on socket 1296 * EISCONN Socket is connected 1297 * <pru_connect>:EADDRNOTAVAIL Address not available. 1298 * <pru_connect>:EINVAL Invalid argument 1299 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef] 1300 * <pru_connect>:EACCES Permission denied 1301 * <pru_connect>:EADDRINUSE Address in use 1302 * <pru_connect>:EAGAIN Resource unavailable, try again 1303 * <pru_connect>:EPERM Operation not permitted 1304 * <sf_connect_out>:??? [anything a filter writer might set] 1305 */ 1306int 1307soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) 1308{ 1309 int error; 1310 struct proc *p = current_proc(); 1311 1312 if (dolock) 1313 socket_lock(so, 1); 1314 1315 so_update_last_owner_locked(so, p); 1316 so_update_policy(so); 1317 1318 /* 1319 * If this is a listening socket or if this is a previously-accepted 1320 * socket that has been marked as inactive, reject the connect request. 1321 */ 1322 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { 1323 error = EOPNOTSUPP; 1324 if (so->so_flags & SOF_DEFUNCT) { 1325 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 1326 "(%d)\n", __func__, proc_pid(p), 1327 (uint64_t)VM_KERNEL_ADDRPERM(so), 1328 SOCK_DOM(so), SOCK_TYPE(so), error)); 1329 } 1330 if (dolock) 1331 socket_unlock(so, 1); 1332 return (error); 1333 } 1334 1335 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) { 1336 if (dolock) 1337 socket_unlock(so, 1); 1338 return (EPERM); 1339 } 1340 1341 /* 1342 * If protocol is connection-based, can only connect once. 1343 * Otherwise, if connected, try to disconnect first. 1344 * This allows user to disconnect by connecting to, e.g., 1345 * a null address. 1346 */ 1347 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1348 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1349 (error = sodisconnectlocked(so)))) { 1350 error = EISCONN; 1351 } else { 1352 /* 1353 * Run connect filter before calling protocol: 1354 * - non-blocking connect returns before completion; 1355 */ 1356 error = sflt_connectout(so, nam); 1357 if (error != 0) { 1358 if (error == EJUSTRETURN) 1359 error = 0; 1360 } else { 1361 error = (*so->so_proto->pr_usrreqs->pru_connect) 1362 (so, nam, p); 1363 } 1364 } 1365 if (dolock) 1366 socket_unlock(so, 1); 1367 return (error); 1368} 1369 1370int 1371soconnect(struct socket *so, struct sockaddr *nam) 1372{ 1373 return (soconnectlock(so, nam, 1)); 1374} 1375 1376/* 1377 * Returns: 0 Success 1378 * <pru_connect2>:EINVAL[AF_UNIX] 1379 * <pru_connect2>:EPROTOTYPE[AF_UNIX] 1380 * <pru_connect2>:??? [other protocol families] 1381 * 1382 * Notes: <pru_connect2> is not supported by [TCP]. 1383 */ 1384int 1385soconnect2(struct socket *so1, struct socket *so2) 1386{ 1387 int error; 1388 1389 socket_lock(so1, 1); 1390 if (so2->so_proto->pr_lock) 1391 socket_lock(so2, 1); 1392 1393 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1394 1395 socket_unlock(so1, 1); 1396 if (so2->so_proto->pr_lock) 1397 socket_unlock(so2, 1); 1398 return (error); 1399} 1400 1401int 1402soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl, 1403 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, 1404 associd_t aid, connid_t *pcid, uint32_t flags, void *arg, 1405 uint32_t arglen) 1406{ 1407 int error; 1408 1409 /* 1410 * If this is a listening socket or if this is a previously-accepted 1411 * socket that has been marked as inactive, reject the connect request. 1412 */ 1413 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { 1414 error = EOPNOTSUPP; 1415 if (so->so_flags & SOF_DEFUNCT) { 1416 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] " 1417 "(%d)\n", __func__, proc_pid(p), 1418 (uint64_t)VM_KERNEL_ADDRPERM(so), 1419 SOCK_DOM(so), SOCK_TYPE(so), error)); 1420 } 1421 return (error); 1422 } 1423 1424 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) 1425 return (EPERM); 1426 1427 /* 1428 * If protocol is connection-based, can only connect once 1429 * unless PR_MULTICONN is set. Otherwise, if connected, 1430 * try to disconnect first. This allows user to disconnect 1431 * by connecting to, e.g., a null address. 1432 */ 1433 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) && 1434 !(so->so_proto->pr_flags & PR_MULTICONN) && 1435 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1436 (error = sodisconnectlocked(so)) != 0)) { 1437 error = EISCONN; 1438 } else { 1439 /* 1440 * Run connect filter before calling protocol: 1441 * - non-blocking connect returns before completion; 1442 */ 1443 error = sflt_connectxout(so, dst_sl); 1444 if (error != 0) { 1445 if (error == EJUSTRETURN) 1446 error = 0; 1447 } else { 1448 error = (*so->so_proto->pr_usrreqs->pru_connectx) 1449 (so, src_sl, dst_sl, p, ifscope, aid, pcid, 1450 flags, arg, arglen); 1451 } 1452 } 1453 1454 return (error); 1455} 1456 1457int 1458sodisconnectlocked(struct socket *so) 1459{ 1460 int error; 1461 1462 if ((so->so_state & SS_ISCONNECTED) == 0) { 1463 error = ENOTCONN; 1464 goto bad; 1465 } 1466 if (so->so_state & SS_ISDISCONNECTING) { 1467 error = EALREADY; 1468 goto bad; 1469 } 1470 1471 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1472 if (error == 0) 1473 sflt_notify(so, sock_evt_disconnected, NULL); 1474 1475bad: 1476 return (error); 1477} 1478 1479/* Locking version */ 1480int 1481sodisconnect(struct socket *so) 1482{ 1483 int error; 1484 1485 socket_lock(so, 1); 1486 error = sodisconnectlocked(so); 1487 socket_unlock(so, 1); 1488 return (error); 1489} 1490 1491int 1492sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid) 1493{ 1494 int error; 1495 1496 /* 1497 * Call the protocol disconnectx handler; let it handle all 1498 * matters related to the connection state of this session. 1499 */ 1500 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid); 1501 if (error == 0) { 1502 /* 1503 * The event applies only for the session, not for 1504 * the disconnection of individual subflows. 1505 */ 1506 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) 1507 sflt_notify(so, sock_evt_disconnected, NULL); 1508 } 1509 return (error); 1510} 1511 1512int 1513sodisconnectx(struct socket *so, associd_t aid, connid_t cid) 1514{ 1515 int error; 1516 1517 socket_lock(so, 1); 1518 error = sodisconnectxlocked(so, aid, cid); 1519 socket_unlock(so, 1); 1520 return (error); 1521} 1522 1523int 1524sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop) 1525{ 1526 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop)); 1527} 1528 1529#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1530 1531/* 1532 * sosendcheck will lock the socket buffer if it isn't locked and 1533 * verify that there is space for the data being inserted. 1534 * 1535 * Returns: 0 Success 1536 * EPIPE 1537 * sblock:EWOULDBLOCK 1538 * sblock:EINTR 1539 * sbwait:EBADF 1540 * sbwait:EINTR 1541 * [so_error]:??? 1542 */ 1543int 1544sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid, 1545 int32_t clen, int32_t atomic, int flags, int *sblocked, 1546 struct mbuf *control) 1547{ 1548 int error = 0; 1549 int32_t space; 1550 int assumelock = 0; 1551 1552restart: 1553 if (*sblocked == 0) { 1554 if ((so->so_snd.sb_flags & SB_LOCK) != 0 && 1555 so->so_send_filt_thread != 0 && 1556 so->so_send_filt_thread == current_thread()) { 1557 /* 1558 * We're being called recursively from a filter, 1559 * allow this to continue. Radar 4150520. 1560 * Don't set sblocked because we don't want 1561 * to perform an unlock later. 1562 */ 1563 assumelock = 1; 1564 } else { 1565 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1566 if (error) { 1567 if (so->so_flags & SOF_DEFUNCT) 1568 goto defunct; 1569 return (error); 1570 } 1571 *sblocked = 1; 1572 } 1573 } 1574 1575 /* 1576 * If a send attempt is made on a socket that has been marked 1577 * as inactive (disconnected), reject the request. 1578 */ 1579 if (so->so_flags & SOF_DEFUNCT) { 1580defunct: 1581 error = EPIPE; 1582 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 1583 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so), 1584 SOCK_DOM(so), SOCK_TYPE(so), error)); 1585 return (error); 1586 } 1587 1588 if (so->so_state & SS_CANTSENDMORE) 1589 return (EPIPE); 1590 1591 if (so->so_error) { 1592 error = so->so_error; 1593 so->so_error = 0; 1594 return (error); 1595 } 1596 1597 if ((so->so_state & SS_ISCONNECTED) == 0) { 1598 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 1599 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1600 !(resid == 0 && clen != 0)) 1601 return (ENOTCONN); 1602 } else if (addr == 0 && !(flags&MSG_HOLD)) { 1603 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ? 1604 ENOTCONN : EDESTADDRREQ); 1605 } 1606 } 1607 if (so->so_flags & SOF_ENABLE_MSGS) 1608 space = msgq_sbspace(so, control); 1609 else 1610 space = sbspace(&so->so_snd); 1611 1612 if (flags & MSG_OOB) 1613 space += 1024; 1614 if ((atomic && resid > so->so_snd.sb_hiwat) || 1615 clen > so->so_snd.sb_hiwat) 1616 return (EMSGSIZE); 1617 1618 if ((space < resid + clen && 1619 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) || 1620 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) { 1621 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || 1622 assumelock) { 1623 return (EWOULDBLOCK); 1624 } 1625 sbunlock(&so->so_snd, TRUE); /* keep socket locked */ 1626 *sblocked = 0; 1627 error = sbwait(&so->so_snd); 1628 if (error) { 1629 if (so->so_flags & SOF_DEFUNCT) 1630 goto defunct; 1631 return (error); 1632 } 1633 goto restart; 1634 } 1635 return (0); 1636} 1637 1638/* 1639 * Send on a socket. 1640 * If send must go all at once and message is larger than 1641 * send buffering, then hard error. 1642 * Lock against other senders. 1643 * If must go all at once and not enough room now, then 1644 * inform user that this would block and do nothing. 1645 * Otherwise, if nonblocking, send as much as possible. 1646 * The data to be sent is described by "uio" if nonzero, 1647 * otherwise by the mbuf chain "top" (which must be null 1648 * if uio is not). Data provided in mbuf chain must be small 1649 * enough to send all at once. 1650 * 1651 * Returns nonzero on error, timeout or signal; callers 1652 * must check for short counts if EINTR/ERESTART are returned. 1653 * Data and control buffers are freed on return. 1654 * Experiment: 1655 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf 1656 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then 1657 * point at the mbuf chain being constructed and go from there. 1658 * 1659 * Returns: 0 Success 1660 * EOPNOTSUPP 1661 * EINVAL 1662 * ENOBUFS 1663 * uiomove:EFAULT 1664 * sosendcheck:EPIPE 1665 * sosendcheck:EWOULDBLOCK 1666 * sosendcheck:EINTR 1667 * sosendcheck:EBADF 1668 * sosendcheck:EINTR 1669 * sosendcheck:??? [value from so_error] 1670 * <pru_send>:ECONNRESET[TCP] 1671 * <pru_send>:EINVAL[TCP] 1672 * <pru_send>:ENOBUFS[TCP] 1673 * <pru_send>:EADDRINUSE[TCP] 1674 * <pru_send>:EADDRNOTAVAIL[TCP] 1675 * <pru_send>:EAFNOSUPPORT[TCP] 1676 * <pru_send>:EACCES[TCP] 1677 * <pru_send>:EAGAIN[TCP] 1678 * <pru_send>:EPERM[TCP] 1679 * <pru_send>:EMSGSIZE[TCP] 1680 * <pru_send>:EHOSTUNREACH[TCP] 1681 * <pru_send>:ENETUNREACH[TCP] 1682 * <pru_send>:ENETDOWN[TCP] 1683 * <pru_send>:ENOMEM[TCP] 1684 * <pru_send>:ENOBUFS[TCP] 1685 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] 1686 * <pru_send>:EINVAL[AF_UNIX] 1687 * <pru_send>:EOPNOTSUPP[AF_UNIX] 1688 * <pru_send>:EPIPE[AF_UNIX] 1689 * <pru_send>:ENOTCONN[AF_UNIX] 1690 * <pru_send>:EISCONN[AF_UNIX] 1691 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses] 1692 * <sf_data_out>:??? [whatever a filter author chooses] 1693 * 1694 * Notes: Other <pru_send> returns depend on the protocol family; all 1695 * <sf_data_out> returns depend on what the filter author causes 1696 * their filter to return. 1697 */ 1698int 1699sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1700 struct mbuf *top, struct mbuf *control, int flags) 1701{ 1702 struct mbuf **mp; 1703 struct mbuf *m, *freelist = NULL; 1704 user_ssize_t space, len, resid; 1705 int clen = 0, error, dontroute, mlen, sendflags; 1706 int atomic = sosendallatonce(so) || top; 1707 int sblocked = 0; 1708 struct proc *p = current_proc(); 1709 struct mbuf *control_copy = NULL; 1710 1711 if (uio != NULL) 1712 resid = uio_resid(uio); 1713 else 1714 resid = top->m_pkthdr.len; 1715 1716 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid, 1717 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); 1718 1719 socket_lock(so, 1); 1720 so_update_last_owner_locked(so, p); 1721 so_update_policy(so); 1722 1723 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { 1724 error = EOPNOTSUPP; 1725 socket_unlock(so, 1); 1726 goto out; 1727 } 1728 1729 /* 1730 * In theory resid should be unsigned. 1731 * However, space must be signed, as it might be less than 0 1732 * if we over-committed, and we must use a signed comparison 1733 * of space and resid. On the other hand, a negative resid 1734 * causes us to loop sending 0-length segments to the protocol. 1735 * 1736 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets. 1737 * But it will be used by sockets doing message delivery. 1738 * 1739 * Note: We limit resid to be a positive 32 bits value as we use 1740 * imin() to set bytes_to_copy -- radr://14558484 1741 */ 1742 if ((int32_t)resid < 0 || (so->so_type == SOCK_STREAM && 1743 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) { 1744 error = EINVAL; 1745 socket_unlock(so, 1); 1746 goto out; 1747 } 1748 1749 dontroute = (flags & MSG_DONTROUTE) && 1750 (so->so_options & SO_DONTROUTE) == 0 && 1751 (so->so_proto->pr_flags & PR_ATOMIC); 1752 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); 1753 1754 if (control != NULL) 1755 clen = control->m_len; 1756 1757 do { 1758 error = sosendcheck(so, addr, resid, clen, atomic, flags, 1759 &sblocked, control); 1760 if (error) 1761 goto release; 1762 1763 mp = ⊤ 1764 if (so->so_flags & SOF_ENABLE_MSGS) 1765 space = msgq_sbspace(so, control); 1766 else 1767 space = sbspace(&so->so_snd) - clen; 1768 space += ((flags & MSG_OOB) ? 1024 : 0); 1769 1770 do { 1771 if (uio == NULL) { 1772 /* 1773 * Data is prepackaged in "top". 1774 */ 1775 resid = 0; 1776 if (flags & MSG_EOR) 1777 top->m_flags |= M_EOR; 1778 } else { 1779 int chainlength; 1780 int bytes_to_copy; 1781 boolean_t jumbocl; 1782 1783 bytes_to_copy = imin(resid, space); 1784 1785 if (sosendminchain > 0) 1786 chainlength = 0; 1787 else 1788 chainlength = sosendmaxchain; 1789 1790 /* 1791 * Attempt to use larger than system page-size 1792 * clusters for large writes only if there is 1793 * a jumbo cluster pool and if the socket is 1794 * marked accordingly. 1795 */ 1796 jumbocl = sosendjcl && njcl > 0 && 1797 ((so->so_flags & SOF_MULTIPAGES) || 1798 sosendjcl_ignore_capab); 1799 1800 socket_unlock(so, 0); 1801 1802 do { 1803 int num_needed; 1804 int hdrs_needed = (top == NULL) ? 1 : 0; 1805 1806 /* 1807 * try to maintain a local cache of mbuf 1808 * clusters needed to complete this 1809 * write the list is further limited to 1810 * the number that are currently needed 1811 * to fill the socket this mechanism 1812 * allows a large number of mbufs/ 1813 * clusters to be grabbed under a single 1814 * mbuf lock... if we can't get any 1815 * clusters, than fall back to trying 1816 * for mbufs if we fail early (or 1817 * miscalcluate the number needed) make 1818 * sure to release any clusters we 1819 * haven't yet consumed. 1820 */ 1821 if (freelist == NULL && 1822 bytes_to_copy > MBIGCLBYTES && 1823 jumbocl) { 1824 num_needed = 1825 bytes_to_copy / M16KCLBYTES; 1826 1827 if ((bytes_to_copy - 1828 (num_needed * M16KCLBYTES)) 1829 >= MINCLSIZE) 1830 num_needed++; 1831 1832 freelist = 1833 m_getpackets_internal( 1834 (unsigned int *)&num_needed, 1835 hdrs_needed, M_WAIT, 0, 1836 M16KCLBYTES); 1837 /* 1838 * Fall back to 4K cluster size 1839 * if allocation failed 1840 */ 1841 } 1842 1843 if (freelist == NULL && 1844 bytes_to_copy > MCLBYTES) { 1845 num_needed = 1846 bytes_to_copy / MBIGCLBYTES; 1847 1848 if ((bytes_to_copy - 1849 (num_needed * MBIGCLBYTES)) >= 1850 MINCLSIZE) 1851 num_needed++; 1852 1853 freelist = 1854 m_getpackets_internal( 1855 (unsigned int *)&num_needed, 1856 hdrs_needed, M_WAIT, 0, 1857 MBIGCLBYTES); 1858 /* 1859 * Fall back to cluster size 1860 * if allocation failed 1861 */ 1862 } 1863 1864 if (freelist == NULL && 1865 bytes_to_copy > MINCLSIZE) { 1866 num_needed = 1867 bytes_to_copy / MCLBYTES; 1868 1869 if ((bytes_to_copy - 1870 (num_needed * MCLBYTES)) >= 1871 MINCLSIZE) 1872 num_needed++; 1873 1874 freelist = 1875 m_getpackets_internal( 1876 (unsigned int *)&num_needed, 1877 hdrs_needed, M_WAIT, 0, 1878 MCLBYTES); 1879 /* 1880 * Fall back to a single mbuf 1881 * if allocation failed 1882 */ 1883 } 1884 1885 if (freelist == NULL) { 1886 if (top == NULL) 1887 MGETHDR(freelist, 1888 M_WAIT, MT_DATA); 1889 else 1890 MGET(freelist, 1891 M_WAIT, MT_DATA); 1892 1893 if (freelist == NULL) { 1894 error = ENOBUFS; 1895 socket_lock(so, 0); 1896 goto release; 1897 } 1898 /* 1899 * For datagram protocols, 1900 * leave room for protocol 1901 * headers in first mbuf. 1902 */ 1903 if (atomic && top == NULL && 1904 bytes_to_copy < MHLEN) { 1905 MH_ALIGN(freelist, 1906 bytes_to_copy); 1907 } 1908 } 1909 m = freelist; 1910 freelist = m->m_next; 1911 m->m_next = NULL; 1912 1913 if ((m->m_flags & M_EXT)) 1914 mlen = m->m_ext.ext_size; 1915 else if ((m->m_flags & M_PKTHDR)) 1916 mlen = 1917 MHLEN - m_leadingspace(m); 1918 else 1919 mlen = MLEN; 1920 len = imin(mlen, bytes_to_copy); 1921 1922 chainlength += len; 1923 1924 space -= len; 1925 1926 error = uiomove(mtod(m, caddr_t), 1927 len, uio); 1928 1929 resid = uio_resid(uio); 1930 1931 m->m_len = len; 1932 *mp = m; 1933 top->m_pkthdr.len += len; 1934 if (error) 1935 break; 1936 mp = &m->m_next; 1937 if (resid <= 0) { 1938 if (flags & MSG_EOR) 1939 top->m_flags |= M_EOR; 1940 break; 1941 } 1942 bytes_to_copy = min(resid, space); 1943 1944 } while (space > 0 && 1945 (chainlength < sosendmaxchain || atomic || 1946 resid < MINCLSIZE)); 1947 1948 socket_lock(so, 0); 1949 1950 if (error) 1951 goto release; 1952 } 1953 1954 if (flags & (MSG_HOLD|MSG_SEND)) { 1955 /* Enqueue for later, go away if HOLD */ 1956 struct mbuf *mb1; 1957 if (so->so_temp && (flags & MSG_FLUSH)) { 1958 m_freem(so->so_temp); 1959 so->so_temp = NULL; 1960 } 1961 if (so->so_temp) 1962 so->so_tail->m_next = top; 1963 else 1964 so->so_temp = top; 1965 mb1 = top; 1966 while (mb1->m_next) 1967 mb1 = mb1->m_next; 1968 so->so_tail = mb1; 1969 if (flags & MSG_HOLD) { 1970 top = NULL; 1971 goto release; 1972 } 1973 top = so->so_temp; 1974 } 1975 if (dontroute) 1976 so->so_options |= SO_DONTROUTE; 1977 1978 /* Compute flags here, for pru_send and NKEs */ 1979 sendflags = (flags & MSG_OOB) ? PRUS_OOB : 1980 /* 1981 * If the user set MSG_EOF, the protocol 1982 * understands this flag and nothing left to 1983 * send then use PRU_SEND_EOF instead of PRU_SEND. 1984 */ 1985 ((flags & MSG_EOF) && 1986 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1987 (resid <= 0)) ? PRUS_EOF : 1988 /* If there is more to send set PRUS_MORETOCOME */ 1989 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1990 1991 /* 1992 * Socket filter processing 1993 */ 1994 error = sflt_data_out(so, addr, &top, 1995 &control, (sendflags & MSG_OOB) ? 1996 sock_data_filt_flag_oob : 0); 1997 if (error) { 1998 if (error == EJUSTRETURN) { 1999 error = 0; 2000 clen = 0; 2001 control = NULL; 2002 top = NULL; 2003 } 2004 2005 goto release; 2006 } 2007 /* 2008 * End Socket filter processing 2009 */ 2010 2011 if (so->so_flags & SOF_ENABLE_MSGS) { 2012 /* 2013 * Make a copy of control mbuf, 2014 * so that msg priority can be 2015 * passed to subsequent mbufs. 2016 */ 2017 control_copy = m_dup(control, M_NOWAIT); 2018 } 2019 error = (*so->so_proto->pr_usrreqs->pru_send) 2020 (so, sendflags, top, addr, control, p); 2021 2022 if (flags & MSG_SEND) 2023 so->so_temp = NULL; 2024 2025 if (dontroute) 2026 so->so_options &= ~SO_DONTROUTE; 2027 2028 clen = 0; 2029 control = control_copy; 2030 control_copy = NULL; 2031 top = NULL; 2032 mp = ⊤ 2033 if (error) 2034 goto release; 2035 } while (resid && space > 0); 2036 } while (resid); 2037 2038release: 2039 if (sblocked) 2040 sbunlock(&so->so_snd, FALSE); /* will unlock socket */ 2041 else 2042 socket_unlock(so, 1); 2043out: 2044 if (top != NULL) 2045 m_freem(top); 2046 if (control != NULL) 2047 m_freem(control); 2048 if (freelist != NULL) 2049 m_freem_list(freelist); 2050 if (control_copy != NULL) 2051 m_freem(control_copy); 2052 2053 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc, 2054 space, error); 2055 2056 return (error); 2057} 2058 2059/* 2060 * Implement receive operations on a socket. 2061 * We depend on the way that records are added to the sockbuf 2062 * by sbappend*. In particular, each record (mbufs linked through m_next) 2063 * must begin with an address if the protocol so specifies, 2064 * followed by an optional mbuf or mbufs containing ancillary data, 2065 * and then zero or more mbufs of data. 2066 * In order to avoid blocking network interrupts for the entire time here, 2067 * we splx() while doing the actual copy to user space. 2068 * Although the sockbuf is locked, new data may still be appended, 2069 * and thus we must maintain consistency of the sockbuf during that time. 2070 * 2071 * The caller may receive the data as a single mbuf chain by supplying 2072 * an mbuf **mp0 for use in returning the chain. The uio is then used 2073 * only for the count in uio_resid. 2074 * 2075 * Returns: 0 Success 2076 * ENOBUFS 2077 * ENOTCONN 2078 * EWOULDBLOCK 2079 * uiomove:EFAULT 2080 * sblock:EWOULDBLOCK 2081 * sblock:EINTR 2082 * sbwait:EBADF 2083 * sbwait:EINTR 2084 * sodelayed_copy:EFAULT 2085 * <pru_rcvoob>:EINVAL[TCP] 2086 * <pru_rcvoob>:EWOULDBLOCK[TCP] 2087 * <pru_rcvoob>:??? 2088 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX] 2089 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX] 2090 * <pr_domain->dom_externalize>:??? 2091 * 2092 * Notes: Additional return values from calls through <pru_rcvoob> and 2093 * <pr_domain->dom_externalize> depend on protocols other than 2094 * TCP or AF_UNIX, which are documented above. 2095 */ 2096int 2097soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2098 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2099{ 2100 struct mbuf *m, **mp, *ml = NULL; 2101 struct mbuf *nextrecord, *free_list; 2102 int flags, error, offset; 2103 user_ssize_t len; 2104 struct protosw *pr = so->so_proto; 2105 int moff, type =0; 2106 user_ssize_t orig_resid = uio_resid(uio); 2107 user_ssize_t delayed_copy_len; 2108 int can_delay; 2109 int need_event; 2110 struct proc *p = current_proc(); 2111 2112 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio), 2113 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); 2114 2115 socket_lock(so, 1); 2116 so_update_last_owner_locked(so, p); 2117 so_update_policy(so); 2118 2119#ifdef MORE_LOCKING_DEBUG 2120 if (so->so_usecount == 1) { 2121 panic("%s: so=%x no other reference on socket\n", __func__, so); 2122 /* NOTREACHED */ 2123 } 2124#endif 2125 mp = mp0; 2126 if (psa != NULL) 2127 *psa = NULL; 2128 if (controlp != NULL) 2129 *controlp = NULL; 2130 if (flagsp != NULL) 2131 flags = *flagsp &~ MSG_EOR; 2132 else 2133 flags = 0; 2134 2135 /* 2136 * If a recv attempt is made on a previously-accepted socket 2137 * that has been marked as inactive (disconnected), reject 2138 * the request. 2139 */ 2140 if (so->so_flags & SOF_DEFUNCT) { 2141 struct sockbuf *sb = &so->so_rcv; 2142 2143 error = ENOTCONN; 2144 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 2145 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 2146 SOCK_DOM(so), SOCK_TYPE(so), error)); 2147 /* 2148 * This socket should have been disconnected and flushed 2149 * prior to being returned from sodefunct(); there should 2150 * be no data on its receive list, so panic otherwise. 2151 */ 2152 if (so->so_state & SS_DEFUNCT) 2153 sb_empty_assert(sb, __func__); 2154 socket_unlock(so, 1); 2155 return (error); 2156 } 2157 2158 /* 2159 * When SO_WANTOOBFLAG is set we try to get out-of-band data 2160 * regardless of the flags argument. Here is the case were 2161 * out-of-band data is not inline. 2162 */ 2163 if ((flags & MSG_OOB) || 2164 ((so->so_options & SO_WANTOOBFLAG) != 0 && 2165 (so->so_options & SO_OOBINLINE) == 0 && 2166 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) { 2167 m = m_get(M_WAIT, MT_DATA); 2168 if (m == NULL) { 2169 socket_unlock(so, 1); 2170 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, 2171 ENOBUFS, 0, 0, 0, 0); 2172 return (ENOBUFS); 2173 } 2174 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 2175 if (error) 2176 goto bad; 2177 socket_unlock(so, 0); 2178 do { 2179 error = uiomove(mtod(m, caddr_t), 2180 imin(uio_resid(uio), m->m_len), uio); 2181 m = m_free(m); 2182 } while (uio_resid(uio) && error == 0 && m != NULL); 2183 socket_lock(so, 0); 2184bad: 2185 if (m != NULL) 2186 m_freem(m); 2187 2188 if ((so->so_options & SO_WANTOOBFLAG) != 0) { 2189 if (error == EWOULDBLOCK || error == EINVAL) { 2190 /* 2191 * Let's try to get normal data: 2192 * EWOULDBLOCK: out-of-band data not 2193 * receive yet. EINVAL: out-of-band data 2194 * already read. 2195 */ 2196 error = 0; 2197 goto nooob; 2198 } else if (error == 0 && flagsp != NULL) { 2199 *flagsp |= MSG_OOB; 2200 } 2201 } 2202 socket_unlock(so, 1); 2203 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2204 0, 0, 0, 0); 2205 2206 return (error); 2207 } 2208nooob: 2209 if (mp != NULL) 2210 *mp = NULL; 2211 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) 2212 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 2213 2214 free_list = NULL; 2215 delayed_copy_len = 0; 2216restart: 2217#ifdef MORE_LOCKING_DEBUG 2218 if (so->so_usecount <= 1) 2219 printf("soreceive: sblock so=%p ref=%d on socket\n", 2220 so, so->so_usecount); 2221#endif 2222 /* 2223 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) 2224 * and if so just return to the caller. This could happen when 2225 * soreceive() is called by a socket upcall function during the 2226 * time the socket is freed. The socket buffer would have been 2227 * locked across the upcall, therefore we cannot put this thread 2228 * to sleep (else we will deadlock) or return EWOULDBLOCK (else 2229 * we may livelock), because the lock on the socket buffer will 2230 * only be released when the upcall routine returns to its caller. 2231 * Because the socket has been officially closed, there can be 2232 * no further read on it. 2233 * 2234 * A multipath subflow socket would have its SS_NOFDREF set by 2235 * default, so check for SOF_MP_SUBFLOW socket flag; when the 2236 * socket is closed for real, SOF_MP_SUBFLOW would be cleared. 2237 */ 2238 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == 2239 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { 2240 socket_unlock(so, 1); 2241 return (0); 2242 } 2243 2244 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 2245 if (error) { 2246 socket_unlock(so, 1); 2247 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2248 0, 0, 0, 0); 2249 return (error); 2250 } 2251 2252 m = so->so_rcv.sb_mb; 2253 /* 2254 * If we have less data than requested, block awaiting more 2255 * (subject to any timeout) if: 2256 * 1. the current count is less than the low water mark, or 2257 * 2. MSG_WAITALL is set, and it is possible to do the entire 2258 * receive operation at once if we block (resid <= hiwat). 2259 * 3. MSG_DONTWAIT is not set 2260 * If MSG_WAITALL is set but resid is larger than the receive buffer, 2261 * we have to do the receive in sections, and thus risk returning 2262 * a short count if a timeout or signal occurs after we start. 2263 */ 2264 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2265 so->so_rcv.sb_cc < uio_resid(uio)) && 2266 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 2267 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) && 2268 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2269 /* 2270 * Panic if we notice inconsistencies in the socket's 2271 * receive list; both sb_mb and sb_cc should correctly 2272 * reflect the contents of the list, otherwise we may 2273 * end up with false positives during select() or poll() 2274 * which could put the application in a bad state. 2275 */ 2276 SB_MB_CHECK(&so->so_rcv); 2277 2278 if (so->so_error) { 2279 if (m != NULL) 2280 goto dontblock; 2281 error = so->so_error; 2282 if ((flags & MSG_PEEK) == 0) 2283 so->so_error = 0; 2284 goto release; 2285 } 2286 if (so->so_state & SS_CANTRCVMORE) { 2287 if (m != NULL) 2288 goto dontblock; 2289 else 2290 goto release; 2291 } 2292 for (; m != NULL; m = m->m_next) 2293 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2294 m = so->so_rcv.sb_mb; 2295 goto dontblock; 2296 } 2297 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 2298 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2299 error = ENOTCONN; 2300 goto release; 2301 } 2302 if (uio_resid(uio) == 0) 2303 goto release; 2304 if ((so->so_state & SS_NBIO) || 2305 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2306 error = EWOULDBLOCK; 2307 goto release; 2308 } 2309 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 2310 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 2311 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 2312#if EVEN_MORE_LOCKING_DEBUG 2313 if (socket_debug) 2314 printf("Waiting for socket data\n"); 2315#endif 2316 2317 error = sbwait(&so->so_rcv); 2318#if EVEN_MORE_LOCKING_DEBUG 2319 if (socket_debug) 2320 printf("SORECEIVE - sbwait returned %d\n", error); 2321#endif 2322 if (so->so_usecount < 1) { 2323 panic("%s: after 2nd sblock so=%p ref=%d on socket\n", 2324 __func__, so, so->so_usecount); 2325 /* NOTREACHED */ 2326 } 2327 if (error) { 2328 socket_unlock(so, 1); 2329 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 2330 0, 0, 0, 0); 2331 return (error); 2332 } 2333 goto restart; 2334 } 2335dontblock: 2336 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); 2337 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 2338 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 2339 nextrecord = m->m_nextpkt; 2340 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { 2341 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 2342#if CONFIG_MACF_SOCKET_SUBSET 2343 /* 2344 * Call the MAC framework for policy checking if we're in 2345 * the user process context and the socket isn't connected. 2346 */ 2347 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) { 2348 struct mbuf *m0 = m; 2349 /* 2350 * Dequeue this record (temporarily) from the receive 2351 * list since we're about to drop the socket's lock 2352 * where a new record may arrive and be appended to 2353 * the list. Upon MAC policy failure, the record 2354 * will be freed. Otherwise, we'll add it back to 2355 * the head of the list. We cannot rely on SB_LOCK 2356 * because append operation uses the socket's lock. 2357 */ 2358 do { 2359 m->m_nextpkt = NULL; 2360 sbfree(&so->so_rcv, m); 2361 m = m->m_next; 2362 } while (m != NULL); 2363 m = m0; 2364 so->so_rcv.sb_mb = nextrecord; 2365 SB_EMPTY_FIXUP(&so->so_rcv); 2366 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a"); 2367 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a"); 2368 socket_unlock(so, 0); 2369 if (mac_socket_check_received(proc_ucred(p), so, 2370 mtod(m, struct sockaddr *)) != 0) { 2371 /* 2372 * MAC policy failure; free this record and 2373 * process the next record (or block until 2374 * one is available). We have adjusted sb_cc 2375 * and sb_mbcnt above so there is no need to 2376 * call sbfree() again. 2377 */ 2378 do { 2379 m = m_free(m); 2380 } while (m != NULL); 2381 /* 2382 * Clear SB_LOCK but don't unlock the socket. 2383 * Process the next record or wait for one. 2384 */ 2385 socket_lock(so, 0); 2386 sbunlock(&so->so_rcv, TRUE); /* stay locked */ 2387 goto restart; 2388 } 2389 socket_lock(so, 0); 2390 /* 2391 * If the socket has been defunct'd, drop it. 2392 */ 2393 if (so->so_flags & SOF_DEFUNCT) { 2394 m_freem(m); 2395 error = ENOTCONN; 2396 goto release; 2397 } 2398 /* 2399 * Re-adjust the socket receive list and re-enqueue 2400 * the record in front of any packets which may have 2401 * been appended while we dropped the lock. 2402 */ 2403 for (m = m0; m->m_next != NULL; m = m->m_next) 2404 sballoc(&so->so_rcv, m); 2405 sballoc(&so->so_rcv, m); 2406 if (so->so_rcv.sb_mb == NULL) { 2407 so->so_rcv.sb_lastrecord = m0; 2408 so->so_rcv.sb_mbtail = m; 2409 } 2410 m = m0; 2411 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb; 2412 so->so_rcv.sb_mb = m; 2413 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b"); 2414 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b"); 2415 } 2416#endif /* CONFIG_MACF_SOCKET_SUBSET */ 2417 orig_resid = 0; 2418 if (psa != NULL) { 2419 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 2420 mp0 == NULL); 2421 if ((*psa == NULL) && (flags & MSG_NEEDSA)) { 2422 error = EWOULDBLOCK; 2423 goto release; 2424 } 2425 } 2426 if (flags & MSG_PEEK) { 2427 m = m->m_next; 2428 } else { 2429 sbfree(&so->so_rcv, m); 2430 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) { 2431 panic("%s: about to create invalid socketbuf", 2432 __func__); 2433 /* NOTREACHED */ 2434 } 2435 MFREE(m, so->so_rcv.sb_mb); 2436 m = so->so_rcv.sb_mb; 2437 if (m != NULL) { 2438 m->m_nextpkt = nextrecord; 2439 } else { 2440 so->so_rcv.sb_mb = nextrecord; 2441 SB_EMPTY_FIXUP(&so->so_rcv); 2442 } 2443 } 2444 } 2445 2446 /* 2447 * Process one or more MT_CONTROL mbufs present before any data mbufs 2448 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2449 * just copy the data; if !MSG_PEEK, we call into the protocol to 2450 * perform externalization. 2451 */ 2452 if (m != NULL && m->m_type == MT_CONTROL) { 2453 struct mbuf *cm = NULL, *cmn; 2454 struct mbuf **cme = &cm; 2455 struct sockbuf *sb_rcv = &so->so_rcv; 2456 struct mbuf **msgpcm = NULL; 2457 2458 /* 2459 * Externalizing the control messages would require us to 2460 * drop the socket's lock below. Once we re-acquire the 2461 * lock, the mbuf chain might change. In order to preserve 2462 * consistency, we unlink all control messages from the 2463 * first mbuf chain in one shot and link them separately 2464 * onto a different chain. 2465 */ 2466 do { 2467 if (flags & MSG_PEEK) { 2468 if (controlp != NULL) { 2469 if (*controlp == NULL) { 2470 msgpcm = controlp; 2471 } 2472 *controlp = m_copy(m, 0, m->m_len); 2473 2474 /* 2475 * If we failed to allocate an mbuf, 2476 * release any previously allocated 2477 * mbufs for control data. Return 2478 * an error. Keep the mbufs in the 2479 * socket as this is using 2480 * MSG_PEEK flag. 2481 */ 2482 if (*controlp == NULL) { 2483 m_freem(*msgpcm); 2484 error = ENOBUFS; 2485 goto release; 2486 } 2487 controlp = &(*controlp)->m_next; 2488 } 2489 m = m->m_next; 2490 } else { 2491 m->m_nextpkt = NULL; 2492 sbfree(sb_rcv, m); 2493 sb_rcv->sb_mb = m->m_next; 2494 m->m_next = NULL; 2495 *cme = m; 2496 cme = &(*cme)->m_next; 2497 m = sb_rcv->sb_mb; 2498 } 2499 } while (m != NULL && m->m_type == MT_CONTROL); 2500 2501 if (!(flags & MSG_PEEK)) { 2502 if (sb_rcv->sb_mb != NULL) { 2503 sb_rcv->sb_mb->m_nextpkt = nextrecord; 2504 } else { 2505 sb_rcv->sb_mb = nextrecord; 2506 SB_EMPTY_FIXUP(sb_rcv); 2507 } 2508 if (nextrecord == NULL) 2509 sb_rcv->sb_lastrecord = m; 2510 } 2511 2512 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl"); 2513 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl"); 2514 2515 while (cm != NULL) { 2516 int cmsg_type; 2517 2518 cmn = cm->m_next; 2519 cm->m_next = NULL; 2520 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type; 2521 2522 /* 2523 * Call the protocol to externalize SCM_RIGHTS message 2524 * and return the modified message to the caller upon 2525 * success. Otherwise, all other control messages are 2526 * returned unmodified to the caller. Note that we 2527 * only get into this loop if MSG_PEEK is not set. 2528 */ 2529 if (pr->pr_domain->dom_externalize != NULL && 2530 cmsg_type == SCM_RIGHTS) { 2531 /* 2532 * Release socket lock: see 3903171. This 2533 * would also allow more records to be appended 2534 * to the socket buffer. We still have SB_LOCK 2535 * set on it, so we can be sure that the head 2536 * of the mbuf chain won't change. 2537 */ 2538 socket_unlock(so, 0); 2539 error = (*pr->pr_domain->dom_externalize)(cm); 2540 socket_lock(so, 0); 2541 } else { 2542 error = 0; 2543 } 2544 2545 if (controlp != NULL && error == 0) { 2546 *controlp = cm; 2547 controlp = &(*controlp)->m_next; 2548 orig_resid = 0; 2549 } else { 2550 (void) m_free(cm); 2551 } 2552 cm = cmn; 2553 } 2554 /* 2555 * Update the value of nextrecord in case we received new 2556 * records when the socket was unlocked above for 2557 * externalizing SCM_RIGHTS. 2558 */ 2559 if (m != NULL) 2560 nextrecord = sb_rcv->sb_mb->m_nextpkt; 2561 else 2562 nextrecord = sb_rcv->sb_mb; 2563 orig_resid = 0; 2564 } 2565 2566 /* 2567 * If the socket is a TCP socket with message delivery 2568 * enabled, then create a control msg to deliver the 2569 * relative TCP sequence number for this data. Waiting 2570 * until this point will protect against failures to 2571 * allocate an mbuf for control msgs. 2572 */ 2573 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP && 2574 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) { 2575 struct mbuf *seq_cm; 2576 2577 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq, 2578 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET); 2579 if (seq_cm == NULL) { 2580 /* unable to allocate a control mbuf */ 2581 error = ENOBUFS; 2582 goto release; 2583 } 2584 *controlp = seq_cm; 2585 controlp = &seq_cm->m_next; 2586 } 2587 2588 if (m != NULL) { 2589 if (!(flags & MSG_PEEK)) { 2590 /* 2591 * We get here because m points to an mbuf following 2592 * any MT_SONAME or MT_CONTROL mbufs which have been 2593 * processed above. In any case, m should be pointing 2594 * to the head of the mbuf chain, and the nextrecord 2595 * should be either NULL or equal to m->m_nextpkt. 2596 * See comments above about SB_LOCK. 2597 */ 2598 if (m != so->so_rcv.sb_mb || 2599 m->m_nextpkt != nextrecord) { 2600 panic("%s: post-control !sync so=%p m=%p " 2601 "nextrecord=%p\n", __func__, so, m, 2602 nextrecord); 2603 /* NOTREACHED */ 2604 } 2605 if (nextrecord == NULL) 2606 so->so_rcv.sb_lastrecord = m; 2607 } 2608 type = m->m_type; 2609 if (type == MT_OOBDATA) 2610 flags |= MSG_OOB; 2611 } else { 2612 if (!(flags & MSG_PEEK)) { 2613 SB_EMPTY_FIXUP(&so->so_rcv); 2614 } 2615 } 2616 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 2617 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 2618 2619 moff = 0; 2620 offset = 0; 2621 2622 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) 2623 can_delay = 1; 2624 else 2625 can_delay = 0; 2626 2627 need_event = 0; 2628 2629 while (m != NULL && 2630 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) { 2631 if (m->m_type == MT_OOBDATA) { 2632 if (type != MT_OOBDATA) 2633 break; 2634 } else if (type == MT_OOBDATA) { 2635 break; 2636 } 2637 /* 2638 * Make sure to allways set MSG_OOB event when getting 2639 * out of band data inline. 2640 */ 2641 if ((so->so_options & SO_WANTOOBFLAG) != 0 && 2642 (so->so_options & SO_OOBINLINE) != 0 && 2643 (so->so_state & SS_RCVATMARK) != 0) { 2644 flags |= MSG_OOB; 2645 } 2646 so->so_state &= ~SS_RCVATMARK; 2647 len = uio_resid(uio) - delayed_copy_len; 2648 if (so->so_oobmark && len > so->so_oobmark - offset) 2649 len = so->so_oobmark - offset; 2650 if (len > m->m_len - moff) 2651 len = m->m_len - moff; 2652 /* 2653 * If mp is set, just pass back the mbufs. 2654 * Otherwise copy them out via the uio, then free. 2655 * Sockbuf must be consistent here (points to current mbuf, 2656 * it points to next record) when we drop priority; 2657 * we must note any additions to the sockbuf when we 2658 * block interrupts again. 2659 */ 2660 if (mp == NULL) { 2661 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 2662 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 2663 if (can_delay && len == m->m_len) { 2664 /* 2665 * only delay the copy if we're consuming the 2666 * mbuf and we're NOT in MSG_PEEK mode 2667 * and we have enough data to make it worthwile 2668 * to drop and retake the lock... can_delay 2669 * reflects the state of the 2 latter 2670 * constraints moff should always be zero 2671 * in these cases 2672 */ 2673 delayed_copy_len += len; 2674 } else { 2675 if (delayed_copy_len) { 2676 error = sodelayed_copy(so, uio, 2677 &free_list, &delayed_copy_len); 2678 2679 if (error) { 2680 goto release; 2681 } 2682 /* 2683 * can only get here if MSG_PEEK is not 2684 * set therefore, m should point at the 2685 * head of the rcv queue; if it doesn't, 2686 * it means something drastically 2687 * changed while we were out from behind 2688 * the lock in sodelayed_copy. perhaps 2689 * a RST on the stream. in any event, 2690 * the stream has been interrupted. it's 2691 * probably best just to return whatever 2692 * data we've moved and let the caller 2693 * sort it out... 2694 */ 2695 if (m != so->so_rcv.sb_mb) { 2696 break; 2697 } 2698 } 2699 socket_unlock(so, 0); 2700 error = uiomove(mtod(m, caddr_t) + moff, 2701 (int)len, uio); 2702 socket_lock(so, 0); 2703 2704 if (error) 2705 goto release; 2706 } 2707 } else { 2708 uio_setresid(uio, (uio_resid(uio) - len)); 2709 } 2710 if (len == m->m_len - moff) { 2711 if (m->m_flags & M_EOR) 2712 flags |= MSG_EOR; 2713 if (flags & MSG_PEEK) { 2714 m = m->m_next; 2715 moff = 0; 2716 } else { 2717 nextrecord = m->m_nextpkt; 2718 sbfree(&so->so_rcv, m); 2719 m->m_nextpkt = NULL; 2720 2721 /* 2722 * If this packet is an unordered packet 2723 * (indicated by M_UNORDERED_DATA flag), remove 2724 * the additional bytes added to the 2725 * receive socket buffer size. 2726 */ 2727 if ((so->so_flags & SOF_ENABLE_MSGS) && 2728 m->m_len && 2729 (m->m_flags & M_UNORDERED_DATA) && 2730 sbreserve(&so->so_rcv, 2731 so->so_rcv.sb_hiwat - m->m_len)) { 2732 if (so->so_msg_state->msg_uno_bytes > 2733 m->m_len) { 2734 so->so_msg_state-> 2735 msg_uno_bytes -= m->m_len; 2736 } else { 2737 so->so_msg_state-> 2738 msg_uno_bytes = 0; 2739 } 2740 m->m_flags &= ~M_UNORDERED_DATA; 2741 } 2742 2743 if (mp != NULL) { 2744 *mp = m; 2745 mp = &m->m_next; 2746 so->so_rcv.sb_mb = m = m->m_next; 2747 *mp = NULL; 2748 } else { 2749 if (free_list == NULL) 2750 free_list = m; 2751 else 2752 ml->m_next = m; 2753 ml = m; 2754 so->so_rcv.sb_mb = m = m->m_next; 2755 ml->m_next = NULL; 2756 } 2757 if (m != NULL) { 2758 m->m_nextpkt = nextrecord; 2759 if (nextrecord == NULL) 2760 so->so_rcv.sb_lastrecord = m; 2761 } else { 2762 so->so_rcv.sb_mb = nextrecord; 2763 SB_EMPTY_FIXUP(&so->so_rcv); 2764 } 2765 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 2766 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 2767 } 2768 } else { 2769 if (flags & MSG_PEEK) { 2770 moff += len; 2771 } else { 2772 if (mp != NULL) { 2773 int copy_flag; 2774 2775 if (flags & MSG_DONTWAIT) 2776 copy_flag = M_DONTWAIT; 2777 else 2778 copy_flag = M_WAIT; 2779 *mp = m_copym(m, 0, len, copy_flag); 2780 /* 2781 * Failed to allocate an mbuf? 2782 * Adjust uio_resid back, it was 2783 * adjusted down by len bytes which 2784 * we didn't copy over. 2785 */ 2786 if (*mp == NULL) { 2787 uio_setresid(uio, 2788 (uio_resid(uio) + len)); 2789 break; 2790 } 2791 } 2792 m->m_data += len; 2793 m->m_len -= len; 2794 so->so_rcv.sb_cc -= len; 2795 } 2796 } 2797 if (so->so_oobmark) { 2798 if ((flags & MSG_PEEK) == 0) { 2799 so->so_oobmark -= len; 2800 if (so->so_oobmark == 0) { 2801 so->so_state |= SS_RCVATMARK; 2802 /* 2803 * delay posting the actual event until 2804 * after any delayed copy processing 2805 * has finished 2806 */ 2807 need_event = 1; 2808 break; 2809 } 2810 } else { 2811 offset += len; 2812 if (offset == so->so_oobmark) 2813 break; 2814 } 2815 } 2816 if (flags & MSG_EOR) 2817 break; 2818 /* 2819 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set 2820 * (for non-atomic socket), we must not quit until 2821 * "uio->uio_resid == 0" or an error termination. 2822 * If a signal/timeout occurs, return with a short 2823 * count but without error. Keep sockbuf locked 2824 * against other readers. 2825 */ 2826 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL && 2827 (uio_resid(uio) - delayed_copy_len) > 0 && 2828 !sosendallatonce(so) && !nextrecord) { 2829 if (so->so_error || so->so_state & SS_CANTRCVMORE) 2830 goto release; 2831 2832 /* 2833 * Depending on the protocol (e.g. TCP), the following 2834 * might cause the socket lock to be dropped and later 2835 * be reacquired, and more data could have arrived and 2836 * have been appended to the receive socket buffer by 2837 * the time it returns. Therefore, we only sleep in 2838 * sbwait() below if and only if the socket buffer is 2839 * empty, in order to avoid a false sleep. 2840 */ 2841 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb && 2842 (((struct inpcb *)so->so_pcb)->inp_state != 2843 INPCB_STATE_DEAD)) 2844 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2845 2846 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 2847 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 2848 2849 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) { 2850 error = 0; 2851 goto release; 2852 } 2853 /* 2854 * have to wait until after we get back from the sbwait 2855 * to do the copy because we will drop the lock if we 2856 * have enough data that has been delayed... by dropping 2857 * the lock we open up a window allowing the netisr 2858 * thread to process the incoming packets and to change 2859 * the state of this socket... we're issuing the sbwait 2860 * because the socket is empty and we're expecting the 2861 * netisr thread to wake us up when more packets arrive; 2862 * if we allow that processing to happen and then sbwait 2863 * we could stall forever with packets sitting in the 2864 * socket if no further packets arrive from the remote 2865 * side. 2866 * 2867 * we want to copy before we've collected all the data 2868 * to satisfy this request to allow the copy to overlap 2869 * the incoming packet processing on an MP system 2870 */ 2871 if (delayed_copy_len > sorecvmincopy && 2872 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) { 2873 error = sodelayed_copy(so, uio, 2874 &free_list, &delayed_copy_len); 2875 2876 if (error) 2877 goto release; 2878 } 2879 m = so->so_rcv.sb_mb; 2880 if (m != NULL) { 2881 nextrecord = m->m_nextpkt; 2882 } 2883 SB_MB_CHECK(&so->so_rcv); 2884 } 2885 } 2886#ifdef MORE_LOCKING_DEBUG 2887 if (so->so_usecount <= 1) { 2888 panic("%s: after big while so=%p ref=%d on socket\n", 2889 __func__, so, so->so_usecount); 2890 /* NOTREACHED */ 2891 } 2892#endif 2893 2894 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2895 if (so->so_options & SO_DONTTRUNC) { 2896 flags |= MSG_RCVMORE; 2897 } else { 2898 flags |= MSG_TRUNC; 2899 if ((flags & MSG_PEEK) == 0) 2900 (void) sbdroprecord(&so->so_rcv); 2901 } 2902 } 2903 2904 /* 2905 * pru_rcvd below (for TCP) may cause more data to be received 2906 * if the socket lock is dropped prior to sending the ACK; some 2907 * legacy OpenTransport applications don't handle this well 2908 * (if it receives less data than requested while MSG_HAVEMORE 2909 * is set), and so we set the flag now based on what we know 2910 * prior to calling pru_rcvd. 2911 */ 2912 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) 2913 flags |= MSG_HAVEMORE; 2914 2915 if ((flags & MSG_PEEK) == 0) { 2916 if (m == NULL) { 2917 so->so_rcv.sb_mb = nextrecord; 2918 /* 2919 * First part is an inline SB_EMPTY_FIXUP(). Second 2920 * part makes sure sb_lastrecord is up-to-date if 2921 * there is still data in the socket buffer. 2922 */ 2923 if (so->so_rcv.sb_mb == NULL) { 2924 so->so_rcv.sb_mbtail = NULL; 2925 so->so_rcv.sb_lastrecord = NULL; 2926 } else if (nextrecord->m_nextpkt == NULL) { 2927 so->so_rcv.sb_lastrecord = nextrecord; 2928 } 2929 SB_MB_CHECK(&so->so_rcv); 2930 } 2931 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 2932 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 2933 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 2934 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2935 } 2936 2937 if (delayed_copy_len) { 2938 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); 2939 if (error) 2940 goto release; 2941 } 2942 if (free_list != NULL) { 2943 m_freem_list(free_list); 2944 free_list = NULL; 2945 } 2946 if (need_event) 2947 postevent(so, 0, EV_OOB); 2948 2949 if (orig_resid == uio_resid(uio) && orig_resid && 2950 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 2951 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ 2952 goto restart; 2953 } 2954 2955 if (flagsp != NULL) 2956 *flagsp |= flags; 2957release: 2958#ifdef MORE_LOCKING_DEBUG 2959 if (so->so_usecount <= 1) { 2960 panic("%s: release so=%p ref=%d on socket\n", __func__, 2961 so, so->so_usecount); 2962 /* NOTREACHED */ 2963 } 2964#endif 2965 if (delayed_copy_len) 2966 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); 2967 2968 if (free_list != NULL) 2969 m_freem_list(free_list); 2970 2971 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ 2972 2973 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio), 2974 so->so_rcv.sb_cc, 0, error); 2975 2976 return (error); 2977} 2978 2979/* 2980 * Returns: 0 Success 2981 * uiomove:EFAULT 2982 */ 2983static int 2984sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, 2985 user_ssize_t *resid) 2986{ 2987 int error = 0; 2988 struct mbuf *m; 2989 2990 m = *free_list; 2991 2992 socket_unlock(so, 0); 2993 2994 while (m != NULL && error == 0) { 2995 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio); 2996 m = m->m_next; 2997 } 2998 m_freem_list(*free_list); 2999 3000 *free_list = NULL; 3001 *resid = 0; 3002 3003 socket_lock(so, 0); 3004 3005 return (error); 3006} 3007 3008/* 3009 * Returns: 0 Success 3010 * EINVAL 3011 * ENOTCONN 3012 * <pru_shutdown>:EINVAL 3013 * <pru_shutdown>:EADDRNOTAVAIL[TCP] 3014 * <pru_shutdown>:ENOBUFS[TCP] 3015 * <pru_shutdown>:EMSGSIZE[TCP] 3016 * <pru_shutdown>:EHOSTUNREACH[TCP] 3017 * <pru_shutdown>:ENETUNREACH[TCP] 3018 * <pru_shutdown>:ENETDOWN[TCP] 3019 * <pru_shutdown>:ENOMEM[TCP] 3020 * <pru_shutdown>:EACCES[TCP] 3021 * <pru_shutdown>:EMSGSIZE[TCP] 3022 * <pru_shutdown>:ENOBUFS[TCP] 3023 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] 3024 * <pru_shutdown>:??? [other protocol families] 3025 */ 3026int 3027soshutdown(struct socket *so, int how) 3028{ 3029 int error; 3030 3031 switch (how) { 3032 case SHUT_RD: 3033 case SHUT_WR: 3034 case SHUT_RDWR: 3035 socket_lock(so, 1); 3036 if ((so->so_state & 3037 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) { 3038 error = ENOTCONN; 3039 } else { 3040 error = soshutdownlock(so, how); 3041 } 3042 socket_unlock(so, 1); 3043 break; 3044 default: 3045 error = EINVAL; 3046 break; 3047 } 3048 3049 return (error); 3050} 3051 3052int 3053soshutdownlock(struct socket *so, int how) 3054{ 3055 struct protosw *pr = so->so_proto; 3056 int error = 0; 3057 3058 sflt_notify(so, sock_evt_shutdown, &how); 3059 3060 if (how != SHUT_WR) { 3061 if ((so->so_state & SS_CANTRCVMORE) != 0) { 3062 /* read already shut down */ 3063 error = ENOTCONN; 3064 goto done; 3065 } 3066 sorflush(so); 3067 postevent(so, 0, EV_RCLOSED); 3068 } 3069 if (how != SHUT_RD) { 3070 if ((so->so_state & SS_CANTSENDMORE) != 0) { 3071 /* write already shut down */ 3072 error = ENOTCONN; 3073 goto done; 3074 } 3075 error = (*pr->pr_usrreqs->pru_shutdown)(so); 3076 postevent(so, 0, EV_WCLOSED); 3077 } 3078done: 3079 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0); 3080 return (error); 3081} 3082 3083void 3084sowflush(struct socket *so) 3085{ 3086 struct sockbuf *sb = &so->so_snd; 3087#ifdef notyet 3088 lck_mtx_t *mutex_held; 3089 /* 3090 * XXX: This code is currently commented out, because we may get here 3091 * as part of sofreelastref(), and at that time, pr_getlock() may no 3092 * longer be able to return us the lock; this will be fixed in future. 3093 */ 3094 if (so->so_proto->pr_getlock != NULL) 3095 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 3096 else 3097 mutex_held = so->so_proto->pr_domain->dom_mtx; 3098 3099 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 3100#endif /* notyet */ 3101 3102 /* 3103 * Obtain lock on the socket buffer (SB_LOCK). This is required 3104 * to prevent the socket buffer from being unexpectedly altered 3105 * while it is used by another thread in socket send/receive. 3106 * 3107 * sblock() must not fail here, hence the assertion. 3108 */ 3109 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); 3110 VERIFY(sb->sb_flags & SB_LOCK); 3111 3112 sb->sb_flags &= ~(SB_SEL|SB_UPCALL); 3113 sb->sb_flags |= SB_DROP; 3114 sb->sb_upcall = NULL; 3115 sb->sb_upcallarg = NULL; 3116 3117 sbunlock(sb, TRUE); /* keep socket locked */ 3118 3119 selthreadclear(&sb->sb_sel); 3120 sbrelease(sb); 3121} 3122 3123void 3124sorflush(struct socket *so) 3125{ 3126 struct sockbuf *sb = &so->so_rcv; 3127 struct protosw *pr = so->so_proto; 3128 struct sockbuf asb; 3129#ifdef notyet 3130 lck_mtx_t *mutex_held; 3131 /* 3132 * XXX: This code is currently commented out, because we may get here 3133 * as part of sofreelastref(), and at that time, pr_getlock() may no 3134 * longer be able to return us the lock; this will be fixed in future. 3135 */ 3136 if (so->so_proto->pr_getlock != NULL) 3137 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 3138 else 3139 mutex_held = so->so_proto->pr_domain->dom_mtx; 3140 3141 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 3142#endif /* notyet */ 3143 3144 sflt_notify(so, sock_evt_flush_read, NULL); 3145 3146 socantrcvmore(so); 3147 3148 /* 3149 * Obtain lock on the socket buffer (SB_LOCK). This is required 3150 * to prevent the socket buffer from being unexpectedly altered 3151 * while it is used by another thread in socket send/receive. 3152 * 3153 * sblock() must not fail here, hence the assertion. 3154 */ 3155 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); 3156 VERIFY(sb->sb_flags & SB_LOCK); 3157 3158 /* 3159 * Copy only the relevant fields from "sb" to "asb" which we 3160 * need for sbrelease() to function. In particular, skip 3161 * sb_sel as it contains the wait queue linkage, which would 3162 * wreak havoc if we were to issue selthreadclear() on "asb". 3163 * Make sure to not carry over SB_LOCK in "asb", as we need 3164 * to acquire it later as part of sbrelease(). 3165 */ 3166 bzero(&asb, sizeof (asb)); 3167 asb.sb_cc = sb->sb_cc; 3168 asb.sb_hiwat = sb->sb_hiwat; 3169 asb.sb_mbcnt = sb->sb_mbcnt; 3170 asb.sb_mbmax = sb->sb_mbmax; 3171 asb.sb_ctl = sb->sb_ctl; 3172 asb.sb_lowat = sb->sb_lowat; 3173 asb.sb_mb = sb->sb_mb; 3174 asb.sb_mbtail = sb->sb_mbtail; 3175 asb.sb_lastrecord = sb->sb_lastrecord; 3176 asb.sb_so = sb->sb_so; 3177 asb.sb_flags = sb->sb_flags; 3178 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL); 3179 asb.sb_flags |= SB_DROP; 3180 3181 /* 3182 * Ideally we'd bzero() these and preserve the ones we need; 3183 * but to do that we'd need to shuffle things around in the 3184 * sockbuf, and we can't do it now because there are KEXTS 3185 * that are directly referring to the socket structure. 3186 * 3187 * Setting SB_DROP acts as a barrier to prevent further appends. 3188 * Clearing SB_SEL is done for selthreadclear() below. 3189 */ 3190 sb->sb_cc = 0; 3191 sb->sb_hiwat = 0; 3192 sb->sb_mbcnt = 0; 3193 sb->sb_mbmax = 0; 3194 sb->sb_ctl = 0; 3195 sb->sb_lowat = 0; 3196 sb->sb_mb = NULL; 3197 sb->sb_mbtail = NULL; 3198 sb->sb_lastrecord = NULL; 3199 sb->sb_timeo.tv_sec = 0; 3200 sb->sb_timeo.tv_usec = 0; 3201 sb->sb_upcall = NULL; 3202 sb->sb_upcallarg = NULL; 3203 sb->sb_flags &= ~(SB_SEL|SB_UPCALL); 3204 sb->sb_flags |= SB_DROP; 3205 3206 sbunlock(sb, TRUE); /* keep socket locked */ 3207 3208 /* 3209 * Note that selthreadclear() is called on the original "sb" and 3210 * not the local "asb" because of the way wait queue linkage is 3211 * implemented. Given that selwakeup() may be triggered, SB_SEL 3212 * should no longer be set (cleared above.) 3213 */ 3214 selthreadclear(&sb->sb_sel); 3215 3216 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 3217 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 3218 3219 sbrelease(&asb); 3220} 3221 3222/* 3223 * Perhaps this routine, and sooptcopyout(), below, ought to come in 3224 * an additional variant to handle the case where the option value needs 3225 * to be some kind of integer, but not a specific size. 3226 * In addition to their use here, these functions are also called by the 3227 * protocol-level pr_ctloutput() routines. 3228 * 3229 * Returns: 0 Success 3230 * EINVAL 3231 * copyin:EFAULT 3232 */ 3233int 3234sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3235{ 3236 size_t valsize; 3237 3238 /* 3239 * If the user gives us more than we wanted, we ignore it, 3240 * but if we don't get the minimum length the caller 3241 * wants, we return EINVAL. On success, sopt->sopt_valsize 3242 * is set to however much we actually retrieved. 3243 */ 3244 if ((valsize = sopt->sopt_valsize) < minlen) 3245 return (EINVAL); 3246 if (valsize > len) 3247 sopt->sopt_valsize = valsize = len; 3248 3249 if (sopt->sopt_p != kernproc) 3250 return (copyin(sopt->sopt_val, buf, valsize)); 3251 3252 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize); 3253 return (0); 3254} 3255 3256/* 3257 * sooptcopyin_timeval 3258 * Copy in a timeval value into tv_p, and take into account whether the 3259 * the calling process is 64-bit or 32-bit. Moved the sanity checking 3260 * code here so that we can verify the 64-bit tv_sec value before we lose 3261 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec. 3262 */ 3263static int 3264sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p) 3265{ 3266 int error; 3267 3268 if (proc_is64bit(sopt->sopt_p)) { 3269 struct user64_timeval tv64; 3270 3271 if (sopt->sopt_valsize < sizeof (tv64)) 3272 return (EINVAL); 3273 3274 sopt->sopt_valsize = sizeof (tv64); 3275 if (sopt->sopt_p != kernproc) { 3276 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64)); 3277 if (error != 0) 3278 return (error); 3279 } else { 3280 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64, 3281 sizeof (tv64)); 3282 } 3283 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX || 3284 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) 3285 return (EDOM); 3286 3287 tv_p->tv_sec = tv64.tv_sec; 3288 tv_p->tv_usec = tv64.tv_usec; 3289 } else { 3290 struct user32_timeval tv32; 3291 3292 if (sopt->sopt_valsize < sizeof (tv32)) 3293 return (EINVAL); 3294 3295 sopt->sopt_valsize = sizeof (tv32); 3296 if (sopt->sopt_p != kernproc) { 3297 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32)); 3298 if (error != 0) { 3299 return (error); 3300 } 3301 } else { 3302 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32, 3303 sizeof (tv32)); 3304 } 3305#ifndef __LP64__ 3306 /* 3307 * K64todo "comparison is always false due to 3308 * limited range of data type" 3309 */ 3310 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX || 3311 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) 3312 return (EDOM); 3313#endif 3314 tv_p->tv_sec = tv32.tv_sec; 3315 tv_p->tv_usec = tv32.tv_usec; 3316 } 3317 return (0); 3318} 3319 3320/* 3321 * Returns: 0 Success 3322 * EINVAL 3323 * ENOPROTOOPT 3324 * ENOBUFS 3325 * EDOM 3326 * sooptcopyin:EINVAL 3327 * sooptcopyin:EFAULT 3328 * sooptcopyin_timeval:EINVAL 3329 * sooptcopyin_timeval:EFAULT 3330 * sooptcopyin_timeval:EDOM 3331 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX] 3332 * <pr_ctloutput>:???w 3333 * sflt_attach_private:??? [whatever a filter author chooses] 3334 * <sf_setoption>:??? [whatever a filter author chooses] 3335 * 3336 * Notes: Other <pru_listen> returns depend on the protocol family; all 3337 * <sf_listen> returns depend on what the filter author causes 3338 * their filter to return. 3339 */ 3340int 3341sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) 3342{ 3343 int error, optval; 3344 struct linger l; 3345 struct timeval tv; 3346#if CONFIG_MACF_SOCKET 3347 struct mac extmac; 3348#endif /* MAC_SOCKET */ 3349 3350 if (sopt->sopt_dir != SOPT_SET) 3351 sopt->sopt_dir = SOPT_SET; 3352 3353 if (dolock) 3354 socket_lock(so, 1); 3355 3356 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == 3357 (SS_CANTRCVMORE | SS_CANTSENDMORE) && 3358 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { 3359 /* the socket has been shutdown, no more sockopt's */ 3360 error = EINVAL; 3361 goto out; 3362 } 3363 3364 error = sflt_setsockopt(so, sopt); 3365 if (error != 0) { 3366 if (error == EJUSTRETURN) 3367 error = 0; 3368 goto out; 3369 } 3370 3371 if (sopt->sopt_level != SOL_SOCKET) { 3372 if (so->so_proto != NULL && 3373 so->so_proto->pr_ctloutput != NULL) { 3374 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3375 goto out; 3376 } 3377 error = ENOPROTOOPT; 3378 } else { 3379 /* 3380 * Allow socket-level (SOL_SOCKET) options to be filtered by 3381 * the protocol layer, if needed. A zero value returned from 3382 * the handler means use default socket-level processing as 3383 * done by the rest of this routine. Otherwise, any other 3384 * return value indicates that the option is unsupported. 3385 */ 3386 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> 3387 pru_socheckopt(so, sopt)) != 0) 3388 goto out; 3389 3390 error = 0; 3391 switch (sopt->sopt_name) { 3392 case SO_LINGER: 3393 case SO_LINGER_SEC: 3394 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l)); 3395 if (error != 0) 3396 goto out; 3397 3398 so->so_linger = (sopt->sopt_name == SO_LINGER) ? 3399 l.l_linger : l.l_linger * hz; 3400 if (l.l_onoff != 0) 3401 so->so_options |= SO_LINGER; 3402 else 3403 so->so_options &= ~SO_LINGER; 3404 break; 3405 3406 case SO_DEBUG: 3407 case SO_KEEPALIVE: 3408 case SO_DONTROUTE: 3409 case SO_USELOOPBACK: 3410 case SO_BROADCAST: 3411 case SO_REUSEADDR: 3412 case SO_REUSEPORT: 3413 case SO_OOBINLINE: 3414 case SO_TIMESTAMP: 3415 case SO_TIMESTAMP_MONOTONIC: 3416 case SO_DONTTRUNC: 3417 case SO_WANTMORE: 3418 case SO_WANTOOBFLAG: 3419 error = sooptcopyin(sopt, &optval, sizeof (optval), 3420 sizeof (optval)); 3421 if (error != 0) 3422 goto out; 3423 if (optval) 3424 so->so_options |= sopt->sopt_name; 3425 else 3426 so->so_options &= ~sopt->sopt_name; 3427 break; 3428 3429 case SO_SNDBUF: 3430 case SO_RCVBUF: 3431 case SO_SNDLOWAT: 3432 case SO_RCVLOWAT: 3433 error = sooptcopyin(sopt, &optval, sizeof (optval), 3434 sizeof (optval)); 3435 if (error != 0) 3436 goto out; 3437 3438 /* 3439 * Values < 1 make no sense for any of these 3440 * options, so disallow them. 3441 */ 3442 if (optval < 1) { 3443 error = EINVAL; 3444 goto out; 3445 } 3446 3447 switch (sopt->sopt_name) { 3448 case SO_SNDBUF: 3449 case SO_RCVBUF: { 3450 struct sockbuf *sb = 3451 (sopt->sopt_name == SO_SNDBUF) ? 3452 &so->so_snd : &so->so_rcv; 3453 if (sbreserve(sb, (u_int32_t)optval) == 0) { 3454 error = ENOBUFS; 3455 goto out; 3456 } 3457 sb->sb_flags |= SB_USRSIZE; 3458 sb->sb_flags &= ~SB_AUTOSIZE; 3459 sb->sb_idealsize = (u_int32_t)optval; 3460 break; 3461 } 3462 /* 3463 * Make sure the low-water is never greater than 3464 * the high-water. 3465 */ 3466 case SO_SNDLOWAT: 3467 so->so_snd.sb_lowat = 3468 (optval > so->so_snd.sb_hiwat) ? 3469 so->so_snd.sb_hiwat : optval; 3470 break; 3471 case SO_RCVLOWAT: 3472 so->so_rcv.sb_lowat = 3473 (optval > so->so_rcv.sb_hiwat) ? 3474 so->so_rcv.sb_hiwat : optval; 3475 break; 3476 } 3477 break; 3478 3479 case SO_SNDTIMEO: 3480 case SO_RCVTIMEO: 3481 error = sooptcopyin_timeval(sopt, &tv); 3482 if (error != 0) 3483 goto out; 3484 3485 switch (sopt->sopt_name) { 3486 case SO_SNDTIMEO: 3487 so->so_snd.sb_timeo = tv; 3488 break; 3489 case SO_RCVTIMEO: 3490 so->so_rcv.sb_timeo = tv; 3491 break; 3492 } 3493 break; 3494 3495 case SO_NKE: { 3496 struct so_nke nke; 3497 3498 error = sooptcopyin(sopt, &nke, sizeof (nke), 3499 sizeof (nke)); 3500 if (error != 0) 3501 goto out; 3502 3503 error = sflt_attach_internal(so, nke.nke_handle); 3504 break; 3505 } 3506 3507 case SO_NOSIGPIPE: 3508 error = sooptcopyin(sopt, &optval, sizeof (optval), 3509 sizeof (optval)); 3510 if (error != 0) 3511 goto out; 3512 if (optval != 0) 3513 so->so_flags |= SOF_NOSIGPIPE; 3514 else 3515 so->so_flags &= ~SOF_NOSIGPIPE; 3516 break; 3517 3518 case SO_NOADDRERR: 3519 error = sooptcopyin(sopt, &optval, sizeof (optval), 3520 sizeof (optval)); 3521 if (error != 0) 3522 goto out; 3523 if (optval != 0) 3524 so->so_flags |= SOF_NOADDRAVAIL; 3525 else 3526 so->so_flags &= ~SOF_NOADDRAVAIL; 3527 break; 3528 3529 case SO_REUSESHAREUID: 3530 error = sooptcopyin(sopt, &optval, sizeof (optval), 3531 sizeof (optval)); 3532 if (error != 0) 3533 goto out; 3534 if (optval != 0) 3535 so->so_flags |= SOF_REUSESHAREUID; 3536 else 3537 so->so_flags &= ~SOF_REUSESHAREUID; 3538 break; 3539 3540 case SO_NOTIFYCONFLICT: 3541 if (kauth_cred_issuser(kauth_cred_get()) == 0) { 3542 error = EPERM; 3543 goto out; 3544 } 3545 error = sooptcopyin(sopt, &optval, sizeof (optval), 3546 sizeof (optval)); 3547 if (error != 0) 3548 goto out; 3549 if (optval != 0) 3550 so->so_flags |= SOF_NOTIFYCONFLICT; 3551 else 3552 so->so_flags &= ~SOF_NOTIFYCONFLICT; 3553 break; 3554 3555 case SO_RESTRICTIONS: 3556 error = sooptcopyin(sopt, &optval, sizeof (optval), 3557 sizeof (optval)); 3558 if (error != 0) 3559 goto out; 3560 3561 error = so_set_restrictions(so, optval); 3562 break; 3563 3564 case SO_LABEL: 3565#if CONFIG_MACF_SOCKET 3566 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 3567 sizeof (extmac))) != 0) 3568 goto out; 3569 3570 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p), 3571 so, &extmac); 3572#else 3573 error = EOPNOTSUPP; 3574#endif /* MAC_SOCKET */ 3575 break; 3576 3577 case SO_UPCALLCLOSEWAIT: 3578 error = sooptcopyin(sopt, &optval, sizeof (optval), 3579 sizeof (optval)); 3580 if (error != 0) 3581 goto out; 3582 if (optval != 0) 3583 so->so_flags |= SOF_UPCALLCLOSEWAIT; 3584 else 3585 so->so_flags &= ~SOF_UPCALLCLOSEWAIT; 3586 break; 3587 3588 case SO_RANDOMPORT: 3589 error = sooptcopyin(sopt, &optval, sizeof (optval), 3590 sizeof (optval)); 3591 if (error != 0) 3592 goto out; 3593 if (optval != 0) 3594 so->so_flags |= SOF_BINDRANDOMPORT; 3595 else 3596 so->so_flags &= ~SOF_BINDRANDOMPORT; 3597 break; 3598 3599 case SO_NP_EXTENSIONS: { 3600 struct so_np_extensions sonpx; 3601 3602 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx), 3603 sizeof (sonpx)); 3604 if (error != 0) 3605 goto out; 3606 if (sonpx.npx_mask & ~SONPX_MASK_VALID) { 3607 error = EINVAL; 3608 goto out; 3609 } 3610 /* 3611 * Only one bit defined for now 3612 */ 3613 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) { 3614 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) 3615 so->so_flags |= SOF_NPX_SETOPTSHUT; 3616 else 3617 so->so_flags &= ~SOF_NPX_SETOPTSHUT; 3618 } 3619 break; 3620 } 3621 3622 case SO_TRAFFIC_CLASS: { 3623 error = sooptcopyin(sopt, &optval, sizeof (optval), 3624 sizeof (optval)); 3625 if (error != 0) 3626 goto out; 3627 error = so_set_traffic_class(so, optval); 3628 if (error != 0) 3629 goto out; 3630 break; 3631 } 3632 3633 case SO_RECV_TRAFFIC_CLASS: { 3634 error = sooptcopyin(sopt, &optval, sizeof (optval), 3635 sizeof (optval)); 3636 if (error != 0) 3637 goto out; 3638 if (optval == 0) 3639 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; 3640 else 3641 so->so_flags |= SOF_RECV_TRAFFIC_CLASS; 3642 break; 3643 } 3644 3645 case SO_TRAFFIC_CLASS_DBG: { 3646 struct so_tcdbg so_tcdbg; 3647 3648 error = sooptcopyin(sopt, &so_tcdbg, 3649 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg)); 3650 if (error != 0) 3651 goto out; 3652 error = so_set_tcdbg(so, &so_tcdbg); 3653 if (error != 0) 3654 goto out; 3655 break; 3656 } 3657 3658 case SO_PRIVILEGED_TRAFFIC_CLASS: 3659 error = priv_check_cred(kauth_cred_get(), 3660 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0); 3661 if (error != 0) 3662 goto out; 3663 error = sooptcopyin(sopt, &optval, sizeof (optval), 3664 sizeof (optval)); 3665 if (error != 0) 3666 goto out; 3667 if (optval == 0) 3668 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS; 3669 else 3670 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS; 3671 break; 3672 3673 case SO_DEFUNCTOK: 3674 error = sooptcopyin(sopt, &optval, sizeof (optval), 3675 sizeof (optval)); 3676 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { 3677 if (error == 0) 3678 error = EBADF; 3679 goto out; 3680 } 3681 /* 3682 * Any process can set SO_DEFUNCTOK (clear 3683 * SOF_NODEFUNCT), but only root can clear 3684 * SO_DEFUNCTOK (set SOF_NODEFUNCT). 3685 */ 3686 if (optval == 0 && 3687 kauth_cred_issuser(kauth_cred_get()) == 0) { 3688 error = EPERM; 3689 goto out; 3690 } 3691 if (optval) 3692 so->so_flags &= ~SOF_NODEFUNCT; 3693 else 3694 so->so_flags |= SOF_NODEFUNCT; 3695 3696 if (SOCK_DOM(so) == PF_INET || 3697 SOCK_DOM(so) == PF_INET6) { 3698 char s[MAX_IPv6_STR_LEN]; 3699 char d[MAX_IPv6_STR_LEN]; 3700 struct inpcb *inp = sotoinpcb(so); 3701 3702 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> " 3703 "%s:%d] is now marked as %seligible for " 3704 "defunct\n", __func__, proc_selfpid(), 3705 (uint64_t)VM_KERNEL_ADDRPERM(so), 3706 (SOCK_TYPE(so) == SOCK_STREAM) ? 3707 "TCP" : "UDP", inet_ntop(SOCK_DOM(so), 3708 ((SOCK_DOM(so) == PF_INET) ? 3709 (void *)&inp->inp_laddr.s_addr : 3710 (void *)&inp->in6p_laddr), s, sizeof (s)), 3711 ntohs(inp->in6p_lport), 3712 inet_ntop(SOCK_DOM(so), 3713 (SOCK_DOM(so) == PF_INET) ? 3714 (void *)&inp->inp_faddr.s_addr : 3715 (void *)&inp->in6p_faddr, d, sizeof (d)), 3716 ntohs(inp->in6p_fport), 3717 (so->so_flags & SOF_NODEFUNCT) ? 3718 "not " : "")); 3719 } else { 3720 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is " 3721 "now marked as %seligible for defunct\n", 3722 __func__, proc_selfpid(), 3723 (uint64_t)VM_KERNEL_ADDRPERM(so), 3724 SOCK_DOM(so), SOCK_TYPE(so), 3725 (so->so_flags & SOF_NODEFUNCT) ? 3726 "not " : "")); 3727 } 3728 break; 3729 3730 case SO_ISDEFUNCT: 3731 /* This option is not settable */ 3732 error = EINVAL; 3733 break; 3734 3735 case SO_OPPORTUNISTIC: 3736 error = sooptcopyin(sopt, &optval, sizeof (optval), 3737 sizeof (optval)); 3738 if (error == 0) 3739 error = so_set_opportunistic(so, optval); 3740 break; 3741 3742 case SO_FLUSH: 3743 /* This option is handled by lower layer(s) */ 3744 error = 0; 3745 break; 3746 3747 case SO_RECV_ANYIF: 3748 error = sooptcopyin(sopt, &optval, sizeof (optval), 3749 sizeof (optval)); 3750 if (error == 0) 3751 error = so_set_recv_anyif(so, optval); 3752 break; 3753 3754 case SO_TRAFFIC_MGT_BACKGROUND: { 3755 /* This option is handled by lower layer(s) */ 3756 error = 0; 3757 break; 3758 } 3759 3760#if FLOW_DIVERT 3761 case SO_FLOW_DIVERT_TOKEN: 3762 error = flow_divert_token_set(so, sopt); 3763 break; 3764#endif /* FLOW_DIVERT */ 3765 3766 3767 case SO_DELEGATED: 3768 if ((error = sooptcopyin(sopt, &optval, sizeof (optval), 3769 sizeof (optval))) != 0) 3770 break; 3771 3772 error = so_set_effective_pid(so, optval, sopt->sopt_p); 3773 break; 3774 3775 case SO_DELEGATED_UUID: { 3776 uuid_t euuid; 3777 3778 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid), 3779 sizeof (euuid))) != 0) 3780 break; 3781 3782 error = so_set_effective_uuid(so, euuid, sopt->sopt_p); 3783 break; 3784 } 3785 3786 default: 3787 error = ENOPROTOOPT; 3788 break; 3789 } 3790 if (error == 0 && so->so_proto != NULL && 3791 so->so_proto->pr_ctloutput != NULL) { 3792 (void) so->so_proto->pr_ctloutput(so, sopt); 3793 } 3794 } 3795out: 3796 if (dolock) 3797 socket_unlock(so, 1); 3798 return (error); 3799} 3800 3801/* Helper routines for getsockopt */ 3802int 3803sooptcopyout(struct sockopt *sopt, void *buf, size_t len) 3804{ 3805 int error; 3806 size_t valsize; 3807 3808 error = 0; 3809 3810 /* 3811 * Documented get behavior is that we always return a value, 3812 * possibly truncated to fit in the user's buffer. 3813 * Traditional behavior is that we always tell the user 3814 * precisely how much we copied, rather than something useful 3815 * like the total amount we had available for her. 3816 * Note that this interface is not idempotent; the entire answer must 3817 * generated ahead of time. 3818 */ 3819 valsize = min(len, sopt->sopt_valsize); 3820 sopt->sopt_valsize = valsize; 3821 if (sopt->sopt_val != USER_ADDR_NULL) { 3822 if (sopt->sopt_p != kernproc) 3823 error = copyout(buf, sopt->sopt_val, valsize); 3824 else 3825 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); 3826 } 3827 return (error); 3828} 3829 3830static int 3831sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p) 3832{ 3833 int error; 3834 size_t len; 3835 struct user64_timeval tv64; 3836 struct user32_timeval tv32; 3837 const void * val; 3838 size_t valsize; 3839 3840 error = 0; 3841 if (proc_is64bit(sopt->sopt_p)) { 3842 len = sizeof (tv64); 3843 tv64.tv_sec = tv_p->tv_sec; 3844 tv64.tv_usec = tv_p->tv_usec; 3845 val = &tv64; 3846 } else { 3847 len = sizeof (tv32); 3848 tv32.tv_sec = tv_p->tv_sec; 3849 tv32.tv_usec = tv_p->tv_usec; 3850 val = &tv32; 3851 } 3852 valsize = min(len, sopt->sopt_valsize); 3853 sopt->sopt_valsize = valsize; 3854 if (sopt->sopt_val != USER_ADDR_NULL) { 3855 if (sopt->sopt_p != kernproc) 3856 error = copyout(val, sopt->sopt_val, valsize); 3857 else 3858 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); 3859 } 3860 return (error); 3861} 3862 3863/* 3864 * Return: 0 Success 3865 * ENOPROTOOPT 3866 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX] 3867 * <pr_ctloutput>:??? 3868 * <sf_getoption>:??? 3869 */ 3870int 3871sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) 3872{ 3873 int error, optval; 3874 struct linger l; 3875 struct timeval tv; 3876#if CONFIG_MACF_SOCKET 3877 struct mac extmac; 3878#endif /* MAC_SOCKET */ 3879 3880 if (sopt->sopt_dir != SOPT_GET) 3881 sopt->sopt_dir = SOPT_GET; 3882 3883 if (dolock) 3884 socket_lock(so, 1); 3885 3886 error = sflt_getsockopt(so, sopt); 3887 if (error != 0) { 3888 if (error == EJUSTRETURN) 3889 error = 0; 3890 goto out; 3891 } 3892 3893 if (sopt->sopt_level != SOL_SOCKET) { 3894 if (so->so_proto != NULL && 3895 so->so_proto->pr_ctloutput != NULL) { 3896 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3897 goto out; 3898 } 3899 error = ENOPROTOOPT; 3900 } else { 3901 /* 3902 * Allow socket-level (SOL_SOCKET) options to be filtered by 3903 * the protocol layer, if needed. A zero value returned from 3904 * the handler means use default socket-level processing as 3905 * done by the rest of this routine. Otherwise, any other 3906 * return value indicates that the option is unsupported. 3907 */ 3908 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> 3909 pru_socheckopt(so, sopt)) != 0) 3910 goto out; 3911 3912 error = 0; 3913 switch (sopt->sopt_name) { 3914 case SO_LINGER: 3915 case SO_LINGER_SEC: 3916 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0); 3917 l.l_linger = (sopt->sopt_name == SO_LINGER) ? 3918 so->so_linger : so->so_linger / hz; 3919 error = sooptcopyout(sopt, &l, sizeof (l)); 3920 break; 3921 3922 case SO_USELOOPBACK: 3923 case SO_DONTROUTE: 3924 case SO_DEBUG: 3925 case SO_KEEPALIVE: 3926 case SO_REUSEADDR: 3927 case SO_REUSEPORT: 3928 case SO_BROADCAST: 3929 case SO_OOBINLINE: 3930 case SO_TIMESTAMP: 3931 case SO_TIMESTAMP_MONOTONIC: 3932 case SO_DONTTRUNC: 3933 case SO_WANTMORE: 3934 case SO_WANTOOBFLAG: 3935 optval = so->so_options & sopt->sopt_name; 3936integer: 3937 error = sooptcopyout(sopt, &optval, sizeof (optval)); 3938 break; 3939 3940 case SO_TYPE: 3941 optval = so->so_type; 3942 goto integer; 3943 3944 case SO_NREAD: 3945 if (so->so_proto->pr_flags & PR_ATOMIC) { 3946 int pkt_total; 3947 struct mbuf *m1; 3948 3949 pkt_total = 0; 3950 m1 = so->so_rcv.sb_mb; 3951 while (m1 != NULL) { 3952 if (m1->m_type == MT_DATA || 3953 m1->m_type == MT_HEADER || 3954 m1->m_type == MT_OOBDATA) 3955 pkt_total += m1->m_len; 3956 m1 = m1->m_next; 3957 } 3958 optval = pkt_total; 3959 } else { 3960 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 3961 } 3962 goto integer; 3963 3964 case SO_NWRITE: 3965 optval = so->so_snd.sb_cc; 3966 goto integer; 3967 3968 case SO_ERROR: 3969 optval = so->so_error; 3970 so->so_error = 0; 3971 goto integer; 3972 3973 case SO_SNDBUF: 3974 optval = so->so_snd.sb_hiwat; 3975 goto integer; 3976 3977 case SO_RCVBUF: 3978 optval = so->so_rcv.sb_hiwat; 3979 goto integer; 3980 3981 case SO_SNDLOWAT: 3982 optval = so->so_snd.sb_lowat; 3983 goto integer; 3984 3985 case SO_RCVLOWAT: 3986 optval = so->so_rcv.sb_lowat; 3987 goto integer; 3988 3989 case SO_SNDTIMEO: 3990 case SO_RCVTIMEO: 3991 tv = (sopt->sopt_name == SO_SNDTIMEO ? 3992 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 3993 3994 error = sooptcopyout_timeval(sopt, &tv); 3995 break; 3996 3997 case SO_NOSIGPIPE: 3998 optval = (so->so_flags & SOF_NOSIGPIPE); 3999 goto integer; 4000 4001 case SO_NOADDRERR: 4002 optval = (so->so_flags & SOF_NOADDRAVAIL); 4003 goto integer; 4004 4005 case SO_REUSESHAREUID: 4006 optval = (so->so_flags & SOF_REUSESHAREUID); 4007 goto integer; 4008 4009 4010 case SO_NOTIFYCONFLICT: 4011 optval = (so->so_flags & SOF_NOTIFYCONFLICT); 4012 goto integer; 4013 4014 case SO_RESTRICTIONS: 4015 optval = so_get_restrictions(so); 4016 goto integer; 4017 4018 case SO_LABEL: 4019#if CONFIG_MACF_SOCKET 4020 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 4021 sizeof (extmac))) != 0 || 4022 (error = mac_socket_label_get(proc_ucred( 4023 sopt->sopt_p), so, &extmac)) != 0) 4024 break; 4025 4026 error = sooptcopyout(sopt, &extmac, sizeof (extmac)); 4027#else 4028 error = EOPNOTSUPP; 4029#endif /* MAC_SOCKET */ 4030 break; 4031 4032 case SO_PEERLABEL: 4033#if CONFIG_MACF_SOCKET 4034 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac), 4035 sizeof (extmac))) != 0 || 4036 (error = mac_socketpeer_label_get(proc_ucred( 4037 sopt->sopt_p), so, &extmac)) != 0) 4038 break; 4039 4040 error = sooptcopyout(sopt, &extmac, sizeof (extmac)); 4041#else 4042 error = EOPNOTSUPP; 4043#endif /* MAC_SOCKET */ 4044 break; 4045 4046#ifdef __APPLE_API_PRIVATE 4047 case SO_UPCALLCLOSEWAIT: 4048 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT); 4049 goto integer; 4050#endif 4051 case SO_RANDOMPORT: 4052 optval = (so->so_flags & SOF_BINDRANDOMPORT); 4053 goto integer; 4054 4055 case SO_NP_EXTENSIONS: { 4056 struct so_np_extensions sonpx; 4057 4058 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? 4059 SONPX_SETOPTSHUT : 0; 4060 sonpx.npx_mask = SONPX_MASK_VALID; 4061 4062 error = sooptcopyout(sopt, &sonpx, 4063 sizeof (struct so_np_extensions)); 4064 break; 4065 } 4066 4067 case SO_TRAFFIC_CLASS: 4068 optval = so->so_traffic_class; 4069 goto integer; 4070 4071 case SO_RECV_TRAFFIC_CLASS: 4072 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); 4073 goto integer; 4074 4075 case SO_TRAFFIC_CLASS_STATS: 4076 error = sooptcopyout(sopt, &so->so_tc_stats, 4077 sizeof (so->so_tc_stats)); 4078 break; 4079 4080 case SO_TRAFFIC_CLASS_DBG: 4081 error = sogetopt_tcdbg(so, sopt); 4082 break; 4083 4084 case SO_PRIVILEGED_TRAFFIC_CLASS: 4085 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); 4086 goto integer; 4087 4088 case SO_DEFUNCTOK: 4089 optval = !(so->so_flags & SOF_NODEFUNCT); 4090 goto integer; 4091 4092 case SO_ISDEFUNCT: 4093 optval = (so->so_flags & SOF_DEFUNCT); 4094 goto integer; 4095 4096 case SO_OPPORTUNISTIC: 4097 optval = so_get_opportunistic(so); 4098 goto integer; 4099 4100 case SO_FLUSH: 4101 /* This option is not gettable */ 4102 error = EINVAL; 4103 break; 4104 4105 case SO_RECV_ANYIF: 4106 optval = so_get_recv_anyif(so); 4107 goto integer; 4108 4109 case SO_TRAFFIC_MGT_BACKGROUND: 4110 /* This option is handled by lower layer(s) */ 4111 if (so->so_proto != NULL && 4112 so->so_proto->pr_ctloutput != NULL) { 4113 (void) so->so_proto->pr_ctloutput(so, sopt); 4114 } 4115 break; 4116 4117#if FLOW_DIVERT 4118 case SO_FLOW_DIVERT_TOKEN: 4119 error = flow_divert_token_get(so, sopt); 4120 break; 4121#endif /* FLOW_DIVERT */ 4122 4123 default: 4124 error = ENOPROTOOPT; 4125 break; 4126 } 4127 } 4128out: 4129 if (dolock) 4130 socket_unlock(so, 1); 4131 return (error); 4132} 4133 4134/* 4135 * The size limits on our soopt_getm is different from that on FreeBSD. 4136 * We limit the size of options to MCLBYTES. This will have to change 4137 * if we need to define options that need more space than MCLBYTES. 4138 */ 4139int 4140soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4141{ 4142 struct mbuf *m, *m_prev; 4143 int sopt_size = sopt->sopt_valsize; 4144 int how; 4145 4146 if (sopt_size <= 0 || sopt_size > MCLBYTES) 4147 return (EMSGSIZE); 4148 4149 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; 4150 MGET(m, how, MT_DATA); 4151 if (m == NULL) 4152 return (ENOBUFS); 4153 if (sopt_size > MLEN) { 4154 MCLGET(m, how); 4155 if ((m->m_flags & M_EXT) == 0) { 4156 m_free(m); 4157 return (ENOBUFS); 4158 } 4159 m->m_len = min(MCLBYTES, sopt_size); 4160 } else { 4161 m->m_len = min(MLEN, sopt_size); 4162 } 4163 sopt_size -= m->m_len; 4164 *mp = m; 4165 m_prev = m; 4166 4167 while (sopt_size > 0) { 4168 MGET(m, how, MT_DATA); 4169 if (m == NULL) { 4170 m_freem(*mp); 4171 return (ENOBUFS); 4172 } 4173 if (sopt_size > MLEN) { 4174 MCLGET(m, how); 4175 if ((m->m_flags & M_EXT) == 0) { 4176 m_freem(*mp); 4177 m_freem(m); 4178 return (ENOBUFS); 4179 } 4180 m->m_len = min(MCLBYTES, sopt_size); 4181 } else { 4182 m->m_len = min(MLEN, sopt_size); 4183 } 4184 sopt_size -= m->m_len; 4185 m_prev->m_next = m; 4186 m_prev = m; 4187 } 4188 return (0); 4189} 4190 4191/* copyin sopt data into mbuf chain */ 4192int 4193soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4194{ 4195 struct mbuf *m0 = m; 4196 4197 if (sopt->sopt_val == USER_ADDR_NULL) 4198 return (0); 4199 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4200 if (sopt->sopt_p != kernproc) { 4201 int error; 4202 4203 error = copyin(sopt->sopt_val, mtod(m, char *), 4204 m->m_len); 4205 if (error != 0) { 4206 m_freem(m0); 4207 return (error); 4208 } 4209 } else { 4210 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), 4211 mtod(m, char *), m->m_len); 4212 } 4213 sopt->sopt_valsize -= m->m_len; 4214 sopt->sopt_val += m->m_len; 4215 m = m->m_next; 4216 } 4217 /* should be allocated enoughly at ip6_sooptmcopyin() */ 4218 if (m != NULL) { 4219 panic("soopt_mcopyin"); 4220 /* NOTREACHED */ 4221 } 4222 return (0); 4223} 4224 4225/* copyout mbuf chain data into soopt */ 4226int 4227soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4228{ 4229 struct mbuf *m0 = m; 4230 size_t valsize = 0; 4231 4232 if (sopt->sopt_val == USER_ADDR_NULL) 4233 return (0); 4234 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4235 if (sopt->sopt_p != kernproc) { 4236 int error; 4237 4238 error = copyout(mtod(m, char *), sopt->sopt_val, 4239 m->m_len); 4240 if (error != 0) { 4241 m_freem(m0); 4242 return (error); 4243 } 4244 } else { 4245 bcopy(mtod(m, char *), 4246 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len); 4247 } 4248 sopt->sopt_valsize -= m->m_len; 4249 sopt->sopt_val += m->m_len; 4250 valsize += m->m_len; 4251 m = m->m_next; 4252 } 4253 if (m != NULL) { 4254 /* enough soopt buffer should be given from user-land */ 4255 m_freem(m0); 4256 return (EINVAL); 4257 } 4258 sopt->sopt_valsize = valsize; 4259 return (0); 4260} 4261 4262void 4263sohasoutofband(struct socket *so) 4264{ 4265 if (so->so_pgid < 0) 4266 gsignal(-so->so_pgid, SIGURG); 4267 else if (so->so_pgid > 0) 4268 proc_signal(so->so_pgid, SIGURG); 4269 selwakeup(&so->so_rcv.sb_sel); 4270} 4271 4272int 4273sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql) 4274{ 4275#pragma unused(cred) 4276 struct proc *p = current_proc(); 4277 int revents = 0; 4278 4279 socket_lock(so, 1); 4280 so_update_last_owner_locked(so, PROC_NULL); 4281 so_update_policy(so); 4282 4283 if (events & (POLLIN | POLLRDNORM)) 4284 if (soreadable(so)) 4285 revents |= events & (POLLIN | POLLRDNORM); 4286 4287 if (events & (POLLOUT | POLLWRNORM)) 4288 if (sowriteable(so)) 4289 revents |= events & (POLLOUT | POLLWRNORM); 4290 4291 if (events & (POLLPRI | POLLRDBAND)) 4292 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 4293 revents |= events & (POLLPRI | POLLRDBAND); 4294 4295 if (revents == 0) { 4296 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 4297 /* 4298 * Darwin sets the flag first, 4299 * BSD calls selrecord first 4300 */ 4301 so->so_rcv.sb_flags |= SB_SEL; 4302 selrecord(p, &so->so_rcv.sb_sel, wql); 4303 } 4304 4305 if (events & (POLLOUT | POLLWRNORM)) { 4306 /* 4307 * Darwin sets the flag first, 4308 * BSD calls selrecord first 4309 */ 4310 so->so_snd.sb_flags |= SB_SEL; 4311 selrecord(p, &so->so_snd.sb_sel, wql); 4312 } 4313 } 4314 4315 socket_unlock(so, 1); 4316 return (revents); 4317} 4318 4319int 4320soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) 4321{ 4322#pragma unused(fp) 4323#if !CONFIG_MACF_SOCKET 4324#pragma unused(ctx) 4325#endif /* MAC_SOCKET */ 4326 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4327 struct klist *skl; 4328 4329 socket_lock(so, 1); 4330 so_update_last_owner_locked(so, PROC_NULL); 4331 so_update_policy(so); 4332 4333#if CONFIG_MACF_SOCKET 4334 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)), 4335 kn, so) != 0) { 4336 socket_unlock(so, 1); 4337 return (1); 4338 } 4339#endif /* MAC_SOCKET */ 4340 4341 switch (kn->kn_filter) { 4342 case EVFILT_READ: 4343 kn->kn_fop = &soread_filtops; 4344 skl = &so->so_rcv.sb_sel.si_note; 4345 break; 4346 case EVFILT_WRITE: 4347 kn->kn_fop = &sowrite_filtops; 4348 skl = &so->so_snd.sb_sel.si_note; 4349 break; 4350 case EVFILT_SOCK: 4351 kn->kn_fop = &sock_filtops; 4352 skl = &so->so_klist; 4353 break; 4354 default: 4355 socket_unlock(so, 1); 4356 return (1); 4357 } 4358 4359 if (KNOTE_ATTACH(skl, kn)) { 4360 switch (kn->kn_filter) { 4361 case EVFILT_READ: 4362 so->so_rcv.sb_flags |= SB_KNOTE; 4363 break; 4364 case EVFILT_WRITE: 4365 so->so_snd.sb_flags |= SB_KNOTE; 4366 break; 4367 case EVFILT_SOCK: 4368 so->so_flags |= SOF_KNOTE; 4369 break; 4370 default: 4371 socket_unlock(so, 1); 4372 return (1); 4373 } 4374 } 4375 socket_unlock(so, 1); 4376 return (0); 4377} 4378 4379static void 4380filt_sordetach(struct knote *kn) 4381{ 4382 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4383 4384 socket_lock(so, 1); 4385 if (so->so_rcv.sb_flags & SB_KNOTE) 4386 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) 4387 so->so_rcv.sb_flags &= ~SB_KNOTE; 4388 socket_unlock(so, 1); 4389} 4390 4391/*ARGSUSED*/ 4392static int 4393filt_soread(struct knote *kn, long hint) 4394{ 4395 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4396 4397 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4398 socket_lock(so, 1); 4399 4400 if (so->so_options & SO_ACCEPTCONN) { 4401 int isempty; 4402 4403 /* 4404 * Radar 6615193 handle the listen case dynamically 4405 * for kqueue read filter. This allows to call listen() 4406 * after registering the kqueue EVFILT_READ. 4407 */ 4408 4409 kn->kn_data = so->so_qlen; 4410 isempty = ! TAILQ_EMPTY(&so->so_comp); 4411 4412 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4413 socket_unlock(so, 1); 4414 4415 return (isempty); 4416 } 4417 4418 /* socket isn't a listener */ 4419 4420 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 4421 4422 if (so->so_oobmark) { 4423 if (kn->kn_flags & EV_OOBAND) { 4424 kn->kn_data -= so->so_oobmark; 4425 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4426 socket_unlock(so, 1); 4427 return (1); 4428 } 4429 kn->kn_data = so->so_oobmark; 4430 kn->kn_flags |= EV_OOBAND; 4431 } else { 4432 if (so->so_state & SS_CANTRCVMORE) { 4433 kn->kn_flags |= EV_EOF; 4434 kn->kn_fflags = so->so_error; 4435 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4436 socket_unlock(so, 1); 4437 return (1); 4438 } 4439 } 4440 4441 if (so->so_state & SS_RCVATMARK) { 4442 if (kn->kn_flags & EV_OOBAND) { 4443 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4444 socket_unlock(so, 1); 4445 return (1); 4446 } 4447 kn->kn_flags |= EV_OOBAND; 4448 } else if (kn->kn_flags & EV_OOBAND) { 4449 kn->kn_data = 0; 4450 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4451 socket_unlock(so, 1); 4452 return (0); 4453 } 4454 4455 if (so->so_error) { /* temporary udp error */ 4456 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4457 socket_unlock(so, 1); 4458 return (1); 4459 } 4460 4461 int64_t lowwat = so->so_rcv.sb_lowat; 4462 if (kn->kn_sfflags & NOTE_LOWAT) { 4463 if (kn->kn_sdata > so->so_rcv.sb_hiwat) 4464 lowwat = so->so_rcv.sb_hiwat; 4465 else if (kn->kn_sdata > lowwat) 4466 lowwat = kn->kn_sdata; 4467 } 4468 4469 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4470 socket_unlock(so, 1); 4471 4472 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); 4473} 4474 4475static void 4476filt_sowdetach(struct knote *kn) 4477{ 4478 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4479 socket_lock(so, 1); 4480 4481 if (so->so_snd.sb_flags & SB_KNOTE) 4482 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) 4483 so->so_snd.sb_flags &= ~SB_KNOTE; 4484 socket_unlock(so, 1); 4485} 4486 4487int 4488so_wait_for_if_feedback(struct socket *so) 4489{ 4490 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) && 4491 (so->so_state & SS_ISCONNECTED)) { 4492 struct inpcb *inp = sotoinpcb(so); 4493 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) 4494 return (1); 4495 } 4496 return (0); 4497} 4498 4499/*ARGSUSED*/ 4500static int 4501filt_sowrite(struct knote *kn, long hint) 4502{ 4503 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4504 int ret = 0; 4505 4506 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4507 socket_lock(so, 1); 4508 4509 kn->kn_data = sbspace(&so->so_snd); 4510 if (so->so_state & SS_CANTSENDMORE) { 4511 kn->kn_flags |= EV_EOF; 4512 kn->kn_fflags = so->so_error; 4513 ret = 1; 4514 goto out; 4515 } 4516 if (so->so_error) { /* temporary udp error */ 4517 ret = 1; 4518 goto out; 4519 } 4520 if (((so->so_state & SS_ISCONNECTED) == 0) && 4521 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 4522 ret = 0; 4523 goto out; 4524 } 4525 int64_t lowwat = so->so_snd.sb_lowat; 4526 if (kn->kn_sfflags & NOTE_LOWAT) { 4527 if (kn->kn_sdata > so->so_snd.sb_hiwat) 4528 lowwat = so->so_snd.sb_hiwat; 4529 else if (kn->kn_sdata > lowwat) 4530 lowwat = kn->kn_sdata; 4531 } 4532 if (kn->kn_data >= lowwat) { 4533 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) { 4534 ret = tcp_notsent_lowat_check(so); 4535 } else { 4536 ret = 1; 4537 } 4538 } 4539 if (so_wait_for_if_feedback(so)) 4540 ret = 0; 4541out: 4542 if ((hint & SO_FILT_HINT_LOCKED) == 0) 4543 socket_unlock(so, 1); 4544 return (ret); 4545} 4546 4547static void 4548filt_sockdetach(struct knote *kn) 4549{ 4550 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4551 socket_lock(so, 1); 4552 4553 if ((so->so_flags & SOF_KNOTE) != 0) 4554 if (KNOTE_DETACH(&so->so_klist, kn)) 4555 so->so_flags &= ~SOF_KNOTE; 4556 socket_unlock(so, 1); 4557} 4558 4559static int 4560filt_sockev(struct knote *kn, long hint) 4561{ 4562 int ret = 0, locked = 0; 4563 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; 4564 long ev_hint = (hint & SO_FILT_HINT_EV); 4565 4566 if ((hint & SO_FILT_HINT_LOCKED) == 0) { 4567 socket_lock(so, 1); 4568 locked = 1; 4569 } 4570 4571 if (ev_hint & SO_FILT_HINT_CONNRESET) { 4572 if (kn->kn_sfflags & NOTE_CONNRESET) 4573 kn->kn_fflags |= NOTE_CONNRESET; 4574 } 4575 if (ev_hint & SO_FILT_HINT_TIMEOUT) { 4576 if (kn->kn_sfflags & NOTE_TIMEOUT) 4577 kn->kn_fflags |= NOTE_TIMEOUT; 4578 } 4579 if (ev_hint & SO_FILT_HINT_NOSRCADDR) { 4580 if (kn->kn_sfflags & NOTE_NOSRCADDR) 4581 kn->kn_fflags |= NOTE_NOSRCADDR; 4582 } 4583 if (ev_hint & SO_FILT_HINT_IFDENIED) { 4584 if ((kn->kn_sfflags & NOTE_IFDENIED)) 4585 kn->kn_fflags |= NOTE_IFDENIED; 4586 } 4587 if (ev_hint & SO_FILT_HINT_KEEPALIVE) { 4588 if (kn->kn_sfflags & NOTE_KEEPALIVE) 4589 kn->kn_fflags |= NOTE_KEEPALIVE; 4590 } 4591 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) { 4592 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO) 4593 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO; 4594 } 4595 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) { 4596 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO) 4597 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO; 4598 } 4599 if (ev_hint & SO_FILT_HINT_CONNECTED) { 4600 if (kn->kn_sfflags & NOTE_CONNECTED) 4601 kn->kn_fflags |= NOTE_CONNECTED; 4602 } 4603 if (ev_hint & SO_FILT_HINT_DISCONNECTED) { 4604 if (kn->kn_sfflags & NOTE_DISCONNECTED) 4605 kn->kn_fflags |= NOTE_DISCONNECTED; 4606 } 4607 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) { 4608 if (so->so_proto != NULL && 4609 (so->so_proto->pr_flags & PR_EVCONNINFO) && 4610 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED)) 4611 kn->kn_fflags |= NOTE_CONNINFO_UPDATED; 4612 } 4613 4614 if ((kn->kn_sfflags & NOTE_READCLOSED) && 4615 (so->so_state & SS_CANTRCVMORE)) 4616 kn->kn_fflags |= NOTE_READCLOSED; 4617 4618 if ((kn->kn_sfflags & NOTE_WRITECLOSED) && 4619 (so->so_state & SS_CANTSENDMORE)) 4620 kn->kn_fflags |= NOTE_WRITECLOSED; 4621 4622 if ((kn->kn_sfflags & NOTE_SUSPEND) && 4623 ((ev_hint & SO_FILT_HINT_SUSPEND) || 4624 (so->so_flags & SOF_SUSPENDED))) { 4625 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); 4626 kn->kn_fflags |= NOTE_SUSPEND; 4627 } 4628 4629 if ((kn->kn_sfflags & NOTE_RESUME) && 4630 ((ev_hint & SO_FILT_HINT_RESUME) || 4631 (so->so_flags & SOF_SUSPENDED) == 0)) { 4632 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); 4633 kn->kn_fflags |= NOTE_RESUME; 4634 } 4635 4636 if (so->so_error != 0) { 4637 ret = 1; 4638 kn->kn_data = so->so_error; 4639 kn->kn_flags |= EV_EOF; 4640 } else { 4641 get_sockev_state(so, (u_int32_t *)&(kn->kn_data)); 4642 } 4643 4644 if (kn->kn_fflags != 0) 4645 ret = 1; 4646 4647 if (locked) 4648 socket_unlock(so, 1); 4649 4650 return (ret); 4651} 4652 4653void 4654get_sockev_state(struct socket *so, u_int32_t *statep) 4655{ 4656 u_int32_t state = *(statep); 4657 4658 if (so->so_state & SS_ISCONNECTED) 4659 state |= SOCKEV_CONNECTED; 4660 else 4661 state &= ~(SOCKEV_CONNECTED); 4662 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0); 4663 *(statep) = state; 4664} 4665 4666#define SO_LOCK_HISTORY_STR_LEN \ 4667 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1) 4668 4669__private_extern__ const char * 4670solockhistory_nr(struct socket *so) 4671{ 4672 size_t n = 0; 4673 int i; 4674 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; 4675 4676 bzero(lock_history_str, sizeof (lock_history_str)); 4677 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { 4678 n += snprintf(lock_history_str + n, 4679 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ", 4680 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], 4681 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]); 4682 } 4683 return (lock_history_str); 4684} 4685 4686int 4687socket_lock(struct socket *so, int refcount) 4688{ 4689 int error = 0; 4690 void *lr_saved; 4691 4692 lr_saved = __builtin_return_address(0); 4693 4694 if (so->so_proto->pr_lock) { 4695 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved); 4696 } else { 4697#ifdef MORE_LOCKING_DEBUG 4698 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx, 4699 LCK_MTX_ASSERT_NOTOWNED); 4700#endif 4701 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); 4702 if (refcount) 4703 so->so_usecount++; 4704 so->lock_lr[so->next_lock_lr] = lr_saved; 4705 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; 4706 } 4707 4708 return (error); 4709} 4710 4711int 4712socket_unlock(struct socket *so, int refcount) 4713{ 4714 int error = 0; 4715 void *lr_saved; 4716 lck_mtx_t *mutex_held; 4717 4718 lr_saved = __builtin_return_address(0); 4719 4720 if (so->so_proto == NULL) { 4721 panic("%s: null so_proto so=%p\n", __func__, so); 4722 /* NOTREACHED */ 4723 } 4724 4725 if (so && so->so_proto->pr_unlock) { 4726 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved); 4727 } else { 4728 mutex_held = so->so_proto->pr_domain->dom_mtx; 4729#ifdef MORE_LOCKING_DEBUG 4730 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 4731#endif 4732 so->unlock_lr[so->next_unlock_lr] = lr_saved; 4733 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; 4734 4735 if (refcount) { 4736 if (so->so_usecount <= 0) { 4737 panic("%s: bad refcount=%d so=%p (%d, %d, %d) " 4738 "lrh=%s", __func__, so->so_usecount, so, 4739 SOCK_DOM(so), so->so_type, 4740 SOCK_PROTO(so), solockhistory_nr(so)); 4741 /* NOTREACHED */ 4742 } 4743 4744 so->so_usecount--; 4745 if (so->so_usecount == 0) 4746 sofreelastref(so, 1); 4747 } 4748 lck_mtx_unlock(mutex_held); 4749 } 4750 4751 return (error); 4752} 4753 4754/* Called with socket locked, will unlock socket */ 4755void 4756sofree(struct socket *so) 4757{ 4758 lck_mtx_t *mutex_held; 4759 4760 if (so->so_proto->pr_getlock != NULL) 4761 mutex_held = (*so->so_proto->pr_getlock)(so, 0); 4762 else 4763 mutex_held = so->so_proto->pr_domain->dom_mtx; 4764 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); 4765 4766 sofreelastref(so, 0); 4767} 4768 4769void 4770soreference(struct socket *so) 4771{ 4772 socket_lock(so, 1); /* locks & take one reference on socket */ 4773 socket_unlock(so, 0); /* unlock only */ 4774} 4775 4776void 4777sodereference(struct socket *so) 4778{ 4779 socket_lock(so, 0); 4780 socket_unlock(so, 1); 4781} 4782 4783/* 4784 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the 4785 * possibility of using jumbo clusters. Caller must ensure to hold 4786 * the socket lock. 4787 */ 4788void 4789somultipages(struct socket *so, boolean_t set) 4790{ 4791 if (set) 4792 so->so_flags |= SOF_MULTIPAGES; 4793 else 4794 so->so_flags &= ~SOF_MULTIPAGES; 4795} 4796 4797int 4798so_isdstlocal(struct socket *so) { 4799 4800 struct inpcb *inp = (struct inpcb *)so->so_pcb; 4801 4802 if (SOCK_DOM(so) == PF_INET) 4803 return (inaddr_local(inp->inp_faddr)); 4804 else if (SOCK_DOM(so) == PF_INET6) 4805 return (in6addr_local(&inp->in6p_faddr)); 4806 4807 return (0); 4808} 4809 4810int 4811sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) 4812{ 4813 struct sockbuf *rcv, *snd; 4814 int err = 0, defunct; 4815 4816 rcv = &so->so_rcv; 4817 snd = &so->so_snd; 4818 4819 defunct = (so->so_flags & SOF_DEFUNCT); 4820 if (defunct) { 4821 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) { 4822 panic("%s: SB_DROP not set", __func__); 4823 /* NOTREACHED */ 4824 } 4825 goto done; 4826 } 4827 4828 if (so->so_flags & SOF_NODEFUNCT) { 4829 if (noforce) { 4830 err = EOPNOTSUPP; 4831 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) " 4832 "so 0x%llx [%d,%d] is not eligible for defunct " 4833 "(%d)\n", __func__, proc_selfpid(), proc_pid(p), 4834 level, (uint64_t)VM_KERNEL_ADDRPERM(so), 4835 SOCK_DOM(so), SOCK_TYPE(so), err)); 4836 return (err); 4837 } 4838 so->so_flags &= ~SOF_NODEFUNCT; 4839 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " 4840 "[%d,%d] defunct by force\n", __func__, proc_selfpid(), 4841 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 4842 SOCK_DOM(so), SOCK_TYPE(so))); 4843 } 4844 4845 so->so_flags |= SOF_DEFUNCT; 4846 4847 /* Prevent further data from being appended to the socket buffers */ 4848 snd->sb_flags |= SB_DROP; 4849 rcv->sb_flags |= SB_DROP; 4850 4851 /* Flush any existing data in the socket buffers */ 4852 if (rcv->sb_cc != 0) { 4853 rcv->sb_flags &= ~SB_SEL; 4854 selthreadclear(&rcv->sb_sel); 4855 sbrelease(rcv); 4856 } 4857 if (snd->sb_cc != 0) { 4858 snd->sb_flags &= ~SB_SEL; 4859 selthreadclear(&snd->sb_sel); 4860 sbrelease(snd); 4861 } 4862 4863done: 4864 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s " 4865 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, 4866 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), 4867 defunct ? "is already" : "marked as")); 4868 4869 return (err); 4870} 4871 4872int 4873sodefunct(struct proc *p, struct socket *so, int level) 4874{ 4875 struct sockbuf *rcv, *snd; 4876 4877 if (!(so->so_flags & SOF_DEFUNCT)) { 4878 panic("%s improperly called", __func__); 4879 /* NOTREACHED */ 4880 } 4881 if (so->so_state & SS_DEFUNCT) 4882 goto done; 4883 4884 rcv = &so->so_rcv; 4885 snd = &so->so_snd; 4886 4887 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 4888 char s[MAX_IPv6_STR_LEN]; 4889 char d[MAX_IPv6_STR_LEN]; 4890 struct inpcb *inp = sotoinpcb(so); 4891 4892 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s " 4893 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " 4894 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), 4895 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 4896 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", 4897 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? 4898 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), 4899 s, sizeof (s)), ntohs(inp->in6p_lport), 4900 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? 4901 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr, 4902 d, sizeof (d)), ntohs(inp->in6p_fport), 4903 (uint32_t)rcv->sb_sel.si_flags, 4904 (uint32_t)snd->sb_sel.si_flags, 4905 rcv->sb_flags, snd->sb_flags)); 4906 } else { 4907 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx " 4908 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, " 4909 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), 4910 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so), 4911 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags, 4912 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags, 4913 snd->sb_flags)); 4914 } 4915 4916 /* 4917 * Unwedge threads blocked on sbwait() and sb_lock(). 4918 */ 4919 sbwakeup(rcv); 4920 sbwakeup(snd); 4921 4922 if (rcv->sb_flags & SB_LOCK) 4923 sbunlock(rcv, TRUE); /* keep socket locked */ 4924 if (snd->sb_flags & SB_LOCK) 4925 sbunlock(snd, TRUE); /* keep socket locked */ 4926 4927 /* 4928 * Flush the buffers and disconnect. We explicitly call shutdown 4929 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE 4930 * states are set for the socket. This would also flush out data 4931 * hanging off the receive list of this socket. 4932 */ 4933 (void) soshutdownlock(so, SHUT_RD); 4934 (void) soshutdownlock(so, SHUT_WR); 4935 (void) sodisconnectlocked(so); 4936 4937 /* 4938 * Explicitly handle connectionless-protocol disconnection 4939 * and release any remaining data in the socket buffers. 4940 */ 4941 if (!(so->so_flags & SS_ISDISCONNECTED)) 4942 (void) soisdisconnected(so); 4943 4944 if (so->so_error == 0) 4945 so->so_error = EBADF; 4946 4947 if (rcv->sb_cc != 0) { 4948 rcv->sb_flags &= ~SB_SEL; 4949 selthreadclear(&rcv->sb_sel); 4950 sbrelease(rcv); 4951 } 4952 if (snd->sb_cc != 0) { 4953 snd->sb_flags &= ~SB_SEL; 4954 selthreadclear(&snd->sb_sel); 4955 sbrelease(snd); 4956 } 4957 so->so_state |= SS_DEFUNCT; 4958 4959done: 4960 return (0); 4961} 4962 4963__private_extern__ int 4964so_set_recv_anyif(struct socket *so, int optval) 4965{ 4966 int ret = 0; 4967 4968#if INET6 4969 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 4970#else 4971 if (SOCK_DOM(so) == PF_INET) { 4972#endif /* !INET6 */ 4973 if (optval) 4974 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF; 4975 else 4976 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF; 4977 } 4978 4979 return (ret); 4980} 4981 4982__private_extern__ int 4983so_get_recv_anyif(struct socket *so) 4984{ 4985 int ret = 0; 4986 4987#if INET6 4988 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 4989#else 4990 if (SOCK_DOM(so) == PF_INET) { 4991#endif /* !INET6 */ 4992 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0; 4993 } 4994 4995 return (ret); 4996} 4997 4998int 4999so_set_restrictions(struct socket *so, uint32_t vals) 5000{ 5001 int nocell_old, nocell_new; 5002 int ret = 0; 5003 5004 /* 5005 * Deny-type restrictions are trapdoors; once set they cannot be 5006 * unset for the lifetime of the socket. This allows them to be 5007 * issued by a framework on behalf of the application without 5008 * having to worry that they can be undone. 5009 * 5010 * Note here that socket-level restrictions overrides any protocol 5011 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR 5012 * socket restriction issued on the socket has a higher precendence 5013 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID 5014 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only, 5015 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued. 5016 */ 5017 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); 5018 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN | 5019 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR)); 5020 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); 5021 5022 /* other than deny cellular, there's nothing more to do */ 5023 if ((nocell_new - nocell_old) == 0) 5024 return (ret); 5025 5026 /* we can only set, not clear restrictions */ 5027 VERIFY((nocell_new - nocell_old) > 0); 5028 5029#if INET6 5030 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { 5031#else 5032 if (SOCK_DOM(so) == PF_INET) { 5033#endif /* !INET6 */ 5034 /* if deny cellular is now set, do what's needed for INPCB */ 5035 inp_set_nocellular(sotoinpcb(so)); 5036 } 5037 5038 return (ret); 5039} 5040 5041uint32_t 5042so_get_restrictions(struct socket *so) 5043{ 5044 return (so->so_restrictions & (SO_RESTRICT_DENY_IN | 5045 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR)); 5046} 5047 5048struct sockaddr_entry * 5049sockaddrentry_alloc(int how) 5050{ 5051 struct sockaddr_entry *se; 5052 5053 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone); 5054 if (se != NULL) 5055 bzero(se, se_zone_size); 5056 5057 return (se); 5058} 5059 5060void 5061sockaddrentry_free(struct sockaddr_entry *se) 5062{ 5063 if (se->se_addr != NULL) { 5064 FREE(se->se_addr, M_SONAME); 5065 se->se_addr = NULL; 5066 } 5067 zfree(se_zone, se); 5068} 5069 5070struct sockaddr_entry * 5071sockaddrentry_dup(const struct sockaddr_entry *src_se, int how) 5072{ 5073 struct sockaddr_entry *dst_se; 5074 5075 dst_se = sockaddrentry_alloc(how); 5076 if (dst_se != NULL) { 5077 int len = src_se->se_addr->sa_len; 5078 5079 MALLOC(dst_se->se_addr, struct sockaddr *, 5080 len, M_SONAME, how | M_ZERO); 5081 if (dst_se->se_addr != NULL) { 5082 bcopy(src_se->se_addr, dst_se->se_addr, len); 5083 } else { 5084 sockaddrentry_free(dst_se); 5085 dst_se = NULL; 5086 } 5087 } 5088 5089 return (dst_se); 5090} 5091 5092struct sockaddr_list * 5093sockaddrlist_alloc(int how) 5094{ 5095 struct sockaddr_list *sl; 5096 5097 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone); 5098 if (sl != NULL) { 5099 bzero(sl, sl_zone_size); 5100 TAILQ_INIT(&sl->sl_head); 5101 } 5102 return (sl); 5103} 5104 5105void 5106sockaddrlist_free(struct sockaddr_list *sl) 5107{ 5108 struct sockaddr_entry *se, *tse; 5109 5110 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) { 5111 sockaddrlist_remove(sl, se); 5112 sockaddrentry_free(se); 5113 } 5114 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head)); 5115 zfree(sl_zone, sl); 5116} 5117 5118void 5119sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se) 5120{ 5121 VERIFY(!(se->se_flags & SEF_ATTACHED)); 5122 se->se_flags |= SEF_ATTACHED; 5123 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link); 5124 sl->sl_cnt++; 5125 VERIFY(sl->sl_cnt != 0); 5126} 5127 5128void 5129sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se) 5130{ 5131 VERIFY(se->se_flags & SEF_ATTACHED); 5132 se->se_flags &= ~SEF_ATTACHED; 5133 VERIFY(sl->sl_cnt != 0); 5134 sl->sl_cnt--; 5135 TAILQ_REMOVE(&sl->sl_head, se, se_link); 5136} 5137 5138struct sockaddr_list * 5139sockaddrlist_dup(const struct sockaddr_list *src_sl, int how) 5140{ 5141 struct sockaddr_entry *src_se, *tse; 5142 struct sockaddr_list *dst_sl; 5143 5144 dst_sl = sockaddrlist_alloc(how); 5145 if (dst_sl == NULL) 5146 return (NULL); 5147 5148 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) { 5149 struct sockaddr_entry *dst_se; 5150 5151 if (src_se->se_addr == NULL) 5152 continue; 5153 5154 dst_se = sockaddrentry_dup(src_se, how); 5155 if (dst_se == NULL) { 5156 sockaddrlist_free(dst_sl); 5157 return (NULL); 5158 } 5159 5160 sockaddrlist_insert(dst_sl, dst_se); 5161 } 5162 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt); 5163 5164 return (dst_sl); 5165} 5166 5167int 5168so_set_effective_pid(struct socket *so, int epid, struct proc *p) 5169{ 5170 struct proc *ep = PROC_NULL; 5171 int error = 0; 5172 5173 /* pid 0 is reserved for kernel */ 5174 if (epid == 0) { 5175 error = EINVAL; 5176 goto done; 5177 } 5178 5179 /* 5180 * If this is an in-kernel socket, prevent its delegate 5181 * association from changing unless the socket option is 5182 * coming from within the kernel itself. 5183 */ 5184 if (so->last_pid == 0 && p != kernproc) { 5185 error = EACCES; 5186 goto done; 5187 } 5188 5189 /* 5190 * If this is issued by a process that's recorded as the 5191 * real owner of the socket, or if the pid is the same as 5192 * the process's own pid, then proceed. Otherwise ensure 5193 * that the issuing process has the necessary privileges. 5194 */ 5195 if (epid != so->last_pid || epid != proc_pid(p)) { 5196 if ((error = priv_check_cred(kauth_cred_get(), 5197 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { 5198 error = EACCES; 5199 goto done; 5200 } 5201 } 5202 5203 /* Find the process that corresponds to the effective pid */ 5204 if ((ep = proc_find(epid)) == PROC_NULL) { 5205 error = ESRCH; 5206 goto done; 5207 } 5208 5209 /* 5210 * If a process tries to delegate the socket to itself, then 5211 * there's really nothing to do; treat it as a way for the 5212 * delegate association to be cleared. Note that we check 5213 * the passed-in proc rather than calling proc_selfpid(), 5214 * as we need to check the process issuing the socket option 5215 * which could be kernproc. Given that we don't allow 0 for 5216 * effective pid, it means that a delegated in-kernel socket 5217 * stays delegated during its lifetime (which is probably OK.) 5218 */ 5219 if (epid == proc_pid(p)) { 5220 so->so_flags &= ~SOF_DELEGATED; 5221 so->e_upid = 0; 5222 so->e_pid = 0; 5223 uuid_clear(so->e_uuid); 5224 } else { 5225 so->so_flags |= SOF_DELEGATED; 5226 so->e_upid = proc_uniqueid(ep); 5227 so->e_pid = proc_pid(ep); 5228 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid)); 5229 } 5230 5231done: 5232 if (error == 0 && net_io_policy_log) { 5233 uuid_string_t buf; 5234 5235 uuid_unparse(so->e_uuid, buf); 5236 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " 5237 "euuid %s%s\n", __func__, proc_name_address(p), 5238 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 5239 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf, 5240 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); 5241 } else if (error != 0 && net_io_policy_log) { 5242 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " 5243 "ERROR (%d)\n", __func__, proc_name_address(p), 5244 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 5245 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" : 5246 proc_name_address(ep), error); 5247 } 5248 5249 if (ep != PROC_NULL) 5250 proc_rele(ep); 5251 5252 return (error); 5253} 5254 5255int 5256so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p) 5257{ 5258 uuid_string_t buf; 5259 uuid_t uuid; 5260 int error = 0; 5261 5262 /* UUID must not be all-zeroes (reserved for kernel) */ 5263 if (uuid_is_null(euuid)) { 5264 error = EINVAL; 5265 goto done;; 5266 } 5267 5268 /* 5269 * If this is an in-kernel socket, prevent its delegate 5270 * association from changing unless the socket option is 5271 * coming from within the kernel itself. 5272 */ 5273 if (so->last_pid == 0 && p != kernproc) { 5274 error = EACCES; 5275 goto done; 5276 } 5277 5278 /* Get the UUID of the issuing process */ 5279 proc_getexecutableuuid(p, uuid, sizeof (uuid)); 5280 5281 /* 5282 * If this is issued by a process that's recorded as the 5283 * real owner of the socket, or if the uuid is the same as 5284 * the process's own uuid, then proceed. Otherwise ensure 5285 * that the issuing process has the necessary privileges. 5286 */ 5287 if (uuid_compare(euuid, so->last_uuid) != 0 || 5288 uuid_compare(euuid, uuid) != 0) { 5289 if ((error = priv_check_cred(kauth_cred_get(), 5290 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { 5291 error = EACCES; 5292 goto done; 5293 } 5294 } 5295 5296 /* 5297 * If a process tries to delegate the socket to itself, then 5298 * there's really nothing to do; treat it as a way for the 5299 * delegate association to be cleared. Note that we check 5300 * the uuid of the passed-in proc rather than that of the 5301 * current process, as we need to check the process issuing 5302 * the socket option which could be kernproc itself. Given 5303 * that we don't allow 0 for effective uuid, it means that 5304 * a delegated in-kernel socket stays delegated during its 5305 * lifetime (which is okay.) 5306 */ 5307 if (uuid_compare(euuid, uuid) == 0) { 5308 so->so_flags &= ~SOF_DELEGATED; 5309 so->e_upid = 0; 5310 so->e_pid = 0; 5311 uuid_clear(so->e_uuid); 5312 } else { 5313 so->so_flags |= SOF_DELEGATED; 5314 /* 5315 * Unlike so_set_effective_pid(), we only have the UUID 5316 * here and the process ID is not known. Inherit the 5317 * real {pid,upid} of the socket. 5318 */ 5319 so->e_upid = so->last_upid; 5320 so->e_pid = so->last_pid; 5321 uuid_copy(so->e_uuid, euuid); 5322 } 5323 5324done: 5325 if (error == 0 && net_io_policy_log) { 5326 uuid_unparse(so->e_uuid, buf); 5327 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d " 5328 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p), 5329 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 5330 SOCK_TYPE(so), so->e_pid, buf, 5331 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); 5332 } else if (error != 0 && net_io_policy_log) { 5333 uuid_unparse(euuid, buf); 5334 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s " 5335 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p), 5336 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), 5337 SOCK_TYPE(so), buf, error); 5338 } 5339 5340 return (error); 5341} 5342 5343void 5344netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data, 5345 uint32_t ev_datalen) 5346{ 5347 struct kev_msg ev_msg; 5348 5349 /* 5350 * A netpolicy event always starts with a netpolicy_event_data 5351 * structure, but the caller can provide for a longer event 5352 * structure to post, depending on the event code. 5353 */ 5354 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data)); 5355 5356 bzero(&ev_msg, sizeof (ev_msg)); 5357 ev_msg.vendor_code = KEV_VENDOR_APPLE; 5358 ev_msg.kev_class = KEV_NETWORK_CLASS; 5359 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS; 5360 ev_msg.event_code = ev_code; 5361 5362 ev_msg.dv[0].data_ptr = ev_data; 5363 ev_msg.dv[0].data_length = ev_datalen; 5364 5365 kev_post_msg(&ev_msg); 5366} 5367