1/* $OpenBSD: uipc_socket.c,v 1.335 2024/05/17 19:11:14 mvs Exp $ */ 2/* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4/* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35#include <sys/param.h> 36#include <sys/systm.h> 37#include <sys/proc.h> 38#include <sys/file.h> 39#include <sys/filedesc.h> 40#include <sys/malloc.h> 41#include <sys/mbuf.h> 42#include <sys/domain.h> 43#include <sys/event.h> 44#include <sys/protosw.h> 45#include <sys/socket.h> 46#include <sys/unpcb.h> 47#include <sys/socketvar.h> 48#include <sys/signalvar.h> 49#include <sys/pool.h> 50#include <sys/atomic.h> 51#include <sys/rwlock.h> 52#include <sys/time.h> 53#include <sys/refcnt.h> 54 55#ifdef DDB 56#include <machine/db_machdep.h> 57#endif 58 59void sbsync(struct sockbuf *, struct mbuf *); 60 61int sosplice(struct socket *, int, off_t, struct timeval *); 62void sounsplice(struct socket *, struct socket *, int); 63void soidle(void *); 64void sotask(void *); 65void soreaper(void *); 66void soput(void *); 67int somove(struct socket *, int); 68void sorflush(struct socket *); 69 70void filt_sordetach(struct knote *kn); 71int filt_soread(struct knote *kn, long hint); 72void filt_sowdetach(struct knote *kn); 73int filt_sowrite(struct knote *kn, long hint); 74int filt_soexcept(struct knote *kn, long hint); 75 76int filt_sowmodify(struct kevent *kev, struct knote *kn); 77int filt_sowprocess(struct knote *kn, struct kevent *kev); 78 79int filt_sormodify(struct kevent *kev, struct knote *kn); 80int filt_sorprocess(struct knote *kn, struct kevent *kev); 81 82const struct filterops soread_filtops = { 83 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 84 .f_attach = NULL, 85 .f_detach = filt_sordetach, 86 .f_event = filt_soread, 87 .f_modify = filt_sormodify, 88 .f_process = filt_sorprocess, 89}; 90 91const struct filterops sowrite_filtops = { 92 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 93 .f_attach = NULL, 94 .f_detach = filt_sowdetach, 95 .f_event = filt_sowrite, 96 .f_modify = filt_sowmodify, 97 .f_process = filt_sowprocess, 98}; 99 100const struct filterops soexcept_filtops = { 101 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 102 .f_attach = NULL, 103 .f_detach = filt_sordetach, 104 .f_event = filt_soexcept, 105 .f_modify = filt_sormodify, 106 .f_process = filt_sorprocess, 107}; 108 109#ifndef SOMINCONN 110#define SOMINCONN 80 111#endif /* SOMINCONN */ 112 113int somaxconn = SOMAXCONN; 114int sominconn = SOMINCONN; 115 116struct pool socket_pool; 117#ifdef SOCKET_SPLICE 118struct pool sosplice_pool; 119struct taskq *sosplice_taskq; 120struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 121#endif 122 123void 124soinit(void) 125{ 126 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 127 "sockpl", NULL); 128#ifdef SOCKET_SPLICE 129 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 130 "sosppl", NULL); 131#endif 132} 133 134struct socket * 135soalloc(const struct protosw *prp, int wait) 136{ 137 const struct domain *dp = prp->pr_domain; 138 struct socket *so; 139 140 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 141 PR_ZERO); 142 if (so == NULL) 143 return (NULL); 144 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 145 refcnt_init(&so->so_refcnt); 146 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 147 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 148 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 149 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 150 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 151 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 152 sigio_init(&so->so_sigio); 153 TAILQ_INIT(&so->so_q0); 154 TAILQ_INIT(&so->so_q); 155 156 switch (dp->dom_family) { 157 case AF_INET: 158 case AF_INET6: 159 switch (prp->pr_type) { 160 case SOCK_RAW: 161 so->so_snd.sb_flags |= SB_MTXLOCK; 162 /* FALLTHROUGH */ 163 case SOCK_DGRAM: 164 so->so_rcv.sb_flags |= SB_MTXLOCK; 165 break; 166 } 167 break; 168 case AF_KEY: 169 case AF_UNIX: 170 so->so_snd.sb_flags |= SB_MTXLOCK; 171 so->so_rcv.sb_flags |= SB_MTXLOCK; 172 break; 173 } 174 175 return (so); 176} 177 178/* 179 * Socket operation routines. 180 * These routines are called by the routines in 181 * sys_socket.c or from a system process, and 182 * implement the semantics of socket operations by 183 * switching out to the protocol specific routines. 184 */ 185int 186socreate(int dom, struct socket **aso, int type, int proto) 187{ 188 struct proc *p = curproc; /* XXX */ 189 const struct protosw *prp; 190 struct socket *so; 191 int error; 192 193 if (proto) 194 prp = pffindproto(dom, proto, type); 195 else 196 prp = pffindtype(dom, type); 197 if (prp == NULL || prp->pr_usrreqs == NULL) 198 return (EPROTONOSUPPORT); 199 if (prp->pr_type != type) 200 return (EPROTOTYPE); 201 so = soalloc(prp, M_WAIT); 202 so->so_type = type; 203 if (suser(p) == 0) 204 so->so_state = SS_PRIV; 205 so->so_ruid = p->p_ucred->cr_ruid; 206 so->so_euid = p->p_ucred->cr_uid; 207 so->so_rgid = p->p_ucred->cr_rgid; 208 so->so_egid = p->p_ucred->cr_gid; 209 so->so_cpid = p->p_p->ps_pid; 210 so->so_proto = prp; 211 so->so_snd.sb_timeo_nsecs = INFSLP; 212 so->so_rcv.sb_timeo_nsecs = INFSLP; 213 214 solock(so); 215 error = pru_attach(so, proto, M_WAIT); 216 if (error) { 217 so->so_state |= SS_NOFDREF; 218 /* sofree() calls sounlock(). */ 219 sofree(so, 0); 220 return (error); 221 } 222 sounlock(so); 223 *aso = so; 224 return (0); 225} 226 227int 228sobind(struct socket *so, struct mbuf *nam, struct proc *p) 229{ 230 soassertlocked(so); 231 return pru_bind(so, nam, p); 232} 233 234int 235solisten(struct socket *so, int backlog) 236{ 237 int somaxconn_local = READ_ONCE(somaxconn); 238 int sominconn_local = READ_ONCE(sominconn); 239 int error; 240 241 switch (so->so_type) { 242 case SOCK_STREAM: 243 case SOCK_SEQPACKET: 244 break; 245 default: 246 return (EOPNOTSUPP); 247 } 248 249 soassertlocked(so); 250 251 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 252 return (EINVAL); 253#ifdef SOCKET_SPLICE 254 if (isspliced(so) || issplicedback(so)) 255 return (EOPNOTSUPP); 256#endif /* SOCKET_SPLICE */ 257 error = pru_listen(so); 258 if (error) 259 return (error); 260 if (TAILQ_FIRST(&so->so_q) == NULL) 261 so->so_options |= SO_ACCEPTCONN; 262 if (backlog < 0 || backlog > somaxconn_local) 263 backlog = somaxconn_local; 264 if (backlog < sominconn_local) 265 backlog = sominconn_local; 266 so->so_qlimit = backlog; 267 return (0); 268} 269 270#define SOSP_FREEING_READ 1 271#define SOSP_FREEING_WRITE 2 272void 273sofree(struct socket *so, int keep_lock) 274{ 275 int persocket = solock_persocket(so); 276 277 soassertlocked(so); 278 279 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 280 if (!keep_lock) 281 sounlock(so); 282 return; 283 } 284 if (so->so_head) { 285 struct socket *head = so->so_head; 286 287 /* 288 * We must not decommission a socket that's on the accept(2) 289 * queue. If we do, then accept(2) may hang after select(2) 290 * indicated that the listening socket was ready. 291 */ 292 if (so->so_onq == &head->so_q) { 293 if (!keep_lock) 294 sounlock(so); 295 return; 296 } 297 298 if (persocket) { 299 /* 300 * Concurrent close of `head' could 301 * abort `so' due to re-lock. 302 */ 303 soref(so); 304 soref(head); 305 sounlock(so); 306 solock(head); 307 solock(so); 308 309 if (so->so_onq != &head->so_q0) { 310 sounlock(head); 311 sounlock(so); 312 sorele(head); 313 sorele(so); 314 return; 315 } 316 317 sorele(head); 318 sorele(so); 319 } 320 321 soqremque(so, 0); 322 323 if (persocket) 324 sounlock(head); 325 } 326 327 if (persocket) { 328 sounlock(so); 329 refcnt_finalize(&so->so_refcnt, "sofinal"); 330 solock(so); 331 } 332 333 sigio_free(&so->so_sigio); 334 klist_free(&so->so_rcv.sb_klist); 335 klist_free(&so->so_snd.sb_klist); 336#ifdef SOCKET_SPLICE 337 if (issplicedback(so)) { 338 int freeing = SOSP_FREEING_WRITE; 339 340 if (so->so_sp->ssp_soback == so) 341 freeing |= SOSP_FREEING_READ; 342 sounsplice(so->so_sp->ssp_soback, so, freeing); 343 } 344 if (isspliced(so)) { 345 int freeing = SOSP_FREEING_READ; 346 347 if (so == so->so_sp->ssp_socket) 348 freeing |= SOSP_FREEING_WRITE; 349 sounsplice(so, so->so_sp->ssp_socket, freeing); 350 } 351#endif /* SOCKET_SPLICE */ 352 353 mtx_enter(&so->so_snd.sb_mtx); 354 sbrelease(so, &so->so_snd); 355 mtx_leave(&so->so_snd.sb_mtx); 356 357 /* 358 * Unlocked dispose and cleanup is safe. Socket is unlinked 359 * from everywhere. Even concurrent sotask() thread will not 360 * call somove(). 361 */ 362 if (so->so_proto->pr_flags & PR_RIGHTS && 363 so->so_proto->pr_domain->dom_dispose) 364 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 365 m_purge(so->so_rcv.sb_mb); 366 367 if (!keep_lock) 368 sounlock(so); 369 370#ifdef SOCKET_SPLICE 371 if (so->so_sp) { 372 /* Reuse splice idle, sounsplice() has been called before. */ 373 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 374 timeout_add(&so->so_sp->ssp_idleto, 0); 375 } else 376#endif /* SOCKET_SPLICE */ 377 { 378 pool_put(&socket_pool, so); 379 } 380} 381 382static inline uint64_t 383solinger_nsec(struct socket *so) 384{ 385 if (so->so_linger == 0) 386 return INFSLP; 387 388 return SEC_TO_NSEC(so->so_linger); 389} 390 391/* 392 * Close a socket on last file table reference removal. 393 * Initiate disconnect if connected. 394 * Free socket when disconnect complete. 395 */ 396int 397soclose(struct socket *so, int flags) 398{ 399 struct socket *so2; 400 int error = 0; 401 402 solock(so); 403 /* Revoke async IO early. There is a final revocation in sofree(). */ 404 sigio_free(&so->so_sigio); 405 if (so->so_state & SS_ISCONNECTED) { 406 if (so->so_pcb == NULL) 407 goto discard; 408 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 409 error = sodisconnect(so); 410 if (error) 411 goto drop; 412 } 413 if (so->so_options & SO_LINGER) { 414 if ((so->so_state & SS_ISDISCONNECTING) && 415 (flags & MSG_DONTWAIT)) 416 goto drop; 417 while (so->so_state & SS_ISCONNECTED) { 418 error = sosleep_nsec(so, &so->so_timeo, 419 PSOCK | PCATCH, "netcls", 420 solinger_nsec(so)); 421 if (error) 422 break; 423 } 424 } 425 } 426drop: 427 if (so->so_pcb) { 428 int error2; 429 error2 = pru_detach(so); 430 if (error == 0) 431 error = error2; 432 } 433 if (so->so_options & SO_ACCEPTCONN) { 434 int persocket = solock_persocket(so); 435 436 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 437 if (persocket) 438 solock(so2); 439 (void) soqremque(so2, 0); 440 if (persocket) 441 sounlock(so); 442 soabort(so2); 443 if (persocket) 444 solock(so); 445 } 446 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 447 if (persocket) 448 solock(so2); 449 (void) soqremque(so2, 1); 450 if (persocket) 451 sounlock(so); 452 soabort(so2); 453 if (persocket) 454 solock(so); 455 } 456 } 457discard: 458 if (so->so_state & SS_NOFDREF) 459 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 460 so->so_state |= SS_NOFDREF; 461 /* sofree() calls sounlock(). */ 462 sofree(so, 0); 463 return (error); 464} 465 466void 467soabort(struct socket *so) 468{ 469 soassertlocked(so); 470 pru_abort(so); 471} 472 473int 474soaccept(struct socket *so, struct mbuf *nam) 475{ 476 int error = 0; 477 478 soassertlocked(so); 479 480 if ((so->so_state & SS_NOFDREF) == 0) 481 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 482 so->so_state &= ~SS_NOFDREF; 483 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 484 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 485 error = pru_accept(so, nam); 486 else 487 error = ECONNABORTED; 488 return (error); 489} 490 491int 492soconnect(struct socket *so, struct mbuf *nam) 493{ 494 int error; 495 496 soassertlocked(so); 497 498 if (so->so_options & SO_ACCEPTCONN) 499 return (EOPNOTSUPP); 500 /* 501 * If protocol is connection-based, can only connect once. 502 * Otherwise, if connected, try to disconnect first. 503 * This allows user to disconnect by connecting to, e.g., 504 * a null address. 505 */ 506 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 507 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 508 (error = sodisconnect(so)))) 509 error = EISCONN; 510 else 511 error = pru_connect(so, nam); 512 return (error); 513} 514 515int 516soconnect2(struct socket *so1, struct socket *so2) 517{ 518 int persocket, error; 519 520 if ((persocket = solock_persocket(so1))) 521 solock_pair(so1, so2); 522 else 523 solock(so1); 524 525 error = pru_connect2(so1, so2); 526 527 if (persocket) 528 sounlock(so2); 529 sounlock(so1); 530 return (error); 531} 532 533int 534sodisconnect(struct socket *so) 535{ 536 int error; 537 538 soassertlocked(so); 539 540 if ((so->so_state & SS_ISCONNECTED) == 0) 541 return (ENOTCONN); 542 if (so->so_state & SS_ISDISCONNECTING) 543 return (EALREADY); 544 error = pru_disconnect(so); 545 return (error); 546} 547 548int m_getuio(struct mbuf **, int, long, struct uio *); 549 550#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 551/* 552 * Send on a socket. 553 * If send must go all at once and message is larger than 554 * send buffering, then hard error. 555 * Lock against other senders. 556 * If must go all at once and not enough room now, then 557 * inform user that this would block and do nothing. 558 * Otherwise, if nonblocking, send as much as possible. 559 * The data to be sent is described by "uio" if nonzero, 560 * otherwise by the mbuf chain "top" (which must be null 561 * if uio is not). Data provided in mbuf chain must be small 562 * enough to send all at once. 563 * 564 * Returns nonzero on error, timeout or signal; callers 565 * must check for short counts if EINTR/ERESTART are returned. 566 * Data and control buffers are freed on return. 567 */ 568int 569sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 570 struct mbuf *control, int flags) 571{ 572 long space, clen = 0; 573 size_t resid; 574 int error; 575 int atomic = sosendallatonce(so) || top; 576 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 577 578 if (uio) 579 resid = uio->uio_resid; 580 else 581 resid = top->m_pkthdr.len; 582 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 583 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 584 m_freem(top); 585 m_freem(control); 586 return (EINVAL); 587 } 588 if (uio && uio->uio_procp) 589 uio->uio_procp->p_ru.ru_msgsnd++; 590 if (control) { 591 /* 592 * In theory clen should be unsigned (since control->m_len is). 593 * However, space must be signed, as it might be less than 0 594 * if we over-committed, and we must use a signed comparison 595 * of space and clen. 596 */ 597 clen = control->m_len; 598 /* reserve extra space for AF_UNIX's internalize */ 599 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 600 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 601 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 602 clen = CMSG_SPACE( 603 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 604 (sizeof(struct fdpass) / sizeof(int))); 605 } 606 607#define snderr(errno) { error = errno; goto release; } 608 609restart: 610 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 611 goto out; 612 if (dosolock) 613 solock_shared(so); 614 sb_mtx_lock(&so->so_snd); 615 so->so_snd.sb_state |= SS_ISSENDING; 616 do { 617 if (so->so_snd.sb_state & SS_CANTSENDMORE) 618 snderr(EPIPE); 619 if ((error = READ_ONCE(so->so_error))) { 620 so->so_error = 0; 621 snderr(error); 622 } 623 if ((so->so_state & SS_ISCONNECTED) == 0) { 624 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 625 if (!(resid == 0 && clen != 0)) 626 snderr(ENOTCONN); 627 } else if (addr == NULL) 628 snderr(EDESTADDRREQ); 629 } 630 space = sbspace(so, &so->so_snd); 631 if (flags & MSG_OOB) 632 space += 1024; 633 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 634 if (atomic && resid > so->so_snd.sb_hiwat) 635 snderr(EMSGSIZE); 636 } else { 637 if (clen > so->so_snd.sb_hiwat || 638 (atomic && resid > so->so_snd.sb_hiwat - clen)) 639 snderr(EMSGSIZE); 640 } 641 if (space < clen || 642 (space - clen < resid && 643 (atomic || space < so->so_snd.sb_lowat))) { 644 if (flags & MSG_DONTWAIT) 645 snderr(EWOULDBLOCK); 646 sbunlock(&so->so_snd); 647 error = sbwait(so, &so->so_snd); 648 so->so_snd.sb_state &= ~SS_ISSENDING; 649 sb_mtx_unlock(&so->so_snd); 650 if (dosolock) 651 sounlock_shared(so); 652 if (error) 653 goto out; 654 goto restart; 655 } 656 space -= clen; 657 do { 658 if (uio == NULL) { 659 /* 660 * Data is prepackaged in "top". 661 */ 662 resid = 0; 663 if (flags & MSG_EOR) 664 top->m_flags |= M_EOR; 665 } else { 666 sb_mtx_unlock(&so->so_snd); 667 if (dosolock) 668 sounlock_shared(so); 669 error = m_getuio(&top, atomic, space, uio); 670 if (dosolock) 671 solock_shared(so); 672 sb_mtx_lock(&so->so_snd); 673 if (error) 674 goto release; 675 space -= top->m_pkthdr.len; 676 resid = uio->uio_resid; 677 if (flags & MSG_EOR) 678 top->m_flags |= M_EOR; 679 } 680 if (resid == 0) 681 so->so_snd.sb_state &= ~SS_ISSENDING; 682 if (top && so->so_options & SO_ZEROIZE) 683 top->m_flags |= M_ZEROIZE; 684 sb_mtx_unlock(&so->so_snd); 685 if (!dosolock) 686 solock_shared(so); 687 if (flags & MSG_OOB) 688 error = pru_sendoob(so, top, addr, control); 689 else 690 error = pru_send(so, top, addr, control); 691 if (!dosolock) 692 sounlock_shared(so); 693 sb_mtx_lock(&so->so_snd); 694 clen = 0; 695 control = NULL; 696 top = NULL; 697 if (error) 698 goto release; 699 } while (resid && space > 0); 700 } while (resid); 701 702release: 703 so->so_snd.sb_state &= ~SS_ISSENDING; 704 sb_mtx_unlock(&so->so_snd); 705 if (dosolock) 706 sounlock_shared(so); 707 sbunlock(&so->so_snd); 708out: 709 m_freem(top); 710 m_freem(control); 711 return (error); 712} 713 714int 715m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 716{ 717 struct mbuf *m, *top = NULL; 718 struct mbuf **nextp = ⊤ 719 u_long len, mlen; 720 size_t resid = uio->uio_resid; 721 int error; 722 723 do { 724 if (top == NULL) { 725 MGETHDR(m, M_WAIT, MT_DATA); 726 mlen = MHLEN; 727 m->m_pkthdr.len = 0; 728 m->m_pkthdr.ph_ifidx = 0; 729 } else { 730 MGET(m, M_WAIT, MT_DATA); 731 mlen = MLEN; 732 } 733 /* chain mbuf together */ 734 *nextp = m; 735 nextp = &m->m_next; 736 737 resid = ulmin(resid, space); 738 if (resid >= MINCLSIZE) { 739 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 740 if ((m->m_flags & M_EXT) == 0) 741 MCLGETL(m, M_NOWAIT, MCLBYTES); 742 if ((m->m_flags & M_EXT) == 0) 743 goto nopages; 744 mlen = m->m_ext.ext_size; 745 len = ulmin(mlen, resid); 746 /* 747 * For datagram protocols, leave room 748 * for protocol headers in first mbuf. 749 */ 750 if (atomic && m == top && len < mlen - max_hdr) 751 m->m_data += max_hdr; 752 } else { 753nopages: 754 len = ulmin(mlen, resid); 755 /* 756 * For datagram protocols, leave room 757 * for protocol headers in first mbuf. 758 */ 759 if (atomic && m == top && len < mlen - max_hdr) 760 m_align(m, len); 761 } 762 763 error = uiomove(mtod(m, caddr_t), len, uio); 764 if (error) { 765 m_freem(top); 766 return (error); 767 } 768 769 /* adjust counters */ 770 resid = uio->uio_resid; 771 space -= len; 772 m->m_len = len; 773 top->m_pkthdr.len += len; 774 775 /* Is there more space and more data? */ 776 } while (space > 0 && resid > 0); 777 778 *mp = top; 779 return 0; 780} 781 782/* 783 * Following replacement or removal of the first mbuf on the first 784 * mbuf chain of a socket buffer, push necessary state changes back 785 * into the socket buffer so that other consumers see the values 786 * consistently. 'nextrecord' is the callers locally stored value of 787 * the original value of sb->sb_mb->m_nextpkt which must be restored 788 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 789 */ 790void 791sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 792{ 793 794 /* 795 * First, update for the new value of nextrecord. If necessary, 796 * make it the first record. 797 */ 798 if (sb->sb_mb != NULL) 799 sb->sb_mb->m_nextpkt = nextrecord; 800 else 801 sb->sb_mb = nextrecord; 802 803 /* 804 * Now update any dependent socket buffer fields to reflect 805 * the new state. This is an inline of SB_EMPTY_FIXUP, with 806 * the addition of a second clause that takes care of the 807 * case where sb_mb has been updated, but remains the last 808 * record. 809 */ 810 if (sb->sb_mb == NULL) { 811 sb->sb_mbtail = NULL; 812 sb->sb_lastrecord = NULL; 813 } else if (sb->sb_mb->m_nextpkt == NULL) 814 sb->sb_lastrecord = sb->sb_mb; 815} 816 817/* 818 * Implement receive operations on a socket. 819 * We depend on the way that records are added to the sockbuf 820 * by sbappend*. In particular, each record (mbufs linked through m_next) 821 * must begin with an address if the protocol so specifies, 822 * followed by an optional mbuf or mbufs containing ancillary data, 823 * and then zero or more mbufs of data. 824 * In order to avoid blocking network for the entire time here, we release 825 * the solock() while doing the actual copy to user space. 826 * Although the sockbuf is locked, new data may still be appended, 827 * and thus we must maintain consistency of the sockbuf during that time. 828 * 829 * The caller may receive the data as a single mbuf chain by supplying 830 * an mbuf **mp0 for use in returning the chain. The uio is then used 831 * only for the count in uio_resid. 832 */ 833int 834soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 835 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 836 socklen_t controllen) 837{ 838 struct mbuf *m, **mp; 839 struct mbuf *cm; 840 u_long len, offset, moff; 841 int flags, error, error2, type, uio_error = 0; 842 const struct protosw *pr = so->so_proto; 843 struct mbuf *nextrecord; 844 size_t resid, orig_resid = uio->uio_resid; 845 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 846 847 mp = mp0; 848 if (paddr) 849 *paddr = NULL; 850 if (controlp) 851 *controlp = NULL; 852 if (flagsp) 853 flags = *flagsp &~ MSG_EOR; 854 else 855 flags = 0; 856 if (flags & MSG_OOB) { 857 m = m_get(M_WAIT, MT_DATA); 858 solock(so); 859 error = pru_rcvoob(so, m, flags & MSG_PEEK); 860 sounlock(so); 861 if (error) 862 goto bad; 863 do { 864 error = uiomove(mtod(m, caddr_t), 865 ulmin(uio->uio_resid, m->m_len), uio); 866 m = m_free(m); 867 } while (uio->uio_resid && error == 0 && m); 868bad: 869 m_freem(m); 870 return (error); 871 } 872 if (mp) 873 *mp = NULL; 874 875restart: 876 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 877 return (error); 878 if (dosolock) 879 solock_shared(so); 880 sb_mtx_lock(&so->so_rcv); 881 882 m = so->so_rcv.sb_mb; 883#ifdef SOCKET_SPLICE 884 if (isspliced(so)) 885 m = NULL; 886#endif /* SOCKET_SPLICE */ 887 /* 888 * If we have less data than requested, block awaiting more 889 * (subject to any timeout) if: 890 * 1. the current count is less than the low water mark, 891 * 2. MSG_WAITALL is set, and it is possible to do the entire 892 * receive operation at once if we block (resid <= hiwat), or 893 * 3. MSG_DONTWAIT is not set. 894 * If MSG_WAITALL is set but resid is larger than the receive buffer, 895 * we have to do the receive in sections, and thus risk returning 896 * a short count if a timeout or signal occurs after we start. 897 */ 898 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 899 so->so_rcv.sb_cc < uio->uio_resid) && 900 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 901 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 902 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 903#ifdef DIAGNOSTIC 904 if (m == NULL && so->so_rcv.sb_cc) 905#ifdef SOCKET_SPLICE 906 if (!isspliced(so)) 907#endif /* SOCKET_SPLICE */ 908 panic("receive 1: so %p, so_type %d, sb_cc %lu", 909 so, so->so_type, so->so_rcv.sb_cc); 910#endif 911 if ((error2 = READ_ONCE(so->so_error))) { 912 if (m) 913 goto dontblock; 914 error = error2; 915 if ((flags & MSG_PEEK) == 0) 916 so->so_error = 0; 917 goto release; 918 } 919 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 920 if (m) 921 goto dontblock; 922 else if (so->so_rcv.sb_cc == 0) 923 goto release; 924 } 925 for (; m; m = m->m_next) 926 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 927 m = so->so_rcv.sb_mb; 928 goto dontblock; 929 } 930 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 931 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 932 error = ENOTCONN; 933 goto release; 934 } 935 if (uio->uio_resid == 0 && controlp == NULL) 936 goto release; 937 if (flags & MSG_DONTWAIT) { 938 error = EWOULDBLOCK; 939 goto release; 940 } 941 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 942 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 943 944 sbunlock(&so->so_rcv); 945 error = sbwait(so, &so->so_rcv); 946 sb_mtx_unlock(&so->so_rcv); 947 if (dosolock) 948 sounlock_shared(so); 949 if (error) 950 return (error); 951 goto restart; 952 } 953dontblock: 954 /* 955 * On entry here, m points to the first record of the socket buffer. 956 * From this point onward, we maintain 'nextrecord' as a cache of the 957 * pointer to the next record in the socket buffer. We must keep the 958 * various socket buffer pointers and local stack versions of the 959 * pointers in sync, pushing out modifications before operations that 960 * may sleep, and re-reading them afterwards. 961 * 962 * Otherwise, we will race with the network stack appending new data 963 * or records onto the socket buffer by using inconsistent/stale 964 * versions of the field, possibly resulting in socket buffer 965 * corruption. 966 */ 967 if (uio->uio_procp) 968 uio->uio_procp->p_ru.ru_msgrcv++; 969 KASSERT(m == so->so_rcv.sb_mb); 970 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 971 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 972 nextrecord = m->m_nextpkt; 973 if (pr->pr_flags & PR_ADDR) { 974#ifdef DIAGNOSTIC 975 if (m->m_type != MT_SONAME) 976 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 977 so, so->so_type, m, m->m_type); 978#endif 979 orig_resid = 0; 980 if (flags & MSG_PEEK) { 981 if (paddr) 982 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 983 m = m->m_next; 984 } else { 985 sbfree(so, &so->so_rcv, m); 986 if (paddr) { 987 *paddr = m; 988 so->so_rcv.sb_mb = m->m_next; 989 m->m_next = NULL; 990 m = so->so_rcv.sb_mb; 991 } else { 992 so->so_rcv.sb_mb = m_free(m); 993 m = so->so_rcv.sb_mb; 994 } 995 sbsync(&so->so_rcv, nextrecord); 996 } 997 } 998 while (m && m->m_type == MT_CONTROL && error == 0) { 999 int skip = 0; 1000 if (flags & MSG_PEEK) { 1001 if (mtod(m, struct cmsghdr *)->cmsg_type == 1002 SCM_RIGHTS) { 1003 /* don't leak internalized SCM_RIGHTS msgs */ 1004 skip = 1; 1005 } else if (controlp) 1006 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1007 m = m->m_next; 1008 } else { 1009 sbfree(so, &so->so_rcv, m); 1010 so->so_rcv.sb_mb = m->m_next; 1011 m->m_nextpkt = m->m_next = NULL; 1012 cm = m; 1013 m = so->so_rcv.sb_mb; 1014 sbsync(&so->so_rcv, nextrecord); 1015 if (controlp) { 1016 if (pr->pr_domain->dom_externalize) { 1017 sb_mtx_unlock(&so->so_rcv); 1018 if (dosolock) 1019 sounlock_shared(so); 1020 error = 1021 (*pr->pr_domain->dom_externalize) 1022 (cm, controllen, flags); 1023 if (dosolock) 1024 solock_shared(so); 1025 sb_mtx_lock(&so->so_rcv); 1026 } 1027 *controlp = cm; 1028 } else { 1029 /* 1030 * Dispose of any SCM_RIGHTS message that went 1031 * through the read path rather than recv. 1032 */ 1033 if (pr->pr_domain->dom_dispose) { 1034 sb_mtx_unlock(&so->so_rcv); 1035 pr->pr_domain->dom_dispose(cm); 1036 sb_mtx_lock(&so->so_rcv); 1037 } 1038 m_free(cm); 1039 } 1040 } 1041 if (m != NULL) 1042 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1043 else 1044 nextrecord = so->so_rcv.sb_mb; 1045 if (controlp && !skip) 1046 controlp = &(*controlp)->m_next; 1047 orig_resid = 0; 1048 } 1049 1050 /* If m is non-NULL, we have some data to read. */ 1051 if (m) { 1052 type = m->m_type; 1053 if (type == MT_OOBDATA) 1054 flags |= MSG_OOB; 1055 if (m->m_flags & M_BCAST) 1056 flags |= MSG_BCAST; 1057 if (m->m_flags & M_MCAST) 1058 flags |= MSG_MCAST; 1059 } 1060 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1061 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1062 1063 moff = 0; 1064 offset = 0; 1065 while (m && uio->uio_resid > 0 && error == 0) { 1066 if (m->m_type == MT_OOBDATA) { 1067 if (type != MT_OOBDATA) 1068 break; 1069 } else if (type == MT_OOBDATA) { 1070 break; 1071 } else if (m->m_type == MT_CONTROL) { 1072 /* 1073 * If there is more than one control message in the 1074 * stream, we do a short read. Next can be received 1075 * or disposed by another system call. 1076 */ 1077 break; 1078#ifdef DIAGNOSTIC 1079 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1080 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1081 so, so->so_type, m, m->m_type); 1082#endif 1083 } 1084 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1085 len = uio->uio_resid; 1086 if (so->so_oobmark && len > so->so_oobmark - offset) 1087 len = so->so_oobmark - offset; 1088 if (len > m->m_len - moff) 1089 len = m->m_len - moff; 1090 /* 1091 * If mp is set, just pass back the mbufs. 1092 * Otherwise copy them out via the uio, then free. 1093 * Sockbuf must be consistent here (points to current mbuf, 1094 * it points to next record) when we drop priority; 1095 * we must note any additions to the sockbuf when we 1096 * block interrupts again. 1097 */ 1098 if (mp == NULL && uio_error == 0) { 1099 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1100 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1101 resid = uio->uio_resid; 1102 sb_mtx_unlock(&so->so_rcv); 1103 if (dosolock) 1104 sounlock_shared(so); 1105 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1106 if (dosolock) 1107 solock_shared(so); 1108 sb_mtx_lock(&so->so_rcv); 1109 if (uio_error) 1110 uio->uio_resid = resid - len; 1111 } else 1112 uio->uio_resid -= len; 1113 if (len == m->m_len - moff) { 1114 if (m->m_flags & M_EOR) 1115 flags |= MSG_EOR; 1116 if (flags & MSG_PEEK) { 1117 m = m->m_next; 1118 moff = 0; 1119 orig_resid = 0; 1120 } else { 1121 nextrecord = m->m_nextpkt; 1122 sbfree(so, &so->so_rcv, m); 1123 if (mp) { 1124 *mp = m; 1125 mp = &m->m_next; 1126 so->so_rcv.sb_mb = m = m->m_next; 1127 *mp = NULL; 1128 } else { 1129 so->so_rcv.sb_mb = m_free(m); 1130 m = so->so_rcv.sb_mb; 1131 } 1132 /* 1133 * If m != NULL, we also know that 1134 * so->so_rcv.sb_mb != NULL. 1135 */ 1136 KASSERT(so->so_rcv.sb_mb == m); 1137 if (m) { 1138 m->m_nextpkt = nextrecord; 1139 if (nextrecord == NULL) 1140 so->so_rcv.sb_lastrecord = m; 1141 } else { 1142 so->so_rcv.sb_mb = nextrecord; 1143 SB_EMPTY_FIXUP(&so->so_rcv); 1144 } 1145 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1146 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1147 } 1148 } else { 1149 if (flags & MSG_PEEK) { 1150 moff += len; 1151 orig_resid = 0; 1152 } else { 1153 if (mp) 1154 *mp = m_copym(m, 0, len, M_WAIT); 1155 m->m_data += len; 1156 m->m_len -= len; 1157 so->so_rcv.sb_cc -= len; 1158 so->so_rcv.sb_datacc -= len; 1159 } 1160 } 1161 if (so->so_oobmark) { 1162 if ((flags & MSG_PEEK) == 0) { 1163 so->so_oobmark -= len; 1164 if (so->so_oobmark == 0) { 1165 so->so_rcv.sb_state |= SS_RCVATMARK; 1166 break; 1167 } 1168 } else { 1169 offset += len; 1170 if (offset == so->so_oobmark) 1171 break; 1172 } 1173 } 1174 if (flags & MSG_EOR) 1175 break; 1176 /* 1177 * If the MSG_WAITALL flag is set (for non-atomic socket), 1178 * we must not quit until "uio->uio_resid == 0" or an error 1179 * termination. If a signal/timeout occurs, return 1180 * with a short count but without error. 1181 * Keep sockbuf locked against other readers. 1182 */ 1183 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1184 !sosendallatonce(so) && !nextrecord) { 1185 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1186 so->so_error) 1187 break; 1188 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1189 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1190 if (sbwait(so, &so->so_rcv)) { 1191 sb_mtx_unlock(&so->so_rcv); 1192 if (dosolock) 1193 sounlock_shared(so); 1194 sbunlock(&so->so_rcv); 1195 return (0); 1196 } 1197 if ((m = so->so_rcv.sb_mb) != NULL) 1198 nextrecord = m->m_nextpkt; 1199 } 1200 } 1201 1202 if (m && pr->pr_flags & PR_ATOMIC) { 1203 flags |= MSG_TRUNC; 1204 if ((flags & MSG_PEEK) == 0) 1205 (void) sbdroprecord(so, &so->so_rcv); 1206 } 1207 if ((flags & MSG_PEEK) == 0) { 1208 if (m == NULL) { 1209 /* 1210 * First part is an inline SB_EMPTY_FIXUP(). Second 1211 * part makes sure sb_lastrecord is up-to-date if 1212 * there is still data in the socket buffer. 1213 */ 1214 so->so_rcv.sb_mb = nextrecord; 1215 if (so->so_rcv.sb_mb == NULL) { 1216 so->so_rcv.sb_mbtail = NULL; 1217 so->so_rcv.sb_lastrecord = NULL; 1218 } else if (nextrecord->m_nextpkt == NULL) 1219 so->so_rcv.sb_lastrecord = nextrecord; 1220 } 1221 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1222 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1223 if (pr->pr_flags & PR_WANTRCVD) { 1224 sb_mtx_unlock(&so->so_rcv); 1225 if (!dosolock) 1226 solock_shared(so); 1227 pru_rcvd(so); 1228 if (!dosolock) 1229 sounlock_shared(so); 1230 sb_mtx_lock(&so->so_rcv); 1231 } 1232 } 1233 if (orig_resid == uio->uio_resid && orig_resid && 1234 (flags & MSG_EOR) == 0 && 1235 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1236 sb_mtx_unlock(&so->so_rcv); 1237 sbunlock(&so->so_rcv); 1238 goto restart; 1239 } 1240 1241 if (uio_error) 1242 error = uio_error; 1243 1244 if (flagsp) 1245 *flagsp |= flags; 1246release: 1247 sb_mtx_unlock(&so->so_rcv); 1248 if (dosolock) 1249 sounlock_shared(so); 1250 sbunlock(&so->so_rcv); 1251 return (error); 1252} 1253 1254int 1255soshutdown(struct socket *so, int how) 1256{ 1257 int error = 0; 1258 1259 switch (how) { 1260 case SHUT_RD: 1261 sorflush(so); 1262 break; 1263 case SHUT_RDWR: 1264 sorflush(so); 1265 /* FALLTHROUGH */ 1266 case SHUT_WR: 1267 solock(so); 1268 error = pru_shutdown(so); 1269 sounlock(so); 1270 break; 1271 default: 1272 error = EINVAL; 1273 break; 1274 } 1275 1276 return (error); 1277} 1278 1279void 1280sorflush(struct socket *so) 1281{ 1282 struct sockbuf *sb = &so->so_rcv; 1283 struct mbuf *m; 1284 const struct protosw *pr = so->so_proto; 1285 int error; 1286 1287 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1288 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1289 KASSERT(error == 0); 1290 1291 solock_shared(so); 1292 socantrcvmore(so); 1293 mtx_enter(&sb->sb_mtx); 1294 m = sb->sb_mb; 1295 memset(&sb->sb_startzero, 0, 1296 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1297 sb->sb_timeo_nsecs = INFSLP; 1298 mtx_leave(&sb->sb_mtx); 1299 sounlock_shared(so); 1300 sbunlock(sb); 1301 1302 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1303 (*pr->pr_domain->dom_dispose)(m); 1304 m_purge(m); 1305} 1306 1307#ifdef SOCKET_SPLICE 1308 1309#define so_splicelen so_sp->ssp_len 1310#define so_splicemax so_sp->ssp_max 1311#define so_idletv so_sp->ssp_idletv 1312#define so_idleto so_sp->ssp_idleto 1313#define so_splicetask so_sp->ssp_task 1314 1315int 1316sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1317{ 1318 struct file *fp; 1319 struct socket *sosp; 1320 struct taskq *tq; 1321 int error = 0; 1322 1323 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1324 return (EPROTONOSUPPORT); 1325 if (max && max < 0) 1326 return (EINVAL); 1327 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1328 return (EINVAL); 1329 1330 /* If no fd is given, unsplice by removing existing link. */ 1331 if (fd < 0) { 1332 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1333 return (error); 1334 solock(so); 1335 if (so->so_options & SO_ACCEPTCONN) { 1336 error = EOPNOTSUPP; 1337 goto out; 1338 } 1339 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1340 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1341 error = ENOTCONN; 1342 goto out; 1343 } 1344 1345 if (so->so_sp && so->so_sp->ssp_socket) 1346 sounsplice(so, so->so_sp->ssp_socket, 0); 1347 out: 1348 sounlock(so); 1349 sbunlock(&so->so_rcv); 1350 return (error); 1351 } 1352 1353 if (sosplice_taskq == NULL) { 1354 rw_enter_write(&sosplice_lock); 1355 if (sosplice_taskq == NULL) { 1356 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1357 TASKQ_MPSAFE); 1358 if (tq == NULL) { 1359 rw_exit_write(&sosplice_lock); 1360 return (ENOMEM); 1361 } 1362 /* Ensure the taskq is fully visible to other CPUs. */ 1363 membar_producer(); 1364 sosplice_taskq = tq; 1365 } 1366 rw_exit_write(&sosplice_lock); 1367 } else { 1368 /* Ensure the taskq is fully visible on this CPU. */ 1369 membar_consumer(); 1370 } 1371 1372 /* Find sosp, the drain socket where data will be spliced into. */ 1373 if ((error = getsock(curproc, fd, &fp)) != 0) 1374 return (error); 1375 sosp = fp->f_data; 1376 1377 if (sosp->so_proto->pr_usrreqs->pru_send != 1378 so->so_proto->pr_usrreqs->pru_send) { 1379 error = EPROTONOSUPPORT; 1380 goto frele; 1381 } 1382 1383 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1384 goto frele; 1385 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1386 sbunlock(&so->so_rcv); 1387 goto frele; 1388 } 1389 solock(so); 1390 1391 if ((so->so_options & SO_ACCEPTCONN) || 1392 (sosp->so_options & SO_ACCEPTCONN)) { 1393 error = EOPNOTSUPP; 1394 goto release; 1395 } 1396 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1397 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1398 error = ENOTCONN; 1399 goto release; 1400 } 1401 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1402 error = ENOTCONN; 1403 goto release; 1404 } 1405 if (so->so_sp == NULL) 1406 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1407 if (sosp->so_sp == NULL) 1408 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1409 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1410 error = EBUSY; 1411 goto release; 1412 } 1413 1414 /* Splice so and sosp together. */ 1415 mtx_enter(&so->so_rcv.sb_mtx); 1416 so->so_sp->ssp_socket = sosp; 1417 sosp->so_sp->ssp_soback = so; 1418 mtx_leave(&so->so_rcv.sb_mtx); 1419 so->so_splicelen = 0; 1420 so->so_splicemax = max; 1421 if (tv) 1422 so->so_idletv = *tv; 1423 else 1424 timerclear(&so->so_idletv); 1425 timeout_set_proc(&so->so_idleto, soidle, so); 1426 task_set(&so->so_splicetask, sotask, so); 1427 1428 /* 1429 * To prevent softnet interrupt from calling somove() while 1430 * we sleep, the socket buffers are not marked as spliced yet. 1431 */ 1432 if (somove(so, M_WAIT)) { 1433 mtx_enter(&so->so_rcv.sb_mtx); 1434 so->so_rcv.sb_flags |= SB_SPLICE; 1435 mtx_leave(&so->so_rcv.sb_mtx); 1436 sosp->so_snd.sb_flags |= SB_SPLICE; 1437 } 1438 1439 release: 1440 sounlock(so); 1441 sbunlock(&sosp->so_snd); 1442 sbunlock(&so->so_rcv); 1443 frele: 1444 FRELE(fp, curproc); 1445 1446 return (error); 1447} 1448 1449void 1450sounsplice(struct socket *so, struct socket *sosp, int freeing) 1451{ 1452 soassertlocked(so); 1453 1454 task_del(sosplice_taskq, &so->so_splicetask); 1455 timeout_del(&so->so_idleto); 1456 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1457 1458 mtx_enter(&so->so_rcv.sb_mtx); 1459 so->so_rcv.sb_flags &= ~SB_SPLICE; 1460 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1461 mtx_leave(&so->so_rcv.sb_mtx); 1462 1463 /* Do not wakeup a socket that is about to be freed. */ 1464 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1465 sorwakeup(so); 1466 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1467 sowwakeup(sosp); 1468} 1469 1470void 1471soidle(void *arg) 1472{ 1473 struct socket *so = arg; 1474 1475 solock(so); 1476 if (so->so_rcv.sb_flags & SB_SPLICE) { 1477 so->so_error = ETIMEDOUT; 1478 sounsplice(so, so->so_sp->ssp_socket, 0); 1479 } 1480 sounlock(so); 1481} 1482 1483void 1484sotask(void *arg) 1485{ 1486 struct socket *so = arg; 1487 1488 solock(so); 1489 if (so->so_rcv.sb_flags & SB_SPLICE) { 1490 /* 1491 * We may not sleep here as sofree() and unsplice() may be 1492 * called from softnet interrupt context. This would remove 1493 * the socket during somove(). 1494 */ 1495 somove(so, M_DONTWAIT); 1496 } 1497 sounlock(so); 1498 1499 /* Avoid user land starvation. */ 1500 yield(); 1501} 1502 1503/* 1504 * The socket splicing task or idle timeout may sleep while grabbing the net 1505 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1506 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1507 * after all pending socket splicing tasks or timeouts have finished. Do this 1508 * by scheduling it on the same threads. 1509 */ 1510void 1511soreaper(void *arg) 1512{ 1513 struct socket *so = arg; 1514 1515 /* Reuse splice task, sounsplice() has been called before. */ 1516 task_set(&so->so_sp->ssp_task, soput, so); 1517 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1518} 1519 1520void 1521soput(void *arg) 1522{ 1523 struct socket *so = arg; 1524 1525 pool_put(&sosplice_pool, so->so_sp); 1526 pool_put(&socket_pool, so); 1527} 1528 1529/* 1530 * Move data from receive buffer of spliced source socket to send 1531 * buffer of drain socket. Try to move as much as possible in one 1532 * big chunk. It is a TCP only implementation. 1533 * Return value 0 means splicing has been finished, 1 continue. 1534 */ 1535int 1536somove(struct socket *so, int wait) 1537{ 1538 struct socket *sosp = so->so_sp->ssp_socket; 1539 struct mbuf *m, **mp, *nextrecord; 1540 u_long len, off, oobmark; 1541 long space; 1542 int error = 0, maxreached = 0; 1543 unsigned int rcvstate; 1544 1545 soassertlocked(so); 1546 1547 nextpkt: 1548 if (so->so_error) { 1549 error = so->so_error; 1550 goto release; 1551 } 1552 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1553 error = EPIPE; 1554 goto release; 1555 } 1556 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1557 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1558 error = sosp->so_error; 1559 goto release; 1560 } 1561 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1562 goto release; 1563 1564 /* Calculate how many bytes can be copied now. */ 1565 len = so->so_rcv.sb_datacc; 1566 if (so->so_splicemax) { 1567 KASSERT(so->so_splicelen < so->so_splicemax); 1568 if (so->so_splicemax <= so->so_splicelen + len) { 1569 len = so->so_splicemax - so->so_splicelen; 1570 maxreached = 1; 1571 } 1572 } 1573 space = sbspace(sosp, &sosp->so_snd); 1574 if (so->so_oobmark && so->so_oobmark < len && 1575 so->so_oobmark < space + 1024) 1576 space += 1024; 1577 if (space <= 0) { 1578 maxreached = 0; 1579 goto release; 1580 } 1581 if (space < len) { 1582 maxreached = 0; 1583 if (space < sosp->so_snd.sb_lowat) 1584 goto release; 1585 len = space; 1586 } 1587 sosp->so_snd.sb_state |= SS_ISSENDING; 1588 1589 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1590 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1591 m = so->so_rcv.sb_mb; 1592 if (m == NULL) 1593 goto release; 1594 nextrecord = m->m_nextpkt; 1595 1596 /* Drop address and control information not used with splicing. */ 1597 if (so->so_proto->pr_flags & PR_ADDR) { 1598#ifdef DIAGNOSTIC 1599 if (m->m_type != MT_SONAME) 1600 panic("somove soname: so %p, so_type %d, m %p, " 1601 "m_type %d", so, so->so_type, m, m->m_type); 1602#endif 1603 m = m->m_next; 1604 } 1605 while (m && m->m_type == MT_CONTROL) 1606 m = m->m_next; 1607 if (m == NULL) { 1608 sbdroprecord(so, &so->so_rcv); 1609 if (so->so_proto->pr_flags & PR_WANTRCVD) 1610 pru_rcvd(so); 1611 goto nextpkt; 1612 } 1613 1614 /* 1615 * By splicing sockets connected to localhost, userland might create a 1616 * loop. Dissolve splicing with error if loop is detected by counter. 1617 * 1618 * If we deal with looped broadcast/multicast packet we bail out with 1619 * no error to suppress splice termination. 1620 */ 1621 if ((m->m_flags & M_PKTHDR) && 1622 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1623 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1624 error = ELOOP; 1625 goto release; 1626 } 1627 1628 if (so->so_proto->pr_flags & PR_ATOMIC) { 1629 if ((m->m_flags & M_PKTHDR) == 0) 1630 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1631 "m_type %d", so, so->so_type, m, m->m_type); 1632 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1633 error = EMSGSIZE; 1634 goto release; 1635 } 1636 if (len < m->m_pkthdr.len) 1637 goto release; 1638 if (m->m_pkthdr.len < len) { 1639 maxreached = 0; 1640 len = m->m_pkthdr.len; 1641 } 1642 /* 1643 * Throw away the name mbuf after it has been assured 1644 * that the whole first record can be processed. 1645 */ 1646 m = so->so_rcv.sb_mb; 1647 sbfree(so, &so->so_rcv, m); 1648 so->so_rcv.sb_mb = m_free(m); 1649 sbsync(&so->so_rcv, nextrecord); 1650 } 1651 /* 1652 * Throw away the control mbufs after it has been assured 1653 * that the whole first record can be processed. 1654 */ 1655 m = so->so_rcv.sb_mb; 1656 while (m && m->m_type == MT_CONTROL) { 1657 sbfree(so, &so->so_rcv, m); 1658 so->so_rcv.sb_mb = m_free(m); 1659 m = so->so_rcv.sb_mb; 1660 sbsync(&so->so_rcv, nextrecord); 1661 } 1662 1663 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1664 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1665 1666 /* Take at most len mbufs out of receive buffer. */ 1667 for (off = 0, mp = &m; off <= len && *mp; 1668 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1669 u_long size = len - off; 1670 1671#ifdef DIAGNOSTIC 1672 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1673 panic("somove type: so %p, so_type %d, m %p, " 1674 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1675#endif 1676 if ((*mp)->m_len > size) { 1677 /* 1678 * Move only a partial mbuf at maximum splice length or 1679 * if the drain buffer is too small for this large mbuf. 1680 */ 1681 if (!maxreached && so->so_snd.sb_datacc > 0) { 1682 len -= size; 1683 break; 1684 } 1685 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1686 if (*mp == NULL) { 1687 len -= size; 1688 break; 1689 } 1690 so->so_rcv.sb_mb->m_data += size; 1691 so->so_rcv.sb_mb->m_len -= size; 1692 so->so_rcv.sb_cc -= size; 1693 so->so_rcv.sb_datacc -= size; 1694 } else { 1695 *mp = so->so_rcv.sb_mb; 1696 sbfree(so, &so->so_rcv, *mp); 1697 so->so_rcv.sb_mb = (*mp)->m_next; 1698 sbsync(&so->so_rcv, nextrecord); 1699 } 1700 } 1701 *mp = NULL; 1702 1703 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1704 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1705 SBCHECK(so, &so->so_rcv); 1706 if (m == NULL) 1707 goto release; 1708 m->m_nextpkt = NULL; 1709 if (m->m_flags & M_PKTHDR) { 1710 m_resethdr(m); 1711 m->m_pkthdr.len = len; 1712 } 1713 1714 /* Send window update to source peer as receive buffer has changed. */ 1715 if (so->so_proto->pr_flags & PR_WANTRCVD) 1716 pru_rcvd(so); 1717 1718 /* Receive buffer did shrink by len bytes, adjust oob. */ 1719 mtx_enter(&so->so_rcv.sb_mtx); 1720 rcvstate = so->so_rcv.sb_state; 1721 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1722 oobmark = so->so_oobmark; 1723 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1724 if (oobmark) { 1725 if (oobmark == len) 1726 so->so_rcv.sb_state |= SS_RCVATMARK; 1727 if (oobmark >= len) 1728 oobmark = 0; 1729 } 1730 mtx_leave(&so->so_rcv.sb_mtx); 1731 1732 /* 1733 * Handle oob data. If any malloc fails, ignore error. 1734 * TCP urgent data is not very reliable anyway. 1735 */ 1736 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1737 (so->so_options & SO_OOBINLINE)) { 1738 struct mbuf *o = NULL; 1739 1740 if (rcvstate & SS_RCVATMARK) { 1741 o = m_get(wait, MT_DATA); 1742 rcvstate &= ~SS_RCVATMARK; 1743 } else if (oobmark) { 1744 o = m_split(m, oobmark, wait); 1745 if (o) { 1746 error = pru_send(sosp, m, NULL, NULL); 1747 if (error) { 1748 if (sosp->so_snd.sb_state & 1749 SS_CANTSENDMORE) 1750 error = EPIPE; 1751 m_freem(o); 1752 goto release; 1753 } 1754 len -= oobmark; 1755 so->so_splicelen += oobmark; 1756 m = o; 1757 o = m_get(wait, MT_DATA); 1758 } 1759 oobmark = 0; 1760 } 1761 if (o) { 1762 o->m_len = 1; 1763 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1764 error = pru_sendoob(sosp, o, NULL, NULL); 1765 if (error) { 1766 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1767 error = EPIPE; 1768 m_freem(m); 1769 goto release; 1770 } 1771 len -= 1; 1772 so->so_splicelen += 1; 1773 if (oobmark) { 1774 oobmark -= 1; 1775 if (oobmark == 0) 1776 rcvstate |= SS_RCVATMARK; 1777 } 1778 m_adj(m, 1); 1779 } 1780 } 1781 1782 /* Append all remaining data to drain socket. */ 1783 if (so->so_rcv.sb_cc == 0 || maxreached) 1784 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1785 error = pru_send(sosp, m, NULL, NULL); 1786 if (error) { 1787 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1788 error = EPIPE; 1789 goto release; 1790 } 1791 so->so_splicelen += len; 1792 1793 /* Move several packets if possible. */ 1794 if (!maxreached && nextrecord) 1795 goto nextpkt; 1796 1797 release: 1798 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1799 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1800 error = EFBIG; 1801 if (error) 1802 so->so_error = error; 1803 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1804 so->so_rcv.sb_cc == 0) || 1805 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1806 maxreached || error) { 1807 sounsplice(so, sosp, 0); 1808 return (0); 1809 } 1810 if (timerisset(&so->so_idletv)) 1811 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1812 return (1); 1813} 1814 1815#endif /* SOCKET_SPLICE */ 1816 1817void 1818sorwakeup(struct socket *so) 1819{ 1820 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1821 soassertlocked_readonly(so); 1822 1823#ifdef SOCKET_SPLICE 1824 if (so->so_rcv.sb_flags & SB_SPLICE) { 1825 /* 1826 * TCP has a sendbuffer that can handle multiple packets 1827 * at once. So queue the stream a bit to accumulate data. 1828 * The sosplice thread will call somove() later and send 1829 * the packets calling tcp_output() only once. 1830 * In the UDP case, send out the packets immediately. 1831 * Using a thread would make things slower. 1832 */ 1833 if (so->so_proto->pr_flags & PR_WANTRCVD) 1834 task_add(sosplice_taskq, &so->so_splicetask); 1835 else 1836 somove(so, M_DONTWAIT); 1837 } 1838 if (isspliced(so)) 1839 return; 1840#endif 1841 sowakeup(so, &so->so_rcv); 1842 if (so->so_upcall) 1843 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1844} 1845 1846void 1847sowwakeup(struct socket *so) 1848{ 1849 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1850 soassertlocked_readonly(so); 1851 1852#ifdef SOCKET_SPLICE 1853 if (so->so_snd.sb_flags & SB_SPLICE) 1854 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1855 if (issplicedback(so)) 1856 return; 1857#endif 1858 sowakeup(so, &so->so_snd); 1859} 1860 1861int 1862sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1863{ 1864 int error = 0; 1865 1866 if (level != SOL_SOCKET) { 1867 if (so->so_proto->pr_ctloutput) { 1868 solock(so); 1869 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1870 level, optname, m); 1871 sounlock(so); 1872 return (error); 1873 } 1874 error = ENOPROTOOPT; 1875 } else { 1876 switch (optname) { 1877 1878 case SO_LINGER: 1879 if (m == NULL || m->m_len != sizeof (struct linger) || 1880 mtod(m, struct linger *)->l_linger < 0 || 1881 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1882 return (EINVAL); 1883 1884 solock(so); 1885 so->so_linger = mtod(m, struct linger *)->l_linger; 1886 if (*mtod(m, int *)) 1887 so->so_options |= optname; 1888 else 1889 so->so_options &= ~optname; 1890 sounlock(so); 1891 1892 break; 1893 case SO_BINDANY: 1894 if ((error = suser(curproc)) != 0) /* XXX */ 1895 return (error); 1896 /* FALLTHROUGH */ 1897 1898 case SO_DEBUG: 1899 case SO_KEEPALIVE: 1900 case SO_USELOOPBACK: 1901 case SO_BROADCAST: 1902 case SO_REUSEADDR: 1903 case SO_REUSEPORT: 1904 case SO_OOBINLINE: 1905 case SO_TIMESTAMP: 1906 case SO_ZEROIZE: 1907 if (m == NULL || m->m_len < sizeof (int)) 1908 return (EINVAL); 1909 1910 solock(so); 1911 if (*mtod(m, int *)) 1912 so->so_options |= optname; 1913 else 1914 so->so_options &= ~optname; 1915 sounlock(so); 1916 1917 break; 1918 case SO_DONTROUTE: 1919 if (m == NULL || m->m_len < sizeof (int)) 1920 return (EINVAL); 1921 if (*mtod(m, int *)) 1922 error = EOPNOTSUPP; 1923 break; 1924 1925 case SO_SNDBUF: 1926 case SO_RCVBUF: 1927 case SO_SNDLOWAT: 1928 case SO_RCVLOWAT: 1929 { 1930 struct sockbuf *sb = (optname == SO_SNDBUF || 1931 optname == SO_SNDLOWAT ? 1932 &so->so_snd : &so->so_rcv); 1933 u_long cnt; 1934 1935 if (m == NULL || m->m_len < sizeof (int)) 1936 return (EINVAL); 1937 cnt = *mtod(m, int *); 1938 if ((long)cnt <= 0) 1939 cnt = 1; 1940 1941 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 1942 solock(so); 1943 mtx_enter(&sb->sb_mtx); 1944 1945 switch (optname) { 1946 case SO_SNDBUF: 1947 case SO_RCVBUF: 1948 if (sb->sb_state & 1949 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1950 error = EINVAL; 1951 break; 1952 } 1953 if (sbcheckreserve(cnt, sb->sb_wat) || 1954 sbreserve(so, sb, cnt)) { 1955 error = ENOBUFS; 1956 break; 1957 } 1958 sb->sb_wat = cnt; 1959 break; 1960 case SO_SNDLOWAT: 1961 case SO_RCVLOWAT: 1962 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 1963 sb->sb_hiwat : cnt; 1964 break; 1965 } 1966 1967 mtx_leave(&sb->sb_mtx); 1968 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 1969 sounlock(so); 1970 1971 break; 1972 } 1973 1974 case SO_SNDTIMEO: 1975 case SO_RCVTIMEO: 1976 { 1977 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 1978 &so->so_snd : &so->so_rcv); 1979 struct timeval tv; 1980 uint64_t nsecs; 1981 1982 if (m == NULL || m->m_len < sizeof (tv)) 1983 return (EINVAL); 1984 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1985 if (!timerisvalid(&tv)) 1986 return (EINVAL); 1987 nsecs = TIMEVAL_TO_NSEC(&tv); 1988 if (nsecs == UINT64_MAX) 1989 return (EDOM); 1990 if (nsecs == 0) 1991 nsecs = INFSLP; 1992 1993 mtx_enter(&sb->sb_mtx); 1994 sb->sb_timeo_nsecs = nsecs; 1995 mtx_leave(&sb->sb_mtx); 1996 break; 1997 } 1998 1999 case SO_RTABLE: 2000 if (so->so_proto->pr_domain && 2001 so->so_proto->pr_domain->dom_protosw && 2002 so->so_proto->pr_ctloutput) { 2003 const struct domain *dom = 2004 so->so_proto->pr_domain; 2005 2006 level = dom->dom_protosw->pr_protocol; 2007 solock(so); 2008 error = (*so->so_proto->pr_ctloutput) 2009 (PRCO_SETOPT, so, level, optname, m); 2010 sounlock(so); 2011 } else 2012 error = ENOPROTOOPT; 2013 break; 2014#ifdef SOCKET_SPLICE 2015 case SO_SPLICE: 2016 if (m == NULL) { 2017 error = sosplice(so, -1, 0, NULL); 2018 } else if (m->m_len < sizeof(int)) { 2019 error = EINVAL; 2020 } else if (m->m_len < sizeof(struct splice)) { 2021 error = sosplice(so, *mtod(m, int *), 0, NULL); 2022 } else { 2023 error = sosplice(so, 2024 mtod(m, struct splice *)->sp_fd, 2025 mtod(m, struct splice *)->sp_max, 2026 &mtod(m, struct splice *)->sp_idle); 2027 } 2028 break; 2029#endif /* SOCKET_SPLICE */ 2030 2031 default: 2032 error = ENOPROTOOPT; 2033 break; 2034 } 2035 } 2036 2037 return (error); 2038} 2039 2040int 2041sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2042{ 2043 int error = 0; 2044 2045 if (level != SOL_SOCKET) { 2046 if (so->so_proto->pr_ctloutput) { 2047 m->m_len = 0; 2048 2049 solock(so); 2050 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2051 level, optname, m); 2052 sounlock(so); 2053 return (error); 2054 } else 2055 return (ENOPROTOOPT); 2056 } else { 2057 m->m_len = sizeof (int); 2058 2059 switch (optname) { 2060 2061 case SO_LINGER: 2062 m->m_len = sizeof (struct linger); 2063 solock_shared(so); 2064 mtod(m, struct linger *)->l_onoff = 2065 so->so_options & SO_LINGER; 2066 mtod(m, struct linger *)->l_linger = so->so_linger; 2067 sounlock_shared(so); 2068 break; 2069 2070 case SO_BINDANY: 2071 case SO_USELOOPBACK: 2072 case SO_DEBUG: 2073 case SO_KEEPALIVE: 2074 case SO_REUSEADDR: 2075 case SO_REUSEPORT: 2076 case SO_BROADCAST: 2077 case SO_OOBINLINE: 2078 case SO_ACCEPTCONN: 2079 case SO_TIMESTAMP: 2080 case SO_ZEROIZE: 2081 *mtod(m, int *) = so->so_options & optname; 2082 break; 2083 2084 case SO_DONTROUTE: 2085 *mtod(m, int *) = 0; 2086 break; 2087 2088 case SO_TYPE: 2089 *mtod(m, int *) = so->so_type; 2090 break; 2091 2092 case SO_ERROR: 2093 solock(so); 2094 *mtod(m, int *) = so->so_error; 2095 so->so_error = 0; 2096 sounlock(so); 2097 2098 break; 2099 2100 case SO_DOMAIN: 2101 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2102 break; 2103 2104 case SO_PROTOCOL: 2105 *mtod(m, int *) = so->so_proto->pr_protocol; 2106 break; 2107 2108 case SO_SNDBUF: 2109 *mtod(m, int *) = so->so_snd.sb_hiwat; 2110 break; 2111 2112 case SO_RCVBUF: 2113 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2114 break; 2115 2116 case SO_SNDLOWAT: 2117 *mtod(m, int *) = so->so_snd.sb_lowat; 2118 break; 2119 2120 case SO_RCVLOWAT: 2121 *mtod(m, int *) = so->so_rcv.sb_lowat; 2122 break; 2123 2124 case SO_SNDTIMEO: 2125 case SO_RCVTIMEO: 2126 { 2127 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2128 &so->so_snd : &so->so_rcv); 2129 struct timeval tv; 2130 uint64_t nsecs; 2131 2132 mtx_enter(&sb->sb_mtx); 2133 nsecs = sb->sb_timeo_nsecs; 2134 mtx_leave(&sb->sb_mtx); 2135 2136 m->m_len = sizeof(struct timeval); 2137 memset(&tv, 0, sizeof(tv)); 2138 if (nsecs != INFSLP) 2139 NSEC_TO_TIMEVAL(nsecs, &tv); 2140 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2141 break; 2142 } 2143 2144 case SO_RTABLE: 2145 if (so->so_proto->pr_domain && 2146 so->so_proto->pr_domain->dom_protosw && 2147 so->so_proto->pr_ctloutput) { 2148 const struct domain *dom = 2149 so->so_proto->pr_domain; 2150 2151 level = dom->dom_protosw->pr_protocol; 2152 solock(so); 2153 error = (*so->so_proto->pr_ctloutput) 2154 (PRCO_GETOPT, so, level, optname, m); 2155 sounlock(so); 2156 if (error) 2157 return (error); 2158 break; 2159 } 2160 return (ENOPROTOOPT); 2161 2162#ifdef SOCKET_SPLICE 2163 case SO_SPLICE: 2164 { 2165 off_t len; 2166 2167 m->m_len = sizeof(off_t); 2168 solock_shared(so); 2169 len = so->so_sp ? so->so_sp->ssp_len : 0; 2170 sounlock_shared(so); 2171 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2172 break; 2173 } 2174#endif /* SOCKET_SPLICE */ 2175 2176 case SO_PEERCRED: 2177 if (so->so_proto->pr_protocol == AF_UNIX) { 2178 struct unpcb *unp = sotounpcb(so); 2179 2180 solock(so); 2181 if (unp->unp_flags & UNP_FEIDS) { 2182 m->m_len = sizeof(unp->unp_connid); 2183 memcpy(mtod(m, caddr_t), 2184 &(unp->unp_connid), m->m_len); 2185 sounlock(so); 2186 break; 2187 } 2188 sounlock(so); 2189 2190 return (ENOTCONN); 2191 } 2192 return (EOPNOTSUPP); 2193 2194 default: 2195 return (ENOPROTOOPT); 2196 } 2197 return (0); 2198 } 2199} 2200 2201void 2202sohasoutofband(struct socket *so) 2203{ 2204 pgsigio(&so->so_sigio, SIGURG, 0); 2205 knote(&so->so_rcv.sb_klist, 0); 2206} 2207 2208void 2209sofilt_lock(struct socket *so, struct sockbuf *sb) 2210{ 2211 switch (so->so_proto->pr_domain->dom_family) { 2212 case PF_INET: 2213 case PF_INET6: 2214 NET_LOCK_SHARED(); 2215 break; 2216 default: 2217 rw_enter_write(&so->so_lock); 2218 break; 2219 } 2220 2221 mtx_enter(&sb->sb_mtx); 2222} 2223 2224void 2225sofilt_unlock(struct socket *so, struct sockbuf *sb) 2226{ 2227 mtx_leave(&sb->sb_mtx); 2228 2229 switch (so->so_proto->pr_domain->dom_family) { 2230 case PF_INET: 2231 case PF_INET6: 2232 NET_UNLOCK_SHARED(); 2233 break; 2234 default: 2235 rw_exit_write(&so->so_lock); 2236 break; 2237 } 2238} 2239 2240int 2241soo_kqfilter(struct file *fp, struct knote *kn) 2242{ 2243 struct socket *so = kn->kn_fp->f_data; 2244 struct sockbuf *sb; 2245 2246 switch (kn->kn_filter) { 2247 case EVFILT_READ: 2248 kn->kn_fop = &soread_filtops; 2249 sb = &so->so_rcv; 2250 break; 2251 case EVFILT_WRITE: 2252 kn->kn_fop = &sowrite_filtops; 2253 sb = &so->so_snd; 2254 break; 2255 case EVFILT_EXCEPT: 2256 kn->kn_fop = &soexcept_filtops; 2257 sb = &so->so_rcv; 2258 break; 2259 default: 2260 return (EINVAL); 2261 } 2262 2263 klist_insert(&sb->sb_klist, kn); 2264 2265 return (0); 2266} 2267 2268void 2269filt_sordetach(struct knote *kn) 2270{ 2271 struct socket *so = kn->kn_fp->f_data; 2272 2273 klist_remove(&so->so_rcv.sb_klist, kn); 2274} 2275 2276int 2277filt_soread(struct knote *kn, long hint) 2278{ 2279 struct socket *so = kn->kn_fp->f_data; 2280 int rv = 0; 2281 2282 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2283 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2284 soassertlocked_readonly(so); 2285 2286 if (so->so_options & SO_ACCEPTCONN) { 2287 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2288 soassertlocked_readonly(so); 2289 2290 kn->kn_data = so->so_qlen; 2291 rv = (kn->kn_data != 0); 2292 2293 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2294 if (so->so_state & SS_ISDISCONNECTED) { 2295 kn->kn_flags |= __EV_HUP; 2296 rv = 1; 2297 } else { 2298 rv = soreadable(so); 2299 } 2300 } 2301 2302 return rv; 2303 } 2304 2305 kn->kn_data = so->so_rcv.sb_cc; 2306#ifdef SOCKET_SPLICE 2307 if (isspliced(so)) { 2308 rv = 0; 2309 } else 2310#endif /* SOCKET_SPLICE */ 2311 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2312 kn->kn_flags |= EV_EOF; 2313 if (kn->kn_flags & __EV_POLL) { 2314 if (so->so_state & SS_ISDISCONNECTED) 2315 kn->kn_flags |= __EV_HUP; 2316 } 2317 kn->kn_fflags = so->so_error; 2318 rv = 1; 2319 } else if (so->so_error) { 2320 rv = 1; 2321 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2322 rv = (kn->kn_data >= kn->kn_sdata); 2323 } else { 2324 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2325 } 2326 2327 return rv; 2328} 2329 2330void 2331filt_sowdetach(struct knote *kn) 2332{ 2333 struct socket *so = kn->kn_fp->f_data; 2334 2335 klist_remove(&so->so_snd.sb_klist, kn); 2336} 2337 2338int 2339filt_sowrite(struct knote *kn, long hint) 2340{ 2341 struct socket *so = kn->kn_fp->f_data; 2342 int rv; 2343 2344 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2345 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2346 soassertlocked_readonly(so); 2347 2348 kn->kn_data = sbspace(so, &so->so_snd); 2349 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2350 kn->kn_flags |= EV_EOF; 2351 if (kn->kn_flags & __EV_POLL) { 2352 if (so->so_state & SS_ISDISCONNECTED) 2353 kn->kn_flags |= __EV_HUP; 2354 } 2355 kn->kn_fflags = so->so_error; 2356 rv = 1; 2357 } else if (so->so_error) { 2358 rv = 1; 2359 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2360 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2361 rv = 0; 2362 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2363 rv = (kn->kn_data >= kn->kn_sdata); 2364 } else { 2365 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2366 } 2367 2368 return (rv); 2369} 2370 2371int 2372filt_soexcept(struct knote *kn, long hint) 2373{ 2374 struct socket *so = kn->kn_fp->f_data; 2375 int rv = 0; 2376 2377 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2378 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2379 soassertlocked_readonly(so); 2380 2381#ifdef SOCKET_SPLICE 2382 if (isspliced(so)) { 2383 rv = 0; 2384 } else 2385#endif /* SOCKET_SPLICE */ 2386 if (kn->kn_sfflags & NOTE_OOB) { 2387 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2388 kn->kn_fflags |= NOTE_OOB; 2389 kn->kn_data -= so->so_oobmark; 2390 rv = 1; 2391 } 2392 } 2393 2394 if (kn->kn_flags & __EV_POLL) { 2395 if (so->so_state & SS_ISDISCONNECTED) { 2396 kn->kn_flags |= __EV_HUP; 2397 rv = 1; 2398 } 2399 } 2400 2401 return rv; 2402} 2403 2404int 2405filt_sowmodify(struct kevent *kev, struct knote *kn) 2406{ 2407 struct socket *so = kn->kn_fp->f_data; 2408 int rv; 2409 2410 sofilt_lock(so, &so->so_snd); 2411 rv = knote_modify(kev, kn); 2412 sofilt_unlock(so, &so->so_snd); 2413 2414 return (rv); 2415} 2416 2417int 2418filt_sowprocess(struct knote *kn, struct kevent *kev) 2419{ 2420 struct socket *so = kn->kn_fp->f_data; 2421 int rv; 2422 2423 sofilt_lock(so, &so->so_snd); 2424 rv = knote_process(kn, kev); 2425 sofilt_unlock(so, &so->so_snd); 2426 2427 return (rv); 2428} 2429 2430int 2431filt_sormodify(struct kevent *kev, struct knote *kn) 2432{ 2433 struct socket *so = kn->kn_fp->f_data; 2434 int rv; 2435 2436 sofilt_lock(so, &so->so_rcv); 2437 rv = knote_modify(kev, kn); 2438 sofilt_unlock(so, &so->so_rcv); 2439 2440 return (rv); 2441} 2442 2443int 2444filt_sorprocess(struct knote *kn, struct kevent *kev) 2445{ 2446 struct socket *so = kn->kn_fp->f_data; 2447 int rv; 2448 2449 sofilt_lock(so, &so->so_rcv); 2450 rv = knote_process(kn, kev); 2451 sofilt_unlock(so, &so->so_rcv); 2452 2453 return (rv); 2454} 2455 2456#ifdef DDB 2457void 2458sobuf_print(struct sockbuf *, 2459 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2460 2461void 2462sobuf_print(struct sockbuf *sb, 2463 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2464{ 2465 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2466 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2467 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2468 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2469 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2470 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2471 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2472 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2473 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2474 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2475 (*pr)("\tsb_sel: ...\n"); 2476 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2477 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2478 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2479} 2480 2481void 2482so_print(void *v, 2483 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2484{ 2485 struct socket *so = v; 2486 2487 (*pr)("socket %p\n", so); 2488 (*pr)("so_type: %i\n", so->so_type); 2489 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2490 (*pr)("so_linger: %i\n", so->so_linger); 2491 (*pr)("so_state: 0x%04x\n", so->so_state); 2492 (*pr)("so_pcb: %p\n", so->so_pcb); 2493 (*pr)("so_proto: %p\n", so->so_proto); 2494 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2495 2496 (*pr)("so_head: %p\n", so->so_head); 2497 (*pr)("so_onq: %p\n", so->so_onq); 2498 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2499 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2500 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2501 (*pr)("so_q0len: %i\n", so->so_q0len); 2502 (*pr)("so_qlen: %i\n", so->so_qlen); 2503 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2504 (*pr)("so_timeo: %i\n", so->so_timeo); 2505 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2506 2507 (*pr)("so_sp: %p\n", so->so_sp); 2508 if (so->so_sp != NULL) { 2509 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2510 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2511 (*pr)("\tssp_len: %lld\n", 2512 (unsigned long long)so->so_sp->ssp_len); 2513 (*pr)("\tssp_max: %lld\n", 2514 (unsigned long long)so->so_sp->ssp_max); 2515 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2516 so->so_sp->ssp_idletv.tv_usec); 2517 (*pr)("\tssp_idleto: %spending (@%i)\n", 2518 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2519 so->so_sp->ssp_idleto.to_time); 2520 } 2521 2522 (*pr)("so_rcv:\n"); 2523 sobuf_print(&so->so_rcv, pr); 2524 (*pr)("so_snd:\n"); 2525 sobuf_print(&so->so_snd, pr); 2526 2527 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2528 so->so_upcall, so->so_upcallarg); 2529 2530 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2531 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2532 (*pr)("so_cpid: %d\n", so->so_cpid); 2533} 2534#endif 2535