uipc_socket.c revision 131932
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 131932 2004-07-10 21:43:35Z rwatson $"); 34 35#include "opt_inet.h" 36#include "opt_mac.h" 37#include "opt_zero.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/fcntl.h> 42#include <sys/limits.h> 43#include <sys/lock.h> 44#include <sys/mac.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/mutex.h> 48#include <sys/domain.h> 49#include <sys/file.h> /* for struct knote */ 50#include <sys/kernel.h> 51#include <sys/event.h> 52#include <sys/poll.h> 53#include <sys/proc.h> 54#include <sys/protosw.h> 55#include <sys/socket.h> 56#include <sys/socketvar.h> 57#include <sys/resourcevar.h> 58#include <sys/signalvar.h> 59#include <sys/sysctl.h> 60#include <sys/uio.h> 61#include <sys/jail.h> 62 63#include <vm/uma.h> 64 65 66#ifdef INET 67static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 68#endif 69 70static void filt_sordetach(struct knote *kn); 71static int filt_soread(struct knote *kn, long hint); 72static void filt_sowdetach(struct knote *kn); 73static int filt_sowrite(struct knote *kn, long hint); 74static int filt_solisten(struct knote *kn, long hint); 75 76static struct filterops solisten_filtops = 77 { 1, NULL, filt_sordetach, filt_solisten }; 78static struct filterops soread_filtops = 79 { 1, NULL, filt_sordetach, filt_soread }; 80static struct filterops sowrite_filtops = 81 { 1, NULL, filt_sowdetach, filt_sowrite }; 82 83uma_zone_t socket_zone; 84so_gen_t so_gencnt; /* generation count for sockets */ 85 86MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 87MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 88 89SYSCTL_DECL(_kern_ipc); 90 91static int somaxconn = SOMAXCONN; 92SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 93 &somaxconn, 0, "Maximum pending socket connection queue size"); 94static int numopensockets; 95SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 96 &numopensockets, 0, "Number of open sockets"); 97#ifdef ZERO_COPY_SOCKETS 98/* These aren't static because they're used in other files. */ 99int so_zero_copy_send = 1; 100int so_zero_copy_receive = 1; 101SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 102 "Zero copy controls"); 103SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 104 &so_zero_copy_receive, 0, "Enable zero copy receive"); 105SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 106 &so_zero_copy_send, 0, "Enable zero copy send"); 107#endif /* ZERO_COPY_SOCKETS */ 108 109/* 110 * accept_mtx locks down per-socket fields relating to accept queues. See 111 * socketvar.h for an annotation of the protected fields of struct socket. 112 */ 113struct mtx accept_mtx; 114MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 115 116/* 117 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 118 * so_gencnt field. 119 * 120 * XXXRW: These variables might be better manipulated using atomic operations 121 * for improved efficiency. 122 */ 123static struct mtx so_global_mtx; 124MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 125 126/* 127 * Socket operation routines. 128 * These routines are called by the routines in 129 * sys_socket.c or from a system process, and 130 * implement the semantics of socket operations by 131 * switching out to the protocol specific routines. 132 */ 133 134/* 135 * Get a socket structure from our zone, and initialize it. 136 * Note that it would probably be better to allocate socket 137 * and PCB at the same time, but I'm not convinced that all 138 * the protocols can be easily modified to do this. 139 * 140 * soalloc() returns a socket with a ref count of 0. 141 */ 142struct socket * 143soalloc(int mflags) 144{ 145 struct socket *so; 146#ifdef MAC 147 int error; 148#endif 149 150 so = uma_zalloc(socket_zone, mflags | M_ZERO); 151 if (so != NULL) { 152#ifdef MAC 153 error = mac_init_socket(so, mflags); 154 if (error != 0) { 155 uma_zfree(socket_zone, so); 156 so = NULL; 157 return so; 158 } 159#endif 160 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 161 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 162 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 163 TAILQ_INIT(&so->so_aiojobq); 164 mtx_lock(&so_global_mtx); 165 so->so_gencnt = ++so_gencnt; 166 ++numopensockets; 167 mtx_unlock(&so_global_mtx); 168 } 169 return so; 170} 171 172/* 173 * socreate returns a socket with a ref count of 1. The socket should be 174 * closed with soclose(). 175 */ 176int 177socreate(dom, aso, type, proto, cred, td) 178 int dom; 179 struct socket **aso; 180 int type; 181 int proto; 182 struct ucred *cred; 183 struct thread *td; 184{ 185 struct protosw *prp; 186 struct socket *so; 187 int error; 188 189 if (proto) 190 prp = pffindproto(dom, proto, type); 191 else 192 prp = pffindtype(dom, type); 193 194 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 195 return (EPROTONOSUPPORT); 196 197 if (jailed(cred) && jail_socket_unixiproute_only && 198 prp->pr_domain->dom_family != PF_LOCAL && 199 prp->pr_domain->dom_family != PF_INET && 200 prp->pr_domain->dom_family != PF_ROUTE) { 201 return (EPROTONOSUPPORT); 202 } 203 204 if (prp->pr_type != type) 205 return (EPROTOTYPE); 206 so = soalloc(M_WAITOK); 207 if (so == NULL) 208 return (ENOBUFS); 209 210 TAILQ_INIT(&so->so_incomp); 211 TAILQ_INIT(&so->so_comp); 212 so->so_type = type; 213 so->so_cred = crhold(cred); 214 so->so_proto = prp; 215#ifdef MAC 216 mac_create_socket(cred, so); 217#endif 218 SOCK_LOCK(so); 219 soref(so); 220 SOCK_UNLOCK(so); 221 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 222 if (error) { 223 SOCK_LOCK(so); 224 so->so_state |= SS_NOFDREF; 225 sorele(so); 226 return (error); 227 } 228 *aso = so; 229 return (0); 230} 231 232int 233sobind(so, nam, td) 234 struct socket *so; 235 struct sockaddr *nam; 236 struct thread *td; 237{ 238 239 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 240} 241 242void 243sodealloc(struct socket *so) 244{ 245 246 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 247 mtx_lock(&so_global_mtx); 248 so->so_gencnt = ++so_gencnt; 249 mtx_unlock(&so_global_mtx); 250 if (so->so_rcv.sb_hiwat) 251 (void)chgsbsize(so->so_cred->cr_uidinfo, 252 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 253 if (so->so_snd.sb_hiwat) 254 (void)chgsbsize(so->so_cred->cr_uidinfo, 255 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 256#ifdef INET 257 /* remove acccept filter if one is present. */ 258 if (so->so_accf != NULL) 259 do_setopt_accept_filter(so, NULL); 260#endif 261#ifdef MAC 262 mac_destroy_socket(so); 263#endif 264 crfree(so->so_cred); 265 SOCKBUF_LOCK_DESTROY(&so->so_snd); 266 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 267 /* sx_destroy(&so->so_sxlock); */ 268 uma_zfree(socket_zone, so); 269 /* 270 * XXXRW: Seems like a shame to grab the mutex again down here, but 271 * we don't want to decrement the socket count until after we free 272 * the socket, and we can't increment the gencnt on the socket after 273 * we free, it so... 274 */ 275 mtx_lock(&so_global_mtx); 276 --numopensockets; 277 mtx_unlock(&so_global_mtx); 278} 279 280int 281solisten(so, backlog, td) 282 struct socket *so; 283 int backlog; 284 struct thread *td; 285{ 286 int error; 287 288 /* 289 * XXXRW: Ordering issue here -- perhaps we need to set 290 * SO_ACCEPTCONN before the call to pru_listen()? 291 * XXXRW: General atomic test-and-set concerns here also. 292 */ 293 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 294 SS_ISDISCONNECTING)) 295 return (EINVAL); 296 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 297 if (error) 298 return (error); 299 ACCEPT_LOCK(); 300 if (TAILQ_EMPTY(&so->so_comp)) { 301 SOCK_LOCK(so); 302 so->so_options |= SO_ACCEPTCONN; 303 SOCK_UNLOCK(so); 304 } 305 if (backlog < 0 || backlog > somaxconn) 306 backlog = somaxconn; 307 so->so_qlimit = backlog; 308 ACCEPT_UNLOCK(); 309 return (0); 310} 311 312void 313sofree(so) 314 struct socket *so; 315{ 316 struct socket *head; 317 318 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 319 SOCK_LOCK_ASSERT(so); 320 321 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 322 SOCK_UNLOCK(so); 323 return; 324 } 325 326 SOCK_UNLOCK(so); 327 ACCEPT_LOCK(); 328 head = so->so_head; 329 if (head != NULL) { 330 KASSERT((so->so_qstate & SQ_COMP) != 0 || 331 (so->so_qstate & SQ_INCOMP) != 0, 332 ("sofree: so_head != NULL, but neither SQ_COMP nor " 333 "SQ_INCOMP")); 334 KASSERT((so->so_qstate & SQ_COMP) == 0 || 335 (so->so_qstate & SQ_INCOMP) == 0, 336 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 337 /* 338 * accept(2) is responsible draining the completed 339 * connection queue and freeing those sockets, so 340 * we just return here if this socket is currently 341 * on the completed connection queue. Otherwise, 342 * accept(2) may hang after select(2) has indicating 343 * that a listening socket was ready. If it's an 344 * incomplete connection, we remove it from the queue 345 * and free it; otherwise, it won't be released until 346 * the listening socket is closed. 347 */ 348 if ((so->so_qstate & SQ_COMP) != 0) { 349 ACCEPT_UNLOCK(); 350 return; 351 } 352 TAILQ_REMOVE(&head->so_incomp, so, so_list); 353 head->so_incqlen--; 354 so->so_qstate &= ~SQ_INCOMP; 355 so->so_head = NULL; 356 } 357 KASSERT((so->so_qstate & SQ_COMP) == 0 && 358 (so->so_qstate & SQ_INCOMP) == 0, 359 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 360 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 361 ACCEPT_UNLOCK(); 362 SOCKBUF_LOCK(&so->so_snd); 363 so->so_snd.sb_flags |= SB_NOINTR; 364 (void)sblock(&so->so_snd, M_WAITOK); 365 /* 366 * socantsendmore_locked() drops the socket buffer mutex so that it 367 * can safely perform wakeups. Re-acquire the mutex before 368 * continuing. 369 */ 370 socantsendmore_locked(so); 371 SOCKBUF_LOCK(&so->so_snd); 372 sbunlock(&so->so_snd); 373 sbrelease_locked(&so->so_snd, so); 374 SOCKBUF_UNLOCK(&so->so_snd); 375 sorflush(so); 376 sodealloc(so); 377} 378 379/* 380 * Close a socket on last file table reference removal. 381 * Initiate disconnect if connected. 382 * Free socket when disconnect complete. 383 * 384 * This function will sorele() the socket. Note that soclose() may be 385 * called prior to the ref count reaching zero. The actual socket 386 * structure will not be freed until the ref count reaches zero. 387 */ 388int 389soclose(so) 390 struct socket *so; 391{ 392 int error = 0; 393 394 funsetown(&so->so_sigio); 395 if (so->so_options & SO_ACCEPTCONN) { 396 struct socket *sp; 397 ACCEPT_LOCK(); 398 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 399 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 400 so->so_incqlen--; 401 sp->so_qstate &= ~SQ_INCOMP; 402 sp->so_head = NULL; 403 ACCEPT_UNLOCK(); 404 (void) soabort(sp); 405 ACCEPT_LOCK(); 406 } 407 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 408 TAILQ_REMOVE(&so->so_comp, sp, so_list); 409 so->so_qlen--; 410 sp->so_qstate &= ~SQ_COMP; 411 sp->so_head = NULL; 412 ACCEPT_UNLOCK(); 413 (void) soabort(sp); 414 ACCEPT_LOCK(); 415 } 416 ACCEPT_UNLOCK(); 417 } 418 if (so->so_pcb == NULL) 419 goto discard; 420 if (so->so_state & SS_ISCONNECTED) { 421 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 422 error = sodisconnect(so); 423 if (error) 424 goto drop; 425 } 426 if (so->so_options & SO_LINGER) { 427 if ((so->so_state & SS_ISDISCONNECTING) && 428 (so->so_state & SS_NBIO)) 429 goto drop; 430 while (so->so_state & SS_ISCONNECTED) { 431 error = tsleep(&so->so_timeo, 432 PSOCK | PCATCH, "soclos", so->so_linger * hz); 433 if (error) 434 break; 435 } 436 } 437 } 438drop: 439 if (so->so_pcb != NULL) { 440 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 441 if (error == 0) 442 error = error2; 443 } 444discard: 445 SOCK_LOCK(so); 446 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 447 so->so_state |= SS_NOFDREF; 448 sorele(so); 449 return (error); 450} 451 452/* 453 * soabort() must not be called with any socket locks held, as it calls 454 * into the protocol, which will call back into the socket code causing 455 * it to acquire additional socket locks that may cause recursion or lock 456 * order reversals. 457 */ 458int 459soabort(so) 460 struct socket *so; 461{ 462 int error; 463 464 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 465 if (error) { 466 SOCK_LOCK(so); 467 sotryfree(so); /* note: does not decrement the ref count */ 468 return error; 469 } 470 return (0); 471} 472 473int 474soaccept(so, nam) 475 struct socket *so; 476 struct sockaddr **nam; 477{ 478 int error; 479 480 SOCK_LOCK(so); 481 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 482 so->so_state &= ~SS_NOFDREF; 483 SOCK_UNLOCK(so); 484 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 485 return (error); 486} 487 488int 489soconnect(so, nam, td) 490 struct socket *so; 491 struct sockaddr *nam; 492 struct thread *td; 493{ 494 int error; 495 496 if (so->so_options & SO_ACCEPTCONN) 497 return (EOPNOTSUPP); 498 /* 499 * If protocol is connection-based, can only connect once. 500 * Otherwise, if connected, try to disconnect first. 501 * This allows user to disconnect by connecting to, e.g., 502 * a null address. 503 */ 504 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 505 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 506 (error = sodisconnect(so)))) 507 error = EISCONN; 508 else 509 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 510 return (error); 511} 512 513int 514soconnect2(so1, so2) 515 struct socket *so1; 516 struct socket *so2; 517{ 518 519 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 520} 521 522int 523sodisconnect(so) 524 struct socket *so; 525{ 526 int error; 527 528 if ((so->so_state & SS_ISCONNECTED) == 0) 529 return (ENOTCONN); 530 if (so->so_state & SS_ISDISCONNECTING) 531 return (EALREADY); 532 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 533 return (error); 534} 535 536#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 537/* 538 * Send on a socket. 539 * If send must go all at once and message is larger than 540 * send buffering, then hard error. 541 * Lock against other senders. 542 * If must go all at once and not enough room now, then 543 * inform user that this would block and do nothing. 544 * Otherwise, if nonblocking, send as much as possible. 545 * The data to be sent is described by "uio" if nonzero, 546 * otherwise by the mbuf chain "top" (which must be null 547 * if uio is not). Data provided in mbuf chain must be small 548 * enough to send all at once. 549 * 550 * Returns nonzero on error, timeout or signal; callers 551 * must check for short counts if EINTR/ERESTART are returned. 552 * Data and control buffers are freed on return. 553 */ 554 555#ifdef ZERO_COPY_SOCKETS 556struct so_zerocopy_stats{ 557 int size_ok; 558 int align_ok; 559 int found_ifp; 560}; 561struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 562#include <netinet/in.h> 563#include <net/route.h> 564#include <netinet/in_pcb.h> 565#include <vm/vm.h> 566#include <vm/vm_page.h> 567#include <vm/vm_object.h> 568#endif /*ZERO_COPY_SOCKETS*/ 569 570int 571sosend(so, addr, uio, top, control, flags, td) 572 struct socket *so; 573 struct sockaddr *addr; 574 struct uio *uio; 575 struct mbuf *top; 576 struct mbuf *control; 577 int flags; 578 struct thread *td; 579{ 580 struct mbuf **mp; 581 struct mbuf *m; 582 long space, len = 0, resid; 583 int clen = 0, error, dontroute; 584 int atomic = sosendallatonce(so) || top; 585#ifdef ZERO_COPY_SOCKETS 586 int cow_send; 587#endif /* ZERO_COPY_SOCKETS */ 588 589 if (uio != NULL) 590 resid = uio->uio_resid; 591 else 592 resid = top->m_pkthdr.len; 593 /* 594 * In theory resid should be unsigned. 595 * However, space must be signed, as it might be less than 0 596 * if we over-committed, and we must use a signed comparison 597 * of space and resid. On the other hand, a negative resid 598 * causes us to loop sending 0-length segments to the protocol. 599 * 600 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 601 * type sockets since that's an error. 602 */ 603 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 604 error = EINVAL; 605 goto out; 606 } 607 608 dontroute = 609 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 610 (so->so_proto->pr_flags & PR_ATOMIC); 611 if (td != NULL) 612 td->td_proc->p_stats->p_ru.ru_msgsnd++; 613 if (control != NULL) 614 clen = control->m_len; 615#define snderr(errno) { error = (errno); goto release; } 616 617 SOCKBUF_LOCK(&so->so_snd); 618restart: 619 SOCKBUF_LOCK_ASSERT(&so->so_snd); 620 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 621 if (error) 622 goto out_locked; 623 do { 624 SOCKBUF_LOCK_ASSERT(&so->so_snd); 625 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 626 snderr(EPIPE); 627 if (so->so_error) { 628 error = so->so_error; 629 so->so_error = 0; 630 goto release; 631 } 632 if ((so->so_state & SS_ISCONNECTED) == 0) { 633 /* 634 * `sendto' and `sendmsg' is allowed on a connection- 635 * based socket if it supports implied connect. 636 * Return ENOTCONN if not connected and no address is 637 * supplied. 638 */ 639 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 640 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 641 if ((so->so_state & SS_ISCONFIRMING) == 0 && 642 !(resid == 0 && clen != 0)) 643 snderr(ENOTCONN); 644 } else if (addr == NULL) 645 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 646 ENOTCONN : EDESTADDRREQ); 647 } 648 space = sbspace(&so->so_snd); 649 if (flags & MSG_OOB) 650 space += 1024; 651 if ((atomic && resid > so->so_snd.sb_hiwat) || 652 clen > so->so_snd.sb_hiwat) 653 snderr(EMSGSIZE); 654 if (space < resid + clen && 655 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 656 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 657 snderr(EWOULDBLOCK); 658 sbunlock(&so->so_snd); 659 error = sbwait(&so->so_snd); 660 if (error) 661 goto out_locked; 662 goto restart; 663 } 664 SOCKBUF_UNLOCK(&so->so_snd); 665 mp = ⊤ 666 space -= clen; 667 do { 668 if (uio == NULL) { 669 /* 670 * Data is prepackaged in "top". 671 */ 672 resid = 0; 673 if (flags & MSG_EOR) 674 top->m_flags |= M_EOR; 675 } else do { 676#ifdef ZERO_COPY_SOCKETS 677 cow_send = 0; 678#endif /* ZERO_COPY_SOCKETS */ 679 if (resid >= MINCLSIZE) { 680#ifdef ZERO_COPY_SOCKETS 681 if (top == NULL) { 682 MGETHDR(m, M_TRYWAIT, MT_DATA); 683 if (m == NULL) { 684 error = ENOBUFS; 685 SOCKBUF_LOCK(&so->so_snd); 686 goto release; 687 } 688 m->m_pkthdr.len = 0; 689 m->m_pkthdr.rcvif = (struct ifnet *)0; 690 } else { 691 MGET(m, M_TRYWAIT, MT_DATA); 692 if (m == NULL) { 693 error = ENOBUFS; 694 SOCKBUF_LOCK(&so->so_snd); 695 goto release; 696 } 697 } 698 if (so_zero_copy_send && 699 resid>=PAGE_SIZE && 700 space>=PAGE_SIZE && 701 uio->uio_iov->iov_len>=PAGE_SIZE) { 702 so_zerocp_stats.size_ok++; 703 if (!((vm_offset_t) 704 uio->uio_iov->iov_base & PAGE_MASK)){ 705 so_zerocp_stats.align_ok++; 706 cow_send = socow_setup(m, uio); 707 } 708 } 709 if (!cow_send) { 710 MCLGET(m, M_TRYWAIT); 711 if ((m->m_flags & M_EXT) == 0) { 712 m_free(m); 713 m = NULL; 714 } else { 715 len = min(min(MCLBYTES, resid), space); 716 } 717 } else 718 len = PAGE_SIZE; 719#else /* ZERO_COPY_SOCKETS */ 720 if (top == NULL) { 721 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 722 m->m_pkthdr.len = 0; 723 m->m_pkthdr.rcvif = (struct ifnet *)0; 724 } else 725 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 726 len = min(min(MCLBYTES, resid), space); 727#endif /* ZERO_COPY_SOCKETS */ 728 } else { 729 if (top == NULL) { 730 m = m_gethdr(M_TRYWAIT, MT_DATA); 731 m->m_pkthdr.len = 0; 732 m->m_pkthdr.rcvif = (struct ifnet *)0; 733 734 len = min(min(MHLEN, resid), space); 735 /* 736 * For datagram protocols, leave room 737 * for protocol headers in first mbuf. 738 */ 739 if (atomic && m && len < MHLEN) 740 MH_ALIGN(m, len); 741 } else { 742 m = m_get(M_TRYWAIT, MT_DATA); 743 len = min(min(MLEN, resid), space); 744 } 745 } 746 if (m == NULL) { 747 error = ENOBUFS; 748 SOCKBUF_LOCK(&so->so_snd); 749 goto release; 750 } 751 752 space -= len; 753#ifdef ZERO_COPY_SOCKETS 754 if (cow_send) 755 error = 0; 756 else 757#endif /* ZERO_COPY_SOCKETS */ 758 error = uiomove(mtod(m, void *), (int)len, uio); 759 resid = uio->uio_resid; 760 m->m_len = len; 761 *mp = m; 762 top->m_pkthdr.len += len; 763 if (error) { 764 SOCKBUF_LOCK(&so->so_snd); 765 goto release; 766 } 767 mp = &m->m_next; 768 if (resid <= 0) { 769 if (flags & MSG_EOR) 770 top->m_flags |= M_EOR; 771 break; 772 } 773 } while (space > 0 && atomic); 774 if (dontroute) { 775 SOCK_LOCK(so); 776 so->so_options |= SO_DONTROUTE; 777 SOCK_UNLOCK(so); 778 } 779 /* 780 * XXX all the SBS_CANTSENDMORE checks previously 781 * done could be out of date. We could have recieved 782 * a reset packet in an interrupt or maybe we slept 783 * while doing page faults in uiomove() etc. We could 784 * probably recheck again inside the splnet() protection 785 * here, but there are probably other places that this 786 * also happens. We must rethink this. 787 */ 788 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 789 (flags & MSG_OOB) ? PRUS_OOB : 790 /* 791 * If the user set MSG_EOF, the protocol 792 * understands this flag and nothing left to 793 * send then use PRU_SEND_EOF instead of PRU_SEND. 794 */ 795 ((flags & MSG_EOF) && 796 (so->so_proto->pr_flags & PR_IMPLOPCL) && 797 (resid <= 0)) ? 798 PRUS_EOF : 799 /* If there is more to send set PRUS_MORETOCOME */ 800 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 801 top, addr, control, td); 802 if (dontroute) { 803 SOCK_LOCK(so); 804 so->so_options &= ~SO_DONTROUTE; 805 SOCK_UNLOCK(so); 806 } 807 clen = 0; 808 control = NULL; 809 top = NULL; 810 mp = ⊤ 811 if (error) { 812 SOCKBUF_LOCK(&so->so_snd); 813 goto release; 814 } 815 } while (resid && space > 0); 816 SOCKBUF_LOCK(&so->so_snd); 817 } while (resid); 818 819release: 820 SOCKBUF_LOCK_ASSERT(&so->so_snd); 821 sbunlock(&so->so_snd); 822out_locked: 823 SOCKBUF_LOCK_ASSERT(&so->so_snd); 824 SOCKBUF_UNLOCK(&so->so_snd); 825out: 826 if (top != NULL) 827 m_freem(top); 828 if (control != NULL) 829 m_freem(control); 830 return (error); 831} 832 833/* 834 * Implement receive operations on a socket. 835 * We depend on the way that records are added to the sockbuf 836 * by sbappend*. In particular, each record (mbufs linked through m_next) 837 * must begin with an address if the protocol so specifies, 838 * followed by an optional mbuf or mbufs containing ancillary data, 839 * and then zero or more mbufs of data. 840 * In order to avoid blocking network interrupts for the entire time here, 841 * we splx() while doing the actual copy to user space. 842 * Although the sockbuf is locked, new data may still be appended, 843 * and thus we must maintain consistency of the sockbuf during that time. 844 * 845 * The caller may receive the data as a single mbuf chain by supplying 846 * an mbuf **mp0 for use in returning the chain. The uio is then used 847 * only for the count in uio_resid. 848 */ 849int 850soreceive(so, psa, uio, mp0, controlp, flagsp) 851 struct socket *so; 852 struct sockaddr **psa; 853 struct uio *uio; 854 struct mbuf **mp0; 855 struct mbuf **controlp; 856 int *flagsp; 857{ 858 struct mbuf *m, **mp; 859 int flags, len, error, offset; 860 struct protosw *pr = so->so_proto; 861 struct mbuf *nextrecord; 862 int moff, type = 0; 863 int orig_resid = uio->uio_resid; 864 865 mp = mp0; 866 if (psa != NULL) 867 *psa = 0; 868 if (controlp != NULL) 869 *controlp = 0; 870 if (flagsp != NULL) 871 flags = *flagsp &~ MSG_EOR; 872 else 873 flags = 0; 874 if (flags & MSG_OOB) { 875 m = m_get(M_TRYWAIT, MT_DATA); 876 if (m == NULL) 877 return (ENOBUFS); 878 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 879 if (error) 880 goto bad; 881 do { 882#ifdef ZERO_COPY_SOCKETS 883 if (so_zero_copy_receive) { 884 vm_page_t pg; 885 int disposable; 886 887 if ((m->m_flags & M_EXT) 888 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 889 disposable = 1; 890 else 891 disposable = 0; 892 893 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 894 if (uio->uio_offset == -1) 895 uio->uio_offset =IDX_TO_OFF(pg->pindex); 896 897 error = uiomoveco(mtod(m, void *), 898 min(uio->uio_resid, m->m_len), 899 uio, pg->object, 900 disposable); 901 } else 902#endif /* ZERO_COPY_SOCKETS */ 903 error = uiomove(mtod(m, void *), 904 (int) min(uio->uio_resid, m->m_len), uio); 905 m = m_free(m); 906 } while (uio->uio_resid && error == 0 && m); 907bad: 908 if (m != NULL) 909 m_freem(m); 910 return (error); 911 } 912 if (mp != NULL) 913 *mp = NULL; 914 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 915 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 916 917 SOCKBUF_LOCK(&so->so_rcv); 918restart: 919 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 920 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 921 if (error) 922 goto out; 923 924 m = so->so_rcv.sb_mb; 925 /* 926 * If we have less data than requested, block awaiting more 927 * (subject to any timeout) if: 928 * 1. the current count is less than the low water mark, or 929 * 2. MSG_WAITALL is set, and it is possible to do the entire 930 * receive operation at once if we block (resid <= hiwat). 931 * 3. MSG_DONTWAIT is not set 932 * If MSG_WAITALL is set but resid is larger than the receive buffer, 933 * we have to do the receive in sections, and thus risk returning 934 * a short count if a timeout or signal occurs after we start. 935 */ 936 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 937 so->so_rcv.sb_cc < uio->uio_resid) && 938 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 939 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 940 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 941 KASSERT(m != NULL || !so->so_rcv.sb_cc, 942 ("receive: m == %p so->so_rcv.sb_cc == %u", 943 m, so->so_rcv.sb_cc)); 944 if (so->so_error) { 945 if (m != NULL) 946 goto dontblock; 947 error = so->so_error; 948 if ((flags & MSG_PEEK) == 0) 949 so->so_error = 0; 950 goto release; 951 } 952 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 953 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 954 if (m) 955 goto dontblock; 956 else 957 goto release; 958 } 959 for (; m != NULL; m = m->m_next) 960 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 961 m = so->so_rcv.sb_mb; 962 goto dontblock; 963 } 964 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 965 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 966 error = ENOTCONN; 967 goto release; 968 } 969 if (uio->uio_resid == 0) 970 goto release; 971 if ((so->so_state & SS_NBIO) || 972 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 973 error = EWOULDBLOCK; 974 goto release; 975 } 976 SBLASTRECORDCHK(&so->so_rcv); 977 SBLASTMBUFCHK(&so->so_rcv); 978 sbunlock(&so->so_rcv); 979 error = sbwait(&so->so_rcv); 980 if (error) 981 goto out; 982 goto restart; 983 } 984dontblock: 985 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 986 if (uio->uio_td) 987 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 988 SBLASTRECORDCHK(&so->so_rcv); 989 SBLASTMBUFCHK(&so->so_rcv); 990 nextrecord = m->m_nextpkt; 991 if (pr->pr_flags & PR_ADDR) { 992 KASSERT(m->m_type == MT_SONAME, 993 ("m->m_type == %d", m->m_type)); 994 orig_resid = 0; 995 if (psa != NULL) 996 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 997 M_NOWAIT); 998 if (flags & MSG_PEEK) { 999 m = m->m_next; 1000 } else { 1001 sbfree(&so->so_rcv, m); 1002 so->so_rcv.sb_mb = m_free(m); 1003 m = so->so_rcv.sb_mb; 1004 if (m != NULL) 1005 m->m_nextpkt = nextrecord; 1006 } 1007 } 1008 while (m != NULL && m->m_type == MT_CONTROL && error == 0) { 1009 if (flags & MSG_PEEK) { 1010 if (controlp != NULL) 1011 *controlp = m_copy(m, 0, m->m_len); 1012 m = m->m_next; 1013 } else { 1014 sbfree(&so->so_rcv, m); 1015 so->so_rcv.sb_mb = m->m_next; 1016 m->m_next = NULL; 1017 if (pr->pr_domain->dom_externalize) { 1018 SOCKBUF_UNLOCK(&so->so_rcv); 1019 error = (*pr->pr_domain->dom_externalize) 1020 (m, controlp); 1021 SOCKBUF_LOCK(&so->so_rcv); 1022 } else if (controlp != NULL) 1023 *controlp = m; 1024 else 1025 m_freem(m); 1026 m = so->so_rcv.sb_mb; 1027 } 1028 if (controlp != NULL) { 1029 orig_resid = 0; 1030 while (*controlp != NULL) 1031 controlp = &(*controlp)->m_next; 1032 } 1033 } 1034 if (m != NULL) { 1035 if ((flags & MSG_PEEK) == 0) { 1036 m->m_nextpkt = nextrecord; 1037 /* 1038 * If nextrecord == NULL (this is a single chain), 1039 * then sb_lastrecord may not be valid here if m 1040 * was changed earlier. 1041 */ 1042 if (nextrecord == NULL) { 1043 KASSERT(so->so_rcv.sb_mb == m, 1044 ("receive tailq 1")); 1045 so->so_rcv.sb_lastrecord = m; 1046 } 1047 } 1048 type = m->m_type; 1049 if (type == MT_OOBDATA) 1050 flags |= MSG_OOB; 1051 } else { 1052 if ((flags & MSG_PEEK) == 0) { 1053 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2")); 1054 so->so_rcv.sb_mb = nextrecord; 1055 SB_EMPTY_FIXUP(&so->so_rcv); 1056 } 1057 } 1058 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1059 SBLASTRECORDCHK(&so->so_rcv); 1060 SBLASTMBUFCHK(&so->so_rcv); 1061 1062 moff = 0; 1063 offset = 0; 1064 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1065 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1066 if (m->m_type == MT_OOBDATA) { 1067 if (type != MT_OOBDATA) 1068 break; 1069 } else if (type == MT_OOBDATA) 1070 break; 1071 else 1072 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1073 ("m->m_type == %d", m->m_type)); 1074 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1075 len = uio->uio_resid; 1076 if (so->so_oobmark && len > so->so_oobmark - offset) 1077 len = so->so_oobmark - offset; 1078 if (len > m->m_len - moff) 1079 len = m->m_len - moff; 1080 /* 1081 * If mp is set, just pass back the mbufs. 1082 * Otherwise copy them out via the uio, then free. 1083 * Sockbuf must be consistent here (points to current mbuf, 1084 * it points to next record) when we drop priority; 1085 * we must note any additions to the sockbuf when we 1086 * block interrupts again. 1087 */ 1088 if (mp == NULL) { 1089 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1090 SBLASTRECORDCHK(&so->so_rcv); 1091 SBLASTMBUFCHK(&so->so_rcv); 1092 SOCKBUF_UNLOCK(&so->so_rcv); 1093#ifdef ZERO_COPY_SOCKETS 1094 if (so_zero_copy_receive) { 1095 vm_page_t pg; 1096 int disposable; 1097 1098 if ((m->m_flags & M_EXT) 1099 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1100 disposable = 1; 1101 else 1102 disposable = 0; 1103 1104 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1105 moff)); 1106 1107 if (uio->uio_offset == -1) 1108 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1109 1110 error = uiomoveco(mtod(m, char *) + moff, 1111 (int)len, uio,pg->object, 1112 disposable); 1113 } else 1114#endif /* ZERO_COPY_SOCKETS */ 1115 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1116 SOCKBUF_LOCK(&so->so_rcv); 1117 if (error) 1118 goto release; 1119 } else 1120 uio->uio_resid -= len; 1121 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1122 if (len == m->m_len - moff) { 1123 if (m->m_flags & M_EOR) 1124 flags |= MSG_EOR; 1125 if (flags & MSG_PEEK) { 1126 m = m->m_next; 1127 moff = 0; 1128 } else { 1129 nextrecord = m->m_nextpkt; 1130 sbfree(&so->so_rcv, m); 1131 if (mp != NULL) { 1132 *mp = m; 1133 mp = &m->m_next; 1134 so->so_rcv.sb_mb = m = m->m_next; 1135 *mp = NULL; 1136 } else { 1137 so->so_rcv.sb_mb = m_free(m); 1138 m = so->so_rcv.sb_mb; 1139 } 1140 if (m != NULL) { 1141 m->m_nextpkt = nextrecord; 1142 if (nextrecord == NULL) 1143 so->so_rcv.sb_lastrecord = m; 1144 } else { 1145 so->so_rcv.sb_mb = nextrecord; 1146 SB_EMPTY_FIXUP(&so->so_rcv); 1147 } 1148 SBLASTRECORDCHK(&so->so_rcv); 1149 SBLASTMBUFCHK(&so->so_rcv); 1150 } 1151 } else { 1152 if (flags & MSG_PEEK) 1153 moff += len; 1154 else { 1155 if (mp != NULL) { 1156 SOCKBUF_UNLOCK(&so->so_rcv); 1157 *mp = m_copym(m, 0, len, M_TRYWAIT); 1158 SOCKBUF_LOCK(&so->so_rcv); 1159 } 1160 m->m_data += len; 1161 m->m_len -= len; 1162 so->so_rcv.sb_cc -= len; 1163 } 1164 } 1165 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1166 if (so->so_oobmark) { 1167 if ((flags & MSG_PEEK) == 0) { 1168 so->so_oobmark -= len; 1169 if (so->so_oobmark == 0) { 1170 so->so_rcv.sb_state |= SBS_RCVATMARK; 1171 break; 1172 } 1173 } else { 1174 offset += len; 1175 if (offset == so->so_oobmark) 1176 break; 1177 } 1178 } 1179 if (flags & MSG_EOR) 1180 break; 1181 /* 1182 * If the MSG_WAITALL flag is set (for non-atomic socket), 1183 * we must not quit until "uio->uio_resid == 0" or an error 1184 * termination. If a signal/timeout occurs, return 1185 * with a short count but without error. 1186 * Keep sockbuf locked against other readers. 1187 */ 1188 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1189 !sosendallatonce(so) && nextrecord == NULL) { 1190 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1191 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1192 break; 1193 /* 1194 * Notify the protocol that some data has been 1195 * drained before blocking. 1196 */ 1197 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1198 SOCKBUF_UNLOCK(&so->so_rcv); 1199 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1200 SOCKBUF_LOCK(&so->so_rcv); 1201 } 1202 SBLASTRECORDCHK(&so->so_rcv); 1203 SBLASTMBUFCHK(&so->so_rcv); 1204 error = sbwait(&so->so_rcv); 1205 if (error) 1206 goto release; 1207 m = so->so_rcv.sb_mb; 1208 if (m != NULL) 1209 nextrecord = m->m_nextpkt; 1210 } 1211 } 1212 1213 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1214 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1215 flags |= MSG_TRUNC; 1216 if ((flags & MSG_PEEK) == 0) 1217 (void) sbdroprecord_locked(&so->so_rcv); 1218 } 1219 if ((flags & MSG_PEEK) == 0) { 1220 if (m == NULL) { 1221 /* 1222 * First part is an inline SB_EMPTY_FIXUP(). Second 1223 * part makes sure sb_lastrecord is up-to-date if 1224 * there is still data in the socket buffer. 1225 */ 1226 so->so_rcv.sb_mb = nextrecord; 1227 if (so->so_rcv.sb_mb == NULL) { 1228 so->so_rcv.sb_mbtail = NULL; 1229 so->so_rcv.sb_lastrecord = NULL; 1230 } else if (nextrecord->m_nextpkt == NULL) 1231 so->so_rcv.sb_lastrecord = nextrecord; 1232 } 1233 SBLASTRECORDCHK(&so->so_rcv); 1234 SBLASTMBUFCHK(&so->so_rcv); 1235 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1236 SOCKBUF_UNLOCK(&so->so_rcv); 1237 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1238 SOCKBUF_LOCK(&so->so_rcv); 1239 } 1240 } 1241 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1242 if (orig_resid == uio->uio_resid && orig_resid && 1243 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1244 sbunlock(&so->so_rcv); 1245 goto restart; 1246 } 1247 1248 if (flagsp != NULL) 1249 *flagsp |= flags; 1250release: 1251 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1252 sbunlock(&so->so_rcv); 1253out: 1254 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1255 SOCKBUF_UNLOCK(&so->so_rcv); 1256 return (error); 1257} 1258 1259int 1260soshutdown(so, how) 1261 struct socket *so; 1262 int how; 1263{ 1264 struct protosw *pr = so->so_proto; 1265 1266 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1267 return (EINVAL); 1268 1269 if (how != SHUT_WR) 1270 sorflush(so); 1271 if (how != SHUT_RD) 1272 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1273 return (0); 1274} 1275 1276void 1277sorflush(so) 1278 struct socket *so; 1279{ 1280 struct sockbuf *sb = &so->so_rcv; 1281 struct protosw *pr = so->so_proto; 1282 struct sockbuf asb; 1283 1284 /* 1285 * XXXRW: This is quite ugly. The existing code made a copy of the 1286 * socket buffer, then zero'd the original to clear the buffer 1287 * fields. However, with mutexes in the socket buffer, this causes 1288 * problems. We only clear the zeroable bits of the original; 1289 * however, we have to initialize and destroy the mutex in the copy 1290 * so that dom_dispose() and sbrelease() can lock t as needed. 1291 */ 1292 SOCKBUF_LOCK(sb); 1293 sb->sb_flags |= SB_NOINTR; 1294 (void) sblock(sb, M_WAITOK); 1295 /* 1296 * socantrcvmore_locked() drops the socket buffer mutex so that it 1297 * can safely perform wakeups. Re-acquire the mutex before 1298 * continuing. 1299 */ 1300 socantrcvmore_locked(so); 1301 SOCKBUF_LOCK(sb); 1302 sbunlock(sb); 1303 /* 1304 * Invalidate/clear most of the sockbuf structure, but leave 1305 * selinfo and mutex data unchanged. 1306 */ 1307 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1308 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1309 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1310 bzero(&sb->sb_startzero, 1311 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1312 SOCKBUF_UNLOCK(sb); 1313 1314 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1315 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1316 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1317 sbrelease(&asb, so); 1318 SOCKBUF_LOCK_DESTROY(&asb); 1319} 1320 1321#ifdef INET 1322static int 1323do_setopt_accept_filter(so, sopt) 1324 struct socket *so; 1325 struct sockopt *sopt; 1326{ 1327 struct accept_filter_arg *afap = NULL; 1328 struct accept_filter *afp; 1329 struct so_accf *af = so->so_accf; 1330 int error = 0; 1331 1332 /* do not set/remove accept filters on non listen sockets */ 1333 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1334 error = EINVAL; 1335 goto out; 1336 } 1337 1338 /* removing the filter */ 1339 if (sopt == NULL) { 1340 if (af != NULL) { 1341 if (af->so_accept_filter != NULL && 1342 af->so_accept_filter->accf_destroy != NULL) { 1343 af->so_accept_filter->accf_destroy(so); 1344 } 1345 if (af->so_accept_filter_str != NULL) { 1346 FREE(af->so_accept_filter_str, M_ACCF); 1347 } 1348 FREE(af, M_ACCF); 1349 so->so_accf = NULL; 1350 } 1351 so->so_options &= ~SO_ACCEPTFILTER; 1352 return (0); 1353 } 1354 /* adding a filter */ 1355 /* must remove previous filter first */ 1356 if (af != NULL) { 1357 error = EINVAL; 1358 goto out; 1359 } 1360 /* don't put large objects on the kernel stack */ 1361 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1362 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1363 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1364 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1365 if (error) 1366 goto out; 1367 afp = accept_filt_get(afap->af_name); 1368 if (afp == NULL) { 1369 error = ENOENT; 1370 goto out; 1371 } 1372 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1373 if (afp->accf_create != NULL) { 1374 if (afap->af_name[0] != '\0') { 1375 int len = strlen(afap->af_name) + 1; 1376 1377 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1378 strcpy(af->so_accept_filter_str, afap->af_name); 1379 } 1380 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1381 if (af->so_accept_filter_arg == NULL) { 1382 FREE(af->so_accept_filter_str, M_ACCF); 1383 FREE(af, M_ACCF); 1384 so->so_accf = NULL; 1385 error = EINVAL; 1386 goto out; 1387 } 1388 } 1389 af->so_accept_filter = afp; 1390 so->so_accf = af; 1391 so->so_options |= SO_ACCEPTFILTER; 1392out: 1393 if (afap != NULL) 1394 FREE(afap, M_TEMP); 1395 return (error); 1396} 1397#endif /* INET */ 1398 1399/* 1400 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1401 * an additional variant to handle the case where the option value needs 1402 * to be some kind of integer, but not a specific size. 1403 * In addition to their use here, these functions are also called by the 1404 * protocol-level pr_ctloutput() routines. 1405 */ 1406int 1407sooptcopyin(sopt, buf, len, minlen) 1408 struct sockopt *sopt; 1409 void *buf; 1410 size_t len; 1411 size_t minlen; 1412{ 1413 size_t valsize; 1414 1415 /* 1416 * If the user gives us more than we wanted, we ignore it, 1417 * but if we don't get the minimum length the caller 1418 * wants, we return EINVAL. On success, sopt->sopt_valsize 1419 * is set to however much we actually retrieved. 1420 */ 1421 if ((valsize = sopt->sopt_valsize) < minlen) 1422 return EINVAL; 1423 if (valsize > len) 1424 sopt->sopt_valsize = valsize = len; 1425 1426 if (sopt->sopt_td != NULL) 1427 return (copyin(sopt->sopt_val, buf, valsize)); 1428 1429 bcopy(sopt->sopt_val, buf, valsize); 1430 return 0; 1431} 1432 1433int 1434sosetopt(so, sopt) 1435 struct socket *so; 1436 struct sockopt *sopt; 1437{ 1438 int error, optval; 1439 struct linger l; 1440 struct timeval tv; 1441 u_long val; 1442#ifdef MAC 1443 struct mac extmac; 1444#endif 1445 1446 error = 0; 1447 if (sopt->sopt_level != SOL_SOCKET) { 1448 if (so->so_proto && so->so_proto->pr_ctloutput) 1449 return ((*so->so_proto->pr_ctloutput) 1450 (so, sopt)); 1451 error = ENOPROTOOPT; 1452 } else { 1453 switch (sopt->sopt_name) { 1454#ifdef INET 1455 case SO_ACCEPTFILTER: 1456 error = do_setopt_accept_filter(so, sopt); 1457 if (error) 1458 goto bad; 1459 break; 1460#endif 1461 case SO_LINGER: 1462 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1463 if (error) 1464 goto bad; 1465 1466 SOCK_LOCK(so); 1467 so->so_linger = l.l_linger; 1468 if (l.l_onoff) 1469 so->so_options |= SO_LINGER; 1470 else 1471 so->so_options &= ~SO_LINGER; 1472 SOCK_UNLOCK(so); 1473 break; 1474 1475 case SO_DEBUG: 1476 case SO_KEEPALIVE: 1477 case SO_DONTROUTE: 1478 case SO_USELOOPBACK: 1479 case SO_BROADCAST: 1480 case SO_REUSEADDR: 1481 case SO_REUSEPORT: 1482 case SO_OOBINLINE: 1483 case SO_TIMESTAMP: 1484 case SO_BINTIME: 1485 case SO_NOSIGPIPE: 1486 error = sooptcopyin(sopt, &optval, sizeof optval, 1487 sizeof optval); 1488 if (error) 1489 goto bad; 1490 SOCK_LOCK(so); 1491 if (optval) 1492 so->so_options |= sopt->sopt_name; 1493 else 1494 so->so_options &= ~sopt->sopt_name; 1495 SOCK_UNLOCK(so); 1496 break; 1497 1498 case SO_SNDBUF: 1499 case SO_RCVBUF: 1500 case SO_SNDLOWAT: 1501 case SO_RCVLOWAT: 1502 error = sooptcopyin(sopt, &optval, sizeof optval, 1503 sizeof optval); 1504 if (error) 1505 goto bad; 1506 1507 /* 1508 * Values < 1 make no sense for any of these 1509 * options, so disallow them. 1510 */ 1511 if (optval < 1) { 1512 error = EINVAL; 1513 goto bad; 1514 } 1515 1516 switch (sopt->sopt_name) { 1517 case SO_SNDBUF: 1518 case SO_RCVBUF: 1519 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1520 &so->so_snd : &so->so_rcv, (u_long)optval, 1521 so, curthread) == 0) { 1522 error = ENOBUFS; 1523 goto bad; 1524 } 1525 break; 1526 1527 /* 1528 * Make sure the low-water is never greater than 1529 * the high-water. 1530 */ 1531 case SO_SNDLOWAT: 1532 SOCKBUF_LOCK(&so->so_snd); 1533 so->so_snd.sb_lowat = 1534 (optval > so->so_snd.sb_hiwat) ? 1535 so->so_snd.sb_hiwat : optval; 1536 SOCKBUF_UNLOCK(&so->so_snd); 1537 break; 1538 case SO_RCVLOWAT: 1539 SOCKBUF_LOCK(&so->so_rcv); 1540 so->so_rcv.sb_lowat = 1541 (optval > so->so_rcv.sb_hiwat) ? 1542 so->so_rcv.sb_hiwat : optval; 1543 SOCKBUF_UNLOCK(&so->so_rcv); 1544 break; 1545 } 1546 break; 1547 1548 case SO_SNDTIMEO: 1549 case SO_RCVTIMEO: 1550 error = sooptcopyin(sopt, &tv, sizeof tv, 1551 sizeof tv); 1552 if (error) 1553 goto bad; 1554 1555 /* assert(hz > 0); */ 1556 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1557 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1558 error = EDOM; 1559 goto bad; 1560 } 1561 /* assert(tick > 0); */ 1562 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1563 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1564 if (val > SHRT_MAX) { 1565 error = EDOM; 1566 goto bad; 1567 } 1568 if (val == 0 && tv.tv_usec != 0) 1569 val = 1; 1570 1571 switch (sopt->sopt_name) { 1572 case SO_SNDTIMEO: 1573 so->so_snd.sb_timeo = val; 1574 break; 1575 case SO_RCVTIMEO: 1576 so->so_rcv.sb_timeo = val; 1577 break; 1578 } 1579 break; 1580 case SO_LABEL: 1581#ifdef MAC 1582 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1583 sizeof extmac); 1584 if (error) 1585 goto bad; 1586 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1587 so, &extmac); 1588#else 1589 error = EOPNOTSUPP; 1590#endif 1591 break; 1592 default: 1593 error = ENOPROTOOPT; 1594 break; 1595 } 1596 if (error == 0 && so->so_proto != NULL && 1597 so->so_proto->pr_ctloutput != NULL) { 1598 (void) ((*so->so_proto->pr_ctloutput) 1599 (so, sopt)); 1600 } 1601 } 1602bad: 1603 return (error); 1604} 1605 1606/* Helper routine for getsockopt */ 1607int 1608sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1609{ 1610 int error; 1611 size_t valsize; 1612 1613 error = 0; 1614 1615 /* 1616 * Documented get behavior is that we always return a value, 1617 * possibly truncated to fit in the user's buffer. 1618 * Traditional behavior is that we always tell the user 1619 * precisely how much we copied, rather than something useful 1620 * like the total amount we had available for her. 1621 * Note that this interface is not idempotent; the entire answer must 1622 * generated ahead of time. 1623 */ 1624 valsize = min(len, sopt->sopt_valsize); 1625 sopt->sopt_valsize = valsize; 1626 if (sopt->sopt_val != NULL) { 1627 if (sopt->sopt_td != NULL) 1628 error = copyout(buf, sopt->sopt_val, valsize); 1629 else 1630 bcopy(buf, sopt->sopt_val, valsize); 1631 } 1632 return error; 1633} 1634 1635int 1636sogetopt(so, sopt) 1637 struct socket *so; 1638 struct sockopt *sopt; 1639{ 1640 int error, optval; 1641 struct linger l; 1642 struct timeval tv; 1643#ifdef INET 1644 struct accept_filter_arg *afap; 1645#endif 1646#ifdef MAC 1647 struct mac extmac; 1648#endif 1649 1650 error = 0; 1651 if (sopt->sopt_level != SOL_SOCKET) { 1652 if (so->so_proto && so->so_proto->pr_ctloutput) { 1653 return ((*so->so_proto->pr_ctloutput) 1654 (so, sopt)); 1655 } else 1656 return (ENOPROTOOPT); 1657 } else { 1658 switch (sopt->sopt_name) { 1659#ifdef INET 1660 case SO_ACCEPTFILTER: 1661 if ((so->so_options & SO_ACCEPTCONN) == 0) 1662 return (EINVAL); 1663 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1664 M_TEMP, M_WAITOK | M_ZERO); 1665 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1666 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1667 if (so->so_accf->so_accept_filter_str != NULL) 1668 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1669 } 1670 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1671 FREE(afap, M_TEMP); 1672 break; 1673#endif 1674 1675 case SO_LINGER: 1676 /* 1677 * XXXRW: We grab the lock here to get a consistent 1678 * snapshot of both fields. This may not really 1679 * be necessary. 1680 */ 1681 SOCK_LOCK(so); 1682 l.l_onoff = so->so_options & SO_LINGER; 1683 l.l_linger = so->so_linger; 1684 SOCK_UNLOCK(so); 1685 error = sooptcopyout(sopt, &l, sizeof l); 1686 break; 1687 1688 case SO_USELOOPBACK: 1689 case SO_DONTROUTE: 1690 case SO_DEBUG: 1691 case SO_KEEPALIVE: 1692 case SO_REUSEADDR: 1693 case SO_REUSEPORT: 1694 case SO_BROADCAST: 1695 case SO_OOBINLINE: 1696 case SO_TIMESTAMP: 1697 case SO_BINTIME: 1698 case SO_NOSIGPIPE: 1699 optval = so->so_options & sopt->sopt_name; 1700integer: 1701 error = sooptcopyout(sopt, &optval, sizeof optval); 1702 break; 1703 1704 case SO_TYPE: 1705 optval = so->so_type; 1706 goto integer; 1707 1708 case SO_ERROR: 1709 optval = so->so_error; 1710 so->so_error = 0; 1711 goto integer; 1712 1713 case SO_SNDBUF: 1714 optval = so->so_snd.sb_hiwat; 1715 goto integer; 1716 1717 case SO_RCVBUF: 1718 optval = so->so_rcv.sb_hiwat; 1719 goto integer; 1720 1721 case SO_SNDLOWAT: 1722 optval = so->so_snd.sb_lowat; 1723 goto integer; 1724 1725 case SO_RCVLOWAT: 1726 optval = so->so_rcv.sb_lowat; 1727 goto integer; 1728 1729 case SO_SNDTIMEO: 1730 case SO_RCVTIMEO: 1731 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1732 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1733 1734 tv.tv_sec = optval / hz; 1735 tv.tv_usec = (optval % hz) * tick; 1736 error = sooptcopyout(sopt, &tv, sizeof tv); 1737 break; 1738 case SO_LABEL: 1739#ifdef MAC 1740 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1741 sizeof(extmac)); 1742 if (error) 1743 return (error); 1744 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1745 so, &extmac); 1746 if (error) 1747 return (error); 1748 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1749#else 1750 error = EOPNOTSUPP; 1751#endif 1752 break; 1753 case SO_PEERLABEL: 1754#ifdef MAC 1755 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1756 sizeof(extmac)); 1757 if (error) 1758 return (error); 1759 error = mac_getsockopt_peerlabel( 1760 sopt->sopt_td->td_ucred, so, &extmac); 1761 if (error) 1762 return (error); 1763 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1764#else 1765 error = EOPNOTSUPP; 1766#endif 1767 break; 1768 default: 1769 error = ENOPROTOOPT; 1770 break; 1771 } 1772 return (error); 1773 } 1774} 1775 1776/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1777int 1778soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1779{ 1780 struct mbuf *m, *m_prev; 1781 int sopt_size = sopt->sopt_valsize; 1782 1783 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1784 if (m == NULL) 1785 return ENOBUFS; 1786 if (sopt_size > MLEN) { 1787 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1788 if ((m->m_flags & M_EXT) == 0) { 1789 m_free(m); 1790 return ENOBUFS; 1791 } 1792 m->m_len = min(MCLBYTES, sopt_size); 1793 } else { 1794 m->m_len = min(MLEN, sopt_size); 1795 } 1796 sopt_size -= m->m_len; 1797 *mp = m; 1798 m_prev = m; 1799 1800 while (sopt_size) { 1801 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1802 if (m == NULL) { 1803 m_freem(*mp); 1804 return ENOBUFS; 1805 } 1806 if (sopt_size > MLEN) { 1807 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1808 M_DONTWAIT); 1809 if ((m->m_flags & M_EXT) == 0) { 1810 m_freem(m); 1811 m_freem(*mp); 1812 return ENOBUFS; 1813 } 1814 m->m_len = min(MCLBYTES, sopt_size); 1815 } else { 1816 m->m_len = min(MLEN, sopt_size); 1817 } 1818 sopt_size -= m->m_len; 1819 m_prev->m_next = m; 1820 m_prev = m; 1821 } 1822 return 0; 1823} 1824 1825/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1826int 1827soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1828{ 1829 struct mbuf *m0 = m; 1830 1831 if (sopt->sopt_val == NULL) 1832 return 0; 1833 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1834 if (sopt->sopt_td != NULL) { 1835 int error; 1836 1837 error = copyin(sopt->sopt_val, mtod(m, char *), 1838 m->m_len); 1839 if (error != 0) { 1840 m_freem(m0); 1841 return(error); 1842 } 1843 } else 1844 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1845 sopt->sopt_valsize -= m->m_len; 1846 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1847 m = m->m_next; 1848 } 1849 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1850 panic("ip6_sooptmcopyin"); 1851 return 0; 1852} 1853 1854/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1855int 1856soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1857{ 1858 struct mbuf *m0 = m; 1859 size_t valsize = 0; 1860 1861 if (sopt->sopt_val == NULL) 1862 return 0; 1863 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1864 if (sopt->sopt_td != NULL) { 1865 int error; 1866 1867 error = copyout(mtod(m, char *), sopt->sopt_val, 1868 m->m_len); 1869 if (error != 0) { 1870 m_freem(m0); 1871 return(error); 1872 } 1873 } else 1874 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 1875 sopt->sopt_valsize -= m->m_len; 1876 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1877 valsize += m->m_len; 1878 m = m->m_next; 1879 } 1880 if (m != NULL) { 1881 /* enough soopt buffer should be given from user-land */ 1882 m_freem(m0); 1883 return(EINVAL); 1884 } 1885 sopt->sopt_valsize = valsize; 1886 return 0; 1887} 1888 1889void 1890sohasoutofband(so) 1891 struct socket *so; 1892{ 1893 if (so->so_sigio != NULL) 1894 pgsigio(&so->so_sigio, SIGURG, 0); 1895 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 1896} 1897 1898int 1899sopoll(struct socket *so, int events, struct ucred *active_cred, 1900 struct thread *td) 1901{ 1902 int revents = 0; 1903 1904 if (events & (POLLIN | POLLRDNORM)) 1905 if (soreadable(so)) 1906 revents |= events & (POLLIN | POLLRDNORM); 1907 1908 if (events & POLLINIGNEOF) 1909 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 1910 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 1911 revents |= POLLINIGNEOF; 1912 1913 if (events & (POLLOUT | POLLWRNORM)) 1914 if (sowriteable(so)) 1915 revents |= events & (POLLOUT | POLLWRNORM); 1916 1917 if (events & (POLLPRI | POLLRDBAND)) 1918 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 1919 revents |= events & (POLLPRI | POLLRDBAND); 1920 1921 if (revents == 0) { 1922 if (events & 1923 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 1924 POLLRDBAND)) { 1925 SOCKBUF_LOCK(&so->so_rcv); 1926 selrecord(td, &so->so_rcv.sb_sel); 1927 so->so_rcv.sb_flags |= SB_SEL; 1928 SOCKBUF_UNLOCK(&so->so_rcv); 1929 } 1930 1931 if (events & (POLLOUT | POLLWRNORM)) { 1932 SOCKBUF_LOCK(&so->so_snd); 1933 selrecord(td, &so->so_snd.sb_sel); 1934 so->so_snd.sb_flags |= SB_SEL; 1935 SOCKBUF_UNLOCK(&so->so_snd); 1936 } 1937 } 1938 1939 return (revents); 1940} 1941 1942int 1943soo_kqfilter(struct file *fp, struct knote *kn) 1944{ 1945 struct socket *so = kn->kn_fp->f_data; 1946 struct sockbuf *sb; 1947 1948 switch (kn->kn_filter) { 1949 case EVFILT_READ: 1950 if (so->so_options & SO_ACCEPTCONN) 1951 kn->kn_fop = &solisten_filtops; 1952 else 1953 kn->kn_fop = &soread_filtops; 1954 sb = &so->so_rcv; 1955 break; 1956 case EVFILT_WRITE: 1957 kn->kn_fop = &sowrite_filtops; 1958 sb = &so->so_snd; 1959 break; 1960 default: 1961 return (1); 1962 } 1963 1964 SOCKBUF_LOCK(sb); 1965 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1966 sb->sb_flags |= SB_KNOTE; 1967 SOCKBUF_UNLOCK(sb); 1968 return (0); 1969} 1970 1971static void 1972filt_sordetach(struct knote *kn) 1973{ 1974 struct socket *so = kn->kn_fp->f_data; 1975 1976 SOCKBUF_LOCK(&so->so_rcv); 1977 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1978 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1979 so->so_rcv.sb_flags &= ~SB_KNOTE; 1980 SOCKBUF_UNLOCK(&so->so_rcv); 1981} 1982 1983/*ARGSUSED*/ 1984static int 1985filt_soread(struct knote *kn, long hint) 1986{ 1987 struct socket *so = kn->kn_fp->f_data; 1988 int need_lock, result; 1989 1990 /* 1991 * XXXRW: Conditional locking because filt_soread() can be called 1992 * either from KNOTE() in the socket context where the socket buffer 1993 * lock is already held, or from kqueue() itself. 1994 */ 1995 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 1996 if (need_lock) 1997 SOCKBUF_LOCK(&so->so_rcv); 1998 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 1999 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2000 kn->kn_flags |= EV_EOF; 2001 kn->kn_fflags = so->so_error; 2002 result = 1; 2003 } else if (so->so_error) /* temporary udp error */ 2004 result = 1; 2005 else if (kn->kn_sfflags & NOTE_LOWAT) 2006 result = (kn->kn_data >= kn->kn_sdata); 2007 else 2008 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2009 if (need_lock) 2010 SOCKBUF_UNLOCK(&so->so_rcv); 2011 return (result); 2012} 2013 2014static void 2015filt_sowdetach(struct knote *kn) 2016{ 2017 struct socket *so = kn->kn_fp->f_data; 2018 2019 SOCKBUF_LOCK(&so->so_snd); 2020 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2021 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2022 so->so_snd.sb_flags &= ~SB_KNOTE; 2023 SOCKBUF_UNLOCK(&so->so_snd); 2024} 2025 2026/*ARGSUSED*/ 2027static int 2028filt_sowrite(struct knote *kn, long hint) 2029{ 2030 struct socket *so = kn->kn_fp->f_data; 2031 int need_lock, result; 2032 2033 /* 2034 * XXXRW: Conditional locking because filt_soread() can be called 2035 * either from KNOTE() in the socket context where the socket buffer 2036 * lock is already held, or from kqueue() itself. 2037 */ 2038 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2039 if (need_lock) 2040 SOCKBUF_LOCK(&so->so_snd); 2041 kn->kn_data = sbspace(&so->so_snd); 2042 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2043 kn->kn_flags |= EV_EOF; 2044 kn->kn_fflags = so->so_error; 2045 result = 1; 2046 } else if (so->so_error) /* temporary udp error */ 2047 result = 1; 2048 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2049 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2050 result = 0; 2051 else if (kn->kn_sfflags & NOTE_LOWAT) 2052 result = (kn->kn_data >= kn->kn_sdata); 2053 else 2054 result = (kn->kn_data >= so->so_snd.sb_lowat); 2055 if (need_lock) 2056 SOCKBUF_UNLOCK(&so->so_snd); 2057 return (result); 2058} 2059 2060/*ARGSUSED*/ 2061static int 2062filt_solisten(struct knote *kn, long hint) 2063{ 2064 struct socket *so = kn->kn_fp->f_data; 2065 2066 kn->kn_data = so->so_qlen; 2067 return (! TAILQ_EMPTY(&so->so_comp)); 2068} 2069 2070int 2071socheckuid(struct socket *so, uid_t uid) 2072{ 2073 2074 if (so == NULL) 2075 return (EPERM); 2076 if (so->so_cred->cr_uid == uid) 2077 return (0); 2078 return (EPERM); 2079} 2080