uipc_socket.c revision 131889
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 131889 2004-07-10 03:47:15Z rwatson $"); 34 35#include "opt_inet.h" 36#include "opt_mac.h" 37#include "opt_zero.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/fcntl.h> 42#include <sys/limits.h> 43#include <sys/lock.h> 44#include <sys/mac.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/mutex.h> 48#include <sys/domain.h> 49#include <sys/file.h> /* for struct knote */ 50#include <sys/kernel.h> 51#include <sys/event.h> 52#include <sys/poll.h> 53#include <sys/proc.h> 54#include <sys/protosw.h> 55#include <sys/socket.h> 56#include <sys/socketvar.h> 57#include <sys/resourcevar.h> 58#include <sys/signalvar.h> 59#include <sys/sysctl.h> 60#include <sys/uio.h> 61#include <sys/jail.h> 62 63#include <vm/uma.h> 64 65 66#ifdef INET 67static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 68#endif 69 70static void filt_sordetach(struct knote *kn); 71static int filt_soread(struct knote *kn, long hint); 72static void filt_sowdetach(struct knote *kn); 73static int filt_sowrite(struct knote *kn, long hint); 74static int filt_solisten(struct knote *kn, long hint); 75 76static struct filterops solisten_filtops = 77 { 1, NULL, filt_sordetach, filt_solisten }; 78static struct filterops soread_filtops = 79 { 1, NULL, filt_sordetach, filt_soread }; 80static struct filterops sowrite_filtops = 81 { 1, NULL, filt_sowdetach, filt_sowrite }; 82 83uma_zone_t socket_zone; 84so_gen_t so_gencnt; /* generation count for sockets */ 85 86MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 87MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 88 89SYSCTL_DECL(_kern_ipc); 90 91static int somaxconn = SOMAXCONN; 92SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 93 &somaxconn, 0, "Maximum pending socket connection queue size"); 94static int numopensockets; 95SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 96 &numopensockets, 0, "Number of open sockets"); 97#ifdef ZERO_COPY_SOCKETS 98/* These aren't static because they're used in other files. */ 99int so_zero_copy_send = 1; 100int so_zero_copy_receive = 1; 101SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 102 "Zero copy controls"); 103SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 104 &so_zero_copy_receive, 0, "Enable zero copy receive"); 105SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 106 &so_zero_copy_send, 0, "Enable zero copy send"); 107#endif /* ZERO_COPY_SOCKETS */ 108 109/* 110 * accept_mtx locks down per-socket fields relating to accept queues. See 111 * socketvar.h for an annotation of the protected fields of struct socket. 112 */ 113struct mtx accept_mtx; 114MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 115 116/* 117 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 118 * so_gencnt field. 119 * 120 * XXXRW: These variables might be better manipulated using atomic operations 121 * for improved efficiency. 122 */ 123static struct mtx so_global_mtx; 124MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 125 126/* 127 * Socket operation routines. 128 * These routines are called by the routines in 129 * sys_socket.c or from a system process, and 130 * implement the semantics of socket operations by 131 * switching out to the protocol specific routines. 132 */ 133 134/* 135 * Get a socket structure from our zone, and initialize it. 136 * Note that it would probably be better to allocate socket 137 * and PCB at the same time, but I'm not convinced that all 138 * the protocols can be easily modified to do this. 139 * 140 * soalloc() returns a socket with a ref count of 0. 141 */ 142struct socket * 143soalloc(int mflags) 144{ 145 struct socket *so; 146#ifdef MAC 147 int error; 148#endif 149 150 so = uma_zalloc(socket_zone, mflags | M_ZERO); 151 if (so != NULL) { 152#ifdef MAC 153 error = mac_init_socket(so, mflags); 154 if (error != 0) { 155 uma_zfree(socket_zone, so); 156 so = NULL; 157 return so; 158 } 159#endif 160 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 161 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 162 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 163 TAILQ_INIT(&so->so_aiojobq); 164 mtx_lock(&so_global_mtx); 165 so->so_gencnt = ++so_gencnt; 166 ++numopensockets; 167 mtx_unlock(&so_global_mtx); 168 } 169 return so; 170} 171 172/* 173 * socreate returns a socket with a ref count of 1. The socket should be 174 * closed with soclose(). 175 */ 176int 177socreate(dom, aso, type, proto, cred, td) 178 int dom; 179 struct socket **aso; 180 int type; 181 int proto; 182 struct ucred *cred; 183 struct thread *td; 184{ 185 struct protosw *prp; 186 struct socket *so; 187 int error; 188 189 if (proto) 190 prp = pffindproto(dom, proto, type); 191 else 192 prp = pffindtype(dom, type); 193 194 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 195 return (EPROTONOSUPPORT); 196 197 if (jailed(cred) && jail_socket_unixiproute_only && 198 prp->pr_domain->dom_family != PF_LOCAL && 199 prp->pr_domain->dom_family != PF_INET && 200 prp->pr_domain->dom_family != PF_ROUTE) { 201 return (EPROTONOSUPPORT); 202 } 203 204 if (prp->pr_type != type) 205 return (EPROTOTYPE); 206 so = soalloc(M_WAITOK); 207 if (so == NULL) 208 return (ENOBUFS); 209 210 TAILQ_INIT(&so->so_incomp); 211 TAILQ_INIT(&so->so_comp); 212 so->so_type = type; 213 so->so_cred = crhold(cred); 214 so->so_proto = prp; 215#ifdef MAC 216 mac_create_socket(cred, so); 217#endif 218 SOCK_LOCK(so); 219 soref(so); 220 SOCK_UNLOCK(so); 221 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 222 if (error) { 223 SOCK_LOCK(so); 224 so->so_state |= SS_NOFDREF; 225 sorele(so); 226 return (error); 227 } 228 *aso = so; 229 return (0); 230} 231 232int 233sobind(so, nam, td) 234 struct socket *so; 235 struct sockaddr *nam; 236 struct thread *td; 237{ 238 239 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 240} 241 242void 243sodealloc(struct socket *so) 244{ 245 246 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 247 mtx_lock(&so_global_mtx); 248 so->so_gencnt = ++so_gencnt; 249 mtx_unlock(&so_global_mtx); 250 if (so->so_rcv.sb_hiwat) 251 (void)chgsbsize(so->so_cred->cr_uidinfo, 252 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 253 if (so->so_snd.sb_hiwat) 254 (void)chgsbsize(so->so_cred->cr_uidinfo, 255 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 256#ifdef INET 257 /* remove acccept filter if one is present. */ 258 if (so->so_accf != NULL) 259 do_setopt_accept_filter(so, NULL); 260#endif 261#ifdef MAC 262 mac_destroy_socket(so); 263#endif 264 crfree(so->so_cred); 265 SOCKBUF_LOCK_DESTROY(&so->so_snd); 266 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 267 /* sx_destroy(&so->so_sxlock); */ 268 uma_zfree(socket_zone, so); 269 /* 270 * XXXRW: Seems like a shame to grab the mutex again down here, but 271 * we don't want to decrement the socket count until after we free 272 * the socket, and we can't increment the gencnt on the socket after 273 * we free, it so... 274 */ 275 mtx_lock(&so_global_mtx); 276 --numopensockets; 277 mtx_unlock(&so_global_mtx); 278} 279 280int 281solisten(so, backlog, td) 282 struct socket *so; 283 int backlog; 284 struct thread *td; 285{ 286 int error; 287 288 /* 289 * XXXRW: Ordering issue here -- perhaps we need to set 290 * SO_ACCEPTCONN before the call to pru_listen()? 291 * XXXRW: General atomic test-and-set concerns here also. 292 */ 293 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 294 SS_ISDISCONNECTING)) 295 return (EINVAL); 296 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 297 if (error) 298 return (error); 299 ACCEPT_LOCK(); 300 if (TAILQ_EMPTY(&so->so_comp)) { 301 SOCK_LOCK(so); 302 so->so_options |= SO_ACCEPTCONN; 303 SOCK_UNLOCK(so); 304 } 305 if (backlog < 0 || backlog > somaxconn) 306 backlog = somaxconn; 307 so->so_qlimit = backlog; 308 ACCEPT_UNLOCK(); 309 return (0); 310} 311 312void 313sofree(so) 314 struct socket *so; 315{ 316 struct socket *head; 317 318 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 319 SOCK_LOCK_ASSERT(so); 320 321 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 322 SOCK_UNLOCK(so); 323 return; 324 } 325 326 SOCK_UNLOCK(so); 327 ACCEPT_LOCK(); 328 head = so->so_head; 329 if (head != NULL) { 330 KASSERT((so->so_qstate & SQ_COMP) != 0 || 331 (so->so_qstate & SQ_INCOMP) != 0, 332 ("sofree: so_head != NULL, but neither SQ_COMP nor " 333 "SQ_INCOMP")); 334 KASSERT((so->so_qstate & SQ_COMP) == 0 || 335 (so->so_qstate & SQ_INCOMP) == 0, 336 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 337 /* 338 * accept(2) is responsible draining the completed 339 * connection queue and freeing those sockets, so 340 * we just return here if this socket is currently 341 * on the completed connection queue. Otherwise, 342 * accept(2) may hang after select(2) has indicating 343 * that a listening socket was ready. If it's an 344 * incomplete connection, we remove it from the queue 345 * and free it; otherwise, it won't be released until 346 * the listening socket is closed. 347 */ 348 if ((so->so_qstate & SQ_COMP) != 0) { 349 ACCEPT_UNLOCK(); 350 return; 351 } 352 TAILQ_REMOVE(&head->so_incomp, so, so_list); 353 head->so_incqlen--; 354 so->so_qstate &= ~SQ_INCOMP; 355 so->so_head = NULL; 356 } 357 KASSERT((so->so_qstate & SQ_COMP) == 0 && 358 (so->so_qstate & SQ_INCOMP) == 0, 359 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 360 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 361 ACCEPT_UNLOCK(); 362 SOCKBUF_LOCK(&so->so_snd); 363 so->so_snd.sb_flags |= SB_NOINTR; 364 (void)sblock(&so->so_snd, M_WAITOK); 365 /* 366 * socantsendmore_locked() drops the socket buffer mutex so that it 367 * can safely perform wakeups. Re-acquire the mutex before 368 * continuing. 369 */ 370 socantsendmore_locked(so); 371 SOCKBUF_LOCK(&so->so_snd); 372 sbunlock(&so->so_snd); 373 sbrelease_locked(&so->so_snd, so); 374 SOCKBUF_UNLOCK(&so->so_snd); 375 sorflush(so); 376 sodealloc(so); 377} 378 379/* 380 * Close a socket on last file table reference removal. 381 * Initiate disconnect if connected. 382 * Free socket when disconnect complete. 383 * 384 * This function will sorele() the socket. Note that soclose() may be 385 * called prior to the ref count reaching zero. The actual socket 386 * structure will not be freed until the ref count reaches zero. 387 */ 388int 389soclose(so) 390 struct socket *so; 391{ 392 int error = 0; 393 394 funsetown(&so->so_sigio); 395 if (so->so_options & SO_ACCEPTCONN) { 396 struct socket *sp; 397 ACCEPT_LOCK(); 398 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 399 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 400 so->so_incqlen--; 401 sp->so_qstate &= ~SQ_INCOMP; 402 sp->so_head = NULL; 403 ACCEPT_UNLOCK(); 404 (void) soabort(sp); 405 ACCEPT_LOCK(); 406 } 407 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 408 TAILQ_REMOVE(&so->so_comp, sp, so_list); 409 so->so_qlen--; 410 sp->so_qstate &= ~SQ_COMP; 411 sp->so_head = NULL; 412 ACCEPT_UNLOCK(); 413 (void) soabort(sp); 414 ACCEPT_LOCK(); 415 } 416 ACCEPT_UNLOCK(); 417 } 418 if (so->so_pcb == NULL) 419 goto discard; 420 if (so->so_state & SS_ISCONNECTED) { 421 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 422 error = sodisconnect(so); 423 if (error) 424 goto drop; 425 } 426 if (so->so_options & SO_LINGER) { 427 if ((so->so_state & SS_ISDISCONNECTING) && 428 (so->so_state & SS_NBIO)) 429 goto drop; 430 while (so->so_state & SS_ISCONNECTED) { 431 error = tsleep(&so->so_timeo, 432 PSOCK | PCATCH, "soclos", so->so_linger * hz); 433 if (error) 434 break; 435 } 436 } 437 } 438drop: 439 if (so->so_pcb != NULL) { 440 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 441 if (error == 0) 442 error = error2; 443 } 444discard: 445 SOCK_LOCK(so); 446 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 447 so->so_state |= SS_NOFDREF; 448 sorele(so); 449 return (error); 450} 451 452/* 453 * soabort() must not be called with any socket locks held, as it calls 454 * into the protocol, which will call back into the socket code causing 455 * it to acquire additional socket locks that may cause recursion or lock 456 * order reversals. 457 */ 458int 459soabort(so) 460 struct socket *so; 461{ 462 int error; 463 464 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 465 if (error) { 466 SOCK_LOCK(so); 467 sotryfree(so); /* note: does not decrement the ref count */ 468 return error; 469 } 470 return (0); 471} 472 473int 474soaccept(so, nam) 475 struct socket *so; 476 struct sockaddr **nam; 477{ 478 int error; 479 480 SOCK_LOCK(so); 481 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 482 so->so_state &= ~SS_NOFDREF; 483 SOCK_UNLOCK(so); 484 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 485 return (error); 486} 487 488int 489soconnect(so, nam, td) 490 struct socket *so; 491 struct sockaddr *nam; 492 struct thread *td; 493{ 494 int error; 495 496 if (so->so_options & SO_ACCEPTCONN) 497 return (EOPNOTSUPP); 498 /* 499 * If protocol is connection-based, can only connect once. 500 * Otherwise, if connected, try to disconnect first. 501 * This allows user to disconnect by connecting to, e.g., 502 * a null address. 503 */ 504 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 505 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 506 (error = sodisconnect(so)))) 507 error = EISCONN; 508 else 509 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 510 return (error); 511} 512 513int 514soconnect2(so1, so2) 515 struct socket *so1; 516 struct socket *so2; 517{ 518 519 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 520} 521 522int 523sodisconnect(so) 524 struct socket *so; 525{ 526 int error; 527 528 if ((so->so_state & SS_ISCONNECTED) == 0) 529 return (ENOTCONN); 530 if (so->so_state & SS_ISDISCONNECTING) 531 return (EALREADY); 532 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 533 return (error); 534} 535 536#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 537/* 538 * Send on a socket. 539 * If send must go all at once and message is larger than 540 * send buffering, then hard error. 541 * Lock against other senders. 542 * If must go all at once and not enough room now, then 543 * inform user that this would block and do nothing. 544 * Otherwise, if nonblocking, send as much as possible. 545 * The data to be sent is described by "uio" if nonzero, 546 * otherwise by the mbuf chain "top" (which must be null 547 * if uio is not). Data provided in mbuf chain must be small 548 * enough to send all at once. 549 * 550 * Returns nonzero on error, timeout or signal; callers 551 * must check for short counts if EINTR/ERESTART are returned. 552 * Data and control buffers are freed on return. 553 */ 554 555#ifdef ZERO_COPY_SOCKETS 556struct so_zerocopy_stats{ 557 int size_ok; 558 int align_ok; 559 int found_ifp; 560}; 561struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 562#include <netinet/in.h> 563#include <net/route.h> 564#include <netinet/in_pcb.h> 565#include <vm/vm.h> 566#include <vm/vm_page.h> 567#include <vm/vm_object.h> 568#endif /*ZERO_COPY_SOCKETS*/ 569 570int 571sosend(so, addr, uio, top, control, flags, td) 572 struct socket *so; 573 struct sockaddr *addr; 574 struct uio *uio; 575 struct mbuf *top; 576 struct mbuf *control; 577 int flags; 578 struct thread *td; 579{ 580 struct mbuf **mp; 581 struct mbuf *m; 582 long space, len = 0, resid; 583 int clen = 0, error, dontroute; 584 int atomic = sosendallatonce(so) || top; 585#ifdef ZERO_COPY_SOCKETS 586 int cow_send; 587#endif /* ZERO_COPY_SOCKETS */ 588 589 if (uio != NULL) 590 resid = uio->uio_resid; 591 else 592 resid = top->m_pkthdr.len; 593 /* 594 * In theory resid should be unsigned. 595 * However, space must be signed, as it might be less than 0 596 * if we over-committed, and we must use a signed comparison 597 * of space and resid. On the other hand, a negative resid 598 * causes us to loop sending 0-length segments to the protocol. 599 * 600 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 601 * type sockets since that's an error. 602 */ 603 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 604 error = EINVAL; 605 goto out; 606 } 607 608 dontroute = 609 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 610 (so->so_proto->pr_flags & PR_ATOMIC); 611 if (td != NULL) 612 td->td_proc->p_stats->p_ru.ru_msgsnd++; 613 if (control != NULL) 614 clen = control->m_len; 615#define snderr(errno) { error = (errno); goto release; } 616 617 SOCKBUF_LOCK(&so->so_snd); 618restart: 619 SOCKBUF_LOCK_ASSERT(&so->so_snd); 620 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 621 if (error) 622 goto out_locked; 623 do { 624 SOCKBUF_LOCK_ASSERT(&so->so_snd); 625 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 626 snderr(EPIPE); 627 if (so->so_error) { 628 error = so->so_error; 629 so->so_error = 0; 630 goto release; 631 } 632 if ((so->so_state & SS_ISCONNECTED) == 0) { 633 /* 634 * `sendto' and `sendmsg' is allowed on a connection- 635 * based socket if it supports implied connect. 636 * Return ENOTCONN if not connected and no address is 637 * supplied. 638 */ 639 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 640 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 641 if ((so->so_state & SS_ISCONFIRMING) == 0 && 642 !(resid == 0 && clen != 0)) 643 snderr(ENOTCONN); 644 } else if (addr == NULL) 645 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 646 ENOTCONN : EDESTADDRREQ); 647 } 648 space = sbspace(&so->so_snd); 649 if (flags & MSG_OOB) 650 space += 1024; 651 if ((atomic && resid > so->so_snd.sb_hiwat) || 652 clen > so->so_snd.sb_hiwat) 653 snderr(EMSGSIZE); 654 if (space < resid + clen && 655 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 656 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 657 snderr(EWOULDBLOCK); 658 sbunlock(&so->so_snd); 659 error = sbwait(&so->so_snd); 660 if (error) 661 goto out_locked; 662 goto restart; 663 } 664 SOCKBUF_UNLOCK(&so->so_snd); 665 mp = ⊤ 666 space -= clen; 667 do { 668 if (uio == NULL) { 669 /* 670 * Data is prepackaged in "top". 671 */ 672 resid = 0; 673 if (flags & MSG_EOR) 674 top->m_flags |= M_EOR; 675 } else do { 676#ifdef ZERO_COPY_SOCKETS 677 cow_send = 0; 678#endif /* ZERO_COPY_SOCKETS */ 679 if (resid >= MINCLSIZE) { 680#ifdef ZERO_COPY_SOCKETS 681 if (top == NULL) { 682 MGETHDR(m, M_TRYWAIT, MT_DATA); 683 if (m == NULL) { 684 error = ENOBUFS; 685 SOCKBUF_LOCK(&so->so_snd); 686 goto release; 687 } 688 m->m_pkthdr.len = 0; 689 m->m_pkthdr.rcvif = (struct ifnet *)0; 690 } else { 691 MGET(m, M_TRYWAIT, MT_DATA); 692 if (m == NULL) { 693 error = ENOBUFS; 694 SOCKBUF_LOCK(&so->so_snd); 695 goto release; 696 } 697 } 698 if (so_zero_copy_send && 699 resid>=PAGE_SIZE && 700 space>=PAGE_SIZE && 701 uio->uio_iov->iov_len>=PAGE_SIZE) { 702 so_zerocp_stats.size_ok++; 703 if (!((vm_offset_t) 704 uio->uio_iov->iov_base & PAGE_MASK)){ 705 so_zerocp_stats.align_ok++; 706 cow_send = socow_setup(m, uio); 707 } 708 } 709 if (!cow_send) { 710 MCLGET(m, M_TRYWAIT); 711 if ((m->m_flags & M_EXT) == 0) { 712 m_free(m); 713 m = NULL; 714 } else { 715 len = min(min(MCLBYTES, resid), space); 716 } 717 } else 718 len = PAGE_SIZE; 719#else /* ZERO_COPY_SOCKETS */ 720 if (top == NULL) { 721 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 722 m->m_pkthdr.len = 0; 723 m->m_pkthdr.rcvif = (struct ifnet *)0; 724 } else 725 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 726 len = min(min(MCLBYTES, resid), space); 727#endif /* ZERO_COPY_SOCKETS */ 728 } else { 729 if (top == NULL) { 730 m = m_gethdr(M_TRYWAIT, MT_DATA); 731 m->m_pkthdr.len = 0; 732 m->m_pkthdr.rcvif = (struct ifnet *)0; 733 734 len = min(min(MHLEN, resid), space); 735 /* 736 * For datagram protocols, leave room 737 * for protocol headers in first mbuf. 738 */ 739 if (atomic && m && len < MHLEN) 740 MH_ALIGN(m, len); 741 } else { 742 m = m_get(M_TRYWAIT, MT_DATA); 743 len = min(min(MLEN, resid), space); 744 } 745 } 746 if (m == NULL) { 747 error = ENOBUFS; 748 SOCKBUF_LOCK(&so->so_snd); 749 goto release; 750 } 751 752 space -= len; 753#ifdef ZERO_COPY_SOCKETS 754 if (cow_send) 755 error = 0; 756 else 757#endif /* ZERO_COPY_SOCKETS */ 758 error = uiomove(mtod(m, void *), (int)len, uio); 759 resid = uio->uio_resid; 760 m->m_len = len; 761 *mp = m; 762 top->m_pkthdr.len += len; 763 if (error) { 764 SOCKBUF_LOCK(&so->so_snd); 765 goto release; 766 } 767 mp = &m->m_next; 768 if (resid <= 0) { 769 if (flags & MSG_EOR) 770 top->m_flags |= M_EOR; 771 break; 772 } 773 } while (space > 0 && atomic); 774 if (dontroute) { 775 SOCK_LOCK(so); 776 so->so_options |= SO_DONTROUTE; 777 SOCK_UNLOCK(so); 778 } 779 /* 780 * XXX all the SBS_CANTSENDMORE checks previously 781 * done could be out of date. We could have recieved 782 * a reset packet in an interrupt or maybe we slept 783 * while doing page faults in uiomove() etc. We could 784 * probably recheck again inside the splnet() protection 785 * here, but there are probably other places that this 786 * also happens. We must rethink this. 787 */ 788 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 789 (flags & MSG_OOB) ? PRUS_OOB : 790 /* 791 * If the user set MSG_EOF, the protocol 792 * understands this flag and nothing left to 793 * send then use PRU_SEND_EOF instead of PRU_SEND. 794 */ 795 ((flags & MSG_EOF) && 796 (so->so_proto->pr_flags & PR_IMPLOPCL) && 797 (resid <= 0)) ? 798 PRUS_EOF : 799 /* If there is more to send set PRUS_MORETOCOME */ 800 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 801 top, addr, control, td); 802 if (dontroute) { 803 SOCK_LOCK(so); 804 so->so_options &= ~SO_DONTROUTE; 805 SOCK_UNLOCK(so); 806 } 807 clen = 0; 808 control = NULL; 809 top = NULL; 810 mp = ⊤ 811 if (error) { 812 SOCKBUF_LOCK(&so->so_snd); 813 goto release; 814 } 815 } while (resid && space > 0); 816 SOCKBUF_LOCK(&so->so_snd); 817 } while (resid); 818 819release: 820 SOCKBUF_LOCK_ASSERT(&so->so_snd); 821 sbunlock(&so->so_snd); 822out_locked: 823 SOCKBUF_LOCK_ASSERT(&so->so_snd); 824 SOCKBUF_UNLOCK(&so->so_snd); 825out: 826 if (top != NULL) 827 m_freem(top); 828 if (control != NULL) 829 m_freem(control); 830 return (error); 831} 832 833/* 834 * Implement receive operations on a socket. 835 * We depend on the way that records are added to the sockbuf 836 * by sbappend*. In particular, each record (mbufs linked through m_next) 837 * must begin with an address if the protocol so specifies, 838 * followed by an optional mbuf or mbufs containing ancillary data, 839 * and then zero or more mbufs of data. 840 * In order to avoid blocking network interrupts for the entire time here, 841 * we splx() while doing the actual copy to user space. 842 * Although the sockbuf is locked, new data may still be appended, 843 * and thus we must maintain consistency of the sockbuf during that time. 844 * 845 * The caller may receive the data as a single mbuf chain by supplying 846 * an mbuf **mp0 for use in returning the chain. The uio is then used 847 * only for the count in uio_resid. 848 */ 849int 850soreceive(so, psa, uio, mp0, controlp, flagsp) 851 struct socket *so; 852 struct sockaddr **psa; 853 struct uio *uio; 854 struct mbuf **mp0; 855 struct mbuf **controlp; 856 int *flagsp; 857{ 858 struct mbuf *m, **mp; 859 int flags, len, error, offset; 860 struct protosw *pr = so->so_proto; 861 struct mbuf *nextrecord; 862 int moff, type = 0; 863 int orig_resid = uio->uio_resid; 864 865 mp = mp0; 866 if (psa != NULL) 867 *psa = 0; 868 if (controlp != NULL) 869 *controlp = 0; 870 if (flagsp != NULL) 871 flags = *flagsp &~ MSG_EOR; 872 else 873 flags = 0; 874 if (flags & MSG_OOB) { 875 m = m_get(M_TRYWAIT, MT_DATA); 876 if (m == NULL) 877 return (ENOBUFS); 878 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 879 if (error) 880 goto bad; 881 do { 882#ifdef ZERO_COPY_SOCKETS 883 if (so_zero_copy_receive) { 884 vm_page_t pg; 885 int disposable; 886 887 if ((m->m_flags & M_EXT) 888 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 889 disposable = 1; 890 else 891 disposable = 0; 892 893 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 894 if (uio->uio_offset == -1) 895 uio->uio_offset =IDX_TO_OFF(pg->pindex); 896 897 error = uiomoveco(mtod(m, void *), 898 min(uio->uio_resid, m->m_len), 899 uio, pg->object, 900 disposable); 901 } else 902#endif /* ZERO_COPY_SOCKETS */ 903 error = uiomove(mtod(m, void *), 904 (int) min(uio->uio_resid, m->m_len), uio); 905 m = m_free(m); 906 } while (uio->uio_resid && error == 0 && m); 907bad: 908 if (m != NULL) 909 m_freem(m); 910 return (error); 911 } 912 if (mp != NULL) 913 *mp = NULL; 914 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 915 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 916 917 SOCKBUF_LOCK(&so->so_rcv); 918restart: 919 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 920 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 921 if (error) 922 goto out; 923 924 m = so->so_rcv.sb_mb; 925 /* 926 * If we have less data than requested, block awaiting more 927 * (subject to any timeout) if: 928 * 1. the current count is less than the low water mark, or 929 * 2. MSG_WAITALL is set, and it is possible to do the entire 930 * receive operation at once if we block (resid <= hiwat). 931 * 3. MSG_DONTWAIT is not set 932 * If MSG_WAITALL is set but resid is larger than the receive buffer, 933 * we have to do the receive in sections, and thus risk returning 934 * a short count if a timeout or signal occurs after we start. 935 */ 936 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 937 so->so_rcv.sb_cc < uio->uio_resid) && 938 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 939 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 940 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 941 KASSERT(m != NULL || !so->so_rcv.sb_cc, 942 ("receive: m == %p so->so_rcv.sb_cc == %u", 943 m, so->so_rcv.sb_cc)); 944 if (so->so_error) { 945 if (m != NULL) 946 goto dontblock; 947 error = so->so_error; 948 if ((flags & MSG_PEEK) == 0) 949 so->so_error = 0; 950 goto release; 951 } 952 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 953 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 954 if (m) 955 goto dontblock; 956 else 957 goto release; 958 } 959 for (; m != NULL; m = m->m_next) 960 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 961 m = so->so_rcv.sb_mb; 962 goto dontblock; 963 } 964 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 965 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 966 error = ENOTCONN; 967 goto release; 968 } 969 if (uio->uio_resid == 0) 970 goto release; 971 if ((so->so_state & SS_NBIO) || 972 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 973 error = EWOULDBLOCK; 974 goto release; 975 } 976 SBLASTRECORDCHK(&so->so_rcv); 977 SBLASTMBUFCHK(&so->so_rcv); 978 sbunlock(&so->so_rcv); 979 error = sbwait(&so->so_rcv); 980 if (error) 981 goto out; 982 goto restart; 983 } 984dontblock: 985 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 986 if (uio->uio_td) 987 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 988 SBLASTRECORDCHK(&so->so_rcv); 989 SBLASTMBUFCHK(&so->so_rcv); 990 nextrecord = m->m_nextpkt; 991 if (pr->pr_flags & PR_ADDR) { 992 KASSERT(m->m_type == MT_SONAME, 993 ("m->m_type == %d", m->m_type)); 994 orig_resid = 0; 995 if (psa != NULL) 996 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 997 M_NOWAIT); 998 if (flags & MSG_PEEK) { 999 m = m->m_next; 1000 } else { 1001 sbfree(&so->so_rcv, m); 1002 so->so_rcv.sb_mb = m_free(m); 1003 m = so->so_rcv.sb_mb; 1004 } 1005 } 1006 while (m != NULL && m->m_type == MT_CONTROL && error == 0) { 1007 if (flags & MSG_PEEK) { 1008 if (controlp != NULL) 1009 *controlp = m_copy(m, 0, m->m_len); 1010 m = m->m_next; 1011 } else { 1012 sbfree(&so->so_rcv, m); 1013 so->so_rcv.sb_mb = m->m_next; 1014 m->m_next = NULL; 1015 if (pr->pr_domain->dom_externalize) { 1016 SOCKBUF_UNLOCK(&so->so_rcv); 1017 error = (*pr->pr_domain->dom_externalize) 1018 (m, controlp); 1019 SOCKBUF_LOCK(&so->so_rcv); 1020 } else if (controlp != NULL) 1021 *controlp = m; 1022 else 1023 m_freem(m); 1024 m = so->so_rcv.sb_mb; 1025 } 1026 if (controlp != NULL) { 1027 orig_resid = 0; 1028 while (*controlp != NULL) 1029 controlp = &(*controlp)->m_next; 1030 } 1031 } 1032 if (m != NULL) { 1033 if ((flags & MSG_PEEK) == 0) { 1034 m->m_nextpkt = nextrecord; 1035 /* 1036 * If nextrecord == NULL (this is a single chain), 1037 * then sb_lastrecord may not be valid here if m 1038 * was changed earlier. 1039 */ 1040 if (nextrecord == NULL) { 1041 KASSERT(so->so_rcv.sb_mb == m, 1042 ("receive tailq 1")); 1043 so->so_rcv.sb_lastrecord = m; 1044 } 1045 } 1046 type = m->m_type; 1047 if (type == MT_OOBDATA) 1048 flags |= MSG_OOB; 1049 } else { 1050 if ((flags & MSG_PEEK) == 0) { 1051 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2")); 1052 so->so_rcv.sb_mb = nextrecord; 1053 SB_EMPTY_FIXUP(&so->so_rcv); 1054 } 1055 } 1056 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1057 SBLASTRECORDCHK(&so->so_rcv); 1058 SBLASTMBUFCHK(&so->so_rcv); 1059 1060 moff = 0; 1061 offset = 0; 1062 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1063 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1064 if (m->m_type == MT_OOBDATA) { 1065 if (type != MT_OOBDATA) 1066 break; 1067 } else if (type == MT_OOBDATA) 1068 break; 1069 else 1070 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1071 ("m->m_type == %d", m->m_type)); 1072 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1073 len = uio->uio_resid; 1074 if (so->so_oobmark && len > so->so_oobmark - offset) 1075 len = so->so_oobmark - offset; 1076 if (len > m->m_len - moff) 1077 len = m->m_len - moff; 1078 /* 1079 * If mp is set, just pass back the mbufs. 1080 * Otherwise copy them out via the uio, then free. 1081 * Sockbuf must be consistent here (points to current mbuf, 1082 * it points to next record) when we drop priority; 1083 * we must note any additions to the sockbuf when we 1084 * block interrupts again. 1085 */ 1086 if (mp == NULL) { 1087 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1088 SBLASTRECORDCHK(&so->so_rcv); 1089 SBLASTMBUFCHK(&so->so_rcv); 1090 SOCKBUF_UNLOCK(&so->so_rcv); 1091#ifdef ZERO_COPY_SOCKETS 1092 if (so_zero_copy_receive) { 1093 vm_page_t pg; 1094 int disposable; 1095 1096 if ((m->m_flags & M_EXT) 1097 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1098 disposable = 1; 1099 else 1100 disposable = 0; 1101 1102 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1103 moff)); 1104 1105 if (uio->uio_offset == -1) 1106 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1107 1108 error = uiomoveco(mtod(m, char *) + moff, 1109 (int)len, uio,pg->object, 1110 disposable); 1111 } else 1112#endif /* ZERO_COPY_SOCKETS */ 1113 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1114 SOCKBUF_LOCK(&so->so_rcv); 1115 if (error) 1116 goto release; 1117 } else 1118 uio->uio_resid -= len; 1119 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1120 if (len == m->m_len - moff) { 1121 if (m->m_flags & M_EOR) 1122 flags |= MSG_EOR; 1123 if (flags & MSG_PEEK) { 1124 m = m->m_next; 1125 moff = 0; 1126 } else { 1127 nextrecord = m->m_nextpkt; 1128 sbfree(&so->so_rcv, m); 1129 if (mp != NULL) { 1130 *mp = m; 1131 mp = &m->m_next; 1132 so->so_rcv.sb_mb = m = m->m_next; 1133 *mp = NULL; 1134 } else { 1135 so->so_rcv.sb_mb = m_free(m); 1136 m = so->so_rcv.sb_mb; 1137 } 1138 if (m != NULL) { 1139 m->m_nextpkt = nextrecord; 1140 if (nextrecord == NULL) 1141 so->so_rcv.sb_lastrecord = m; 1142 } else { 1143 so->so_rcv.sb_mb = nextrecord; 1144 SB_EMPTY_FIXUP(&so->so_rcv); 1145 } 1146 SBLASTRECORDCHK(&so->so_rcv); 1147 SBLASTMBUFCHK(&so->so_rcv); 1148 } 1149 } else { 1150 if (flags & MSG_PEEK) 1151 moff += len; 1152 else { 1153 if (mp != NULL) { 1154 SOCKBUF_UNLOCK(&so->so_rcv); 1155 *mp = m_copym(m, 0, len, M_TRYWAIT); 1156 SOCKBUF_LOCK(&so->so_rcv); 1157 } 1158 m->m_data += len; 1159 m->m_len -= len; 1160 so->so_rcv.sb_cc -= len; 1161 } 1162 } 1163 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1164 if (so->so_oobmark) { 1165 if ((flags & MSG_PEEK) == 0) { 1166 so->so_oobmark -= len; 1167 if (so->so_oobmark == 0) { 1168 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1169 so->so_rcv.sb_state |= SBS_RCVATMARK; 1170 break; 1171 } 1172 } else { 1173 offset += len; 1174 if (offset == so->so_oobmark) 1175 break; 1176 } 1177 } 1178 if (flags & MSG_EOR) 1179 break; 1180 /* 1181 * If the MSG_WAITALL flag is set (for non-atomic socket), 1182 * we must not quit until "uio->uio_resid == 0" or an error 1183 * termination. If a signal/timeout occurs, return 1184 * with a short count but without error. 1185 * Keep sockbuf locked against other readers. 1186 */ 1187 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1188 !sosendallatonce(so) && nextrecord == NULL) { 1189 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1190 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1191 break; 1192 /* 1193 * Notify the protocol that some data has been 1194 * drained before blocking. 1195 */ 1196 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1197 SOCKBUF_UNLOCK(&so->so_rcv); 1198 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1199 SOCKBUF_LOCK(&so->so_rcv); 1200 } 1201 SBLASTRECORDCHK(&so->so_rcv); 1202 SBLASTMBUFCHK(&so->so_rcv); 1203 error = sbwait(&so->so_rcv); 1204 if (error) 1205 goto release; 1206 m = so->so_rcv.sb_mb; 1207 if (m != NULL) 1208 nextrecord = m->m_nextpkt; 1209 } 1210 } 1211 1212 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1213 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1214 flags |= MSG_TRUNC; 1215 if ((flags & MSG_PEEK) == 0) { 1216 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1217 (void) sbdroprecord_locked(&so->so_rcv); 1218 } 1219 } 1220 if ((flags & MSG_PEEK) == 0) { 1221 if (m == NULL) { 1222 /* 1223 * First part is an inline SB_EMPTY_FIXUP(). Second 1224 * part makes sure sb_lastrecord is up-to-date if 1225 * there is still data in the socket buffer. 1226 */ 1227 so->so_rcv.sb_mb = nextrecord; 1228 if (so->so_rcv.sb_mb == NULL) { 1229 so->so_rcv.sb_mbtail = NULL; 1230 so->so_rcv.sb_lastrecord = NULL; 1231 } else if (nextrecord->m_nextpkt == NULL) 1232 so->so_rcv.sb_lastrecord = nextrecord; 1233 } 1234 SBLASTRECORDCHK(&so->so_rcv); 1235 SBLASTMBUFCHK(&so->so_rcv); 1236 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1237 SOCKBUF_UNLOCK(&so->so_rcv); 1238 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1239 SOCKBUF_LOCK(&so->so_rcv); 1240 } 1241 } 1242 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1243 if (orig_resid == uio->uio_resid && orig_resid && 1244 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1245 sbunlock(&so->so_rcv); 1246 goto restart; 1247 } 1248 1249 if (flagsp != NULL) 1250 *flagsp |= flags; 1251release: 1252 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1253 sbunlock(&so->so_rcv); 1254out: 1255 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1256 SOCKBUF_UNLOCK(&so->so_rcv); 1257 return (error); 1258} 1259 1260int 1261soshutdown(so, how) 1262 struct socket *so; 1263 int how; 1264{ 1265 struct protosw *pr = so->so_proto; 1266 1267 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1268 return (EINVAL); 1269 1270 if (how != SHUT_WR) 1271 sorflush(so); 1272 if (how != SHUT_RD) 1273 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1274 return (0); 1275} 1276 1277void 1278sorflush(so) 1279 struct socket *so; 1280{ 1281 struct sockbuf *sb = &so->so_rcv; 1282 struct protosw *pr = so->so_proto; 1283 struct sockbuf asb; 1284 1285 /* 1286 * XXXRW: This is quite ugly. The existing code made a copy of the 1287 * socket buffer, then zero'd the original to clear the buffer 1288 * fields. However, with mutexes in the socket buffer, this causes 1289 * problems. We only clear the zeroable bits of the original; 1290 * however, we have to initialize and destroy the mutex in the copy 1291 * so that dom_dispose() and sbrelease() can lock t as needed. 1292 */ 1293 SOCKBUF_LOCK(sb); 1294 sb->sb_flags |= SB_NOINTR; 1295 (void) sblock(sb, M_WAITOK); 1296 /* 1297 * socantrcvmore_locked() drops the socket buffer mutex so that it 1298 * can safely perform wakeups. Re-acquire the mutex before 1299 * continuing. 1300 */ 1301 socantrcvmore_locked(so); 1302 SOCKBUF_LOCK(sb); 1303 sbunlock(sb); 1304 /* 1305 * Invalidate/clear most of the sockbuf structure, but leave 1306 * selinfo and mutex data unchanged. 1307 */ 1308 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1309 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1310 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1311 bzero(&sb->sb_startzero, 1312 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1313 SOCKBUF_UNLOCK(sb); 1314 1315 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1316 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1317 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1318 sbrelease(&asb, so); 1319 SOCKBUF_LOCK_DESTROY(&asb); 1320} 1321 1322#ifdef INET 1323static int 1324do_setopt_accept_filter(so, sopt) 1325 struct socket *so; 1326 struct sockopt *sopt; 1327{ 1328 struct accept_filter_arg *afap = NULL; 1329 struct accept_filter *afp; 1330 struct so_accf *af = so->so_accf; 1331 int error = 0; 1332 1333 /* do not set/remove accept filters on non listen sockets */ 1334 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1335 error = EINVAL; 1336 goto out; 1337 } 1338 1339 /* removing the filter */ 1340 if (sopt == NULL) { 1341 if (af != NULL) { 1342 if (af->so_accept_filter != NULL && 1343 af->so_accept_filter->accf_destroy != NULL) { 1344 af->so_accept_filter->accf_destroy(so); 1345 } 1346 if (af->so_accept_filter_str != NULL) { 1347 FREE(af->so_accept_filter_str, M_ACCF); 1348 } 1349 FREE(af, M_ACCF); 1350 so->so_accf = NULL; 1351 } 1352 so->so_options &= ~SO_ACCEPTFILTER; 1353 return (0); 1354 } 1355 /* adding a filter */ 1356 /* must remove previous filter first */ 1357 if (af != NULL) { 1358 error = EINVAL; 1359 goto out; 1360 } 1361 /* don't put large objects on the kernel stack */ 1362 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1363 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1364 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1365 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1366 if (error) 1367 goto out; 1368 afp = accept_filt_get(afap->af_name); 1369 if (afp == NULL) { 1370 error = ENOENT; 1371 goto out; 1372 } 1373 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1374 if (afp->accf_create != NULL) { 1375 if (afap->af_name[0] != '\0') { 1376 int len = strlen(afap->af_name) + 1; 1377 1378 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1379 strcpy(af->so_accept_filter_str, afap->af_name); 1380 } 1381 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1382 if (af->so_accept_filter_arg == NULL) { 1383 FREE(af->so_accept_filter_str, M_ACCF); 1384 FREE(af, M_ACCF); 1385 so->so_accf = NULL; 1386 error = EINVAL; 1387 goto out; 1388 } 1389 } 1390 af->so_accept_filter = afp; 1391 so->so_accf = af; 1392 so->so_options |= SO_ACCEPTFILTER; 1393out: 1394 if (afap != NULL) 1395 FREE(afap, M_TEMP); 1396 return (error); 1397} 1398#endif /* INET */ 1399 1400/* 1401 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1402 * an additional variant to handle the case where the option value needs 1403 * to be some kind of integer, but not a specific size. 1404 * In addition to their use here, these functions are also called by the 1405 * protocol-level pr_ctloutput() routines. 1406 */ 1407int 1408sooptcopyin(sopt, buf, len, minlen) 1409 struct sockopt *sopt; 1410 void *buf; 1411 size_t len; 1412 size_t minlen; 1413{ 1414 size_t valsize; 1415 1416 /* 1417 * If the user gives us more than we wanted, we ignore it, 1418 * but if we don't get the minimum length the caller 1419 * wants, we return EINVAL. On success, sopt->sopt_valsize 1420 * is set to however much we actually retrieved. 1421 */ 1422 if ((valsize = sopt->sopt_valsize) < minlen) 1423 return EINVAL; 1424 if (valsize > len) 1425 sopt->sopt_valsize = valsize = len; 1426 1427 if (sopt->sopt_td != NULL) 1428 return (copyin(sopt->sopt_val, buf, valsize)); 1429 1430 bcopy(sopt->sopt_val, buf, valsize); 1431 return 0; 1432} 1433 1434int 1435sosetopt(so, sopt) 1436 struct socket *so; 1437 struct sockopt *sopt; 1438{ 1439 int error, optval; 1440 struct linger l; 1441 struct timeval tv; 1442 u_long val; 1443#ifdef MAC 1444 struct mac extmac; 1445#endif 1446 1447 error = 0; 1448 if (sopt->sopt_level != SOL_SOCKET) { 1449 if (so->so_proto && so->so_proto->pr_ctloutput) 1450 return ((*so->so_proto->pr_ctloutput) 1451 (so, sopt)); 1452 error = ENOPROTOOPT; 1453 } else { 1454 switch (sopt->sopt_name) { 1455#ifdef INET 1456 case SO_ACCEPTFILTER: 1457 error = do_setopt_accept_filter(so, sopt); 1458 if (error) 1459 goto bad; 1460 break; 1461#endif 1462 case SO_LINGER: 1463 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1464 if (error) 1465 goto bad; 1466 1467 SOCK_LOCK(so); 1468 so->so_linger = l.l_linger; 1469 if (l.l_onoff) 1470 so->so_options |= SO_LINGER; 1471 else 1472 so->so_options &= ~SO_LINGER; 1473 SOCK_UNLOCK(so); 1474 break; 1475 1476 case SO_DEBUG: 1477 case SO_KEEPALIVE: 1478 case SO_DONTROUTE: 1479 case SO_USELOOPBACK: 1480 case SO_BROADCAST: 1481 case SO_REUSEADDR: 1482 case SO_REUSEPORT: 1483 case SO_OOBINLINE: 1484 case SO_TIMESTAMP: 1485 case SO_BINTIME: 1486 case SO_NOSIGPIPE: 1487 error = sooptcopyin(sopt, &optval, sizeof optval, 1488 sizeof optval); 1489 if (error) 1490 goto bad; 1491 SOCK_LOCK(so); 1492 if (optval) 1493 so->so_options |= sopt->sopt_name; 1494 else 1495 so->so_options &= ~sopt->sopt_name; 1496 SOCK_UNLOCK(so); 1497 break; 1498 1499 case SO_SNDBUF: 1500 case SO_RCVBUF: 1501 case SO_SNDLOWAT: 1502 case SO_RCVLOWAT: 1503 error = sooptcopyin(sopt, &optval, sizeof optval, 1504 sizeof optval); 1505 if (error) 1506 goto bad; 1507 1508 /* 1509 * Values < 1 make no sense for any of these 1510 * options, so disallow them. 1511 */ 1512 if (optval < 1) { 1513 error = EINVAL; 1514 goto bad; 1515 } 1516 1517 switch (sopt->sopt_name) { 1518 case SO_SNDBUF: 1519 case SO_RCVBUF: 1520 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1521 &so->so_snd : &so->so_rcv, (u_long)optval, 1522 so, curthread) == 0) { 1523 error = ENOBUFS; 1524 goto bad; 1525 } 1526 break; 1527 1528 /* 1529 * Make sure the low-water is never greater than 1530 * the high-water. 1531 */ 1532 case SO_SNDLOWAT: 1533 SOCKBUF_LOCK(&so->so_snd); 1534 so->so_snd.sb_lowat = 1535 (optval > so->so_snd.sb_hiwat) ? 1536 so->so_snd.sb_hiwat : optval; 1537 SOCKBUF_UNLOCK(&so->so_snd); 1538 break; 1539 case SO_RCVLOWAT: 1540 SOCKBUF_LOCK(&so->so_rcv); 1541 so->so_rcv.sb_lowat = 1542 (optval > so->so_rcv.sb_hiwat) ? 1543 so->so_rcv.sb_hiwat : optval; 1544 SOCKBUF_UNLOCK(&so->so_rcv); 1545 break; 1546 } 1547 break; 1548 1549 case SO_SNDTIMEO: 1550 case SO_RCVTIMEO: 1551 error = sooptcopyin(sopt, &tv, sizeof tv, 1552 sizeof tv); 1553 if (error) 1554 goto bad; 1555 1556 /* assert(hz > 0); */ 1557 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1558 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1559 error = EDOM; 1560 goto bad; 1561 } 1562 /* assert(tick > 0); */ 1563 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1564 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1565 if (val > SHRT_MAX) { 1566 error = EDOM; 1567 goto bad; 1568 } 1569 if (val == 0 && tv.tv_usec != 0) 1570 val = 1; 1571 1572 switch (sopt->sopt_name) { 1573 case SO_SNDTIMEO: 1574 so->so_snd.sb_timeo = val; 1575 break; 1576 case SO_RCVTIMEO: 1577 so->so_rcv.sb_timeo = val; 1578 break; 1579 } 1580 break; 1581 case SO_LABEL: 1582#ifdef MAC 1583 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1584 sizeof extmac); 1585 if (error) 1586 goto bad; 1587 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1588 so, &extmac); 1589#else 1590 error = EOPNOTSUPP; 1591#endif 1592 break; 1593 default: 1594 error = ENOPROTOOPT; 1595 break; 1596 } 1597 if (error == 0 && so->so_proto != NULL && 1598 so->so_proto->pr_ctloutput != NULL) { 1599 (void) ((*so->so_proto->pr_ctloutput) 1600 (so, sopt)); 1601 } 1602 } 1603bad: 1604 return (error); 1605} 1606 1607/* Helper routine for getsockopt */ 1608int 1609sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1610{ 1611 int error; 1612 size_t valsize; 1613 1614 error = 0; 1615 1616 /* 1617 * Documented get behavior is that we always return a value, 1618 * possibly truncated to fit in the user's buffer. 1619 * Traditional behavior is that we always tell the user 1620 * precisely how much we copied, rather than something useful 1621 * like the total amount we had available for her. 1622 * Note that this interface is not idempotent; the entire answer must 1623 * generated ahead of time. 1624 */ 1625 valsize = min(len, sopt->sopt_valsize); 1626 sopt->sopt_valsize = valsize; 1627 if (sopt->sopt_val != NULL) { 1628 if (sopt->sopt_td != NULL) 1629 error = copyout(buf, sopt->sopt_val, valsize); 1630 else 1631 bcopy(buf, sopt->sopt_val, valsize); 1632 } 1633 return error; 1634} 1635 1636int 1637sogetopt(so, sopt) 1638 struct socket *so; 1639 struct sockopt *sopt; 1640{ 1641 int error, optval; 1642 struct linger l; 1643 struct timeval tv; 1644#ifdef INET 1645 struct accept_filter_arg *afap; 1646#endif 1647#ifdef MAC 1648 struct mac extmac; 1649#endif 1650 1651 error = 0; 1652 if (sopt->sopt_level != SOL_SOCKET) { 1653 if (so->so_proto && so->so_proto->pr_ctloutput) { 1654 return ((*so->so_proto->pr_ctloutput) 1655 (so, sopt)); 1656 } else 1657 return (ENOPROTOOPT); 1658 } else { 1659 switch (sopt->sopt_name) { 1660#ifdef INET 1661 case SO_ACCEPTFILTER: 1662 if ((so->so_options & SO_ACCEPTCONN) == 0) 1663 return (EINVAL); 1664 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1665 M_TEMP, M_WAITOK | M_ZERO); 1666 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1667 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1668 if (so->so_accf->so_accept_filter_str != NULL) 1669 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1670 } 1671 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1672 FREE(afap, M_TEMP); 1673 break; 1674#endif 1675 1676 case SO_LINGER: 1677 /* 1678 * XXXRW: We grab the lock here to get a consistent 1679 * snapshot of both fields. This may not really 1680 * be necessary. 1681 */ 1682 SOCK_LOCK(so); 1683 l.l_onoff = so->so_options & SO_LINGER; 1684 l.l_linger = so->so_linger; 1685 SOCK_UNLOCK(so); 1686 error = sooptcopyout(sopt, &l, sizeof l); 1687 break; 1688 1689 case SO_USELOOPBACK: 1690 case SO_DONTROUTE: 1691 case SO_DEBUG: 1692 case SO_KEEPALIVE: 1693 case SO_REUSEADDR: 1694 case SO_REUSEPORT: 1695 case SO_BROADCAST: 1696 case SO_OOBINLINE: 1697 case SO_TIMESTAMP: 1698 case SO_BINTIME: 1699 case SO_NOSIGPIPE: 1700 optval = so->so_options & sopt->sopt_name; 1701integer: 1702 error = sooptcopyout(sopt, &optval, sizeof optval); 1703 break; 1704 1705 case SO_TYPE: 1706 optval = so->so_type; 1707 goto integer; 1708 1709 case SO_ERROR: 1710 optval = so->so_error; 1711 so->so_error = 0; 1712 goto integer; 1713 1714 case SO_SNDBUF: 1715 optval = so->so_snd.sb_hiwat; 1716 goto integer; 1717 1718 case SO_RCVBUF: 1719 optval = so->so_rcv.sb_hiwat; 1720 goto integer; 1721 1722 case SO_SNDLOWAT: 1723 optval = so->so_snd.sb_lowat; 1724 goto integer; 1725 1726 case SO_RCVLOWAT: 1727 optval = so->so_rcv.sb_lowat; 1728 goto integer; 1729 1730 case SO_SNDTIMEO: 1731 case SO_RCVTIMEO: 1732 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1733 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1734 1735 tv.tv_sec = optval / hz; 1736 tv.tv_usec = (optval % hz) * tick; 1737 error = sooptcopyout(sopt, &tv, sizeof tv); 1738 break; 1739 case SO_LABEL: 1740#ifdef MAC 1741 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1742 sizeof(extmac)); 1743 if (error) 1744 return (error); 1745 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1746 so, &extmac); 1747 if (error) 1748 return (error); 1749 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1750#else 1751 error = EOPNOTSUPP; 1752#endif 1753 break; 1754 case SO_PEERLABEL: 1755#ifdef MAC 1756 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1757 sizeof(extmac)); 1758 if (error) 1759 return (error); 1760 error = mac_getsockopt_peerlabel( 1761 sopt->sopt_td->td_ucred, so, &extmac); 1762 if (error) 1763 return (error); 1764 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1765#else 1766 error = EOPNOTSUPP; 1767#endif 1768 break; 1769 default: 1770 error = ENOPROTOOPT; 1771 break; 1772 } 1773 return (error); 1774 } 1775} 1776 1777/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1778int 1779soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1780{ 1781 struct mbuf *m, *m_prev; 1782 int sopt_size = sopt->sopt_valsize; 1783 1784 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1785 if (m == NULL) 1786 return ENOBUFS; 1787 if (sopt_size > MLEN) { 1788 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1789 if ((m->m_flags & M_EXT) == 0) { 1790 m_free(m); 1791 return ENOBUFS; 1792 } 1793 m->m_len = min(MCLBYTES, sopt_size); 1794 } else { 1795 m->m_len = min(MLEN, sopt_size); 1796 } 1797 sopt_size -= m->m_len; 1798 *mp = m; 1799 m_prev = m; 1800 1801 while (sopt_size) { 1802 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1803 if (m == NULL) { 1804 m_freem(*mp); 1805 return ENOBUFS; 1806 } 1807 if (sopt_size > MLEN) { 1808 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1809 M_DONTWAIT); 1810 if ((m->m_flags & M_EXT) == 0) { 1811 m_freem(m); 1812 m_freem(*mp); 1813 return ENOBUFS; 1814 } 1815 m->m_len = min(MCLBYTES, sopt_size); 1816 } else { 1817 m->m_len = min(MLEN, sopt_size); 1818 } 1819 sopt_size -= m->m_len; 1820 m_prev->m_next = m; 1821 m_prev = m; 1822 } 1823 return 0; 1824} 1825 1826/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1827int 1828soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1829{ 1830 struct mbuf *m0 = m; 1831 1832 if (sopt->sopt_val == NULL) 1833 return 0; 1834 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1835 if (sopt->sopt_td != NULL) { 1836 int error; 1837 1838 error = copyin(sopt->sopt_val, mtod(m, char *), 1839 m->m_len); 1840 if (error != 0) { 1841 m_freem(m0); 1842 return(error); 1843 } 1844 } else 1845 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1846 sopt->sopt_valsize -= m->m_len; 1847 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1848 m = m->m_next; 1849 } 1850 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1851 panic("ip6_sooptmcopyin"); 1852 return 0; 1853} 1854 1855/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1856int 1857soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1858{ 1859 struct mbuf *m0 = m; 1860 size_t valsize = 0; 1861 1862 if (sopt->sopt_val == NULL) 1863 return 0; 1864 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1865 if (sopt->sopt_td != NULL) { 1866 int error; 1867 1868 error = copyout(mtod(m, char *), sopt->sopt_val, 1869 m->m_len); 1870 if (error != 0) { 1871 m_freem(m0); 1872 return(error); 1873 } 1874 } else 1875 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 1876 sopt->sopt_valsize -= m->m_len; 1877 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1878 valsize += m->m_len; 1879 m = m->m_next; 1880 } 1881 if (m != NULL) { 1882 /* enough soopt buffer should be given from user-land */ 1883 m_freem(m0); 1884 return(EINVAL); 1885 } 1886 sopt->sopt_valsize = valsize; 1887 return 0; 1888} 1889 1890void 1891sohasoutofband(so) 1892 struct socket *so; 1893{ 1894 if (so->so_sigio != NULL) 1895 pgsigio(&so->so_sigio, SIGURG, 0); 1896 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 1897} 1898 1899int 1900sopoll(struct socket *so, int events, struct ucred *active_cred, 1901 struct thread *td) 1902{ 1903 int revents = 0; 1904 1905 if (events & (POLLIN | POLLRDNORM)) 1906 if (soreadable(so)) 1907 revents |= events & (POLLIN | POLLRDNORM); 1908 1909 if (events & POLLINIGNEOF) 1910 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 1911 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 1912 revents |= POLLINIGNEOF; 1913 1914 if (events & (POLLOUT | POLLWRNORM)) 1915 if (sowriteable(so)) 1916 revents |= events & (POLLOUT | POLLWRNORM); 1917 1918 if (events & (POLLPRI | POLLRDBAND)) 1919 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 1920 revents |= events & (POLLPRI | POLLRDBAND); 1921 1922 if (revents == 0) { 1923 if (events & 1924 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 1925 POLLRDBAND)) { 1926 SOCKBUF_LOCK(&so->so_rcv); 1927 selrecord(td, &so->so_rcv.sb_sel); 1928 so->so_rcv.sb_flags |= SB_SEL; 1929 SOCKBUF_UNLOCK(&so->so_rcv); 1930 } 1931 1932 if (events & (POLLOUT | POLLWRNORM)) { 1933 SOCKBUF_LOCK(&so->so_snd); 1934 selrecord(td, &so->so_snd.sb_sel); 1935 so->so_snd.sb_flags |= SB_SEL; 1936 SOCKBUF_UNLOCK(&so->so_snd); 1937 } 1938 } 1939 1940 return (revents); 1941} 1942 1943int 1944soo_kqfilter(struct file *fp, struct knote *kn) 1945{ 1946 struct socket *so = kn->kn_fp->f_data; 1947 struct sockbuf *sb; 1948 1949 switch (kn->kn_filter) { 1950 case EVFILT_READ: 1951 if (so->so_options & SO_ACCEPTCONN) 1952 kn->kn_fop = &solisten_filtops; 1953 else 1954 kn->kn_fop = &soread_filtops; 1955 sb = &so->so_rcv; 1956 break; 1957 case EVFILT_WRITE: 1958 kn->kn_fop = &sowrite_filtops; 1959 sb = &so->so_snd; 1960 break; 1961 default: 1962 return (1); 1963 } 1964 1965 SOCKBUF_LOCK(sb); 1966 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1967 sb->sb_flags |= SB_KNOTE; 1968 SOCKBUF_UNLOCK(sb); 1969 return (0); 1970} 1971 1972static void 1973filt_sordetach(struct knote *kn) 1974{ 1975 struct socket *so = kn->kn_fp->f_data; 1976 1977 SOCKBUF_LOCK(&so->so_rcv); 1978 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1979 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1980 so->so_rcv.sb_flags &= ~SB_KNOTE; 1981 SOCKBUF_UNLOCK(&so->so_rcv); 1982} 1983 1984/*ARGSUSED*/ 1985static int 1986filt_soread(struct knote *kn, long hint) 1987{ 1988 struct socket *so = kn->kn_fp->f_data; 1989 int need_lock, result; 1990 1991 /* 1992 * XXXRW: Conditional locking because filt_soread() can be called 1993 * either from KNOTE() in the socket context where the socket buffer 1994 * lock is already held, or from kqueue() itself. 1995 */ 1996 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 1997 if (need_lock) 1998 SOCKBUF_LOCK(&so->so_rcv); 1999 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2000 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2001 kn->kn_flags |= EV_EOF; 2002 kn->kn_fflags = so->so_error; 2003 result = 1; 2004 } else if (so->so_error) /* temporary udp error */ 2005 result = 1; 2006 else if (kn->kn_sfflags & NOTE_LOWAT) 2007 result = (kn->kn_data >= kn->kn_sdata); 2008 else 2009 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2010 if (need_lock) 2011 SOCKBUF_UNLOCK(&so->so_rcv); 2012 return (result); 2013} 2014 2015static void 2016filt_sowdetach(struct knote *kn) 2017{ 2018 struct socket *so = kn->kn_fp->f_data; 2019 2020 SOCKBUF_LOCK(&so->so_snd); 2021 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2022 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2023 so->so_snd.sb_flags &= ~SB_KNOTE; 2024 SOCKBUF_UNLOCK(&so->so_snd); 2025} 2026 2027/*ARGSUSED*/ 2028static int 2029filt_sowrite(struct knote *kn, long hint) 2030{ 2031 struct socket *so = kn->kn_fp->f_data; 2032 int need_lock, result; 2033 2034 /* 2035 * XXXRW: Conditional locking because filt_soread() can be called 2036 * either from KNOTE() in the socket context where the socket buffer 2037 * lock is already held, or from kqueue() itself. 2038 */ 2039 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2040 if (need_lock) 2041 SOCKBUF_LOCK(&so->so_snd); 2042 kn->kn_data = sbspace(&so->so_snd); 2043 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2044 kn->kn_flags |= EV_EOF; 2045 kn->kn_fflags = so->so_error; 2046 result = 1; 2047 } else if (so->so_error) /* temporary udp error */ 2048 result = 1; 2049 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2050 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2051 result = 0; 2052 else if (kn->kn_sfflags & NOTE_LOWAT) 2053 result = (kn->kn_data >= kn->kn_sdata); 2054 else 2055 result = (kn->kn_data >= so->so_snd.sb_lowat); 2056 if (need_lock) 2057 SOCKBUF_UNLOCK(&so->so_snd); 2058 return (result); 2059} 2060 2061/*ARGSUSED*/ 2062static int 2063filt_solisten(struct knote *kn, long hint) 2064{ 2065 struct socket *so = kn->kn_fp->f_data; 2066 2067 kn->kn_data = so->so_qlen; 2068 return (! TAILQ_EMPTY(&so->so_comp)); 2069} 2070 2071int 2072socheckuid(struct socket *so, uid_t uid) 2073{ 2074 2075 if (so == NULL) 2076 return (EPERM); 2077 if (so->so_cred->cr_uid == uid) 2078 return (0); 2079 return (EPERM); 2080} 2081