uipc_socket.c revision 47720
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 34 * $Id: uipc_socket.c,v 1.58 1999/05/21 15:54:40 ache Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/proc.h> 40#include <sys/fcntl.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/domain.h> 44#include <sys/kernel.h> 45#include <sys/poll.h> 46#include <sys/protosw.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/resourcevar.h> 50#include <sys/signalvar.h> 51#include <sys/sysctl.h> 52#include <sys/uio.h> 53#include <vm/vm_zone.h> 54 55#include <machine/limits.h> 56 57struct vm_zone *socket_zone; 58so_gen_t so_gencnt; /* generation count for sockets */ 59 60MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 61MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 62 63SYSCTL_DECL(_kern_ipc); 64 65static int somaxconn = SOMAXCONN; 66SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 67 &somaxconn, 0, "Maximum pending socket connection queue size"); 68 69/* 70 * Socket operation routines. 71 * These routines are called by the routines in 72 * sys_socket.c or from a system process, and 73 * implement the semantics of socket operations by 74 * switching out to the protocol specific routines. 75 */ 76 77/* 78 * Get a socket structure from our zone, and initialize it. 79 * We don't implement `waitok' yet (see comments in uipc_domain.c). 80 * Note that it would probably be better to allocate socket 81 * and PCB at the same time, but I'm not convinced that all 82 * the protocols can be easily modified to do this. 83 */ 84struct socket * 85soalloc(waitok) 86 int waitok; 87{ 88 struct socket *so; 89 90 so = zalloci(socket_zone); 91 if (so) { 92 /* XXX race condition for reentrant kernel */ 93 bzero(so, sizeof *so); 94 so->so_gencnt = ++so_gencnt; 95 so->so_zone = socket_zone; 96 } 97 return so; 98} 99 100int 101socreate(dom, aso, type, proto, p) 102 int dom; 103 struct socket **aso; 104 register int type; 105 int proto; 106 struct proc *p; 107{ 108 register struct protosw *prp; 109 register struct socket *so; 110 register int error; 111 112 if (proto) 113 prp = pffindproto(dom, proto, type); 114 else 115 prp = pffindtype(dom, type); 116 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 117 return (EPROTONOSUPPORT); 118 if (prp->pr_type != type) 119 return (EPROTOTYPE); 120 so = soalloc(p != 0); 121 if (so == 0) 122 return (ENOBUFS); 123 124 TAILQ_INIT(&so->so_incomp); 125 TAILQ_INIT(&so->so_comp); 126 so->so_type = type; 127 if (p != 0) 128 so->so_uid = p->p_ucred->cr_uid; 129 so->so_proto = prp; 130 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); 131 if (error) { 132 so->so_state |= SS_NOFDREF; 133 sofree(so); 134 return (error); 135 } 136 *aso = so; 137 return (0); 138} 139 140int 141sobind(so, nam, p) 142 struct socket *so; 143 struct sockaddr *nam; 144 struct proc *p; 145{ 146 int s = splnet(); 147 int error; 148 149 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); 150 splx(s); 151 return (error); 152} 153 154void 155sodealloc(so) 156 struct socket *so; 157{ 158 so->so_gencnt = ++so_gencnt; 159 zfreei(so->so_zone, so); 160} 161 162int 163solisten(so, backlog, p) 164 register struct socket *so; 165 int backlog; 166 struct proc *p; 167{ 168 int s, error; 169 170 s = splnet(); 171 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); 172 if (error) { 173 splx(s); 174 return (error); 175 } 176 if (so->so_comp.tqh_first == NULL) 177 so->so_options |= SO_ACCEPTCONN; 178 if (backlog < 0 || backlog > somaxconn) 179 backlog = somaxconn; 180 so->so_qlimit = backlog; 181 splx(s); 182 return (0); 183} 184 185void 186sofree(so) 187 register struct socket *so; 188{ 189 struct socket *head = so->so_head; 190 191 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 192 return; 193 if (head != NULL) { 194 if (so->so_state & SS_INCOMP) { 195 TAILQ_REMOVE(&head->so_incomp, so, so_list); 196 head->so_incqlen--; 197 } else if (so->so_state & SS_COMP) { 198 /* 199 * We must not decommission a socket that's 200 * on the accept(2) queue. If we do, then 201 * accept(2) may hang after select(2) indicated 202 * that the listening socket was ready. 203 */ 204 return; 205 } else { 206 panic("sofree: not queued"); 207 } 208 head->so_qlen--; 209 so->so_state &= ~SS_INCOMP; 210 so->so_head = NULL; 211 } 212 sbrelease(&so->so_snd); 213 sorflush(so); 214 sodealloc(so); 215} 216 217/* 218 * Close a socket on last file table reference removal. 219 * Initiate disconnect if connected. 220 * Free socket when disconnect complete. 221 */ 222int 223soclose(so) 224 register struct socket *so; 225{ 226 int s = splnet(); /* conservative */ 227 int error = 0; 228 229 funsetown(so->so_sigio); 230 if (so->so_options & SO_ACCEPTCONN) { 231 struct socket *sp, *sonext; 232 233 for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) { 234 sonext = sp->so_list.tqe_next; 235 (void) soabort(sp); 236 } 237 for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) { 238 sonext = sp->so_list.tqe_next; 239 /* Dequeue from so_comp since sofree() won't do it */ 240 TAILQ_REMOVE(&so->so_comp, sp, so_list); 241 so->so_qlen--; 242 sp->so_state &= ~SS_COMP; 243 sp->so_head = NULL; 244 (void) soabort(sp); 245 } 246 } 247 if (so->so_pcb == 0) 248 goto discard; 249 if (so->so_state & SS_ISCONNECTED) { 250 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 251 error = sodisconnect(so); 252 if (error) 253 goto drop; 254 } 255 if (so->so_options & SO_LINGER) { 256 if ((so->so_state & SS_ISDISCONNECTING) && 257 (so->so_state & SS_NBIO)) 258 goto drop; 259 while (so->so_state & SS_ISCONNECTED) { 260 error = tsleep((caddr_t)&so->so_timeo, 261 PSOCK | PCATCH, "soclos", so->so_linger * hz); 262 if (error) 263 break; 264 } 265 } 266 } 267drop: 268 if (so->so_pcb) { 269 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 270 if (error == 0) 271 error = error2; 272 } 273discard: 274 if (so->so_state & SS_NOFDREF) 275 panic("soclose: NOFDREF"); 276 so->so_state |= SS_NOFDREF; 277 sofree(so); 278 splx(s); 279 return (error); 280} 281 282/* 283 * Must be called at splnet... 284 */ 285int 286soabort(so) 287 struct socket *so; 288{ 289 290 return (*so->so_proto->pr_usrreqs->pru_abort)(so); 291} 292 293int 294soaccept(so, nam) 295 register struct socket *so; 296 struct sockaddr **nam; 297{ 298 int s = splnet(); 299 int error; 300 301 if ((so->so_state & SS_NOFDREF) == 0) 302 panic("soaccept: !NOFDREF"); 303 so->so_state &= ~SS_NOFDREF; 304 if ((so->so_state & SS_ISDISCONNECTED) == 0) 305 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 306 else { 307 if (nam) 308 *nam = 0; 309 error = 0; 310 } 311 splx(s); 312 return (error); 313} 314 315int 316soconnect(so, nam, p) 317 register struct socket *so; 318 struct sockaddr *nam; 319 struct proc *p; 320{ 321 int s; 322 int error; 323 324 if (so->so_options & SO_ACCEPTCONN) 325 return (EOPNOTSUPP); 326 s = splnet(); 327 /* 328 * If protocol is connection-based, can only connect once. 329 * Otherwise, if connected, try to disconnect first. 330 * This allows user to disconnect by connecting to, e.g., 331 * a null address. 332 */ 333 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 334 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 335 (error = sodisconnect(so)))) 336 error = EISCONN; 337 else 338 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); 339 splx(s); 340 return (error); 341} 342 343int 344soconnect2(so1, so2) 345 register struct socket *so1; 346 struct socket *so2; 347{ 348 int s = splnet(); 349 int error; 350 351 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 352 splx(s); 353 return (error); 354} 355 356int 357sodisconnect(so) 358 register struct socket *so; 359{ 360 int s = splnet(); 361 int error; 362 363 if ((so->so_state & SS_ISCONNECTED) == 0) { 364 error = ENOTCONN; 365 goto bad; 366 } 367 if (so->so_state & SS_ISDISCONNECTING) { 368 error = EALREADY; 369 goto bad; 370 } 371 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 372bad: 373 splx(s); 374 return (error); 375} 376 377#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 378/* 379 * Send on a socket. 380 * If send must go all at once and message is larger than 381 * send buffering, then hard error. 382 * Lock against other senders. 383 * If must go all at once and not enough room now, then 384 * inform user that this would block and do nothing. 385 * Otherwise, if nonblocking, send as much as possible. 386 * The data to be sent is described by "uio" if nonzero, 387 * otherwise by the mbuf chain "top" (which must be null 388 * if uio is not). Data provided in mbuf chain must be small 389 * enough to send all at once. 390 * 391 * Returns nonzero on error, timeout or signal; callers 392 * must check for short counts if EINTR/ERESTART are returned. 393 * Data and control buffers are freed on return. 394 */ 395int 396sosend(so, addr, uio, top, control, flags, p) 397 register struct socket *so; 398 struct sockaddr *addr; 399 struct uio *uio; 400 struct mbuf *top; 401 struct mbuf *control; 402 int flags; 403 struct proc *p; 404{ 405 struct mbuf **mp; 406 register struct mbuf *m; 407 register long space, len, resid; 408 int clen = 0, error, s, dontroute, mlen; 409 int atomic = sosendallatonce(so) || top; 410 411 if (uio) 412 resid = uio->uio_resid; 413 else 414 resid = top->m_pkthdr.len; 415 /* 416 * In theory resid should be unsigned. 417 * However, space must be signed, as it might be less than 0 418 * if we over-committed, and we must use a signed comparison 419 * of space and resid. On the other hand, a negative resid 420 * causes us to loop sending 0-length segments to the protocol. 421 * 422 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 423 * type sockets since that's an error. 424 */ 425 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 426 error = EINVAL; 427 goto out; 428 } 429 430 dontroute = 431 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 432 (so->so_proto->pr_flags & PR_ATOMIC); 433 if (p) 434 p->p_stats->p_ru.ru_msgsnd++; 435 if (control) 436 clen = control->m_len; 437#define snderr(errno) { error = errno; splx(s); goto release; } 438 439restart: 440 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 441 if (error) 442 goto out; 443 do { 444 s = splnet(); 445 if (so->so_state & SS_CANTSENDMORE) 446 snderr(EPIPE); 447 if (so->so_error) { 448 error = so->so_error; 449 so->so_error = 0; 450 splx(s); 451 goto release; 452 } 453 if ((so->so_state & SS_ISCONNECTED) == 0) { 454 /* 455 * `sendto' and `sendmsg' is allowed on a connection- 456 * based socket if it supports implied connect. 457 * Return ENOTCONN if not connected and no address is 458 * supplied. 459 */ 460 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 461 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 462 if ((so->so_state & SS_ISCONFIRMING) == 0 && 463 !(resid == 0 && clen != 0)) 464 snderr(ENOTCONN); 465 } else if (addr == 0) 466 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 467 ENOTCONN : EDESTADDRREQ); 468 } 469 space = sbspace(&so->so_snd); 470 if (flags & MSG_OOB) 471 space += 1024; 472 if ((atomic && resid > so->so_snd.sb_hiwat) || 473 clen > so->so_snd.sb_hiwat) 474 snderr(EMSGSIZE); 475 if (space < resid + clen && uio && 476 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 477 if (so->so_state & SS_NBIO) 478 snderr(EWOULDBLOCK); 479 sbunlock(&so->so_snd); 480 error = sbwait(&so->so_snd); 481 splx(s); 482 if (error) 483 goto out; 484 goto restart; 485 } 486 splx(s); 487 mp = ⊤ 488 space -= clen; 489 do { 490 if (uio == NULL) { 491 /* 492 * Data is prepackaged in "top". 493 */ 494 resid = 0; 495 if (flags & MSG_EOR) 496 top->m_flags |= M_EOR; 497 } else do { 498 if (top == 0) { 499 MGETHDR(m, M_WAIT, MT_DATA); 500 mlen = MHLEN; 501 m->m_pkthdr.len = 0; 502 m->m_pkthdr.rcvif = (struct ifnet *)0; 503 } else { 504 MGET(m, M_WAIT, MT_DATA); 505 mlen = MLEN; 506 } 507 if (resid >= MINCLSIZE) { 508 MCLGET(m, M_WAIT); 509 if ((m->m_flags & M_EXT) == 0) 510 goto nopages; 511 mlen = MCLBYTES; 512 len = min(min(mlen, resid), space); 513 } else { 514nopages: 515 len = min(min(mlen, resid), space); 516 /* 517 * For datagram protocols, leave room 518 * for protocol headers in first mbuf. 519 */ 520 if (atomic && top == 0 && len < mlen) 521 MH_ALIGN(m, len); 522 } 523 space -= len; 524 error = uiomove(mtod(m, caddr_t), (int)len, uio); 525 resid = uio->uio_resid; 526 m->m_len = len; 527 *mp = m; 528 top->m_pkthdr.len += len; 529 if (error) 530 goto release; 531 mp = &m->m_next; 532 if (resid <= 0) { 533 if (flags & MSG_EOR) 534 top->m_flags |= M_EOR; 535 break; 536 } 537 } while (space > 0 && atomic); 538 if (dontroute) 539 so->so_options |= SO_DONTROUTE; 540 s = splnet(); /* XXX */ 541 /* 542 * XXX all the SS_CANTSENDMORE checks previously 543 * done could be out of date. We could have recieved 544 * a reset packet in an interrupt or maybe we slept 545 * while doing page faults in uiomove() etc. We could 546 * probably recheck again inside the splnet() protection 547 * here, but there are probably other places that this 548 * also happens. We must rethink this. 549 */ 550 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 551 (flags & MSG_OOB) ? PRUS_OOB : 552 /* 553 * If the user set MSG_EOF, the protocol 554 * understands this flag and nothing left to 555 * send then use PRU_SEND_EOF instead of PRU_SEND. 556 */ 557 ((flags & MSG_EOF) && 558 (so->so_proto->pr_flags & PR_IMPLOPCL) && 559 (resid <= 0)) ? 560 PRUS_EOF : 561 /* If there is more to send set PRUS_MORETOCOME */ 562 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 563 top, addr, control, p); 564 splx(s); 565 if (dontroute) 566 so->so_options &= ~SO_DONTROUTE; 567 clen = 0; 568 control = 0; 569 top = 0; 570 mp = ⊤ 571 if (error) 572 goto release; 573 } while (resid && space > 0); 574 } while (resid); 575 576release: 577 sbunlock(&so->so_snd); 578out: 579 if (top) 580 m_freem(top); 581 if (control) 582 m_freem(control); 583 return (error); 584} 585 586/* 587 * Implement receive operations on a socket. 588 * We depend on the way that records are added to the sockbuf 589 * by sbappend*. In particular, each record (mbufs linked through m_next) 590 * must begin with an address if the protocol so specifies, 591 * followed by an optional mbuf or mbufs containing ancillary data, 592 * and then zero or more mbufs of data. 593 * In order to avoid blocking network interrupts for the entire time here, 594 * we splx() while doing the actual copy to user space. 595 * Although the sockbuf is locked, new data may still be appended, 596 * and thus we must maintain consistency of the sockbuf during that time. 597 * 598 * The caller may receive the data as a single mbuf chain by supplying 599 * an mbuf **mp0 for use in returning the chain. The uio is then used 600 * only for the count in uio_resid. 601 */ 602int 603soreceive(so, psa, uio, mp0, controlp, flagsp) 604 register struct socket *so; 605 struct sockaddr **psa; 606 struct uio *uio; 607 struct mbuf **mp0; 608 struct mbuf **controlp; 609 int *flagsp; 610{ 611 register struct mbuf *m, **mp; 612 register int flags, len, error, s, offset; 613 struct protosw *pr = so->so_proto; 614 struct mbuf *nextrecord; 615 int moff, type = 0; 616 int orig_resid = uio->uio_resid; 617 618 mp = mp0; 619 if (psa) 620 *psa = 0; 621 if (controlp) 622 *controlp = 0; 623 if (flagsp) 624 flags = *flagsp &~ MSG_EOR; 625 else 626 flags = 0; 627 if (flags & MSG_OOB) { 628 m = m_get(M_WAIT, MT_DATA); 629 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 630 if (error) 631 goto bad; 632 do { 633 error = uiomove(mtod(m, caddr_t), 634 (int) min(uio->uio_resid, m->m_len), uio); 635 m = m_free(m); 636 } while (uio->uio_resid && error == 0 && m); 637bad: 638 if (m) 639 m_freem(m); 640 return (error); 641 } 642 if (mp) 643 *mp = (struct mbuf *)0; 644 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 645 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 646 647restart: 648 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 649 if (error) 650 return (error); 651 s = splnet(); 652 653 m = so->so_rcv.sb_mb; 654 /* 655 * If we have less data than requested, block awaiting more 656 * (subject to any timeout) if: 657 * 1. the current count is less than the low water mark, or 658 * 2. MSG_WAITALL is set, and it is possible to do the entire 659 * receive operation at once if we block (resid <= hiwat). 660 * 3. MSG_DONTWAIT is not set 661 * If MSG_WAITALL is set but resid is larger than the receive buffer, 662 * we have to do the receive in sections, and thus risk returning 663 * a short count if a timeout or signal occurs after we start. 664 */ 665 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 666 so->so_rcv.sb_cc < uio->uio_resid) && 667 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 668 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 669 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 670 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); 671 if (so->so_error) { 672 if (m) 673 goto dontblock; 674 error = so->so_error; 675 if ((flags & MSG_PEEK) == 0) 676 so->so_error = 0; 677 goto release; 678 } 679 if (so->so_state & SS_CANTRCVMORE) { 680 if (m) 681 goto dontblock; 682 else 683 goto release; 684 } 685 for (; m; m = m->m_next) 686 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 687 m = so->so_rcv.sb_mb; 688 goto dontblock; 689 } 690 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 691 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 692 error = ENOTCONN; 693 goto release; 694 } 695 if (uio->uio_resid == 0) 696 goto release; 697 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 698 error = EWOULDBLOCK; 699 goto release; 700 } 701 sbunlock(&so->so_rcv); 702 error = sbwait(&so->so_rcv); 703 splx(s); 704 if (error) 705 return (error); 706 goto restart; 707 } 708dontblock: 709 if (uio->uio_procp) 710 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 711 nextrecord = m->m_nextpkt; 712 if (pr->pr_flags & PR_ADDR) { 713 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 714 orig_resid = 0; 715 if (psa) 716 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 717 mp0 == 0); 718 if (flags & MSG_PEEK) { 719 m = m->m_next; 720 } else { 721 sbfree(&so->so_rcv, m); 722 MFREE(m, so->so_rcv.sb_mb); 723 m = so->so_rcv.sb_mb; 724 } 725 } 726 while (m && m->m_type == MT_CONTROL && error == 0) { 727 if (flags & MSG_PEEK) { 728 if (controlp) 729 *controlp = m_copy(m, 0, m->m_len); 730 m = m->m_next; 731 } else { 732 sbfree(&so->so_rcv, m); 733 if (controlp) { 734 if (pr->pr_domain->dom_externalize && 735 mtod(m, struct cmsghdr *)->cmsg_type == 736 SCM_RIGHTS) 737 error = (*pr->pr_domain->dom_externalize)(m); 738 *controlp = m; 739 so->so_rcv.sb_mb = m->m_next; 740 m->m_next = 0; 741 m = so->so_rcv.sb_mb; 742 } else { 743 MFREE(m, so->so_rcv.sb_mb); 744 m = so->so_rcv.sb_mb; 745 } 746 } 747 if (controlp) { 748 orig_resid = 0; 749 controlp = &(*controlp)->m_next; 750 } 751 } 752 if (m) { 753 if ((flags & MSG_PEEK) == 0) 754 m->m_nextpkt = nextrecord; 755 type = m->m_type; 756 if (type == MT_OOBDATA) 757 flags |= MSG_OOB; 758 } 759 moff = 0; 760 offset = 0; 761 while (m && uio->uio_resid > 0 && error == 0) { 762 if (m->m_type == MT_OOBDATA) { 763 if (type != MT_OOBDATA) 764 break; 765 } else if (type == MT_OOBDATA) 766 break; 767 else 768 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 769 ("receive 3")); 770 so->so_state &= ~SS_RCVATMARK; 771 len = uio->uio_resid; 772 if (so->so_oobmark && len > so->so_oobmark - offset) 773 len = so->so_oobmark - offset; 774 if (len > m->m_len - moff) 775 len = m->m_len - moff; 776 /* 777 * If mp is set, just pass back the mbufs. 778 * Otherwise copy them out via the uio, then free. 779 * Sockbuf must be consistent here (points to current mbuf, 780 * it points to next record) when we drop priority; 781 * we must note any additions to the sockbuf when we 782 * block interrupts again. 783 */ 784 if (mp == 0) { 785 splx(s); 786 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 787 s = splnet(); 788 if (error) 789 goto release; 790 } else 791 uio->uio_resid -= len; 792 if (len == m->m_len - moff) { 793 if (m->m_flags & M_EOR) 794 flags |= MSG_EOR; 795 if (flags & MSG_PEEK) { 796 m = m->m_next; 797 moff = 0; 798 } else { 799 nextrecord = m->m_nextpkt; 800 sbfree(&so->so_rcv, m); 801 if (mp) { 802 *mp = m; 803 mp = &m->m_next; 804 so->so_rcv.sb_mb = m = m->m_next; 805 *mp = (struct mbuf *)0; 806 } else { 807 MFREE(m, so->so_rcv.sb_mb); 808 m = so->so_rcv.sb_mb; 809 } 810 if (m) 811 m->m_nextpkt = nextrecord; 812 } 813 } else { 814 if (flags & MSG_PEEK) 815 moff += len; 816 else { 817 if (mp) 818 *mp = m_copym(m, 0, len, M_WAIT); 819 m->m_data += len; 820 m->m_len -= len; 821 so->so_rcv.sb_cc -= len; 822 } 823 } 824 if (so->so_oobmark) { 825 if ((flags & MSG_PEEK) == 0) { 826 so->so_oobmark -= len; 827 if (so->so_oobmark == 0) { 828 so->so_state |= SS_RCVATMARK; 829 break; 830 } 831 } else { 832 offset += len; 833 if (offset == so->so_oobmark) 834 break; 835 } 836 } 837 if (flags & MSG_EOR) 838 break; 839 /* 840 * If the MSG_WAITALL flag is set (for non-atomic socket), 841 * we must not quit until "uio->uio_resid == 0" or an error 842 * termination. If a signal/timeout occurs, return 843 * with a short count but without error. 844 * Keep sockbuf locked against other readers. 845 */ 846 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 847 !sosendallatonce(so) && !nextrecord) { 848 if (so->so_error || so->so_state & SS_CANTRCVMORE) 849 break; 850 error = sbwait(&so->so_rcv); 851 if (error) { 852 sbunlock(&so->so_rcv); 853 splx(s); 854 return (0); 855 } 856 m = so->so_rcv.sb_mb; 857 if (m) 858 nextrecord = m->m_nextpkt; 859 } 860 } 861 862 if (m && pr->pr_flags & PR_ATOMIC) { 863 flags |= MSG_TRUNC; 864 if ((flags & MSG_PEEK) == 0) 865 (void) sbdroprecord(&so->so_rcv); 866 } 867 if ((flags & MSG_PEEK) == 0) { 868 if (m == 0) 869 so->so_rcv.sb_mb = nextrecord; 870 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 871 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 872 } 873 if (orig_resid == uio->uio_resid && orig_resid && 874 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 875 sbunlock(&so->so_rcv); 876 splx(s); 877 goto restart; 878 } 879 880 if (flagsp) 881 *flagsp |= flags; 882release: 883 sbunlock(&so->so_rcv); 884 splx(s); 885 return (error); 886} 887 888int 889soshutdown(so, how) 890 register struct socket *so; 891 register int how; 892{ 893 register struct protosw *pr = so->so_proto; 894 895 how++; 896 if (how & FREAD) 897 sorflush(so); 898 if (how & FWRITE) 899 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 900 return (0); 901} 902 903void 904sorflush(so) 905 register struct socket *so; 906{ 907 register struct sockbuf *sb = &so->so_rcv; 908 register struct protosw *pr = so->so_proto; 909 register int s; 910 struct sockbuf asb; 911 912 sb->sb_flags |= SB_NOINTR; 913 (void) sblock(sb, M_WAITOK); 914 s = splimp(); 915 socantrcvmore(so); 916 sbunlock(sb); 917 asb = *sb; 918 bzero((caddr_t)sb, sizeof (*sb)); 919 splx(s); 920 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 921 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 922 sbrelease(&asb); 923} 924 925/* 926 * Perhaps this routine, and sooptcopyout(), below, ought to come in 927 * an additional variant to handle the case where the option value needs 928 * to be some kind of integer, but not a specific size. 929 * In addition to their use here, these functions are also called by the 930 * protocol-level pr_ctloutput() routines. 931 */ 932int 933sooptcopyin(sopt, buf, len, minlen) 934 struct sockopt *sopt; 935 void *buf; 936 size_t len; 937 size_t minlen; 938{ 939 size_t valsize; 940 941 /* 942 * If the user gives us more than we wanted, we ignore it, 943 * but if we don't get the minimum length the caller 944 * wants, we return EINVAL. On success, sopt->sopt_valsize 945 * is set to however much we actually retrieved. 946 */ 947 if ((valsize = sopt->sopt_valsize) < minlen) 948 return EINVAL; 949 if (valsize > len) 950 sopt->sopt_valsize = valsize = len; 951 952 if (sopt->sopt_p != 0) 953 return (copyin(sopt->sopt_val, buf, valsize)); 954 955 bcopy(sopt->sopt_val, buf, valsize); 956 return 0; 957} 958 959int 960sosetopt(so, sopt) 961 struct socket *so; 962 struct sockopt *sopt; 963{ 964 int error, optval; 965 struct linger l; 966 struct timeval tv; 967 u_long val; 968 969 error = 0; 970 if (sopt->sopt_level != SOL_SOCKET) { 971 if (so->so_proto && so->so_proto->pr_ctloutput) 972 return ((*so->so_proto->pr_ctloutput) 973 (so, sopt)); 974 error = ENOPROTOOPT; 975 } else { 976 switch (sopt->sopt_name) { 977 case SO_LINGER: 978 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 979 if (error) 980 goto bad; 981 982 so->so_linger = l.l_linger; 983 if (l.l_onoff) 984 so->so_options |= SO_LINGER; 985 else 986 so->so_options &= ~SO_LINGER; 987 break; 988 989 case SO_DEBUG: 990 case SO_KEEPALIVE: 991 case SO_DONTROUTE: 992 case SO_USELOOPBACK: 993 case SO_BROADCAST: 994 case SO_REUSEADDR: 995 case SO_REUSEPORT: 996 case SO_OOBINLINE: 997 case SO_TIMESTAMP: 998 error = sooptcopyin(sopt, &optval, sizeof optval, 999 sizeof optval); 1000 if (error) 1001 goto bad; 1002 if (optval) 1003 so->so_options |= sopt->sopt_name; 1004 else 1005 so->so_options &= ~sopt->sopt_name; 1006 break; 1007 1008 case SO_SNDBUF: 1009 case SO_RCVBUF: 1010 case SO_SNDLOWAT: 1011 case SO_RCVLOWAT: 1012 error = sooptcopyin(sopt, &optval, sizeof optval, 1013 sizeof optval); 1014 if (error) 1015 goto bad; 1016 1017 /* 1018 * Values < 1 make no sense for any of these 1019 * options, so disallow them. 1020 */ 1021 if (optval < 1) { 1022 error = EINVAL; 1023 goto bad; 1024 } 1025 1026 switch (sopt->sopt_name) { 1027 case SO_SNDBUF: 1028 case SO_RCVBUF: 1029 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1030 &so->so_snd : &so->so_rcv, 1031 (u_long) optval) == 0) { 1032 error = ENOBUFS; 1033 goto bad; 1034 } 1035 break; 1036 1037 /* 1038 * Make sure the low-water is never greater than 1039 * the high-water. 1040 */ 1041 case SO_SNDLOWAT: 1042 so->so_snd.sb_lowat = 1043 (optval > so->so_snd.sb_hiwat) ? 1044 so->so_snd.sb_hiwat : optval; 1045 break; 1046 case SO_RCVLOWAT: 1047 so->so_rcv.sb_lowat = 1048 (optval > so->so_rcv.sb_hiwat) ? 1049 so->so_rcv.sb_hiwat : optval; 1050 break; 1051 } 1052 break; 1053 1054 case SO_SNDTIMEO: 1055 case SO_RCVTIMEO: 1056 error = sooptcopyin(sopt, &tv, sizeof tv, 1057 sizeof tv); 1058 if (error) 1059 goto bad; 1060 1061 /* assert(hz > 0); */ 1062 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1063 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1064 error = EDOM; 1065 goto bad; 1066 } 1067 /* assert(tick > 0); */ 1068 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1069 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1070 if (val > SHRT_MAX) { 1071 error = EDOM; 1072 goto bad; 1073 } 1074 1075 switch (sopt->sopt_name) { 1076 case SO_SNDTIMEO: 1077 so->so_snd.sb_timeo = val; 1078 break; 1079 case SO_RCVTIMEO: 1080 so->so_rcv.sb_timeo = val; 1081 break; 1082 } 1083 break; 1084 1085 default: 1086 error = ENOPROTOOPT; 1087 break; 1088 } 1089 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1090 (void) ((*so->so_proto->pr_ctloutput) 1091 (so, sopt)); 1092 } 1093 } 1094bad: 1095 return (error); 1096} 1097 1098/* Helper routine for getsockopt */ 1099int 1100sooptcopyout(sopt, buf, len) 1101 struct sockopt *sopt; 1102 void *buf; 1103 size_t len; 1104{ 1105 int error; 1106 size_t valsize; 1107 1108 error = 0; 1109 1110 /* 1111 * Documented get behavior is that we always return a value, 1112 * possibly truncated to fit in the user's buffer. 1113 * Traditional behavior is that we always tell the user 1114 * precisely how much we copied, rather than something useful 1115 * like the total amount we had available for her. 1116 * Note that this interface is not idempotent; the entire answer must 1117 * generated ahead of time. 1118 */ 1119 valsize = min(len, sopt->sopt_valsize); 1120 sopt->sopt_valsize = valsize; 1121 if (sopt->sopt_val != 0) { 1122 if (sopt->sopt_p != 0) 1123 error = copyout(buf, sopt->sopt_val, valsize); 1124 else 1125 bcopy(buf, sopt->sopt_val, valsize); 1126 } 1127 return error; 1128} 1129 1130int 1131sogetopt(so, sopt) 1132 struct socket *so; 1133 struct sockopt *sopt; 1134{ 1135 int error, optval; 1136 struct linger l; 1137 struct timeval tv; 1138 1139 error = 0; 1140 if (sopt->sopt_level != SOL_SOCKET) { 1141 if (so->so_proto && so->so_proto->pr_ctloutput) { 1142 return ((*so->so_proto->pr_ctloutput) 1143 (so, sopt)); 1144 } else 1145 return (ENOPROTOOPT); 1146 } else { 1147 switch (sopt->sopt_name) { 1148 case SO_LINGER: 1149 l.l_onoff = so->so_options & SO_LINGER; 1150 l.l_linger = so->so_linger; 1151 error = sooptcopyout(sopt, &l, sizeof l); 1152 break; 1153 1154 case SO_USELOOPBACK: 1155 case SO_DONTROUTE: 1156 case SO_DEBUG: 1157 case SO_KEEPALIVE: 1158 case SO_REUSEADDR: 1159 case SO_REUSEPORT: 1160 case SO_BROADCAST: 1161 case SO_OOBINLINE: 1162 case SO_TIMESTAMP: 1163 optval = so->so_options & sopt->sopt_name; 1164integer: 1165 error = sooptcopyout(sopt, &optval, sizeof optval); 1166 break; 1167 1168 case SO_TYPE: 1169 optval = so->so_type; 1170 goto integer; 1171 1172 case SO_ERROR: 1173 optval = so->so_error; 1174 so->so_error = 0; 1175 goto integer; 1176 1177 case SO_SNDBUF: 1178 optval = so->so_snd.sb_hiwat; 1179 goto integer; 1180 1181 case SO_RCVBUF: 1182 optval = so->so_rcv.sb_hiwat; 1183 goto integer; 1184 1185 case SO_SNDLOWAT: 1186 optval = so->so_snd.sb_lowat; 1187 goto integer; 1188 1189 case SO_RCVLOWAT: 1190 optval = so->so_rcv.sb_lowat; 1191 goto integer; 1192 1193 case SO_SNDTIMEO: 1194 case SO_RCVTIMEO: 1195 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1196 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1197 1198 tv.tv_sec = optval / hz; 1199 tv.tv_usec = (optval % hz) * tick; 1200 error = sooptcopyout(sopt, &tv, sizeof tv); 1201 break; 1202 1203 default: 1204 error = ENOPROTOOPT; 1205 break; 1206 } 1207 return (error); 1208 } 1209} 1210 1211void 1212sohasoutofband(so) 1213 register struct socket *so; 1214{ 1215 if (so->so_sigio != NULL) 1216 pgsigio(so->so_sigio, SIGURG, 0); 1217 selwakeup(&so->so_rcv.sb_sel); 1218} 1219 1220int 1221sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p) 1222{ 1223 int revents = 0; 1224 int s = splnet(); 1225 1226 if (events & (POLLIN | POLLRDNORM)) 1227 if (soreadable(so)) 1228 revents |= events & (POLLIN | POLLRDNORM); 1229 1230 if (events & (POLLOUT | POLLWRNORM)) 1231 if (sowriteable(so)) 1232 revents |= events & (POLLOUT | POLLWRNORM); 1233 1234 if (events & (POLLPRI | POLLRDBAND)) 1235 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 1236 revents |= events & (POLLPRI | POLLRDBAND); 1237 1238 if (revents == 0) { 1239 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 1240 selrecord(p, &so->so_rcv.sb_sel); 1241 so->so_rcv.sb_flags |= SB_SEL; 1242 } 1243 1244 if (events & (POLLOUT | POLLWRNORM)) { 1245 selrecord(p, &so->so_snd.sb_sel); 1246 so->so_snd.sb_flags |= SB_SEL; 1247 } 1248 } 1249 1250 splx(s); 1251 return (revents); 1252} 1253