uipc_socket.c revision 43301
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 34 * $Id: uipc_socket.c,v 1.52 1999/01/25 16:58:52 fenner Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/proc.h> 40#include <sys/fcntl.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/domain.h> 44#include <sys/kernel.h> 45#include <sys/poll.h> 46#include <sys/protosw.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/resourcevar.h> 50#include <sys/signalvar.h> 51#include <sys/sysctl.h> 52#include <sys/uio.h> 53#include <vm/vm_zone.h> 54 55#include <machine/limits.h> 56 57struct vm_zone *socket_zone; 58so_gen_t so_gencnt; /* generation count for sockets */ 59 60MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 61MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 62 63static int somaxconn = SOMAXCONN; 64SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 65 0, ""); 66 67/* 68 * Socket operation routines. 69 * These routines are called by the routines in 70 * sys_socket.c or from a system process, and 71 * implement the semantics of socket operations by 72 * switching out to the protocol specific routines. 73 */ 74 75/* 76 * Get a socket structure from our zone, and initialize it. 77 * We don't implement `waitok' yet (see comments in uipc_domain.c). 78 * Note that it would probably be better to allocate socket 79 * and PCB at the same time, but I'm not convinced that all 80 * the protocols can be easily modified to do this. 81 */ 82struct socket * 83soalloc(waitok) 84 int waitok; 85{ 86 struct socket *so; 87 88 so = zalloci(socket_zone); 89 if (so) { 90 /* XXX race condition for reentrant kernel */ 91 bzero(so, sizeof *so); 92 so->so_gencnt = ++so_gencnt; 93 so->so_zone = socket_zone; 94 } 95 return so; 96} 97 98int 99socreate(dom, aso, type, proto, p) 100 int dom; 101 struct socket **aso; 102 register int type; 103 int proto; 104 struct proc *p; 105{ 106 register struct protosw *prp; 107 register struct socket *so; 108 register int error; 109 110 if (proto) 111 prp = pffindproto(dom, proto, type); 112 else 113 prp = pffindtype(dom, type); 114 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 115 return (EPROTONOSUPPORT); 116 if (prp->pr_type != type) 117 return (EPROTOTYPE); 118 so = soalloc(p != 0); 119 if (so == 0) 120 return (ENOBUFS); 121 122 TAILQ_INIT(&so->so_incomp); 123 TAILQ_INIT(&so->so_comp); 124 so->so_type = type; 125 if (p != 0) 126 so->so_uid = p->p_ucred->cr_uid; 127 so->so_proto = prp; 128 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); 129 if (error) { 130 so->so_state |= SS_NOFDREF; 131 sofree(so); 132 return (error); 133 } 134 *aso = so; 135 return (0); 136} 137 138int 139sobind(so, nam, p) 140 struct socket *so; 141 struct sockaddr *nam; 142 struct proc *p; 143{ 144 int s = splnet(); 145 int error; 146 147 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); 148 splx(s); 149 return (error); 150} 151 152void 153sodealloc(so) 154 struct socket *so; 155{ 156 so->so_gencnt = ++so_gencnt; 157 zfreei(so->so_zone, so); 158} 159 160int 161solisten(so, backlog, p) 162 register struct socket *so; 163 int backlog; 164 struct proc *p; 165{ 166 int s, error; 167 168 s = splnet(); 169 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); 170 if (error) { 171 splx(s); 172 return (error); 173 } 174 if (so->so_comp.tqh_first == NULL) 175 so->so_options |= SO_ACCEPTCONN; 176 if (backlog < 0 || backlog > somaxconn) 177 backlog = somaxconn; 178 so->so_qlimit = backlog; 179 splx(s); 180 return (0); 181} 182 183void 184sofree(so) 185 register struct socket *so; 186{ 187 struct socket *head = so->so_head; 188 189 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 190 return; 191 if (head != NULL) { 192 if (so->so_state & SS_INCOMP) { 193 TAILQ_REMOVE(&head->so_incomp, so, so_list); 194 head->so_incqlen--; 195 } else if (so->so_state & SS_COMP) { 196 /* 197 * We must not decommission a socket that's 198 * on the accept(2) queue. If we do, then 199 * accept(2) may hang after select(2) indicated 200 * that the listening socket was ready. 201 */ 202 } else { 203 panic("sofree: not queued"); 204 } 205 head->so_qlen--; 206 so->so_state &= ~(SS_INCOMP|SS_COMP); 207 so->so_head = NULL; 208 } 209 sbrelease(&so->so_snd); 210 sorflush(so); 211 sodealloc(so); 212} 213 214/* 215 * Close a socket on last file table reference removal. 216 * Initiate disconnect if connected. 217 * Free socket when disconnect complete. 218 */ 219int 220soclose(so) 221 register struct socket *so; 222{ 223 int s = splnet(); /* conservative */ 224 int error = 0; 225 226 funsetown(so->so_sigio); 227 if (so->so_options & SO_ACCEPTCONN) { 228 struct socket *sp, *sonext; 229 230 for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) { 231 sonext = sp->so_list.tqe_next; 232 (void) soabort(sp); 233 } 234 for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) { 235 sonext = sp->so_list.tqe_next; 236 TAILQ_REMOVE(&so->so_comp, sp, so_list); 237 (void) soabort(sp); 238 } 239 } 240 if (so->so_pcb == 0) 241 goto discard; 242 if (so->so_state & SS_ISCONNECTED) { 243 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 244 error = sodisconnect(so); 245 if (error) 246 goto drop; 247 } 248 if (so->so_options & SO_LINGER) { 249 if ((so->so_state & SS_ISDISCONNECTING) && 250 (so->so_state & SS_NBIO)) 251 goto drop; 252 while (so->so_state & SS_ISCONNECTED) { 253 error = tsleep((caddr_t)&so->so_timeo, 254 PSOCK | PCATCH, "soclos", so->so_linger); 255 if (error) 256 break; 257 } 258 } 259 } 260drop: 261 if (so->so_pcb) { 262 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 263 if (error == 0) 264 error = error2; 265 } 266discard: 267 if (so->so_state & SS_NOFDREF) 268 panic("soclose: NOFDREF"); 269 so->so_state |= SS_NOFDREF; 270 sofree(so); 271 splx(s); 272 return (error); 273} 274 275/* 276 * Must be called at splnet... 277 */ 278int 279soabort(so) 280 struct socket *so; 281{ 282 283 return (*so->so_proto->pr_usrreqs->pru_abort)(so); 284} 285 286int 287soaccept(so, nam) 288 register struct socket *so; 289 struct sockaddr **nam; 290{ 291 int s = splnet(); 292 int error; 293 294 if ((so->so_state & SS_NOFDREF) == 0) 295 panic("soaccept: !NOFDREF"); 296 so->so_state &= ~SS_NOFDREF; 297 if ((so->so_state & SS_ISDISCONNECTED) == 0) 298 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 299 else { 300 if (nam) 301 *nam = 0; 302 error = 0; 303 } 304 splx(s); 305 return (error); 306} 307 308int 309soconnect(so, nam, p) 310 register struct socket *so; 311 struct sockaddr *nam; 312 struct proc *p; 313{ 314 int s; 315 int error; 316 317 if (so->so_options & SO_ACCEPTCONN) 318 return (EOPNOTSUPP); 319 s = splnet(); 320 /* 321 * If protocol is connection-based, can only connect once. 322 * Otherwise, if connected, try to disconnect first. 323 * This allows user to disconnect by connecting to, e.g., 324 * a null address. 325 */ 326 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 327 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 328 (error = sodisconnect(so)))) 329 error = EISCONN; 330 else 331 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); 332 splx(s); 333 return (error); 334} 335 336int 337soconnect2(so1, so2) 338 register struct socket *so1; 339 struct socket *so2; 340{ 341 int s = splnet(); 342 int error; 343 344 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 345 splx(s); 346 return (error); 347} 348 349int 350sodisconnect(so) 351 register struct socket *so; 352{ 353 int s = splnet(); 354 int error; 355 356 if ((so->so_state & SS_ISCONNECTED) == 0) { 357 error = ENOTCONN; 358 goto bad; 359 } 360 if (so->so_state & SS_ISDISCONNECTING) { 361 error = EALREADY; 362 goto bad; 363 } 364 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 365bad: 366 splx(s); 367 return (error); 368} 369 370#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 371/* 372 * Send on a socket. 373 * If send must go all at once and message is larger than 374 * send buffering, then hard error. 375 * Lock against other senders. 376 * If must go all at once and not enough room now, then 377 * inform user that this would block and do nothing. 378 * Otherwise, if nonblocking, send as much as possible. 379 * The data to be sent is described by "uio" if nonzero, 380 * otherwise by the mbuf chain "top" (which must be null 381 * if uio is not). Data provided in mbuf chain must be small 382 * enough to send all at once. 383 * 384 * Returns nonzero on error, timeout or signal; callers 385 * must check for short counts if EINTR/ERESTART are returned. 386 * Data and control buffers are freed on return. 387 */ 388int 389sosend(so, addr, uio, top, control, flags, p) 390 register struct socket *so; 391 struct sockaddr *addr; 392 struct uio *uio; 393 struct mbuf *top; 394 struct mbuf *control; 395 int flags; 396 struct proc *p; 397{ 398 struct mbuf **mp; 399 register struct mbuf *m; 400 register long space, len, resid; 401 int clen = 0, error, s, dontroute, mlen; 402 int atomic = sosendallatonce(so) || top; 403 404 if (uio) 405 resid = uio->uio_resid; 406 else 407 resid = top->m_pkthdr.len; 408 /* 409 * In theory resid should be unsigned. 410 * However, space must be signed, as it might be less than 0 411 * if we over-committed, and we must use a signed comparison 412 * of space and resid. On the other hand, a negative resid 413 * causes us to loop sending 0-length segments to the protocol. 414 * 415 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 416 * type sockets since that's an error. 417 */ 418 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 419 error = EINVAL; 420 goto out; 421 } 422 423 dontroute = 424 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 425 (so->so_proto->pr_flags & PR_ATOMIC); 426 if (p) 427 p->p_stats->p_ru.ru_msgsnd++; 428 if (control) 429 clen = control->m_len; 430#define snderr(errno) { error = errno; splx(s); goto release; } 431 432restart: 433 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 434 if (error) 435 goto out; 436 do { 437 s = splnet(); 438 if (so->so_state & SS_CANTSENDMORE) 439 snderr(EPIPE); 440 if (so->so_error) { 441 error = so->so_error; 442 so->so_error = 0; 443 splx(s); 444 goto release; 445 } 446 if ((so->so_state & SS_ISCONNECTED) == 0) { 447 /* 448 * `sendto' and `sendmsg' is allowed on a connection- 449 * based socket if it supports implied connect. 450 * Return ENOTCONN if not connected and no address is 451 * supplied. 452 */ 453 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 454 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 455 if ((so->so_state & SS_ISCONFIRMING) == 0 && 456 !(resid == 0 && clen != 0)) 457 snderr(ENOTCONN); 458 } else if (addr == 0) 459 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 460 ENOTCONN : EDESTADDRREQ); 461 } 462 space = sbspace(&so->so_snd); 463 if (flags & MSG_OOB) 464 space += 1024; 465 if ((atomic && resid > so->so_snd.sb_hiwat) || 466 clen > so->so_snd.sb_hiwat) 467 snderr(EMSGSIZE); 468 if (space < resid + clen && uio && 469 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 470 if (so->so_state & SS_NBIO) 471 snderr(EWOULDBLOCK); 472 sbunlock(&so->so_snd); 473 error = sbwait(&so->so_snd); 474 splx(s); 475 if (error) 476 goto out; 477 goto restart; 478 } 479 splx(s); 480 mp = ⊤ 481 space -= clen; 482 do { 483 if (uio == NULL) { 484 /* 485 * Data is prepackaged in "top". 486 */ 487 resid = 0; 488 if (flags & MSG_EOR) 489 top->m_flags |= M_EOR; 490 } else do { 491 if (top == 0) { 492 MGETHDR(m, M_WAIT, MT_DATA); 493 mlen = MHLEN; 494 m->m_pkthdr.len = 0; 495 m->m_pkthdr.rcvif = (struct ifnet *)0; 496 } else { 497 MGET(m, M_WAIT, MT_DATA); 498 mlen = MLEN; 499 } 500 if (resid >= MINCLSIZE) { 501 MCLGET(m, M_WAIT); 502 if ((m->m_flags & M_EXT) == 0) 503 goto nopages; 504 mlen = MCLBYTES; 505 len = min(min(mlen, resid), space); 506 } else { 507nopages: 508 len = min(min(mlen, resid), space); 509 /* 510 * For datagram protocols, leave room 511 * for protocol headers in first mbuf. 512 */ 513 if (atomic && top == 0 && len < mlen) 514 MH_ALIGN(m, len); 515 } 516 space -= len; 517 error = uiomove(mtod(m, caddr_t), (int)len, uio); 518 resid = uio->uio_resid; 519 m->m_len = len; 520 *mp = m; 521 top->m_pkthdr.len += len; 522 if (error) 523 goto release; 524 mp = &m->m_next; 525 if (resid <= 0) { 526 if (flags & MSG_EOR) 527 top->m_flags |= M_EOR; 528 break; 529 } 530 } while (space > 0 && atomic); 531 if (dontroute) 532 so->so_options |= SO_DONTROUTE; 533 s = splnet(); /* XXX */ 534 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 535 (flags & MSG_OOB) ? PRUS_OOB : 536 /* 537 * If the user set MSG_EOF, the protocol 538 * understands this flag and nothing left to 539 * send then use PRU_SEND_EOF instead of PRU_SEND. 540 */ 541 ((flags & MSG_EOF) && 542 (so->so_proto->pr_flags & PR_IMPLOPCL) && 543 (resid <= 0)) ? 544 PRUS_EOF : 545 /* If there is more to send set PRUS_MORETOCOME */ 546 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 547 top, addr, control, p); 548 splx(s); 549 if (dontroute) 550 so->so_options &= ~SO_DONTROUTE; 551 clen = 0; 552 control = 0; 553 top = 0; 554 mp = ⊤ 555 if (error) 556 goto release; 557 } while (resid && space > 0); 558 } while (resid); 559 560release: 561 sbunlock(&so->so_snd); 562out: 563 if (top) 564 m_freem(top); 565 if (control) 566 m_freem(control); 567 return (error); 568} 569 570/* 571 * Implement receive operations on a socket. 572 * We depend on the way that records are added to the sockbuf 573 * by sbappend*. In particular, each record (mbufs linked through m_next) 574 * must begin with an address if the protocol so specifies, 575 * followed by an optional mbuf or mbufs containing ancillary data, 576 * and then zero or more mbufs of data. 577 * In order to avoid blocking network interrupts for the entire time here, 578 * we splx() while doing the actual copy to user space. 579 * Although the sockbuf is locked, new data may still be appended, 580 * and thus we must maintain consistency of the sockbuf during that time. 581 * 582 * The caller may receive the data as a single mbuf chain by supplying 583 * an mbuf **mp0 for use in returning the chain. The uio is then used 584 * only for the count in uio_resid. 585 */ 586int 587soreceive(so, psa, uio, mp0, controlp, flagsp) 588 register struct socket *so; 589 struct sockaddr **psa; 590 struct uio *uio; 591 struct mbuf **mp0; 592 struct mbuf **controlp; 593 int *flagsp; 594{ 595 register struct mbuf *m, **mp; 596 register int flags, len, error, s, offset; 597 struct protosw *pr = so->so_proto; 598 struct mbuf *nextrecord; 599 int moff, type = 0; 600 int orig_resid = uio->uio_resid; 601 602 mp = mp0; 603 if (psa) 604 *psa = 0; 605 if (controlp) 606 *controlp = 0; 607 if (flagsp) 608 flags = *flagsp &~ MSG_EOR; 609 else 610 flags = 0; 611 if (flags & MSG_OOB) { 612 m = m_get(M_WAIT, MT_DATA); 613 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 614 if (error) 615 goto bad; 616 do { 617 error = uiomove(mtod(m, caddr_t), 618 (int) min(uio->uio_resid, m->m_len), uio); 619 m = m_free(m); 620 } while (uio->uio_resid && error == 0 && m); 621bad: 622 if (m) 623 m_freem(m); 624 return (error); 625 } 626 if (mp) 627 *mp = (struct mbuf *)0; 628 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 629 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 630 631restart: 632 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 633 if (error) 634 return (error); 635 s = splnet(); 636 637 m = so->so_rcv.sb_mb; 638 /* 639 * If we have less data than requested, block awaiting more 640 * (subject to any timeout) if: 641 * 1. the current count is less than the low water mark, or 642 * 2. MSG_WAITALL is set, and it is possible to do the entire 643 * receive operation at once if we block (resid <= hiwat). 644 * 3. MSG_DONTWAIT is not set 645 * If MSG_WAITALL is set but resid is larger than the receive buffer, 646 * we have to do the receive in sections, and thus risk returning 647 * a short count if a timeout or signal occurs after we start. 648 */ 649 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 650 so->so_rcv.sb_cc < uio->uio_resid) && 651 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 652 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 653 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 654 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); 655 if (so->so_error) { 656 if (m) 657 goto dontblock; 658 error = so->so_error; 659 if ((flags & MSG_PEEK) == 0) 660 so->so_error = 0; 661 goto release; 662 } 663 if (so->so_state & SS_CANTRCVMORE) { 664 if (m) 665 goto dontblock; 666 else 667 goto release; 668 } 669 for (; m; m = m->m_next) 670 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 671 m = so->so_rcv.sb_mb; 672 goto dontblock; 673 } 674 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 675 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 676 error = ENOTCONN; 677 goto release; 678 } 679 if (uio->uio_resid == 0) 680 goto release; 681 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 682 error = EWOULDBLOCK; 683 goto release; 684 } 685 sbunlock(&so->so_rcv); 686 error = sbwait(&so->so_rcv); 687 splx(s); 688 if (error) 689 return (error); 690 goto restart; 691 } 692dontblock: 693 if (uio->uio_procp) 694 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 695 nextrecord = m->m_nextpkt; 696 if (pr->pr_flags & PR_ADDR) { 697 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 698 orig_resid = 0; 699 if (psa) 700 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 701 mp0 == 0); 702 if (flags & MSG_PEEK) { 703 m = m->m_next; 704 } else { 705 sbfree(&so->so_rcv, m); 706 MFREE(m, so->so_rcv.sb_mb); 707 m = so->so_rcv.sb_mb; 708 } 709 } 710 while (m && m->m_type == MT_CONTROL && error == 0) { 711 if (flags & MSG_PEEK) { 712 if (controlp) 713 *controlp = m_copy(m, 0, m->m_len); 714 m = m->m_next; 715 } else { 716 sbfree(&so->so_rcv, m); 717 if (controlp) { 718 if (pr->pr_domain->dom_externalize && 719 mtod(m, struct cmsghdr *)->cmsg_type == 720 SCM_RIGHTS) 721 error = (*pr->pr_domain->dom_externalize)(m); 722 *controlp = m; 723 so->so_rcv.sb_mb = m->m_next; 724 m->m_next = 0; 725 m = so->so_rcv.sb_mb; 726 } else { 727 MFREE(m, so->so_rcv.sb_mb); 728 m = so->so_rcv.sb_mb; 729 } 730 } 731 if (controlp) { 732 orig_resid = 0; 733 controlp = &(*controlp)->m_next; 734 } 735 } 736 if (m) { 737 if ((flags & MSG_PEEK) == 0) 738 m->m_nextpkt = nextrecord; 739 type = m->m_type; 740 if (type == MT_OOBDATA) 741 flags |= MSG_OOB; 742 } 743 moff = 0; 744 offset = 0; 745 while (m && uio->uio_resid > 0 && error == 0) { 746 if (m->m_type == MT_OOBDATA) { 747 if (type != MT_OOBDATA) 748 break; 749 } else if (type == MT_OOBDATA) 750 break; 751 else 752 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 753 ("receive 3")); 754 so->so_state &= ~SS_RCVATMARK; 755 len = uio->uio_resid; 756 if (so->so_oobmark && len > so->so_oobmark - offset) 757 len = so->so_oobmark - offset; 758 if (len > m->m_len - moff) 759 len = m->m_len - moff; 760 /* 761 * If mp is set, just pass back the mbufs. 762 * Otherwise copy them out via the uio, then free. 763 * Sockbuf must be consistent here (points to current mbuf, 764 * it points to next record) when we drop priority; 765 * we must note any additions to the sockbuf when we 766 * block interrupts again. 767 */ 768 if (mp == 0) { 769 splx(s); 770 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 771 s = splnet(); 772 if (error) 773 goto release; 774 } else 775 uio->uio_resid -= len; 776 if (len == m->m_len - moff) { 777 if (m->m_flags & M_EOR) 778 flags |= MSG_EOR; 779 if (flags & MSG_PEEK) { 780 m = m->m_next; 781 moff = 0; 782 } else { 783 nextrecord = m->m_nextpkt; 784 sbfree(&so->so_rcv, m); 785 if (mp) { 786 *mp = m; 787 mp = &m->m_next; 788 so->so_rcv.sb_mb = m = m->m_next; 789 *mp = (struct mbuf *)0; 790 } else { 791 MFREE(m, so->so_rcv.sb_mb); 792 m = so->so_rcv.sb_mb; 793 } 794 if (m) 795 m->m_nextpkt = nextrecord; 796 } 797 } else { 798 if (flags & MSG_PEEK) 799 moff += len; 800 else { 801 if (mp) 802 *mp = m_copym(m, 0, len, M_WAIT); 803 m->m_data += len; 804 m->m_len -= len; 805 so->so_rcv.sb_cc -= len; 806 } 807 } 808 if (so->so_oobmark) { 809 if ((flags & MSG_PEEK) == 0) { 810 so->so_oobmark -= len; 811 if (so->so_oobmark == 0) { 812 so->so_state |= SS_RCVATMARK; 813 break; 814 } 815 } else { 816 offset += len; 817 if (offset == so->so_oobmark) 818 break; 819 } 820 } 821 if (flags & MSG_EOR) 822 break; 823 /* 824 * If the MSG_WAITALL flag is set (for non-atomic socket), 825 * we must not quit until "uio->uio_resid == 0" or an error 826 * termination. If a signal/timeout occurs, return 827 * with a short count but without error. 828 * Keep sockbuf locked against other readers. 829 */ 830 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 831 !sosendallatonce(so) && !nextrecord) { 832 if (so->so_error || so->so_state & SS_CANTRCVMORE) 833 break; 834 error = sbwait(&so->so_rcv); 835 if (error) { 836 sbunlock(&so->so_rcv); 837 splx(s); 838 return (0); 839 } 840 m = so->so_rcv.sb_mb; 841 if (m) 842 nextrecord = m->m_nextpkt; 843 } 844 } 845 846 if (m && pr->pr_flags & PR_ATOMIC) { 847 flags |= MSG_TRUNC; 848 if ((flags & MSG_PEEK) == 0) 849 (void) sbdroprecord(&so->so_rcv); 850 } 851 if ((flags & MSG_PEEK) == 0) { 852 if (m == 0) 853 so->so_rcv.sb_mb = nextrecord; 854 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 855 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 856 } 857 if (orig_resid == uio->uio_resid && orig_resid && 858 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 859 sbunlock(&so->so_rcv); 860 splx(s); 861 goto restart; 862 } 863 864 if (flagsp) 865 *flagsp |= flags; 866release: 867 sbunlock(&so->so_rcv); 868 splx(s); 869 return (error); 870} 871 872int 873soshutdown(so, how) 874 register struct socket *so; 875 register int how; 876{ 877 register struct protosw *pr = so->so_proto; 878 879 how++; 880 if (how & FREAD) 881 sorflush(so); 882 if (how & FWRITE) 883 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 884 return (0); 885} 886 887void 888sorflush(so) 889 register struct socket *so; 890{ 891 register struct sockbuf *sb = &so->so_rcv; 892 register struct protosw *pr = so->so_proto; 893 register int s; 894 struct sockbuf asb; 895 896 sb->sb_flags |= SB_NOINTR; 897 (void) sblock(sb, M_WAITOK); 898 s = splimp(); 899 socantrcvmore(so); 900 sbunlock(sb); 901 asb = *sb; 902 bzero((caddr_t)sb, sizeof (*sb)); 903 splx(s); 904 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 905 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 906 sbrelease(&asb); 907} 908 909/* 910 * Perhaps this routine, and sooptcopyout(), below, ought to come in 911 * an additional variant to handle the case where the option value needs 912 * to be some kind of integer, but not a specific size. 913 * In addition to their use here, these functions are also called by the 914 * protocol-level pr_ctloutput() routines. 915 */ 916int 917sooptcopyin(sopt, buf, len, minlen) 918 struct sockopt *sopt; 919 void *buf; 920 size_t len; 921 size_t minlen; 922{ 923 size_t valsize; 924 925 /* 926 * If the user gives us more than we wanted, we ignore it, 927 * but if we don't get the minimum length the caller 928 * wants, we return EINVAL. On success, sopt->sopt_valsize 929 * is set to however much we actually retrieved. 930 */ 931 if ((valsize = sopt->sopt_valsize) < minlen) 932 return EINVAL; 933 if (valsize > len) 934 sopt->sopt_valsize = valsize = len; 935 936 if (sopt->sopt_p != 0) 937 return (copyin(sopt->sopt_val, buf, valsize)); 938 939 bcopy(sopt->sopt_val, buf, valsize); 940 return 0; 941} 942 943int 944sosetopt(so, sopt) 945 struct socket *so; 946 struct sockopt *sopt; 947{ 948 int error, optval; 949 struct linger l; 950 struct timeval tv; 951 short val; 952 953 error = 0; 954 if (sopt->sopt_level != SOL_SOCKET) { 955 if (so->so_proto && so->so_proto->pr_ctloutput) 956 return ((*so->so_proto->pr_ctloutput) 957 (so, sopt)); 958 error = ENOPROTOOPT; 959 } else { 960 switch (sopt->sopt_name) { 961 case SO_LINGER: 962 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 963 if (error) 964 goto bad; 965 966 so->so_linger = l.l_linger; 967 if (l.l_onoff) 968 so->so_options |= SO_LINGER; 969 else 970 so->so_options &= ~SO_LINGER; 971 break; 972 973 case SO_DEBUG: 974 case SO_KEEPALIVE: 975 case SO_DONTROUTE: 976 case SO_USELOOPBACK: 977 case SO_BROADCAST: 978 case SO_REUSEADDR: 979 case SO_REUSEPORT: 980 case SO_OOBINLINE: 981 case SO_TIMESTAMP: 982 error = sooptcopyin(sopt, &optval, sizeof optval, 983 sizeof optval); 984 if (error) 985 goto bad; 986 if (optval) 987 so->so_options |= sopt->sopt_name; 988 else 989 so->so_options &= ~sopt->sopt_name; 990 break; 991 992 case SO_SNDBUF: 993 case SO_RCVBUF: 994 case SO_SNDLOWAT: 995 case SO_RCVLOWAT: 996 error = sooptcopyin(sopt, &optval, sizeof optval, 997 sizeof optval); 998 if (error) 999 goto bad; 1000 1001 /* 1002 * Values < 1 make no sense for any of these 1003 * options, so disallow them. 1004 */ 1005 if (optval < 1) { 1006 error = EINVAL; 1007 goto bad; 1008 } 1009 1010 switch (sopt->sopt_name) { 1011 case SO_SNDBUF: 1012 case SO_RCVBUF: 1013 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1014 &so->so_snd : &so->so_rcv, 1015 (u_long) optval) == 0) { 1016 error = ENOBUFS; 1017 goto bad; 1018 } 1019 break; 1020 1021 /* 1022 * Make sure the low-water is never greater than 1023 * the high-water. 1024 */ 1025 case SO_SNDLOWAT: 1026 so->so_snd.sb_lowat = 1027 (optval > so->so_snd.sb_hiwat) ? 1028 so->so_snd.sb_hiwat : optval; 1029 break; 1030 case SO_RCVLOWAT: 1031 so->so_rcv.sb_lowat = 1032 (optval > so->so_rcv.sb_hiwat) ? 1033 so->so_rcv.sb_hiwat : optval; 1034 break; 1035 } 1036 break; 1037 1038 case SO_SNDTIMEO: 1039 case SO_RCVTIMEO: 1040 error = sooptcopyin(sopt, &tv, sizeof tv, 1041 sizeof tv); 1042 if (error) 1043 goto bad; 1044 1045 if (tv.tv_sec > SHRT_MAX / hz - hz) { 1046 error = EDOM; 1047 goto bad; 1048 } 1049 val = tv.tv_sec * hz + tv.tv_usec / tick; 1050 1051 switch (sopt->sopt_name) { 1052 case SO_SNDTIMEO: 1053 so->so_snd.sb_timeo = val; 1054 break; 1055 case SO_RCVTIMEO: 1056 so->so_rcv.sb_timeo = val; 1057 break; 1058 } 1059 break; 1060 1061 default: 1062 error = ENOPROTOOPT; 1063 break; 1064 } 1065 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1066 (void) ((*so->so_proto->pr_ctloutput) 1067 (so, sopt)); 1068 } 1069 } 1070bad: 1071 return (error); 1072} 1073 1074/* Helper routine for getsockopt */ 1075int 1076sooptcopyout(sopt, buf, len) 1077 struct sockopt *sopt; 1078 void *buf; 1079 size_t len; 1080{ 1081 int error; 1082 size_t valsize; 1083 1084 error = 0; 1085 1086 /* 1087 * Documented get behavior is that we always return a value, 1088 * possibly truncated to fit in the user's buffer. 1089 * Traditional behavior is that we always tell the user 1090 * precisely how much we copied, rather than something useful 1091 * like the total amount we had available for her. 1092 * Note that this interface is not idempotent; the entire answer must 1093 * generated ahead of time. 1094 */ 1095 valsize = min(len, sopt->sopt_valsize); 1096 sopt->sopt_valsize = valsize; 1097 if (sopt->sopt_val != 0) { 1098 if (sopt->sopt_p != 0) 1099 error = copyout(buf, sopt->sopt_val, valsize); 1100 else 1101 bcopy(buf, sopt->sopt_val, valsize); 1102 } 1103 return error; 1104} 1105 1106int 1107sogetopt(so, sopt) 1108 struct socket *so; 1109 struct sockopt *sopt; 1110{ 1111 int error, optval; 1112 struct linger l; 1113 struct timeval tv; 1114 1115 error = 0; 1116 if (sopt->sopt_level != SOL_SOCKET) { 1117 if (so->so_proto && so->so_proto->pr_ctloutput) { 1118 return ((*so->so_proto->pr_ctloutput) 1119 (so, sopt)); 1120 } else 1121 return (ENOPROTOOPT); 1122 } else { 1123 switch (sopt->sopt_name) { 1124 case SO_LINGER: 1125 l.l_onoff = so->so_options & SO_LINGER; 1126 l.l_linger = so->so_linger; 1127 error = sooptcopyout(sopt, &l, sizeof l); 1128 break; 1129 1130 case SO_USELOOPBACK: 1131 case SO_DONTROUTE: 1132 case SO_DEBUG: 1133 case SO_KEEPALIVE: 1134 case SO_REUSEADDR: 1135 case SO_REUSEPORT: 1136 case SO_BROADCAST: 1137 case SO_OOBINLINE: 1138 case SO_TIMESTAMP: 1139 optval = so->so_options & sopt->sopt_name; 1140integer: 1141 error = sooptcopyout(sopt, &optval, sizeof optval); 1142 break; 1143 1144 case SO_TYPE: 1145 optval = so->so_type; 1146 goto integer; 1147 1148 case SO_ERROR: 1149 optval = so->so_error; 1150 so->so_error = 0; 1151 goto integer; 1152 1153 case SO_SNDBUF: 1154 optval = so->so_snd.sb_hiwat; 1155 goto integer; 1156 1157 case SO_RCVBUF: 1158 optval = so->so_rcv.sb_hiwat; 1159 goto integer; 1160 1161 case SO_SNDLOWAT: 1162 optval = so->so_snd.sb_lowat; 1163 goto integer; 1164 1165 case SO_RCVLOWAT: 1166 optval = so->so_rcv.sb_lowat; 1167 goto integer; 1168 1169 case SO_SNDTIMEO: 1170 case SO_RCVTIMEO: 1171 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1172 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1173 1174 tv.tv_sec = optval / hz; 1175 tv.tv_usec = (optval % hz) * tick; 1176 error = sooptcopyout(sopt, &tv, sizeof tv); 1177 break; 1178 1179 default: 1180 error = ENOPROTOOPT; 1181 break; 1182 } 1183 return (error); 1184 } 1185} 1186 1187void 1188sohasoutofband(so) 1189 register struct socket *so; 1190{ 1191 if (so->so_sigio != NULL) 1192 pgsigio(so->so_sigio, SIGURG, 0); 1193 selwakeup(&so->so_rcv.sb_sel); 1194} 1195 1196int 1197sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p) 1198{ 1199 int revents = 0; 1200 int s = splnet(); 1201 1202 if (events & (POLLIN | POLLRDNORM)) 1203 if (soreadable(so)) 1204 revents |= events & (POLLIN | POLLRDNORM); 1205 1206 if (events & (POLLOUT | POLLWRNORM)) 1207 if (sowriteable(so)) 1208 revents |= events & (POLLOUT | POLLWRNORM); 1209 1210 if (events & (POLLPRI | POLLRDBAND)) 1211 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 1212 revents |= events & (POLLPRI | POLLRDBAND); 1213 1214 if (revents == 0) { 1215 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 1216 selrecord(p, &so->so_rcv.sb_sel); 1217 so->so_rcv.sb_flags |= SB_SEL; 1218 } 1219 1220 if (events & (POLLOUT | POLLWRNORM)) { 1221 selrecord(p, &so->so_snd.sb_sel); 1222 so->so_snd.sb_flags |= SB_SEL; 1223 } 1224 } 1225 1226 splx(s); 1227 return (revents); 1228} 1229