uipc_socket2.c revision 1.77
1/* $NetBSD: uipc_socket2.c,v 1.77 2006/08/16 18:31:54 plunky Exp $ */ 2 3/* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 32 */ 33 34#include <sys/cdefs.h> 35__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.77 2006/08/16 18:31:54 plunky Exp $"); 36 37#include "opt_mbuftrace.h" 38#include "opt_sb_max.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/proc.h> 43#include <sys/file.h> 44#include <sys/buf.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/protosw.h> 48#include <sys/poll.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/signalvar.h> 52#include <sys/kauth.h> 53 54/* 55 * Primitive routines for operating on sockets and socket buffers 56 */ 57 58/* strings for sleep message: */ 59const char netcon[] = "netcon"; 60const char netcls[] = "netcls"; 61const char netio[] = "netio"; 62const char netlck[] = "netlck"; 63 64u_long sb_max = SB_MAX; /* maximum socket buffer size */ 65static u_long sb_max_adj; /* adjusted sb_max */ 66 67/* 68 * Procedures to manipulate state flags of socket 69 * and do appropriate wakeups. Normal sequence from the 70 * active (originating) side is that soisconnecting() is 71 * called during processing of connect() call, 72 * resulting in an eventual call to soisconnected() if/when the 73 * connection is established. When the connection is torn down 74 * soisdisconnecting() is called during processing of disconnect() call, 75 * and soisdisconnected() is called when the connection to the peer 76 * is totally severed. The semantics of these routines are such that 77 * connectionless protocols can call soisconnected() and soisdisconnected() 78 * only, bypassing the in-progress calls when setting up a ``connection'' 79 * takes no time. 80 * 81 * From the passive side, a socket is created with 82 * two queues of sockets: so_q0 for connections in progress 83 * and so_q for connections already made and awaiting user acceptance. 84 * As a protocol is preparing incoming connections, it creates a socket 85 * structure queued on so_q0 by calling sonewconn(). When the connection 86 * is established, soisconnected() is called, and transfers the 87 * socket structure to so_q, making it available to accept(). 88 * 89 * If a socket is closed with sockets on either 90 * so_q0 or so_q, these sockets are dropped. 91 * 92 * If higher level protocols are implemented in 93 * the kernel, the wakeups done here will sometimes 94 * cause software-interrupt process scheduling. 95 */ 96 97void 98soisconnecting(struct socket *so) 99{ 100 101 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 102 so->so_state |= SS_ISCONNECTING; 103} 104 105void 106soisconnected(struct socket *so) 107{ 108 struct socket *head; 109 110 head = so->so_head; 111 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 112 so->so_state |= SS_ISCONNECTED; 113 if (head && soqremque(so, 0)) { 114 soqinsque(head, so, 1); 115 sorwakeup(head); 116 wakeup((caddr_t)&head->so_timeo); 117 } else { 118 wakeup((caddr_t)&so->so_timeo); 119 sorwakeup(so); 120 sowwakeup(so); 121 } 122} 123 124void 125soisdisconnecting(struct socket *so) 126{ 127 128 so->so_state &= ~SS_ISCONNECTING; 129 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 130 wakeup((caddr_t)&so->so_timeo); 131 sowwakeup(so); 132 sorwakeup(so); 133} 134 135void 136soisdisconnected(struct socket *so) 137{ 138 139 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 140 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 141 wakeup((caddr_t)&so->so_timeo); 142 sowwakeup(so); 143 sorwakeup(so); 144} 145 146/* 147 * When an attempt at a new connection is noted on a socket 148 * which accepts connections, sonewconn is called. If the 149 * connection is possible (subject to space constraints, etc.) 150 * then we allocate a new structure, propoerly linked into the 151 * data structure of the original socket, and return this. 152 * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED. 153 */ 154struct socket * 155sonewconn(struct socket *head, int connstatus) 156{ 157 struct socket *so; 158 int soqueue; 159 160 soqueue = connstatus ? 1 : 0; 161 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) 162 return ((struct socket *)0); 163 so = pool_get(&socket_pool, PR_NOWAIT); 164 if (so == NULL) 165 return (NULL); 166 memset((caddr_t)so, 0, sizeof(*so)); 167 so->so_type = head->so_type; 168 so->so_options = head->so_options &~ SO_ACCEPTCONN; 169 so->so_linger = head->so_linger; 170 so->so_state = head->so_state | SS_NOFDREF; 171 so->so_proto = head->so_proto; 172 so->so_timeo = head->so_timeo; 173 so->so_pgid = head->so_pgid; 174 so->so_send = head->so_send; 175 so->so_receive = head->so_receive; 176 so->so_uidinfo = head->so_uidinfo; 177#ifdef MBUFTRACE 178 so->so_mowner = head->so_mowner; 179 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 180 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 181#endif 182 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); 183 soqinsque(head, so, soqueue); 184 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, 185 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 186 (struct lwp *)0)) { 187 (void) soqremque(so, soqueue); 188 pool_put(&socket_pool, so); 189 return (NULL); 190 } 191 if (connstatus) { 192 sorwakeup(head); 193 wakeup((caddr_t)&head->so_timeo); 194 so->so_state |= connstatus; 195 } 196 return (so); 197} 198 199void 200soqinsque(struct socket *head, struct socket *so, int q) 201{ 202 203#ifdef DIAGNOSTIC 204 if (so->so_onq != NULL) 205 panic("soqinsque"); 206#endif 207 208 so->so_head = head; 209 if (q == 0) { 210 head->so_q0len++; 211 so->so_onq = &head->so_q0; 212 } else { 213 head->so_qlen++; 214 so->so_onq = &head->so_q; 215 } 216 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 217} 218 219int 220soqremque(struct socket *so, int q) 221{ 222 struct socket *head; 223 224 head = so->so_head; 225 if (q == 0) { 226 if (so->so_onq != &head->so_q0) 227 return (0); 228 head->so_q0len--; 229 } else { 230 if (so->so_onq != &head->so_q) 231 return (0); 232 head->so_qlen--; 233 } 234 TAILQ_REMOVE(so->so_onq, so, so_qe); 235 so->so_onq = NULL; 236 so->so_head = NULL; 237 return (1); 238} 239 240/* 241 * Socantsendmore indicates that no more data will be sent on the 242 * socket; it would normally be applied to a socket when the user 243 * informs the system that no more data is to be sent, by the protocol 244 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 245 * will be received, and will normally be applied to the socket by a 246 * protocol when it detects that the peer will send no more data. 247 * Data queued for reading in the socket may yet be read. 248 */ 249 250void 251socantsendmore(struct socket *so) 252{ 253 254 so->so_state |= SS_CANTSENDMORE; 255 sowwakeup(so); 256} 257 258void 259socantrcvmore(struct socket *so) 260{ 261 262 so->so_state |= SS_CANTRCVMORE; 263 sorwakeup(so); 264} 265 266/* 267 * Wait for data to arrive at/drain from a socket buffer. 268 */ 269int 270sbwait(struct sockbuf *sb) 271{ 272 273 sb->sb_flags |= SB_WAIT; 274 return (tsleep((caddr_t)&sb->sb_cc, 275 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, 276 sb->sb_timeo)); 277} 278 279/* 280 * Lock a sockbuf already known to be locked; 281 * return any error returned from sleep (EINTR). 282 */ 283int 284sb_lock(struct sockbuf *sb) 285{ 286 int error; 287 288 while (sb->sb_flags & SB_LOCK) { 289 sb->sb_flags |= SB_WANT; 290 error = tsleep((caddr_t)&sb->sb_flags, 291 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, 292 netlck, 0); 293 if (error) 294 return (error); 295 } 296 sb->sb_flags |= SB_LOCK; 297 return (0); 298} 299 300/* 301 * Wakeup processes waiting on a socket buffer. 302 * Do asynchronous notification via SIGIO 303 * if the socket buffer has the SB_ASYNC flag set. 304 */ 305void 306sowakeup(struct socket *so, struct sockbuf *sb, int code) 307{ 308 selnotify(&sb->sb_sel, 0); 309 sb->sb_flags &= ~SB_SEL; 310 if (sb->sb_flags & SB_WAIT) { 311 sb->sb_flags &= ~SB_WAIT; 312 wakeup((caddr_t)&sb->sb_cc); 313 } 314 if (sb->sb_flags & SB_ASYNC) { 315 int band; 316 if (code == POLL_IN) 317 band = POLLIN|POLLRDNORM; 318 else 319 band = POLLOUT|POLLWRNORM; 320 fownsignal(so->so_pgid, SIGIO, code, band, so); 321 } 322 if (sb->sb_flags & SB_UPCALL) 323 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 324} 325 326/* 327 * Socket buffer (struct sockbuf) utility routines. 328 * 329 * Each socket contains two socket buffers: one for sending data and 330 * one for receiving data. Each buffer contains a queue of mbufs, 331 * information about the number of mbufs and amount of data in the 332 * queue, and other fields allowing poll() statements and notification 333 * on data availability to be implemented. 334 * 335 * Data stored in a socket buffer is maintained as a list of records. 336 * Each record is a list of mbufs chained together with the m_next 337 * field. Records are chained together with the m_nextpkt field. The upper 338 * level routine soreceive() expects the following conventions to be 339 * observed when placing information in the receive buffer: 340 * 341 * 1. If the protocol requires each message be preceded by the sender's 342 * name, then a record containing that name must be present before 343 * any associated data (mbuf's must be of type MT_SONAME). 344 * 2. If the protocol supports the exchange of ``access rights'' (really 345 * just additional data associated with the message), and there are 346 * ``rights'' to be received, then a record containing this data 347 * should be present (mbuf's must be of type MT_CONTROL). 348 * 3. If a name or rights record exists, then it must be followed by 349 * a data record, perhaps of zero length. 350 * 351 * Before using a new socket structure it is first necessary to reserve 352 * buffer space to the socket, by calling sbreserve(). This should commit 353 * some of the available buffer space in the system buffer pool for the 354 * socket (currently, it does nothing but enforce limits). The space 355 * should be released by calling sbrelease() when the socket is destroyed. 356 */ 357 358int 359sb_max_set(u_long new_sbmax) 360{ 361 int s; 362 363 if (new_sbmax < (16 * 1024)) 364 return (EINVAL); 365 366 s = splsoftnet(); 367 sb_max = new_sbmax; 368 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 369 splx(s); 370 371 return (0); 372} 373 374int 375soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 376{ 377 /* 378 * there's at least one application (a configure script of screen) 379 * which expects a fifo is writable even if it has "some" bytes 380 * in its buffer. 381 * so we want to make sure (hiwat - lowat) >= (some bytes). 382 * 383 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 384 * we expect it's large enough for such applications. 385 */ 386 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 387 u_long hiwat = lowat + PIPE_BUF; 388 389 if (sndcc < hiwat) 390 sndcc = hiwat; 391 if (sbreserve(&so->so_snd, sndcc, so) == 0) 392 goto bad; 393 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 394 goto bad2; 395 if (so->so_rcv.sb_lowat == 0) 396 so->so_rcv.sb_lowat = 1; 397 if (so->so_snd.sb_lowat == 0) 398 so->so_snd.sb_lowat = lowat; 399 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 400 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 401 return (0); 402 bad2: 403 sbrelease(&so->so_snd, so); 404 bad: 405 return (ENOBUFS); 406} 407 408/* 409 * Allot mbufs to a sockbuf. 410 * Attempt to scale mbmax so that mbcnt doesn't become limiting 411 * if buffering efficiency is near the normal case. 412 */ 413int 414sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 415{ 416 struct lwp *l = curlwp; /* XXX */ 417 rlim_t maxcc; 418 struct uidinfo *uidinfo; 419 420 KDASSERT(sb_max_adj != 0); 421 if (cc == 0 || cc > sb_max_adj) 422 return (0); 423 if (so) { 424 if (l && kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid) 425 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 426 else 427 maxcc = RLIM_INFINITY; 428 uidinfo = so->so_uidinfo; 429 } else { 430 uidinfo = uid_find(0); /* XXX: nothing better */ 431 maxcc = RLIM_INFINITY; 432 } 433 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 434 return 0; 435 sb->sb_mbmax = min(cc * 2, sb_max); 436 if (sb->sb_lowat > sb->sb_hiwat) 437 sb->sb_lowat = sb->sb_hiwat; 438 return (1); 439} 440 441/* 442 * Free mbufs held by a socket, and reserved mbuf space. 443 */ 444void 445sbrelease(struct sockbuf *sb, struct socket *so) 446{ 447 448 sbflush(sb); 449 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, 450 RLIM_INFINITY); 451 sb->sb_mbmax = 0; 452} 453 454/* 455 * Routines to add and remove 456 * data from an mbuf queue. 457 * 458 * The routines sbappend() or sbappendrecord() are normally called to 459 * append new mbufs to a socket buffer, after checking that adequate 460 * space is available, comparing the function sbspace() with the amount 461 * of data to be added. sbappendrecord() differs from sbappend() in 462 * that data supplied is treated as the beginning of a new record. 463 * To place a sender's address, optional access rights, and data in a 464 * socket receive buffer, sbappendaddr() should be used. To place 465 * access rights and data in a socket receive buffer, sbappendrights() 466 * should be used. In either case, the new data begins a new record. 467 * Note that unlike sbappend() and sbappendrecord(), these routines check 468 * for the caller that there will be enough space to store the data. 469 * Each fails if there is not enough space, or if it cannot find mbufs 470 * to store additional information in. 471 * 472 * Reliable protocols may use the socket send buffer to hold data 473 * awaiting acknowledgement. Data is normally copied from a socket 474 * send buffer in a protocol with m_copy for output to a peer, 475 * and then removing the data from the socket buffer with sbdrop() 476 * or sbdroprecord() when the data is acknowledged by the peer. 477 */ 478 479#ifdef SOCKBUF_DEBUG 480void 481sblastrecordchk(struct sockbuf *sb, const char *where) 482{ 483 struct mbuf *m = sb->sb_mb; 484 485 while (m && m->m_nextpkt) 486 m = m->m_nextpkt; 487 488 if (m != sb->sb_lastrecord) { 489 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 490 sb->sb_mb, sb->sb_lastrecord, m); 491 printf("packet chain:\n"); 492 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 493 printf("\t%p\n", m); 494 panic("sblastrecordchk from %s", where); 495 } 496} 497 498void 499sblastmbufchk(struct sockbuf *sb, const char *where) 500{ 501 struct mbuf *m = sb->sb_mb; 502 struct mbuf *n; 503 504 while (m && m->m_nextpkt) 505 m = m->m_nextpkt; 506 507 while (m && m->m_next) 508 m = m->m_next; 509 510 if (m != sb->sb_mbtail) { 511 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 512 sb->sb_mb, sb->sb_mbtail, m); 513 printf("packet tree:\n"); 514 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 515 printf("\t"); 516 for (n = m; n != NULL; n = n->m_next) 517 printf("%p ", n); 518 printf("\n"); 519 } 520 panic("sblastmbufchk from %s", where); 521 } 522} 523#endif /* SOCKBUF_DEBUG */ 524 525/* 526 * Link a chain of records onto a socket buffer 527 */ 528#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 529do { \ 530 if ((sb)->sb_lastrecord != NULL) \ 531 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 532 else \ 533 (sb)->sb_mb = (m0); \ 534 (sb)->sb_lastrecord = (mlast); \ 535} while (/*CONSTCOND*/0) 536 537 538#define SBLINKRECORD(sb, m0) \ 539 SBLINKRECORDCHAIN(sb, m0, m0) 540 541/* 542 * Append mbuf chain m to the last record in the 543 * socket buffer sb. The additional space associated 544 * the mbuf chain is recorded in sb. Empty mbufs are 545 * discarded and mbufs are compacted where possible. 546 */ 547void 548sbappend(struct sockbuf *sb, struct mbuf *m) 549{ 550 struct mbuf *n; 551 552 if (m == 0) 553 return; 554 555#ifdef MBUFTRACE 556 m_claimm(m, sb->sb_mowner); 557#endif 558 559 SBLASTRECORDCHK(sb, "sbappend 1"); 560 561 if ((n = sb->sb_lastrecord) != NULL) { 562 /* 563 * XXX Would like to simply use sb_mbtail here, but 564 * XXX I need to verify that I won't miss an EOR that 565 * XXX way. 566 */ 567 do { 568 if (n->m_flags & M_EOR) { 569 sbappendrecord(sb, m); /* XXXXXX!!!! */ 570 return; 571 } 572 } while (n->m_next && (n = n->m_next)); 573 } else { 574 /* 575 * If this is the first record in the socket buffer, it's 576 * also the last record. 577 */ 578 sb->sb_lastrecord = m; 579 } 580 sbcompress(sb, m, n); 581 SBLASTRECORDCHK(sb, "sbappend 2"); 582} 583 584/* 585 * This version of sbappend() should only be used when the caller 586 * absolutely knows that there will never be more than one record 587 * in the socket buffer, that is, a stream protocol (such as TCP). 588 */ 589void 590sbappendstream(struct sockbuf *sb, struct mbuf *m) 591{ 592 593 KDASSERT(m->m_nextpkt == NULL); 594 KASSERT(sb->sb_mb == sb->sb_lastrecord); 595 596 SBLASTMBUFCHK(sb, __func__); 597 598#ifdef MBUFTRACE 599 m_claimm(m, sb->sb_mowner); 600#endif 601 602 sbcompress(sb, m, sb->sb_mbtail); 603 604 sb->sb_lastrecord = sb->sb_mb; 605 SBLASTRECORDCHK(sb, __func__); 606} 607 608#ifdef SOCKBUF_DEBUG 609void 610sbcheck(struct sockbuf *sb) 611{ 612 struct mbuf *m; 613 u_long len, mbcnt; 614 615 len = 0; 616 mbcnt = 0; 617 for (m = sb->sb_mb; m; m = m->m_next) { 618 len += m->m_len; 619 mbcnt += MSIZE; 620 if (m->m_flags & M_EXT) 621 mbcnt += m->m_ext.ext_size; 622 if (m->m_nextpkt) 623 panic("sbcheck nextpkt"); 624 } 625 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 626 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 627 mbcnt, sb->sb_mbcnt); 628 panic("sbcheck"); 629 } 630} 631#endif 632 633/* 634 * As above, except the mbuf chain 635 * begins a new record. 636 */ 637void 638sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 639{ 640 struct mbuf *m; 641 642 if (m0 == 0) 643 return; 644 645#ifdef MBUFTRACE 646 m_claimm(m0, sb->sb_mowner); 647#endif 648 /* 649 * Put the first mbuf on the queue. 650 * Note this permits zero length records. 651 */ 652 sballoc(sb, m0); 653 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 654 SBLINKRECORD(sb, m0); 655 m = m0->m_next; 656 m0->m_next = 0; 657 if (m && (m0->m_flags & M_EOR)) { 658 m0->m_flags &= ~M_EOR; 659 m->m_flags |= M_EOR; 660 } 661 sbcompress(sb, m, m0); 662 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 663} 664 665/* 666 * As above except that OOB data 667 * is inserted at the beginning of the sockbuf, 668 * but after any other OOB data. 669 */ 670void 671sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 672{ 673 struct mbuf *m, **mp; 674 675 if (m0 == 0) 676 return; 677 678 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 679 680 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 681 again: 682 switch (m->m_type) { 683 684 case MT_OOBDATA: 685 continue; /* WANT next train */ 686 687 case MT_CONTROL: 688 if ((m = m->m_next) != NULL) 689 goto again; /* inspect THIS train further */ 690 } 691 break; 692 } 693 /* 694 * Put the first mbuf on the queue. 695 * Note this permits zero length records. 696 */ 697 sballoc(sb, m0); 698 m0->m_nextpkt = *mp; 699 if (*mp == NULL) { 700 /* m0 is actually the new tail */ 701 sb->sb_lastrecord = m0; 702 } 703 *mp = m0; 704 m = m0->m_next; 705 m0->m_next = 0; 706 if (m && (m0->m_flags & M_EOR)) { 707 m0->m_flags &= ~M_EOR; 708 m->m_flags |= M_EOR; 709 } 710 sbcompress(sb, m, m0); 711 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 712} 713 714/* 715 * Append address and data, and optionally, control (ancillary) data 716 * to the receive queue of a socket. If present, 717 * m0 must include a packet header with total length. 718 * Returns 0 if no space in sockbuf or insufficient mbufs. 719 */ 720int 721sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 722 struct mbuf *control) 723{ 724 struct mbuf *m, *n, *nlast; 725 int space, len; 726 727 space = asa->sa_len; 728 729 if (m0 != NULL) { 730 if ((m0->m_flags & M_PKTHDR) == 0) 731 panic("sbappendaddr"); 732 space += m0->m_pkthdr.len; 733#ifdef MBUFTRACE 734 m_claimm(m0, sb->sb_mowner); 735#endif 736 } 737 for (n = control; n; n = n->m_next) { 738 space += n->m_len; 739 MCLAIM(n, sb->sb_mowner); 740 if (n->m_next == 0) /* keep pointer to last control buf */ 741 break; 742 } 743 if (space > sbspace(sb)) 744 return (0); 745 MGET(m, M_DONTWAIT, MT_SONAME); 746 if (m == 0) 747 return (0); 748 MCLAIM(m, sb->sb_mowner); 749 /* 750 * XXX avoid 'comparison always true' warning which isn't easily 751 * avoided. 752 */ 753 len = asa->sa_len; 754 if (len > MLEN) { 755 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 756 if ((m->m_flags & M_EXT) == 0) { 757 m_free(m); 758 return (0); 759 } 760 } 761 m->m_len = asa->sa_len; 762 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 763 if (n) 764 n->m_next = m0; /* concatenate data to control */ 765 else 766 control = m0; 767 m->m_next = control; 768 769 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 770 771 for (n = m; n->m_next != NULL; n = n->m_next) 772 sballoc(sb, n); 773 sballoc(sb, n); 774 nlast = n; 775 SBLINKRECORD(sb, m); 776 777 sb->sb_mbtail = nlast; 778 SBLASTMBUFCHK(sb, "sbappendaddr"); 779 780 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 781 782 return (1); 783} 784 785/* 786 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 787 * an mbuf chain. 788 */ 789static inline struct mbuf * 790m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 791 const struct sockaddr *asa) 792{ 793 struct mbuf *m; 794 const int salen = asa->sa_len; 795 796 /* only the first in each chain need be a pkthdr */ 797 MGETHDR(m, M_DONTWAIT, MT_SONAME); 798 if (m == 0) 799 return (0); 800 MCLAIM(m, sb->sb_mowner); 801#ifdef notyet 802 if (salen > MHLEN) { 803 MEXTMALLOC(m, salen, M_NOWAIT); 804 if ((m->m_flags & M_EXT) == 0) { 805 m_free(m); 806 return (0); 807 } 808 } 809#else 810 KASSERT(salen <= MHLEN); 811#endif 812 m->m_len = salen; 813 memcpy(mtod(m, caddr_t), asa, salen); 814 m->m_next = m0; 815 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 816 817 return m; 818} 819 820int 821sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 822 struct mbuf *m0, int sbprio) 823{ 824 int space; 825 struct mbuf *m, *n, *n0, *nlast; 826 int error; 827 828 /* 829 * XXX sbprio reserved for encoding priority of this* request: 830 * SB_PRIO_NONE --> honour normal sb limits 831 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 832 * take whole chain. Intended for large requests 833 * that should be delivered atomically (all, or none). 834 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 835 * over normal socket limits, for messages indicating 836 * buffer overflow in earlier normal/lower-priority messages 837 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 838 * Intended for kernel-generated messages only. 839 * Up to generator to avoid total mbuf resource exhaustion. 840 */ 841 (void)sbprio; 842 843 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 844 panic("sbappendaddrchain"); 845 846 space = sbspace(sb); 847 848#ifdef notyet 849 /* 850 * Enforce SB_PRIO_* limits as described above. 851 */ 852#endif 853 854 n0 = NULL; 855 nlast = NULL; 856 for (m = m0; m; m = m->m_nextpkt) { 857 struct mbuf *np; 858 859#ifdef MBUFTRACE 860 m_claimm(m, sb->sb_mowner); 861#endif 862 863 /* Prepend sockaddr to this record (m) of input chain m0 */ 864 n = m_prepend_sockaddr(sb, m, asa); 865 if (n == NULL) { 866 error = ENOBUFS; 867 goto bad; 868 } 869 870 /* Append record (asa+m) to end of new chain n0 */ 871 if (n0 == NULL) { 872 n0 = n; 873 } else { 874 nlast->m_nextpkt = n; 875 } 876 /* Keep track of last record on new chain */ 877 nlast = n; 878 879 for (np = n; np; np = np->m_next) 880 sballoc(sb, np); 881 } 882 883 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 884 885 /* Drop the entire chain of (asa+m) records onto the socket */ 886 SBLINKRECORDCHAIN(sb, n0, nlast); 887 888 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 889 890 for (m = nlast; m->m_next; m = m->m_next) 891 ; 892 sb->sb_mbtail = m; 893 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 894 895 return (1); 896 897bad: 898 /* 899 * On error, free the prepended addreseses. For consistency 900 * with sbappendaddr(), leave it to our caller to free 901 * the input record chain passed to us as m0. 902 */ 903 while ((n = n0) != NULL) { 904 struct mbuf *np; 905 906 /* Undo the sballoc() of this record */ 907 for (np = n; np; np = np->m_next) 908 sbfree(sb, np); 909 910 n0 = n->m_nextpkt; /* iterate at next prepended address */ 911 MFREE(n, np); /* free prepended address (not data) */ 912 } 913 return 0; 914} 915 916 917int 918sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 919{ 920 struct mbuf *m, *mlast, *n; 921 int space; 922 923 space = 0; 924 if (control == 0) 925 panic("sbappendcontrol"); 926 for (m = control; ; m = m->m_next) { 927 space += m->m_len; 928 MCLAIM(m, sb->sb_mowner); 929 if (m->m_next == 0) 930 break; 931 } 932 n = m; /* save pointer to last control buffer */ 933 for (m = m0; m; m = m->m_next) { 934 MCLAIM(m, sb->sb_mowner); 935 space += m->m_len; 936 } 937 if (space > sbspace(sb)) 938 return (0); 939 n->m_next = m0; /* concatenate data to control */ 940 941 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 942 943 for (m = control; m->m_next != NULL; m = m->m_next) 944 sballoc(sb, m); 945 sballoc(sb, m); 946 mlast = m; 947 SBLINKRECORD(sb, control); 948 949 sb->sb_mbtail = mlast; 950 SBLASTMBUFCHK(sb, "sbappendcontrol"); 951 952 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 953 954 return (1); 955} 956 957/* 958 * Compress mbuf chain m into the socket 959 * buffer sb following mbuf n. If n 960 * is null, the buffer is presumed empty. 961 */ 962void 963sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 964{ 965 int eor; 966 struct mbuf *o; 967 968 eor = 0; 969 while (m) { 970 eor |= m->m_flags & M_EOR; 971 if (m->m_len == 0 && 972 (eor == 0 || 973 (((o = m->m_next) || (o = n)) && 974 o->m_type == m->m_type))) { 975 if (sb->sb_lastrecord == m) 976 sb->sb_lastrecord = m->m_next; 977 m = m_free(m); 978 continue; 979 } 980 if (n && (n->m_flags & M_EOR) == 0 && 981 /* M_TRAILINGSPACE() checks buffer writeability */ 982 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 983 m->m_len <= M_TRAILINGSPACE(n) && 984 n->m_type == m->m_type) { 985 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 986 (unsigned)m->m_len); 987 n->m_len += m->m_len; 988 sb->sb_cc += m->m_len; 989 m = m_free(m); 990 continue; 991 } 992 if (n) 993 n->m_next = m; 994 else 995 sb->sb_mb = m; 996 sb->sb_mbtail = m; 997 sballoc(sb, m); 998 n = m; 999 m->m_flags &= ~M_EOR; 1000 m = m->m_next; 1001 n->m_next = 0; 1002 } 1003 if (eor) { 1004 if (n) 1005 n->m_flags |= eor; 1006 else 1007 printf("semi-panic: sbcompress\n"); 1008 } 1009 SBLASTMBUFCHK(sb, __func__); 1010} 1011 1012/* 1013 * Free all mbufs in a sockbuf. 1014 * Check that all resources are reclaimed. 1015 */ 1016void 1017sbflush(struct sockbuf *sb) 1018{ 1019 1020 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1021 1022 while (sb->sb_mbcnt) 1023 sbdrop(sb, (int)sb->sb_cc); 1024 1025 KASSERT(sb->sb_cc == 0); 1026 KASSERT(sb->sb_mb == NULL); 1027 KASSERT(sb->sb_mbtail == NULL); 1028 KASSERT(sb->sb_lastrecord == NULL); 1029} 1030 1031/* 1032 * Drop data from (the front of) a sockbuf. 1033 */ 1034void 1035sbdrop(struct sockbuf *sb, int len) 1036{ 1037 struct mbuf *m, *mn, *next; 1038 1039 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 1040 while (len > 0) { 1041 if (m == 0) { 1042 if (next == 0) 1043 panic("sbdrop"); 1044 m = next; 1045 next = m->m_nextpkt; 1046 continue; 1047 } 1048 if (m->m_len > len) { 1049 m->m_len -= len; 1050 m->m_data += len; 1051 sb->sb_cc -= len; 1052 break; 1053 } 1054 len -= m->m_len; 1055 sbfree(sb, m); 1056 MFREE(m, mn); 1057 m = mn; 1058 } 1059 while (m && m->m_len == 0) { 1060 sbfree(sb, m); 1061 MFREE(m, mn); 1062 m = mn; 1063 } 1064 if (m) { 1065 sb->sb_mb = m; 1066 m->m_nextpkt = next; 1067 } else 1068 sb->sb_mb = next; 1069 /* 1070 * First part is an inline SB_EMPTY_FIXUP(). Second part 1071 * makes sure sb_lastrecord is up-to-date if we dropped 1072 * part of the last record. 1073 */ 1074 m = sb->sb_mb; 1075 if (m == NULL) { 1076 sb->sb_mbtail = NULL; 1077 sb->sb_lastrecord = NULL; 1078 } else if (m->m_nextpkt == NULL) 1079 sb->sb_lastrecord = m; 1080} 1081 1082/* 1083 * Drop a record off the front of a sockbuf 1084 * and move the next record to the front. 1085 */ 1086void 1087sbdroprecord(struct sockbuf *sb) 1088{ 1089 struct mbuf *m, *mn; 1090 1091 m = sb->sb_mb; 1092 if (m) { 1093 sb->sb_mb = m->m_nextpkt; 1094 do { 1095 sbfree(sb, m); 1096 MFREE(m, mn); 1097 } while ((m = mn) != NULL); 1098 } 1099 SB_EMPTY_FIXUP(sb); 1100} 1101 1102/* 1103 * Create a "control" mbuf containing the specified data 1104 * with the specified type for presentation on a socket buffer. 1105 */ 1106struct mbuf * 1107sbcreatecontrol(caddr_t p, int size, int type, int level) 1108{ 1109 struct cmsghdr *cp; 1110 struct mbuf *m; 1111 1112 if (CMSG_SPACE(size) > MCLBYTES) { 1113 printf("sbcreatecontrol: message too large %d\n", size); 1114 return NULL; 1115 } 1116 1117 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1118 return ((struct mbuf *) NULL); 1119 if (CMSG_SPACE(size) > MLEN) { 1120 MCLGET(m, M_DONTWAIT); 1121 if ((m->m_flags & M_EXT) == 0) { 1122 m_free(m); 1123 return NULL; 1124 } 1125 } 1126 cp = mtod(m, struct cmsghdr *); 1127 memcpy(CMSG_DATA(cp), p, size); 1128 m->m_len = CMSG_SPACE(size); 1129 cp->cmsg_len = CMSG_LEN(size); 1130 cp->cmsg_level = level; 1131 cp->cmsg_type = type; 1132 return (m); 1133} 1134