uipc_socket2.c revision 1.75
1/* $NetBSD: uipc_socket2.c,v 1.75 2006/07/23 22:06:11 ad Exp $ */ 2 3/* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 32 */ 33 34#include <sys/cdefs.h> 35__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.75 2006/07/23 22:06:11 ad Exp $"); 36 37#include "opt_mbuftrace.h" 38#include "opt_sb_max.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/proc.h> 43#include <sys/file.h> 44#include <sys/buf.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/protosw.h> 48#include <sys/poll.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/signalvar.h> 52#include <sys/kauth.h> 53 54/* 55 * Primitive routines for operating on sockets and socket buffers 56 */ 57 58/* strings for sleep message: */ 59const char netcon[] = "netcon"; 60const char netcls[] = "netcls"; 61const char netio[] = "netio"; 62const char netlck[] = "netlck"; 63 64u_long sb_max = SB_MAX; /* maximum socket buffer size */ 65static u_long sb_max_adj; /* adjusted sb_max */ 66 67/* 68 * Procedures to manipulate state flags of socket 69 * and do appropriate wakeups. Normal sequence from the 70 * active (originating) side is that soisconnecting() is 71 * called during processing of connect() call, 72 * resulting in an eventual call to soisconnected() if/when the 73 * connection is established. When the connection is torn down 74 * soisdisconnecting() is called during processing of disconnect() call, 75 * and soisdisconnected() is called when the connection to the peer 76 * is totally severed. The semantics of these routines are such that 77 * connectionless protocols can call soisconnected() and soisdisconnected() 78 * only, bypassing the in-progress calls when setting up a ``connection'' 79 * takes no time. 80 * 81 * From the passive side, a socket is created with 82 * two queues of sockets: so_q0 for connections in progress 83 * and so_q for connections already made and awaiting user acceptance. 84 * As a protocol is preparing incoming connections, it creates a socket 85 * structure queued on so_q0 by calling sonewconn(). When the connection 86 * is established, soisconnected() is called, and transfers the 87 * socket structure to so_q, making it available to accept(). 88 * 89 * If a socket is closed with sockets on either 90 * so_q0 or so_q, these sockets are dropped. 91 * 92 * If higher level protocols are implemented in 93 * the kernel, the wakeups done here will sometimes 94 * cause software-interrupt process scheduling. 95 */ 96 97void 98soisconnecting(struct socket *so) 99{ 100 101 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 102 so->so_state |= SS_ISCONNECTING; 103} 104 105void 106soisconnected(struct socket *so) 107{ 108 struct socket *head; 109 110 head = so->so_head; 111 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 112 so->so_state |= SS_ISCONNECTED; 113 if (head && soqremque(so, 0)) { 114 soqinsque(head, so, 1); 115 sorwakeup(head); 116 wakeup((caddr_t)&head->so_timeo); 117 } else { 118 wakeup((caddr_t)&so->so_timeo); 119 sorwakeup(so); 120 sowwakeup(so); 121 } 122} 123 124void 125soisdisconnecting(struct socket *so) 126{ 127 128 so->so_state &= ~SS_ISCONNECTING; 129 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 130 wakeup((caddr_t)&so->so_timeo); 131 sowwakeup(so); 132 sorwakeup(so); 133} 134 135void 136soisdisconnected(struct socket *so) 137{ 138 139 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 140 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 141 wakeup((caddr_t)&so->so_timeo); 142 sowwakeup(so); 143 sorwakeup(so); 144} 145 146/* 147 * When an attempt at a new connection is noted on a socket 148 * which accepts connections, sonewconn is called. If the 149 * connection is possible (subject to space constraints, etc.) 150 * then we allocate a new structure, propoerly linked into the 151 * data structure of the original socket, and return this. 152 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 153 * 154 * Currently, sonewconn() is defined as sonewconn1() in socketvar.h 155 * to catch calls that are missing the (new) second parameter. 156 */ 157struct socket * 158sonewconn1(struct socket *head, int connstatus) 159{ 160 struct socket *so; 161 int soqueue; 162 163 soqueue = connstatus ? 1 : 0; 164 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) 165 return ((struct socket *)0); 166 so = pool_get(&socket_pool, PR_NOWAIT); 167 if (so == NULL) 168 return (NULL); 169 memset((caddr_t)so, 0, sizeof(*so)); 170 so->so_type = head->so_type; 171 so->so_options = head->so_options &~ SO_ACCEPTCONN; 172 so->so_linger = head->so_linger; 173 so->so_state = head->so_state | SS_NOFDREF; 174 so->so_proto = head->so_proto; 175 so->so_timeo = head->so_timeo; 176 so->so_pgid = head->so_pgid; 177 so->so_send = head->so_send; 178 so->so_receive = head->so_receive; 179 so->so_uidinfo = head->so_uidinfo; 180#ifdef MBUFTRACE 181 so->so_mowner = head->so_mowner; 182 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 183 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 184#endif 185 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); 186 soqinsque(head, so, soqueue); 187 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, 188 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 189 (struct lwp *)0)) { 190 (void) soqremque(so, soqueue); 191 pool_put(&socket_pool, so); 192 return (NULL); 193 } 194 if (connstatus) { 195 sorwakeup(head); 196 wakeup((caddr_t)&head->so_timeo); 197 so->so_state |= connstatus; 198 } 199 return (so); 200} 201 202void 203soqinsque(struct socket *head, struct socket *so, int q) 204{ 205 206#ifdef DIAGNOSTIC 207 if (so->so_onq != NULL) 208 panic("soqinsque"); 209#endif 210 211 so->so_head = head; 212 if (q == 0) { 213 head->so_q0len++; 214 so->so_onq = &head->so_q0; 215 } else { 216 head->so_qlen++; 217 so->so_onq = &head->so_q; 218 } 219 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 220} 221 222int 223soqremque(struct socket *so, int q) 224{ 225 struct socket *head; 226 227 head = so->so_head; 228 if (q == 0) { 229 if (so->so_onq != &head->so_q0) 230 return (0); 231 head->so_q0len--; 232 } else { 233 if (so->so_onq != &head->so_q) 234 return (0); 235 head->so_qlen--; 236 } 237 TAILQ_REMOVE(so->so_onq, so, so_qe); 238 so->so_onq = NULL; 239 so->so_head = NULL; 240 return (1); 241} 242 243/* 244 * Socantsendmore indicates that no more data will be sent on the 245 * socket; it would normally be applied to a socket when the user 246 * informs the system that no more data is to be sent, by the protocol 247 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 248 * will be received, and will normally be applied to the socket by a 249 * protocol when it detects that the peer will send no more data. 250 * Data queued for reading in the socket may yet be read. 251 */ 252 253void 254socantsendmore(struct socket *so) 255{ 256 257 so->so_state |= SS_CANTSENDMORE; 258 sowwakeup(so); 259} 260 261void 262socantrcvmore(struct socket *so) 263{ 264 265 so->so_state |= SS_CANTRCVMORE; 266 sorwakeup(so); 267} 268 269/* 270 * Wait for data to arrive at/drain from a socket buffer. 271 */ 272int 273sbwait(struct sockbuf *sb) 274{ 275 276 sb->sb_flags |= SB_WAIT; 277 return (tsleep((caddr_t)&sb->sb_cc, 278 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, 279 sb->sb_timeo)); 280} 281 282/* 283 * Lock a sockbuf already known to be locked; 284 * return any error returned from sleep (EINTR). 285 */ 286int 287sb_lock(struct sockbuf *sb) 288{ 289 int error; 290 291 while (sb->sb_flags & SB_LOCK) { 292 sb->sb_flags |= SB_WANT; 293 error = tsleep((caddr_t)&sb->sb_flags, 294 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, 295 netlck, 0); 296 if (error) 297 return (error); 298 } 299 sb->sb_flags |= SB_LOCK; 300 return (0); 301} 302 303/* 304 * Wakeup processes waiting on a socket buffer. 305 * Do asynchronous notification via SIGIO 306 * if the socket buffer has the SB_ASYNC flag set. 307 */ 308void 309sowakeup(struct socket *so, struct sockbuf *sb, int code) 310{ 311 selnotify(&sb->sb_sel, 0); 312 sb->sb_flags &= ~SB_SEL; 313 if (sb->sb_flags & SB_WAIT) { 314 sb->sb_flags &= ~SB_WAIT; 315 wakeup((caddr_t)&sb->sb_cc); 316 } 317 if (sb->sb_flags & SB_ASYNC) { 318 int band; 319 if (code == POLL_IN) 320 band = POLLIN|POLLRDNORM; 321 else 322 band = POLLOUT|POLLWRNORM; 323 fownsignal(so->so_pgid, SIGIO, code, band, so); 324 } 325 if (sb->sb_flags & SB_UPCALL) 326 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 327} 328 329/* 330 * Socket buffer (struct sockbuf) utility routines. 331 * 332 * Each socket contains two socket buffers: one for sending data and 333 * one for receiving data. Each buffer contains a queue of mbufs, 334 * information about the number of mbufs and amount of data in the 335 * queue, and other fields allowing poll() statements and notification 336 * on data availability to be implemented. 337 * 338 * Data stored in a socket buffer is maintained as a list of records. 339 * Each record is a list of mbufs chained together with the m_next 340 * field. Records are chained together with the m_nextpkt field. The upper 341 * level routine soreceive() expects the following conventions to be 342 * observed when placing information in the receive buffer: 343 * 344 * 1. If the protocol requires each message be preceded by the sender's 345 * name, then a record containing that name must be present before 346 * any associated data (mbuf's must be of type MT_SONAME). 347 * 2. If the protocol supports the exchange of ``access rights'' (really 348 * just additional data associated with the message), and there are 349 * ``rights'' to be received, then a record containing this data 350 * should be present (mbuf's must be of type MT_CONTROL). 351 * 3. If a name or rights record exists, then it must be followed by 352 * a data record, perhaps of zero length. 353 * 354 * Before using a new socket structure it is first necessary to reserve 355 * buffer space to the socket, by calling sbreserve(). This should commit 356 * some of the available buffer space in the system buffer pool for the 357 * socket (currently, it does nothing but enforce limits). The space 358 * should be released by calling sbrelease() when the socket is destroyed. 359 */ 360 361int 362sb_max_set(u_long new_sbmax) 363{ 364 int s; 365 366 if (new_sbmax < (16 * 1024)) 367 return (EINVAL); 368 369 s = splsoftnet(); 370 sb_max = new_sbmax; 371 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 372 splx(s); 373 374 return (0); 375} 376 377int 378soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 379{ 380 /* 381 * there's at least one application (a configure script of screen) 382 * which expects a fifo is writable even if it has "some" bytes 383 * in its buffer. 384 * so we want to make sure (hiwat - lowat) >= (some bytes). 385 * 386 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 387 * we expect it's large enough for such applications. 388 */ 389 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 390 u_long hiwat = lowat + PIPE_BUF; 391 392 if (sndcc < hiwat) 393 sndcc = hiwat; 394 if (sbreserve(&so->so_snd, sndcc, so) == 0) 395 goto bad; 396 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 397 goto bad2; 398 if (so->so_rcv.sb_lowat == 0) 399 so->so_rcv.sb_lowat = 1; 400 if (so->so_snd.sb_lowat == 0) 401 so->so_snd.sb_lowat = lowat; 402 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 403 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 404 return (0); 405 bad2: 406 sbrelease(&so->so_snd, so); 407 bad: 408 return (ENOBUFS); 409} 410 411/* 412 * Allot mbufs to a sockbuf. 413 * Attempt to scale mbmax so that mbcnt doesn't become limiting 414 * if buffering efficiency is near the normal case. 415 */ 416int 417sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 418{ 419 struct lwp *l = curlwp; /* XXX */ 420 rlim_t maxcc; 421 struct uidinfo *uidinfo; 422 423 KDASSERT(sb_max_adj != 0); 424 if (cc == 0 || cc > sb_max_adj) 425 return (0); 426 if (so) { 427 if (l && kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid) 428 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 429 else 430 maxcc = RLIM_INFINITY; 431 uidinfo = so->so_uidinfo; 432 } else { 433 uidinfo = uid_find(0); /* XXX: nothing better */ 434 maxcc = RLIM_INFINITY; 435 } 436 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 437 return 0; 438 sb->sb_mbmax = min(cc * 2, sb_max); 439 if (sb->sb_lowat > sb->sb_hiwat) 440 sb->sb_lowat = sb->sb_hiwat; 441 return (1); 442} 443 444/* 445 * Free mbufs held by a socket, and reserved mbuf space. 446 */ 447void 448sbrelease(struct sockbuf *sb, struct socket *so) 449{ 450 451 sbflush(sb); 452 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, 453 RLIM_INFINITY); 454 sb->sb_mbmax = 0; 455} 456 457/* 458 * Routines to add and remove 459 * data from an mbuf queue. 460 * 461 * The routines sbappend() or sbappendrecord() are normally called to 462 * append new mbufs to a socket buffer, after checking that adequate 463 * space is available, comparing the function sbspace() with the amount 464 * of data to be added. sbappendrecord() differs from sbappend() in 465 * that data supplied is treated as the beginning of a new record. 466 * To place a sender's address, optional access rights, and data in a 467 * socket receive buffer, sbappendaddr() should be used. To place 468 * access rights and data in a socket receive buffer, sbappendrights() 469 * should be used. In either case, the new data begins a new record. 470 * Note that unlike sbappend() and sbappendrecord(), these routines check 471 * for the caller that there will be enough space to store the data. 472 * Each fails if there is not enough space, or if it cannot find mbufs 473 * to store additional information in. 474 * 475 * Reliable protocols may use the socket send buffer to hold data 476 * awaiting acknowledgement. Data is normally copied from a socket 477 * send buffer in a protocol with m_copy for output to a peer, 478 * and then removing the data from the socket buffer with sbdrop() 479 * or sbdroprecord() when the data is acknowledged by the peer. 480 */ 481 482#ifdef SOCKBUF_DEBUG 483void 484sblastrecordchk(struct sockbuf *sb, const char *where) 485{ 486 struct mbuf *m = sb->sb_mb; 487 488 while (m && m->m_nextpkt) 489 m = m->m_nextpkt; 490 491 if (m != sb->sb_lastrecord) { 492 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 493 sb->sb_mb, sb->sb_lastrecord, m); 494 printf("packet chain:\n"); 495 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 496 printf("\t%p\n", m); 497 panic("sblastrecordchk from %s", where); 498 } 499} 500 501void 502sblastmbufchk(struct sockbuf *sb, const char *where) 503{ 504 struct mbuf *m = sb->sb_mb; 505 struct mbuf *n; 506 507 while (m && m->m_nextpkt) 508 m = m->m_nextpkt; 509 510 while (m && m->m_next) 511 m = m->m_next; 512 513 if (m != sb->sb_mbtail) { 514 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 515 sb->sb_mb, sb->sb_mbtail, m); 516 printf("packet tree:\n"); 517 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 518 printf("\t"); 519 for (n = m; n != NULL; n = n->m_next) 520 printf("%p ", n); 521 printf("\n"); 522 } 523 panic("sblastmbufchk from %s", where); 524 } 525} 526#endif /* SOCKBUF_DEBUG */ 527 528/* 529 * Link a chain of records onto a socket buffer 530 */ 531#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 532do { \ 533 if ((sb)->sb_lastrecord != NULL) \ 534 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 535 else \ 536 (sb)->sb_mb = (m0); \ 537 (sb)->sb_lastrecord = (mlast); \ 538} while (/*CONSTCOND*/0) 539 540 541#define SBLINKRECORD(sb, m0) \ 542 SBLINKRECORDCHAIN(sb, m0, m0) 543 544/* 545 * Append mbuf chain m to the last record in the 546 * socket buffer sb. The additional space associated 547 * the mbuf chain is recorded in sb. Empty mbufs are 548 * discarded and mbufs are compacted where possible. 549 */ 550void 551sbappend(struct sockbuf *sb, struct mbuf *m) 552{ 553 struct mbuf *n; 554 555 if (m == 0) 556 return; 557 558#ifdef MBUFTRACE 559 m_claimm(m, sb->sb_mowner); 560#endif 561 562 SBLASTRECORDCHK(sb, "sbappend 1"); 563 564 if ((n = sb->sb_lastrecord) != NULL) { 565 /* 566 * XXX Would like to simply use sb_mbtail here, but 567 * XXX I need to verify that I won't miss an EOR that 568 * XXX way. 569 */ 570 do { 571 if (n->m_flags & M_EOR) { 572 sbappendrecord(sb, m); /* XXXXXX!!!! */ 573 return; 574 } 575 } while (n->m_next && (n = n->m_next)); 576 } else { 577 /* 578 * If this is the first record in the socket buffer, it's 579 * also the last record. 580 */ 581 sb->sb_lastrecord = m; 582 } 583 sbcompress(sb, m, n); 584 SBLASTRECORDCHK(sb, "sbappend 2"); 585} 586 587/* 588 * This version of sbappend() should only be used when the caller 589 * absolutely knows that there will never be more than one record 590 * in the socket buffer, that is, a stream protocol (such as TCP). 591 */ 592void 593sbappendstream(struct sockbuf *sb, struct mbuf *m) 594{ 595 596 KDASSERT(m->m_nextpkt == NULL); 597 KASSERT(sb->sb_mb == sb->sb_lastrecord); 598 599 SBLASTMBUFCHK(sb, __func__); 600 601#ifdef MBUFTRACE 602 m_claimm(m, sb->sb_mowner); 603#endif 604 605 sbcompress(sb, m, sb->sb_mbtail); 606 607 sb->sb_lastrecord = sb->sb_mb; 608 SBLASTRECORDCHK(sb, __func__); 609} 610 611#ifdef SOCKBUF_DEBUG 612void 613sbcheck(struct sockbuf *sb) 614{ 615 struct mbuf *m; 616 u_long len, mbcnt; 617 618 len = 0; 619 mbcnt = 0; 620 for (m = sb->sb_mb; m; m = m->m_next) { 621 len += m->m_len; 622 mbcnt += MSIZE; 623 if (m->m_flags & M_EXT) 624 mbcnt += m->m_ext.ext_size; 625 if (m->m_nextpkt) 626 panic("sbcheck nextpkt"); 627 } 628 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 629 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 630 mbcnt, sb->sb_mbcnt); 631 panic("sbcheck"); 632 } 633} 634#endif 635 636/* 637 * As above, except the mbuf chain 638 * begins a new record. 639 */ 640void 641sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 642{ 643 struct mbuf *m; 644 645 if (m0 == 0) 646 return; 647 648#ifdef MBUFTRACE 649 m_claimm(m0, sb->sb_mowner); 650#endif 651 /* 652 * Put the first mbuf on the queue. 653 * Note this permits zero length records. 654 */ 655 sballoc(sb, m0); 656 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 657 SBLINKRECORD(sb, m0); 658 m = m0->m_next; 659 m0->m_next = 0; 660 if (m && (m0->m_flags & M_EOR)) { 661 m0->m_flags &= ~M_EOR; 662 m->m_flags |= M_EOR; 663 } 664 sbcompress(sb, m, m0); 665 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 666} 667 668/* 669 * As above except that OOB data 670 * is inserted at the beginning of the sockbuf, 671 * but after any other OOB data. 672 */ 673void 674sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 675{ 676 struct mbuf *m, **mp; 677 678 if (m0 == 0) 679 return; 680 681 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 682 683 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 684 again: 685 switch (m->m_type) { 686 687 case MT_OOBDATA: 688 continue; /* WANT next train */ 689 690 case MT_CONTROL: 691 if ((m = m->m_next) != NULL) 692 goto again; /* inspect THIS train further */ 693 } 694 break; 695 } 696 /* 697 * Put the first mbuf on the queue. 698 * Note this permits zero length records. 699 */ 700 sballoc(sb, m0); 701 m0->m_nextpkt = *mp; 702 if (*mp == NULL) { 703 /* m0 is actually the new tail */ 704 sb->sb_lastrecord = m0; 705 } 706 *mp = m0; 707 m = m0->m_next; 708 m0->m_next = 0; 709 if (m && (m0->m_flags & M_EOR)) { 710 m0->m_flags &= ~M_EOR; 711 m->m_flags |= M_EOR; 712 } 713 sbcompress(sb, m, m0); 714 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 715} 716 717/* 718 * Append address and data, and optionally, control (ancillary) data 719 * to the receive queue of a socket. If present, 720 * m0 must include a packet header with total length. 721 * Returns 0 if no space in sockbuf or insufficient mbufs. 722 */ 723int 724sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 725 struct mbuf *control) 726{ 727 struct mbuf *m, *n, *nlast; 728 int space, len; 729 730 space = asa->sa_len; 731 732 if (m0 != NULL) { 733 if ((m0->m_flags & M_PKTHDR) == 0) 734 panic("sbappendaddr"); 735 space += m0->m_pkthdr.len; 736#ifdef MBUFTRACE 737 m_claimm(m0, sb->sb_mowner); 738#endif 739 } 740 for (n = control; n; n = n->m_next) { 741 space += n->m_len; 742 MCLAIM(n, sb->sb_mowner); 743 if (n->m_next == 0) /* keep pointer to last control buf */ 744 break; 745 } 746 if (space > sbspace(sb)) 747 return (0); 748 MGET(m, M_DONTWAIT, MT_SONAME); 749 if (m == 0) 750 return (0); 751 MCLAIM(m, sb->sb_mowner); 752 /* 753 * XXX avoid 'comparison always true' warning which isn't easily 754 * avoided. 755 */ 756 len = asa->sa_len; 757 if (len > MLEN) { 758 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 759 if ((m->m_flags & M_EXT) == 0) { 760 m_free(m); 761 return (0); 762 } 763 } 764 m->m_len = asa->sa_len; 765 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 766 if (n) 767 n->m_next = m0; /* concatenate data to control */ 768 else 769 control = m0; 770 m->m_next = control; 771 772 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 773 774 for (n = m; n->m_next != NULL; n = n->m_next) 775 sballoc(sb, n); 776 sballoc(sb, n); 777 nlast = n; 778 SBLINKRECORD(sb, m); 779 780 sb->sb_mbtail = nlast; 781 SBLASTMBUFCHK(sb, "sbappendaddr"); 782 783 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 784 785 return (1); 786} 787 788/* 789 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 790 * an mbuf chain. 791 */ 792static inline struct mbuf * 793m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 794 const struct sockaddr *asa) 795{ 796 struct mbuf *m; 797 const int salen = asa->sa_len; 798 799 /* only the first in each chain need be a pkthdr */ 800 MGETHDR(m, M_DONTWAIT, MT_SONAME); 801 if (m == 0) 802 return (0); 803 MCLAIM(m, sb->sb_mowner); 804#ifdef notyet 805 if (salen > MHLEN) { 806 MEXTMALLOC(m, salen, M_NOWAIT); 807 if ((m->m_flags & M_EXT) == 0) { 808 m_free(m); 809 return (0); 810 } 811 } 812#else 813 KASSERT(salen <= MHLEN); 814#endif 815 m->m_len = salen; 816 memcpy(mtod(m, caddr_t), asa, salen); 817 m->m_next = m0; 818 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 819 820 return m; 821} 822 823int 824sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 825 struct mbuf *m0, int sbprio) 826{ 827 int space; 828 struct mbuf *m, *n, *n0, *nlast; 829 int error; 830 831 /* 832 * XXX sbprio reserved for encoding priority of this* request: 833 * SB_PRIO_NONE --> honour normal sb limits 834 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 835 * take whole chain. Intended for large requests 836 * that should be delivered atomically (all, or none). 837 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 838 * over normal socket limits, for messages indicating 839 * buffer overflow in earlier normal/lower-priority messages 840 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 841 * Intended for kernel-generated messages only. 842 * Up to generator to avoid total mbuf resource exhaustion. 843 */ 844 (void)sbprio; 845 846 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 847 panic("sbappendaddrchain"); 848 849 space = sbspace(sb); 850 851#ifdef notyet 852 /* 853 * Enforce SB_PRIO_* limits as described above. 854 */ 855#endif 856 857 n0 = NULL; 858 nlast = NULL; 859 for (m = m0; m; m = m->m_nextpkt) { 860 struct mbuf *np; 861 862#ifdef MBUFTRACE 863 m_claimm(m, sb->sb_mowner); 864#endif 865 866 /* Prepend sockaddr to this record (m) of input chain m0 */ 867 n = m_prepend_sockaddr(sb, m, asa); 868 if (n == NULL) { 869 error = ENOBUFS; 870 goto bad; 871 } 872 873 /* Append record (asa+m) to end of new chain n0 */ 874 if (n0 == NULL) { 875 n0 = n; 876 } else { 877 nlast->m_nextpkt = n; 878 } 879 /* Keep track of last record on new chain */ 880 nlast = n; 881 882 for (np = n; np; np = np->m_next) 883 sballoc(sb, np); 884 } 885 886 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 887 888 /* Drop the entire chain of (asa+m) records onto the socket */ 889 SBLINKRECORDCHAIN(sb, n0, nlast); 890 891 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 892 893 for (m = nlast; m->m_next; m = m->m_next) 894 ; 895 sb->sb_mbtail = m; 896 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 897 898 return (1); 899 900bad: 901 /* 902 * On error, free the prepended addreseses. For consistency 903 * with sbappendaddr(), leave it to our caller to free 904 * the input record chain passed to us as m0. 905 */ 906 while ((n = n0) != NULL) { 907 struct mbuf *np; 908 909 /* Undo the sballoc() of this record */ 910 for (np = n; np; np = np->m_next) 911 sbfree(sb, np); 912 913 n0 = n->m_nextpkt; /* iterate at next prepended address */ 914 MFREE(n, np); /* free prepended address (not data) */ 915 } 916 return 0; 917} 918 919 920int 921sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 922{ 923 struct mbuf *m, *mlast, *n; 924 int space; 925 926 space = 0; 927 if (control == 0) 928 panic("sbappendcontrol"); 929 for (m = control; ; m = m->m_next) { 930 space += m->m_len; 931 MCLAIM(m, sb->sb_mowner); 932 if (m->m_next == 0) 933 break; 934 } 935 n = m; /* save pointer to last control buffer */ 936 for (m = m0; m; m = m->m_next) { 937 MCLAIM(m, sb->sb_mowner); 938 space += m->m_len; 939 } 940 if (space > sbspace(sb)) 941 return (0); 942 n->m_next = m0; /* concatenate data to control */ 943 944 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 945 946 for (m = control; m->m_next != NULL; m = m->m_next) 947 sballoc(sb, m); 948 sballoc(sb, m); 949 mlast = m; 950 SBLINKRECORD(sb, control); 951 952 sb->sb_mbtail = mlast; 953 SBLASTMBUFCHK(sb, "sbappendcontrol"); 954 955 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 956 957 return (1); 958} 959 960/* 961 * Compress mbuf chain m into the socket 962 * buffer sb following mbuf n. If n 963 * is null, the buffer is presumed empty. 964 */ 965void 966sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 967{ 968 int eor; 969 struct mbuf *o; 970 971 eor = 0; 972 while (m) { 973 eor |= m->m_flags & M_EOR; 974 if (m->m_len == 0 && 975 (eor == 0 || 976 (((o = m->m_next) || (o = n)) && 977 o->m_type == m->m_type))) { 978 if (sb->sb_lastrecord == m) 979 sb->sb_lastrecord = m->m_next; 980 m = m_free(m); 981 continue; 982 } 983 if (n && (n->m_flags & M_EOR) == 0 && 984 /* M_TRAILINGSPACE() checks buffer writeability */ 985 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 986 m->m_len <= M_TRAILINGSPACE(n) && 987 n->m_type == m->m_type) { 988 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 989 (unsigned)m->m_len); 990 n->m_len += m->m_len; 991 sb->sb_cc += m->m_len; 992 m = m_free(m); 993 continue; 994 } 995 if (n) 996 n->m_next = m; 997 else 998 sb->sb_mb = m; 999 sb->sb_mbtail = m; 1000 sballoc(sb, m); 1001 n = m; 1002 m->m_flags &= ~M_EOR; 1003 m = m->m_next; 1004 n->m_next = 0; 1005 } 1006 if (eor) { 1007 if (n) 1008 n->m_flags |= eor; 1009 else 1010 printf("semi-panic: sbcompress\n"); 1011 } 1012 SBLASTMBUFCHK(sb, __func__); 1013} 1014 1015/* 1016 * Free all mbufs in a sockbuf. 1017 * Check that all resources are reclaimed. 1018 */ 1019void 1020sbflush(struct sockbuf *sb) 1021{ 1022 1023 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1024 1025 while (sb->sb_mbcnt) 1026 sbdrop(sb, (int)sb->sb_cc); 1027 1028 KASSERT(sb->sb_cc == 0); 1029 KASSERT(sb->sb_mb == NULL); 1030 KASSERT(sb->sb_mbtail == NULL); 1031 KASSERT(sb->sb_lastrecord == NULL); 1032} 1033 1034/* 1035 * Drop data from (the front of) a sockbuf. 1036 */ 1037void 1038sbdrop(struct sockbuf *sb, int len) 1039{ 1040 struct mbuf *m, *mn, *next; 1041 1042 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 1043 while (len > 0) { 1044 if (m == 0) { 1045 if (next == 0) 1046 panic("sbdrop"); 1047 m = next; 1048 next = m->m_nextpkt; 1049 continue; 1050 } 1051 if (m->m_len > len) { 1052 m->m_len -= len; 1053 m->m_data += len; 1054 sb->sb_cc -= len; 1055 break; 1056 } 1057 len -= m->m_len; 1058 sbfree(sb, m); 1059 MFREE(m, mn); 1060 m = mn; 1061 } 1062 while (m && m->m_len == 0) { 1063 sbfree(sb, m); 1064 MFREE(m, mn); 1065 m = mn; 1066 } 1067 if (m) { 1068 sb->sb_mb = m; 1069 m->m_nextpkt = next; 1070 } else 1071 sb->sb_mb = next; 1072 /* 1073 * First part is an inline SB_EMPTY_FIXUP(). Second part 1074 * makes sure sb_lastrecord is up-to-date if we dropped 1075 * part of the last record. 1076 */ 1077 m = sb->sb_mb; 1078 if (m == NULL) { 1079 sb->sb_mbtail = NULL; 1080 sb->sb_lastrecord = NULL; 1081 } else if (m->m_nextpkt == NULL) 1082 sb->sb_lastrecord = m; 1083} 1084 1085/* 1086 * Drop a record off the front of a sockbuf 1087 * and move the next record to the front. 1088 */ 1089void 1090sbdroprecord(struct sockbuf *sb) 1091{ 1092 struct mbuf *m, *mn; 1093 1094 m = sb->sb_mb; 1095 if (m) { 1096 sb->sb_mb = m->m_nextpkt; 1097 do { 1098 sbfree(sb, m); 1099 MFREE(m, mn); 1100 } while ((m = mn) != NULL); 1101 } 1102 SB_EMPTY_FIXUP(sb); 1103} 1104 1105/* 1106 * Create a "control" mbuf containing the specified data 1107 * with the specified type for presentation on a socket buffer. 1108 */ 1109struct mbuf * 1110sbcreatecontrol(caddr_t p, int size, int type, int level) 1111{ 1112 struct cmsghdr *cp; 1113 struct mbuf *m; 1114 1115 if (CMSG_SPACE(size) > MCLBYTES) { 1116 printf("sbcreatecontrol: message too large %d\n", size); 1117 return NULL; 1118 } 1119 1120 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1121 return ((struct mbuf *) NULL); 1122 if (CMSG_SPACE(size) > MLEN) { 1123 MCLGET(m, M_DONTWAIT); 1124 if ((m->m_flags & M_EXT) == 0) { 1125 m_free(m); 1126 return NULL; 1127 } 1128 } 1129 cp = mtod(m, struct cmsghdr *); 1130 memcpy(CMSG_DATA(cp), p, size); 1131 m->m_len = CMSG_SPACE(size); 1132 cp->cmsg_len = CMSG_LEN(size); 1133 cp->cmsg_level = level; 1134 cp->cmsg_type = type; 1135 return (m); 1136} 1137