uipc_socket2.c revision 1.85
1/* $NetBSD: uipc_socket2.c,v 1.85 2007/08/02 02:42:40 rmind Exp $ */ 2 3/* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 32 */ 33 34#include <sys/cdefs.h> 35__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.85 2007/08/02 02:42:40 rmind Exp $"); 36 37#include "opt_mbuftrace.h" 38#include "opt_sb_max.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/proc.h> 43#include <sys/file.h> 44#include <sys/buf.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/protosw.h> 48#include <sys/poll.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/signalvar.h> 52#include <sys/kauth.h> 53 54/* 55 * Primitive routines for operating on sockets and socket buffers 56 */ 57 58/* strings for sleep message: */ 59const char netcon[] = "netcon"; 60const char netcls[] = "netcls"; 61const char netio[] = "netio"; 62const char netlck[] = "netlck"; 63 64u_long sb_max = SB_MAX; /* maximum socket buffer size */ 65static u_long sb_max_adj; /* adjusted sb_max */ 66 67/* 68 * Procedures to manipulate state flags of socket 69 * and do appropriate wakeups. Normal sequence from the 70 * active (originating) side is that soisconnecting() is 71 * called during processing of connect() call, 72 * resulting in an eventual call to soisconnected() if/when the 73 * connection is established. When the connection is torn down 74 * soisdisconnecting() is called during processing of disconnect() call, 75 * and soisdisconnected() is called when the connection to the peer 76 * is totally severed. The semantics of these routines are such that 77 * connectionless protocols can call soisconnected() and soisdisconnected() 78 * only, bypassing the in-progress calls when setting up a ``connection'' 79 * takes no time. 80 * 81 * From the passive side, a socket is created with 82 * two queues of sockets: so_q0 for connections in progress 83 * and so_q for connections already made and awaiting user acceptance. 84 * As a protocol is preparing incoming connections, it creates a socket 85 * structure queued on so_q0 by calling sonewconn(). When the connection 86 * is established, soisconnected() is called, and transfers the 87 * socket structure to so_q, making it available to accept(). 88 * 89 * If a socket is closed with sockets on either 90 * so_q0 or so_q, these sockets are dropped. 91 * 92 * If higher level protocols are implemented in 93 * the kernel, the wakeups done here will sometimes 94 * cause software-interrupt process scheduling. 95 */ 96 97void 98soisconnecting(struct socket *so) 99{ 100 101 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 102 so->so_state |= SS_ISCONNECTING; 103} 104 105void 106soisconnected(struct socket *so) 107{ 108 struct socket *head; 109 110 head = so->so_head; 111 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 112 so->so_state |= SS_ISCONNECTED; 113 if (head && soqremque(so, 0)) { 114 soqinsque(head, so, 1); 115 sorwakeup(head); 116 wakeup((void *)&head->so_timeo); 117 } else { 118 wakeup((void *)&so->so_timeo); 119 sorwakeup(so); 120 sowwakeup(so); 121 } 122} 123 124void 125soisdisconnecting(struct socket *so) 126{ 127 128 so->so_state &= ~SS_ISCONNECTING; 129 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 130 wakeup((void *)&so->so_timeo); 131 sowwakeup(so); 132 sorwakeup(so); 133} 134 135void 136soisdisconnected(struct socket *so) 137{ 138 139 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 140 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 141 wakeup((void *)&so->so_timeo); 142 sowwakeup(so); 143 sorwakeup(so); 144} 145 146/* 147 * When an attempt at a new connection is noted on a socket 148 * which accepts connections, sonewconn is called. If the 149 * connection is possible (subject to space constraints, etc.) 150 * then we allocate a new structure, propoerly linked into the 151 * data structure of the original socket, and return this. 152 * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED. 153 */ 154struct socket * 155sonewconn(struct socket *head, int connstatus) 156{ 157 struct socket *so; 158 int soqueue; 159 160 soqueue = connstatus ? 1 : 0; 161 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) 162 return ((struct socket *)0); 163 so = pool_get(&socket_pool, PR_NOWAIT); 164 if (so == NULL) 165 return (NULL); 166 memset((void *)so, 0, sizeof(*so)); 167 so->so_type = head->so_type; 168 so->so_options = head->so_options &~ SO_ACCEPTCONN; 169 so->so_linger = head->so_linger; 170 so->so_state = head->so_state | SS_NOFDREF; 171 so->so_proto = head->so_proto; 172 so->so_timeo = head->so_timeo; 173 so->so_pgid = head->so_pgid; 174 so->so_send = head->so_send; 175 so->so_receive = head->so_receive; 176 so->so_uidinfo = head->so_uidinfo; 177#ifdef MBUFTRACE 178 so->so_mowner = head->so_mowner; 179 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 180 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 181#endif 182 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); 183 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 184 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 185 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 186 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 187 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 188 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 189 soqinsque(head, so, soqueue); 190 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, 191 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 192 (struct lwp *)0)) { 193 (void) soqremque(so, soqueue); 194 pool_put(&socket_pool, so); 195 return (NULL); 196 } 197 if (connstatus) { 198 sorwakeup(head); 199 wakeup((void *)&head->so_timeo); 200 so->so_state |= connstatus; 201 } 202 return (so); 203} 204 205void 206soqinsque(struct socket *head, struct socket *so, int q) 207{ 208 209#ifdef DIAGNOSTIC 210 if (so->so_onq != NULL) 211 panic("soqinsque"); 212#endif 213 214 so->so_head = head; 215 if (q == 0) { 216 head->so_q0len++; 217 so->so_onq = &head->so_q0; 218 } else { 219 head->so_qlen++; 220 so->so_onq = &head->so_q; 221 } 222 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 223} 224 225int 226soqremque(struct socket *so, int q) 227{ 228 struct socket *head; 229 230 head = so->so_head; 231 if (q == 0) { 232 if (so->so_onq != &head->so_q0) 233 return (0); 234 head->so_q0len--; 235 } else { 236 if (so->so_onq != &head->so_q) 237 return (0); 238 head->so_qlen--; 239 } 240 TAILQ_REMOVE(so->so_onq, so, so_qe); 241 so->so_onq = NULL; 242 so->so_head = NULL; 243 return (1); 244} 245 246/* 247 * Socantsendmore indicates that no more data will be sent on the 248 * socket; it would normally be applied to a socket when the user 249 * informs the system that no more data is to be sent, by the protocol 250 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 251 * will be received, and will normally be applied to the socket by a 252 * protocol when it detects that the peer will send no more data. 253 * Data queued for reading in the socket may yet be read. 254 */ 255 256void 257socantsendmore(struct socket *so) 258{ 259 260 so->so_state |= SS_CANTSENDMORE; 261 sowwakeup(so); 262} 263 264void 265socantrcvmore(struct socket *so) 266{ 267 268 so->so_state |= SS_CANTRCVMORE; 269 sorwakeup(so); 270} 271 272/* 273 * Wait for data to arrive at/drain from a socket buffer. 274 */ 275int 276sbwait(struct sockbuf *sb) 277{ 278 279 sb->sb_flags |= SB_WAIT; 280 return (tsleep((void *)&sb->sb_cc, 281 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, 282 sb->sb_timeo)); 283} 284 285/* 286 * Lock a sockbuf already known to be locked; 287 * return any error returned from sleep (EINTR). 288 */ 289int 290sb_lock(struct sockbuf *sb) 291{ 292 int error; 293 294 while (sb->sb_flags & SB_LOCK) { 295 sb->sb_flags |= SB_WANT; 296 error = tsleep((void *)&sb->sb_flags, 297 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, 298 netlck, 0); 299 if (error) 300 return (error); 301 } 302 sb->sb_flags |= SB_LOCK; 303 return (0); 304} 305 306/* 307 * Wakeup processes waiting on a socket buffer. 308 * Do asynchronous notification via SIGIO 309 * if the socket buffer has the SB_ASYNC flag set. 310 */ 311void 312sowakeup(struct socket *so, struct sockbuf *sb, int code) 313{ 314 selnotify(&sb->sb_sel, 0); 315 sb->sb_flags &= ~SB_SEL; 316 if (sb->sb_flags & SB_WAIT) { 317 sb->sb_flags &= ~SB_WAIT; 318 wakeup((void *)&sb->sb_cc); 319 } 320 if (sb->sb_flags & SB_ASYNC) { 321 int band; 322 if (code == POLL_IN) 323 band = POLLIN|POLLRDNORM; 324 else 325 band = POLLOUT|POLLWRNORM; 326 fownsignal(so->so_pgid, SIGIO, code, band, so); 327 } 328 if (sb->sb_flags & SB_UPCALL) 329 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 330} 331 332/* 333 * Socket buffer (struct sockbuf) utility routines. 334 * 335 * Each socket contains two socket buffers: one for sending data and 336 * one for receiving data. Each buffer contains a queue of mbufs, 337 * information about the number of mbufs and amount of data in the 338 * queue, and other fields allowing poll() statements and notification 339 * on data availability to be implemented. 340 * 341 * Data stored in a socket buffer is maintained as a list of records. 342 * Each record is a list of mbufs chained together with the m_next 343 * field. Records are chained together with the m_nextpkt field. The upper 344 * level routine soreceive() expects the following conventions to be 345 * observed when placing information in the receive buffer: 346 * 347 * 1. If the protocol requires each message be preceded by the sender's 348 * name, then a record containing that name must be present before 349 * any associated data (mbuf's must be of type MT_SONAME). 350 * 2. If the protocol supports the exchange of ``access rights'' (really 351 * just additional data associated with the message), and there are 352 * ``rights'' to be received, then a record containing this data 353 * should be present (mbuf's must be of type MT_CONTROL). 354 * 3. If a name or rights record exists, then it must be followed by 355 * a data record, perhaps of zero length. 356 * 357 * Before using a new socket structure it is first necessary to reserve 358 * buffer space to the socket, by calling sbreserve(). This should commit 359 * some of the available buffer space in the system buffer pool for the 360 * socket (currently, it does nothing but enforce limits). The space 361 * should be released by calling sbrelease() when the socket is destroyed. 362 */ 363 364int 365sb_max_set(u_long new_sbmax) 366{ 367 int s; 368 369 if (new_sbmax < (16 * 1024)) 370 return (EINVAL); 371 372 s = splsoftnet(); 373 sb_max = new_sbmax; 374 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 375 splx(s); 376 377 return (0); 378} 379 380int 381soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 382{ 383 /* 384 * there's at least one application (a configure script of screen) 385 * which expects a fifo is writable even if it has "some" bytes 386 * in its buffer. 387 * so we want to make sure (hiwat - lowat) >= (some bytes). 388 * 389 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 390 * we expect it's large enough for such applications. 391 */ 392 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 393 u_long hiwat = lowat + PIPE_BUF; 394 395 if (sndcc < hiwat) 396 sndcc = hiwat; 397 if (sbreserve(&so->so_snd, sndcc, so) == 0) 398 goto bad; 399 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 400 goto bad2; 401 if (so->so_rcv.sb_lowat == 0) 402 so->so_rcv.sb_lowat = 1; 403 if (so->so_snd.sb_lowat == 0) 404 so->so_snd.sb_lowat = lowat; 405 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 406 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 407 return (0); 408 bad2: 409 sbrelease(&so->so_snd, so); 410 bad: 411 return (ENOBUFS); 412} 413 414/* 415 * Allot mbufs to a sockbuf. 416 * Attempt to scale mbmax so that mbcnt doesn't become limiting 417 * if buffering efficiency is near the normal case. 418 */ 419int 420sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 421{ 422 struct lwp *l = curlwp; /* XXX */ 423 rlim_t maxcc; 424 struct uidinfo *uidinfo; 425 426 KDASSERT(sb_max_adj != 0); 427 if (cc == 0 || cc > sb_max_adj) 428 return (0); 429 if (so) { 430 if (l && kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid) 431 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 432 else 433 maxcc = RLIM_INFINITY; 434 uidinfo = so->so_uidinfo; 435 } else { 436 uidinfo = uid_find(0); /* XXX: nothing better */ 437 maxcc = RLIM_INFINITY; 438 } 439 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 440 return 0; 441 sb->sb_mbmax = min(cc * 2, sb_max); 442 if (sb->sb_lowat > sb->sb_hiwat) 443 sb->sb_lowat = sb->sb_hiwat; 444 return (1); 445} 446 447/* 448 * Free mbufs held by a socket, and reserved mbuf space. 449 */ 450void 451sbrelease(struct sockbuf *sb, struct socket *so) 452{ 453 454 sbflush(sb); 455 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, 456 RLIM_INFINITY); 457 sb->sb_mbmax = 0; 458} 459 460/* 461 * Routines to add and remove 462 * data from an mbuf queue. 463 * 464 * The routines sbappend() or sbappendrecord() are normally called to 465 * append new mbufs to a socket buffer, after checking that adequate 466 * space is available, comparing the function sbspace() with the amount 467 * of data to be added. sbappendrecord() differs from sbappend() in 468 * that data supplied is treated as the beginning of a new record. 469 * To place a sender's address, optional access rights, and data in a 470 * socket receive buffer, sbappendaddr() should be used. To place 471 * access rights and data in a socket receive buffer, sbappendrights() 472 * should be used. In either case, the new data begins a new record. 473 * Note that unlike sbappend() and sbappendrecord(), these routines check 474 * for the caller that there will be enough space to store the data. 475 * Each fails if there is not enough space, or if it cannot find mbufs 476 * to store additional information in. 477 * 478 * Reliable protocols may use the socket send buffer to hold data 479 * awaiting acknowledgement. Data is normally copied from a socket 480 * send buffer in a protocol with m_copy for output to a peer, 481 * and then removing the data from the socket buffer with sbdrop() 482 * or sbdroprecord() when the data is acknowledged by the peer. 483 */ 484 485#ifdef SOCKBUF_DEBUG 486void 487sblastrecordchk(struct sockbuf *sb, const char *where) 488{ 489 struct mbuf *m = sb->sb_mb; 490 491 while (m && m->m_nextpkt) 492 m = m->m_nextpkt; 493 494 if (m != sb->sb_lastrecord) { 495 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 496 sb->sb_mb, sb->sb_lastrecord, m); 497 printf("packet chain:\n"); 498 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 499 printf("\t%p\n", m); 500 panic("sblastrecordchk from %s", where); 501 } 502} 503 504void 505sblastmbufchk(struct sockbuf *sb, const char *where) 506{ 507 struct mbuf *m = sb->sb_mb; 508 struct mbuf *n; 509 510 while (m && m->m_nextpkt) 511 m = m->m_nextpkt; 512 513 while (m && m->m_next) 514 m = m->m_next; 515 516 if (m != sb->sb_mbtail) { 517 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 518 sb->sb_mb, sb->sb_mbtail, m); 519 printf("packet tree:\n"); 520 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 521 printf("\t"); 522 for (n = m; n != NULL; n = n->m_next) 523 printf("%p ", n); 524 printf("\n"); 525 } 526 panic("sblastmbufchk from %s", where); 527 } 528} 529#endif /* SOCKBUF_DEBUG */ 530 531/* 532 * Link a chain of records onto a socket buffer 533 */ 534#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 535do { \ 536 if ((sb)->sb_lastrecord != NULL) \ 537 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 538 else \ 539 (sb)->sb_mb = (m0); \ 540 (sb)->sb_lastrecord = (mlast); \ 541} while (/*CONSTCOND*/0) 542 543 544#define SBLINKRECORD(sb, m0) \ 545 SBLINKRECORDCHAIN(sb, m0, m0) 546 547/* 548 * Append mbuf chain m to the last record in the 549 * socket buffer sb. The additional space associated 550 * the mbuf chain is recorded in sb. Empty mbufs are 551 * discarded and mbufs are compacted where possible. 552 */ 553void 554sbappend(struct sockbuf *sb, struct mbuf *m) 555{ 556 struct mbuf *n; 557 558 if (m == 0) 559 return; 560 561#ifdef MBUFTRACE 562 m_claimm(m, sb->sb_mowner); 563#endif 564 565 SBLASTRECORDCHK(sb, "sbappend 1"); 566 567 if ((n = sb->sb_lastrecord) != NULL) { 568 /* 569 * XXX Would like to simply use sb_mbtail here, but 570 * XXX I need to verify that I won't miss an EOR that 571 * XXX way. 572 */ 573 do { 574 if (n->m_flags & M_EOR) { 575 sbappendrecord(sb, m); /* XXXXXX!!!! */ 576 return; 577 } 578 } while (n->m_next && (n = n->m_next)); 579 } else { 580 /* 581 * If this is the first record in the socket buffer, it's 582 * also the last record. 583 */ 584 sb->sb_lastrecord = m; 585 } 586 sbcompress(sb, m, n); 587 SBLASTRECORDCHK(sb, "sbappend 2"); 588} 589 590/* 591 * This version of sbappend() should only be used when the caller 592 * absolutely knows that there will never be more than one record 593 * in the socket buffer, that is, a stream protocol (such as TCP). 594 */ 595void 596sbappendstream(struct sockbuf *sb, struct mbuf *m) 597{ 598 599 KDASSERT(m->m_nextpkt == NULL); 600 KASSERT(sb->sb_mb == sb->sb_lastrecord); 601 602 SBLASTMBUFCHK(sb, __func__); 603 604#ifdef MBUFTRACE 605 m_claimm(m, sb->sb_mowner); 606#endif 607 608 sbcompress(sb, m, sb->sb_mbtail); 609 610 sb->sb_lastrecord = sb->sb_mb; 611 SBLASTRECORDCHK(sb, __func__); 612} 613 614#ifdef SOCKBUF_DEBUG 615void 616sbcheck(struct sockbuf *sb) 617{ 618 struct mbuf *m; 619 u_long len, mbcnt; 620 621 len = 0; 622 mbcnt = 0; 623 for (m = sb->sb_mb; m; m = m->m_next) { 624 len += m->m_len; 625 mbcnt += MSIZE; 626 if (m->m_flags & M_EXT) 627 mbcnt += m->m_ext.ext_size; 628 if (m->m_nextpkt) 629 panic("sbcheck nextpkt"); 630 } 631 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 632 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 633 mbcnt, sb->sb_mbcnt); 634 panic("sbcheck"); 635 } 636} 637#endif 638 639/* 640 * As above, except the mbuf chain 641 * begins a new record. 642 */ 643void 644sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 645{ 646 struct mbuf *m; 647 648 if (m0 == 0) 649 return; 650 651#ifdef MBUFTRACE 652 m_claimm(m0, sb->sb_mowner); 653#endif 654 /* 655 * Put the first mbuf on the queue. 656 * Note this permits zero length records. 657 */ 658 sballoc(sb, m0); 659 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 660 SBLINKRECORD(sb, m0); 661 m = m0->m_next; 662 m0->m_next = 0; 663 if (m && (m0->m_flags & M_EOR)) { 664 m0->m_flags &= ~M_EOR; 665 m->m_flags |= M_EOR; 666 } 667 sbcompress(sb, m, m0); 668 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 669} 670 671/* 672 * As above except that OOB data 673 * is inserted at the beginning of the sockbuf, 674 * but after any other OOB data. 675 */ 676void 677sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 678{ 679 struct mbuf *m, **mp; 680 681 if (m0 == 0) 682 return; 683 684 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 685 686 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 687 again: 688 switch (m->m_type) { 689 690 case MT_OOBDATA: 691 continue; /* WANT next train */ 692 693 case MT_CONTROL: 694 if ((m = m->m_next) != NULL) 695 goto again; /* inspect THIS train further */ 696 } 697 break; 698 } 699 /* 700 * Put the first mbuf on the queue. 701 * Note this permits zero length records. 702 */ 703 sballoc(sb, m0); 704 m0->m_nextpkt = *mp; 705 if (*mp == NULL) { 706 /* m0 is actually the new tail */ 707 sb->sb_lastrecord = m0; 708 } 709 *mp = m0; 710 m = m0->m_next; 711 m0->m_next = 0; 712 if (m && (m0->m_flags & M_EOR)) { 713 m0->m_flags &= ~M_EOR; 714 m->m_flags |= M_EOR; 715 } 716 sbcompress(sb, m, m0); 717 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 718} 719 720/* 721 * Append address and data, and optionally, control (ancillary) data 722 * to the receive queue of a socket. If present, 723 * m0 must include a packet header with total length. 724 * Returns 0 if no space in sockbuf or insufficient mbufs. 725 */ 726int 727sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 728 struct mbuf *control) 729{ 730 struct mbuf *m, *n, *nlast; 731 int space, len; 732 733 space = asa->sa_len; 734 735 if (m0 != NULL) { 736 if ((m0->m_flags & M_PKTHDR) == 0) 737 panic("sbappendaddr"); 738 space += m0->m_pkthdr.len; 739#ifdef MBUFTRACE 740 m_claimm(m0, sb->sb_mowner); 741#endif 742 } 743 for (n = control; n; n = n->m_next) { 744 space += n->m_len; 745 MCLAIM(n, sb->sb_mowner); 746 if (n->m_next == 0) /* keep pointer to last control buf */ 747 break; 748 } 749 if (space > sbspace(sb)) 750 return (0); 751 MGET(m, M_DONTWAIT, MT_SONAME); 752 if (m == 0) 753 return (0); 754 MCLAIM(m, sb->sb_mowner); 755 /* 756 * XXX avoid 'comparison always true' warning which isn't easily 757 * avoided. 758 */ 759 len = asa->sa_len; 760 if (len > MLEN) { 761 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 762 if ((m->m_flags & M_EXT) == 0) { 763 m_free(m); 764 return (0); 765 } 766 } 767 m->m_len = asa->sa_len; 768 memcpy(mtod(m, void *), asa, asa->sa_len); 769 if (n) 770 n->m_next = m0; /* concatenate data to control */ 771 else 772 control = m0; 773 m->m_next = control; 774 775 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 776 777 for (n = m; n->m_next != NULL; n = n->m_next) 778 sballoc(sb, n); 779 sballoc(sb, n); 780 nlast = n; 781 SBLINKRECORD(sb, m); 782 783 sb->sb_mbtail = nlast; 784 SBLASTMBUFCHK(sb, "sbappendaddr"); 785 786 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 787 788 return (1); 789} 790 791/* 792 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 793 * an mbuf chain. 794 */ 795static inline struct mbuf * 796m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 797 const struct sockaddr *asa) 798{ 799 struct mbuf *m; 800 const int salen = asa->sa_len; 801 802 /* only the first in each chain need be a pkthdr */ 803 MGETHDR(m, M_DONTWAIT, MT_SONAME); 804 if (m == 0) 805 return (0); 806 MCLAIM(m, sb->sb_mowner); 807#ifdef notyet 808 if (salen > MHLEN) { 809 MEXTMALLOC(m, salen, M_NOWAIT); 810 if ((m->m_flags & M_EXT) == 0) { 811 m_free(m); 812 return (0); 813 } 814 } 815#else 816 KASSERT(salen <= MHLEN); 817#endif 818 m->m_len = salen; 819 memcpy(mtod(m, void *), asa, salen); 820 m->m_next = m0; 821 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 822 823 return m; 824} 825 826int 827sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 828 struct mbuf *m0, int sbprio) 829{ 830 int space; 831 struct mbuf *m, *n, *n0, *nlast; 832 int error; 833 834 /* 835 * XXX sbprio reserved for encoding priority of this* request: 836 * SB_PRIO_NONE --> honour normal sb limits 837 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 838 * take whole chain. Intended for large requests 839 * that should be delivered atomically (all, or none). 840 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 841 * over normal socket limits, for messages indicating 842 * buffer overflow in earlier normal/lower-priority messages 843 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 844 * Intended for kernel-generated messages only. 845 * Up to generator to avoid total mbuf resource exhaustion. 846 */ 847 (void)sbprio; 848 849 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 850 panic("sbappendaddrchain"); 851 852 space = sbspace(sb); 853 854#ifdef notyet 855 /* 856 * Enforce SB_PRIO_* limits as described above. 857 */ 858#endif 859 860 n0 = NULL; 861 nlast = NULL; 862 for (m = m0; m; m = m->m_nextpkt) { 863 struct mbuf *np; 864 865#ifdef MBUFTRACE 866 m_claimm(m, sb->sb_mowner); 867#endif 868 869 /* Prepend sockaddr to this record (m) of input chain m0 */ 870 n = m_prepend_sockaddr(sb, m, asa); 871 if (n == NULL) { 872 error = ENOBUFS; 873 goto bad; 874 } 875 876 /* Append record (asa+m) to end of new chain n0 */ 877 if (n0 == NULL) { 878 n0 = n; 879 } else { 880 nlast->m_nextpkt = n; 881 } 882 /* Keep track of last record on new chain */ 883 nlast = n; 884 885 for (np = n; np; np = np->m_next) 886 sballoc(sb, np); 887 } 888 889 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 890 891 /* Drop the entire chain of (asa+m) records onto the socket */ 892 SBLINKRECORDCHAIN(sb, n0, nlast); 893 894 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 895 896 for (m = nlast; m->m_next; m = m->m_next) 897 ; 898 sb->sb_mbtail = m; 899 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 900 901 return (1); 902 903bad: 904 /* 905 * On error, free the prepended addreseses. For consistency 906 * with sbappendaddr(), leave it to our caller to free 907 * the input record chain passed to us as m0. 908 */ 909 while ((n = n0) != NULL) { 910 struct mbuf *np; 911 912 /* Undo the sballoc() of this record */ 913 for (np = n; np; np = np->m_next) 914 sbfree(sb, np); 915 916 n0 = n->m_nextpkt; /* iterate at next prepended address */ 917 MFREE(n, np); /* free prepended address (not data) */ 918 } 919 return 0; 920} 921 922 923int 924sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 925{ 926 struct mbuf *m, *mlast, *n; 927 int space; 928 929 space = 0; 930 if (control == 0) 931 panic("sbappendcontrol"); 932 for (m = control; ; m = m->m_next) { 933 space += m->m_len; 934 MCLAIM(m, sb->sb_mowner); 935 if (m->m_next == 0) 936 break; 937 } 938 n = m; /* save pointer to last control buffer */ 939 for (m = m0; m; m = m->m_next) { 940 MCLAIM(m, sb->sb_mowner); 941 space += m->m_len; 942 } 943 if (space > sbspace(sb)) 944 return (0); 945 n->m_next = m0; /* concatenate data to control */ 946 947 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 948 949 for (m = control; m->m_next != NULL; m = m->m_next) 950 sballoc(sb, m); 951 sballoc(sb, m); 952 mlast = m; 953 SBLINKRECORD(sb, control); 954 955 sb->sb_mbtail = mlast; 956 SBLASTMBUFCHK(sb, "sbappendcontrol"); 957 958 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 959 960 return (1); 961} 962 963/* 964 * Compress mbuf chain m into the socket 965 * buffer sb following mbuf n. If n 966 * is null, the buffer is presumed empty. 967 */ 968void 969sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 970{ 971 int eor; 972 struct mbuf *o; 973 974 eor = 0; 975 while (m) { 976 eor |= m->m_flags & M_EOR; 977 if (m->m_len == 0 && 978 (eor == 0 || 979 (((o = m->m_next) || (o = n)) && 980 o->m_type == m->m_type))) { 981 if (sb->sb_lastrecord == m) 982 sb->sb_lastrecord = m->m_next; 983 m = m_free(m); 984 continue; 985 } 986 if (n && (n->m_flags & M_EOR) == 0 && 987 /* M_TRAILINGSPACE() checks buffer writeability */ 988 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 989 m->m_len <= M_TRAILINGSPACE(n) && 990 n->m_type == m->m_type) { 991 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *), 992 (unsigned)m->m_len); 993 n->m_len += m->m_len; 994 sb->sb_cc += m->m_len; 995 m = m_free(m); 996 continue; 997 } 998 if (n) 999 n->m_next = m; 1000 else 1001 sb->sb_mb = m; 1002 sb->sb_mbtail = m; 1003 sballoc(sb, m); 1004 n = m; 1005 m->m_flags &= ~M_EOR; 1006 m = m->m_next; 1007 n->m_next = 0; 1008 } 1009 if (eor) { 1010 if (n) 1011 n->m_flags |= eor; 1012 else 1013 printf("semi-panic: sbcompress\n"); 1014 } 1015 SBLASTMBUFCHK(sb, __func__); 1016} 1017 1018/* 1019 * Free all mbufs in a sockbuf. 1020 * Check that all resources are reclaimed. 1021 */ 1022void 1023sbflush(struct sockbuf *sb) 1024{ 1025 1026 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1027 1028 while (sb->sb_mbcnt) 1029 sbdrop(sb, (int)sb->sb_cc); 1030 1031 KASSERT(sb->sb_cc == 0); 1032 KASSERT(sb->sb_mb == NULL); 1033 KASSERT(sb->sb_mbtail == NULL); 1034 KASSERT(sb->sb_lastrecord == NULL); 1035} 1036 1037/* 1038 * Drop data from (the front of) a sockbuf. 1039 */ 1040void 1041sbdrop(struct sockbuf *sb, int len) 1042{ 1043 struct mbuf *m, *mn, *next; 1044 1045 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 1046 while (len > 0) { 1047 if (m == 0) { 1048 if (next == 0) 1049 panic("sbdrop"); 1050 m = next; 1051 next = m->m_nextpkt; 1052 continue; 1053 } 1054 if (m->m_len > len) { 1055 m->m_len -= len; 1056 m->m_data += len; 1057 sb->sb_cc -= len; 1058 break; 1059 } 1060 len -= m->m_len; 1061 sbfree(sb, m); 1062 MFREE(m, mn); 1063 m = mn; 1064 } 1065 while (m && m->m_len == 0) { 1066 sbfree(sb, m); 1067 MFREE(m, mn); 1068 m = mn; 1069 } 1070 if (m) { 1071 sb->sb_mb = m; 1072 m->m_nextpkt = next; 1073 } else 1074 sb->sb_mb = next; 1075 /* 1076 * First part is an inline SB_EMPTY_FIXUP(). Second part 1077 * makes sure sb_lastrecord is up-to-date if we dropped 1078 * part of the last record. 1079 */ 1080 m = sb->sb_mb; 1081 if (m == NULL) { 1082 sb->sb_mbtail = NULL; 1083 sb->sb_lastrecord = NULL; 1084 } else if (m->m_nextpkt == NULL) 1085 sb->sb_lastrecord = m; 1086} 1087 1088/* 1089 * Drop a record off the front of a sockbuf 1090 * and move the next record to the front. 1091 */ 1092void 1093sbdroprecord(struct sockbuf *sb) 1094{ 1095 struct mbuf *m, *mn; 1096 1097 m = sb->sb_mb; 1098 if (m) { 1099 sb->sb_mb = m->m_nextpkt; 1100 do { 1101 sbfree(sb, m); 1102 MFREE(m, mn); 1103 } while ((m = mn) != NULL); 1104 } 1105 SB_EMPTY_FIXUP(sb); 1106} 1107 1108/* 1109 * Create a "control" mbuf containing the specified data 1110 * with the specified type for presentation on a socket buffer. 1111 */ 1112struct mbuf * 1113sbcreatecontrol(void *p, int size, int type, int level) 1114{ 1115 struct cmsghdr *cp; 1116 struct mbuf *m; 1117 1118 if (CMSG_SPACE(size) > MCLBYTES) { 1119 printf("sbcreatecontrol: message too large %d\n", size); 1120 return NULL; 1121 } 1122 1123 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1124 return ((struct mbuf *) NULL); 1125 if (CMSG_SPACE(size) > MLEN) { 1126 MCLGET(m, M_DONTWAIT); 1127 if ((m->m_flags & M_EXT) == 0) { 1128 m_free(m); 1129 return NULL; 1130 } 1131 } 1132 cp = mtod(m, struct cmsghdr *); 1133 memcpy(CMSG_DATA(cp), p, size); 1134 m->m_len = CMSG_SPACE(size); 1135 cp->cmsg_len = CMSG_LEN(size); 1136 cp->cmsg_level = level; 1137 cp->cmsg_type = type; 1138 return (m); 1139} 1140