uipc_socket2.c revision 1.70
1/* $NetBSD: uipc_socket2.c,v 1.70 2005/12/24 19:12:23 perry Exp $ */ 2 3/* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 32 */ 33 34#include <sys/cdefs.h> 35__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.70 2005/12/24 19:12:23 perry Exp $"); 36 37#include "opt_mbuftrace.h" 38#include "opt_sb_max.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/proc.h> 43#include <sys/file.h> 44#include <sys/buf.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/protosw.h> 48#include <sys/poll.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/signalvar.h> 52 53/* 54 * Primitive routines for operating on sockets and socket buffers 55 */ 56 57/* strings for sleep message: */ 58const char netcon[] = "netcon"; 59const char netcls[] = "netcls"; 60const char netio[] = "netio"; 61const char netlck[] = "netlck"; 62 63u_long sb_max = SB_MAX; /* maximum socket buffer size */ 64static u_long sb_max_adj; /* adjusted sb_max */ 65 66/* 67 * Procedures to manipulate state flags of socket 68 * and do appropriate wakeups. Normal sequence from the 69 * active (originating) side is that soisconnecting() is 70 * called during processing of connect() call, 71 * resulting in an eventual call to soisconnected() if/when the 72 * connection is established. When the connection is torn down 73 * soisdisconnecting() is called during processing of disconnect() call, 74 * and soisdisconnected() is called when the connection to the peer 75 * is totally severed. The semantics of these routines are such that 76 * connectionless protocols can call soisconnected() and soisdisconnected() 77 * only, bypassing the in-progress calls when setting up a ``connection'' 78 * takes no time. 79 * 80 * From the passive side, a socket is created with 81 * two queues of sockets: so_q0 for connections in progress 82 * and so_q for connections already made and awaiting user acceptance. 83 * As a protocol is preparing incoming connections, it creates a socket 84 * structure queued on so_q0 by calling sonewconn(). When the connection 85 * is established, soisconnected() is called, and transfers the 86 * socket structure to so_q, making it available to accept(). 87 * 88 * If a socket is closed with sockets on either 89 * so_q0 or so_q, these sockets are dropped. 90 * 91 * If higher level protocols are implemented in 92 * the kernel, the wakeups done here will sometimes 93 * cause software-interrupt process scheduling. 94 */ 95 96void 97soisconnecting(struct socket *so) 98{ 99 100 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 101 so->so_state |= SS_ISCONNECTING; 102} 103 104void 105soisconnected(struct socket *so) 106{ 107 struct socket *head; 108 109 head = so->so_head; 110 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 111 so->so_state |= SS_ISCONNECTED; 112 if (head && soqremque(so, 0)) { 113 soqinsque(head, so, 1); 114 sorwakeup(head); 115 wakeup((caddr_t)&head->so_timeo); 116 } else { 117 wakeup((caddr_t)&so->so_timeo); 118 sorwakeup(so); 119 sowwakeup(so); 120 } 121} 122 123void 124soisdisconnecting(struct socket *so) 125{ 126 127 so->so_state &= ~SS_ISCONNECTING; 128 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 129 wakeup((caddr_t)&so->so_timeo); 130 sowwakeup(so); 131 sorwakeup(so); 132} 133 134void 135soisdisconnected(struct socket *so) 136{ 137 138 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 139 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 140 wakeup((caddr_t)&so->so_timeo); 141 sowwakeup(so); 142 sorwakeup(so); 143} 144 145/* 146 * When an attempt at a new connection is noted on a socket 147 * which accepts connections, sonewconn is called. If the 148 * connection is possible (subject to space constraints, etc.) 149 * then we allocate a new structure, propoerly linked into the 150 * data structure of the original socket, and return this. 151 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 152 * 153 * Currently, sonewconn() is defined as sonewconn1() in socketvar.h 154 * to catch calls that are missing the (new) second parameter. 155 */ 156struct socket * 157sonewconn1(struct socket *head, int connstatus) 158{ 159 struct socket *so; 160 int soqueue; 161 162 soqueue = connstatus ? 1 : 0; 163 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) 164 return ((struct socket *)0); 165 so = pool_get(&socket_pool, PR_NOWAIT); 166 if (so == NULL) 167 return (NULL); 168 memset((caddr_t)so, 0, sizeof(*so)); 169 so->so_type = head->so_type; 170 so->so_options = head->so_options &~ SO_ACCEPTCONN; 171 so->so_linger = head->so_linger; 172 so->so_state = head->so_state | SS_NOFDREF; 173 so->so_proto = head->so_proto; 174 so->so_timeo = head->so_timeo; 175 so->so_pgid = head->so_pgid; 176 so->so_send = head->so_send; 177 so->so_receive = head->so_receive; 178 so->so_uidinfo = head->so_uidinfo; 179#ifdef MBUFTRACE 180 so->so_mowner = head->so_mowner; 181 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 182 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 183#endif 184 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); 185 soqinsque(head, so, soqueue); 186 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, 187 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 188 (struct lwp *)0)) { 189 (void) soqremque(so, soqueue); 190 pool_put(&socket_pool, so); 191 return (NULL); 192 } 193 if (connstatus) { 194 sorwakeup(head); 195 wakeup((caddr_t)&head->so_timeo); 196 so->so_state |= connstatus; 197 } 198 return (so); 199} 200 201void 202soqinsque(struct socket *head, struct socket *so, int q) 203{ 204 205#ifdef DIAGNOSTIC 206 if (so->so_onq != NULL) 207 panic("soqinsque"); 208#endif 209 210 so->so_head = head; 211 if (q == 0) { 212 head->so_q0len++; 213 so->so_onq = &head->so_q0; 214 } else { 215 head->so_qlen++; 216 so->so_onq = &head->so_q; 217 } 218 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 219} 220 221int 222soqremque(struct socket *so, int q) 223{ 224 struct socket *head; 225 226 head = so->so_head; 227 if (q == 0) { 228 if (so->so_onq != &head->so_q0) 229 return (0); 230 head->so_q0len--; 231 } else { 232 if (so->so_onq != &head->so_q) 233 return (0); 234 head->so_qlen--; 235 } 236 TAILQ_REMOVE(so->so_onq, so, so_qe); 237 so->so_onq = NULL; 238 so->so_head = NULL; 239 return (1); 240} 241 242/* 243 * Socantsendmore indicates that no more data will be sent on the 244 * socket; it would normally be applied to a socket when the user 245 * informs the system that no more data is to be sent, by the protocol 246 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 247 * will be received, and will normally be applied to the socket by a 248 * protocol when it detects that the peer will send no more data. 249 * Data queued for reading in the socket may yet be read. 250 */ 251 252void 253socantsendmore(struct socket *so) 254{ 255 256 so->so_state |= SS_CANTSENDMORE; 257 sowwakeup(so); 258} 259 260void 261socantrcvmore(struct socket *so) 262{ 263 264 so->so_state |= SS_CANTRCVMORE; 265 sorwakeup(so); 266} 267 268/* 269 * Wait for data to arrive at/drain from a socket buffer. 270 */ 271int 272sbwait(struct sockbuf *sb) 273{ 274 275 sb->sb_flags |= SB_WAIT; 276 return (tsleep((caddr_t)&sb->sb_cc, 277 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, 278 sb->sb_timeo)); 279} 280 281/* 282 * Lock a sockbuf already known to be locked; 283 * return any error returned from sleep (EINTR). 284 */ 285int 286sb_lock(struct sockbuf *sb) 287{ 288 int error; 289 290 while (sb->sb_flags & SB_LOCK) { 291 sb->sb_flags |= SB_WANT; 292 error = tsleep((caddr_t)&sb->sb_flags, 293 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, 294 netlck, 0); 295 if (error) 296 return (error); 297 } 298 sb->sb_flags |= SB_LOCK; 299 return (0); 300} 301 302/* 303 * Wakeup processes waiting on a socket buffer. 304 * Do asynchronous notification via SIGIO 305 * if the socket buffer has the SB_ASYNC flag set. 306 */ 307void 308sowakeup(struct socket *so, struct sockbuf *sb, int code) 309{ 310 selnotify(&sb->sb_sel, 0); 311 sb->sb_flags &= ~SB_SEL; 312 if (sb->sb_flags & SB_WAIT) { 313 sb->sb_flags &= ~SB_WAIT; 314 wakeup((caddr_t)&sb->sb_cc); 315 } 316 if (sb->sb_flags & SB_ASYNC) { 317 int band; 318 if (code == POLL_IN) 319 band = POLLIN|POLLRDNORM; 320 else 321 band = POLLOUT|POLLWRNORM; 322 fownsignal(so->so_pgid, SIGIO, code, band, so); 323 } 324 if (sb->sb_flags & SB_UPCALL) 325 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 326} 327 328/* 329 * Socket buffer (struct sockbuf) utility routines. 330 * 331 * Each socket contains two socket buffers: one for sending data and 332 * one for receiving data. Each buffer contains a queue of mbufs, 333 * information about the number of mbufs and amount of data in the 334 * queue, and other fields allowing poll() statements and notification 335 * on data availability to be implemented. 336 * 337 * Data stored in a socket buffer is maintained as a list of records. 338 * Each record is a list of mbufs chained together with the m_next 339 * field. Records are chained together with the m_nextpkt field. The upper 340 * level routine soreceive() expects the following conventions to be 341 * observed when placing information in the receive buffer: 342 * 343 * 1. If the protocol requires each message be preceded by the sender's 344 * name, then a record containing that name must be present before 345 * any associated data (mbuf's must be of type MT_SONAME). 346 * 2. If the protocol supports the exchange of ``access rights'' (really 347 * just additional data associated with the message), and there are 348 * ``rights'' to be received, then a record containing this data 349 * should be present (mbuf's must be of type MT_CONTROL). 350 * 3. If a name or rights record exists, then it must be followed by 351 * a data record, perhaps of zero length. 352 * 353 * Before using a new socket structure it is first necessary to reserve 354 * buffer space to the socket, by calling sbreserve(). This should commit 355 * some of the available buffer space in the system buffer pool for the 356 * socket (currently, it does nothing but enforce limits). The space 357 * should be released by calling sbrelease() when the socket is destroyed. 358 */ 359 360int 361sb_max_set(u_long new_sbmax) 362{ 363 int s; 364 365 if (new_sbmax < (16 * 1024)) 366 return (EINVAL); 367 368 s = splsoftnet(); 369 sb_max = new_sbmax; 370 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 371 splx(s); 372 373 return (0); 374} 375 376int 377soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 378{ 379 380 if (sbreserve(&so->so_snd, sndcc, so) == 0) 381 goto bad; 382 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 383 goto bad2; 384 if (so->so_rcv.sb_lowat == 0) 385 so->so_rcv.sb_lowat = 1; 386 if (so->so_snd.sb_lowat == 0) 387 so->so_snd.sb_lowat = MCLBYTES; 388 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 389 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 390 return (0); 391 bad2: 392 sbrelease(&so->so_snd, so); 393 bad: 394 return (ENOBUFS); 395} 396 397/* 398 * Allot mbufs to a sockbuf. 399 * Attempt to scale mbmax so that mbcnt doesn't become limiting 400 * if buffering efficiency is near the normal case. 401 */ 402int 403sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 404{ 405 struct proc *p = curproc; /* XXX */ 406 rlim_t maxcc; 407 struct uidinfo *uidinfo; 408 409 KDASSERT(sb_max_adj != 0); 410 if (cc == 0 || cc > sb_max_adj) 411 return (0); 412 if (so) { 413 if (p && p->p_ucred->cr_uid == so->so_uidinfo->ui_uid) 414 maxcc = p->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 415 else 416 maxcc = RLIM_INFINITY; 417 uidinfo = so->so_uidinfo; 418 } else { 419 uidinfo = uid_find(0); /* XXX: nothing better */ 420 maxcc = RLIM_INFINITY; 421 } 422 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 423 return 0; 424 sb->sb_mbmax = min(cc * 2, sb_max); 425 if (sb->sb_lowat > sb->sb_hiwat) 426 sb->sb_lowat = sb->sb_hiwat; 427 return (1); 428} 429 430/* 431 * Free mbufs held by a socket, and reserved mbuf space. 432 */ 433void 434sbrelease(struct sockbuf *sb, struct socket *so) 435{ 436 437 sbflush(sb); 438 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, 439 RLIM_INFINITY); 440 sb->sb_mbmax = 0; 441} 442 443/* 444 * Routines to add and remove 445 * data from an mbuf queue. 446 * 447 * The routines sbappend() or sbappendrecord() are normally called to 448 * append new mbufs to a socket buffer, after checking that adequate 449 * space is available, comparing the function sbspace() with the amount 450 * of data to be added. sbappendrecord() differs from sbappend() in 451 * that data supplied is treated as the beginning of a new record. 452 * To place a sender's address, optional access rights, and data in a 453 * socket receive buffer, sbappendaddr() should be used. To place 454 * access rights and data in a socket receive buffer, sbappendrights() 455 * should be used. In either case, the new data begins a new record. 456 * Note that unlike sbappend() and sbappendrecord(), these routines check 457 * for the caller that there will be enough space to store the data. 458 * Each fails if there is not enough space, or if it cannot find mbufs 459 * to store additional information in. 460 * 461 * Reliable protocols may use the socket send buffer to hold data 462 * awaiting acknowledgement. Data is normally copied from a socket 463 * send buffer in a protocol with m_copy for output to a peer, 464 * and then removing the data from the socket buffer with sbdrop() 465 * or sbdroprecord() when the data is acknowledged by the peer. 466 */ 467 468#ifdef SOCKBUF_DEBUG 469void 470sblastrecordchk(struct sockbuf *sb, const char *where) 471{ 472 struct mbuf *m = sb->sb_mb; 473 474 while (m && m->m_nextpkt) 475 m = m->m_nextpkt; 476 477 if (m != sb->sb_lastrecord) { 478 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 479 sb->sb_mb, sb->sb_lastrecord, m); 480 printf("packet chain:\n"); 481 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 482 printf("\t%p\n", m); 483 panic("sblastrecordchk from %s", where); 484 } 485} 486 487void 488sblastmbufchk(struct sockbuf *sb, const char *where) 489{ 490 struct mbuf *m = sb->sb_mb; 491 struct mbuf *n; 492 493 while (m && m->m_nextpkt) 494 m = m->m_nextpkt; 495 496 while (m && m->m_next) 497 m = m->m_next; 498 499 if (m != sb->sb_mbtail) { 500 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 501 sb->sb_mb, sb->sb_mbtail, m); 502 printf("packet tree:\n"); 503 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 504 printf("\t"); 505 for (n = m; n != NULL; n = n->m_next) 506 printf("%p ", n); 507 printf("\n"); 508 } 509 panic("sblastmbufchk from %s", where); 510 } 511} 512#endif /* SOCKBUF_DEBUG */ 513 514/* 515 * Link a chain of records onto a socket buffer 516 */ 517#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 518do { \ 519 if ((sb)->sb_lastrecord != NULL) \ 520 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 521 else \ 522 (sb)->sb_mb = (m0); \ 523 (sb)->sb_lastrecord = (mlast); \ 524} while (/*CONSTCOND*/0) 525 526 527#define SBLINKRECORD(sb, m0) \ 528 SBLINKRECORDCHAIN(sb, m0, m0) 529 530/* 531 * Append mbuf chain m to the last record in the 532 * socket buffer sb. The additional space associated 533 * the mbuf chain is recorded in sb. Empty mbufs are 534 * discarded and mbufs are compacted where possible. 535 */ 536void 537sbappend(struct sockbuf *sb, struct mbuf *m) 538{ 539 struct mbuf *n; 540 541 if (m == 0) 542 return; 543 544#ifdef MBUFTRACE 545 m_claimm(m, sb->sb_mowner); 546#endif 547 548 SBLASTRECORDCHK(sb, "sbappend 1"); 549 550 if ((n = sb->sb_lastrecord) != NULL) { 551 /* 552 * XXX Would like to simply use sb_mbtail here, but 553 * XXX I need to verify that I won't miss an EOR that 554 * XXX way. 555 */ 556 do { 557 if (n->m_flags & M_EOR) { 558 sbappendrecord(sb, m); /* XXXXXX!!!! */ 559 return; 560 } 561 } while (n->m_next && (n = n->m_next)); 562 } else { 563 /* 564 * If this is the first record in the socket buffer, it's 565 * also the last record. 566 */ 567 sb->sb_lastrecord = m; 568 } 569 sbcompress(sb, m, n); 570 SBLASTRECORDCHK(sb, "sbappend 2"); 571} 572 573/* 574 * This version of sbappend() should only be used when the caller 575 * absolutely knows that there will never be more than one record 576 * in the socket buffer, that is, a stream protocol (such as TCP). 577 */ 578void 579sbappendstream(struct sockbuf *sb, struct mbuf *m) 580{ 581 582 KDASSERT(m->m_nextpkt == NULL); 583 KASSERT(sb->sb_mb == sb->sb_lastrecord); 584 585 SBLASTMBUFCHK(sb, __func__); 586 587#ifdef MBUFTRACE 588 m_claimm(m, sb->sb_mowner); 589#endif 590 591 sbcompress(sb, m, sb->sb_mbtail); 592 593 sb->sb_lastrecord = sb->sb_mb; 594 SBLASTRECORDCHK(sb, __func__); 595} 596 597#ifdef SOCKBUF_DEBUG 598void 599sbcheck(struct sockbuf *sb) 600{ 601 struct mbuf *m; 602 u_long len, mbcnt; 603 604 len = 0; 605 mbcnt = 0; 606 for (m = sb->sb_mb; m; m = m->m_next) { 607 len += m->m_len; 608 mbcnt += MSIZE; 609 if (m->m_flags & M_EXT) 610 mbcnt += m->m_ext.ext_size; 611 if (m->m_nextpkt) 612 panic("sbcheck nextpkt"); 613 } 614 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 615 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 616 mbcnt, sb->sb_mbcnt); 617 panic("sbcheck"); 618 } 619} 620#endif 621 622/* 623 * As above, except the mbuf chain 624 * begins a new record. 625 */ 626void 627sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 628{ 629 struct mbuf *m; 630 631 if (m0 == 0) 632 return; 633 634#ifdef MBUFTRACE 635 m_claimm(m0, sb->sb_mowner); 636#endif 637 /* 638 * Put the first mbuf on the queue. 639 * Note this permits zero length records. 640 */ 641 sballoc(sb, m0); 642 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 643 SBLINKRECORD(sb, m0); 644 m = m0->m_next; 645 m0->m_next = 0; 646 if (m && (m0->m_flags & M_EOR)) { 647 m0->m_flags &= ~M_EOR; 648 m->m_flags |= M_EOR; 649 } 650 sbcompress(sb, m, m0); 651 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 652} 653 654/* 655 * As above except that OOB data 656 * is inserted at the beginning of the sockbuf, 657 * but after any other OOB data. 658 */ 659void 660sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 661{ 662 struct mbuf *m, **mp; 663 664 if (m0 == 0) 665 return; 666 667 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 668 669 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 670 again: 671 switch (m->m_type) { 672 673 case MT_OOBDATA: 674 continue; /* WANT next train */ 675 676 case MT_CONTROL: 677 if ((m = m->m_next) != NULL) 678 goto again; /* inspect THIS train further */ 679 } 680 break; 681 } 682 /* 683 * Put the first mbuf on the queue. 684 * Note this permits zero length records. 685 */ 686 sballoc(sb, m0); 687 m0->m_nextpkt = *mp; 688 if (*mp == NULL) { 689 /* m0 is actually the new tail */ 690 sb->sb_lastrecord = m0; 691 } 692 *mp = m0; 693 m = m0->m_next; 694 m0->m_next = 0; 695 if (m && (m0->m_flags & M_EOR)) { 696 m0->m_flags &= ~M_EOR; 697 m->m_flags |= M_EOR; 698 } 699 sbcompress(sb, m, m0); 700 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 701} 702 703/* 704 * Append address and data, and optionally, control (ancillary) data 705 * to the receive queue of a socket. If present, 706 * m0 must include a packet header with total length. 707 * Returns 0 if no space in sockbuf or insufficient mbufs. 708 */ 709int 710sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 711 struct mbuf *control) 712{ 713 struct mbuf *m, *n, *nlast; 714 int space, len; 715 716 space = asa->sa_len; 717 718 if (m0 != NULL) { 719 if ((m0->m_flags & M_PKTHDR) == 0) 720 panic("sbappendaddr"); 721 space += m0->m_pkthdr.len; 722#ifdef MBUFTRACE 723 m_claimm(m0, sb->sb_mowner); 724#endif 725 } 726 for (n = control; n; n = n->m_next) { 727 space += n->m_len; 728 MCLAIM(n, sb->sb_mowner); 729 if (n->m_next == 0) /* keep pointer to last control buf */ 730 break; 731 } 732 if (space > sbspace(sb)) 733 return (0); 734 MGET(m, M_DONTWAIT, MT_SONAME); 735 if (m == 0) 736 return (0); 737 MCLAIM(m, sb->sb_mowner); 738 /* 739 * XXX avoid 'comparison always true' warning which isn't easily 740 * avoided. 741 */ 742 len = asa->sa_len; 743 if (len > MLEN) { 744 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 745 if ((m->m_flags & M_EXT) == 0) { 746 m_free(m); 747 return (0); 748 } 749 } 750 m->m_len = asa->sa_len; 751 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 752 if (n) 753 n->m_next = m0; /* concatenate data to control */ 754 else 755 control = m0; 756 m->m_next = control; 757 758 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 759 760 for (n = m; n->m_next != NULL; n = n->m_next) 761 sballoc(sb, n); 762 sballoc(sb, n); 763 nlast = n; 764 SBLINKRECORD(sb, m); 765 766 sb->sb_mbtail = nlast; 767 SBLASTMBUFCHK(sb, "sbappendaddr"); 768 769 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 770 771 return (1); 772} 773 774/* 775 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 776 * an mbuf chain. 777 */ 778static inline struct mbuf * 779m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 780 const struct sockaddr *asa) 781{ 782 struct mbuf *m; 783 const int salen = asa->sa_len; 784 785 /* only the first in each chain need be a pkthdr */ 786 MGETHDR(m, M_DONTWAIT, MT_SONAME); 787 if (m == 0) 788 return (0); 789 MCLAIM(m, sb->sb_mowner); 790#ifdef notyet 791 if (salen > MHLEN) { 792 MEXTMALLOC(m, salen, M_NOWAIT); 793 if ((m->m_flags & M_EXT) == 0) { 794 m_free(m); 795 return (0); 796 } 797 } 798#else 799 KASSERT(salen <= MHLEN); 800#endif 801 m->m_len = salen; 802 memcpy(mtod(m, caddr_t), asa, salen); 803 m->m_next = m0; 804 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 805 806 return m; 807} 808 809int 810sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 811 struct mbuf *m0, int sbprio) 812{ 813 int space; 814 struct mbuf *m, *n, *n0, *nlast; 815 int error; 816 817 /* 818 * XXX sbprio reserved for encoding priority of this* request: 819 * SB_PRIO_NONE --> honour normal sb limits 820 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 821 * take whole chain. Intended for large requests 822 * that should be delivered atomically (all, or none). 823 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 824 * over normal socket limits, for messages indicating 825 * buffer overflow in earlier normal/lower-priority messages 826 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 827 * Intended for kernel-generated messages only. 828 * Up to generator to avoid total mbuf resource exhaustion. 829 */ 830 (void)sbprio; 831 832 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 833 panic("sbappendaddrchain"); 834 835 space = sbspace(sb); 836 837#ifdef notyet 838 /* 839 * Enforce SB_PRIO_* limits as described above. 840 */ 841#endif 842 843 n0 = NULL; 844 nlast = NULL; 845 for (m = m0; m; m = m->m_nextpkt) { 846 struct mbuf *np; 847 848#ifdef MBUFTRACE 849 m_claimm(m, sb->sb_mowner); 850#endif 851 852 /* Prepend sockaddr to this record (m) of input chain m0 */ 853 n = m_prepend_sockaddr(sb, m, asa); 854 if (n == NULL) { 855 error = ENOBUFS; 856 goto bad; 857 } 858 859 /* Append record (asa+m) to end of new chain n0 */ 860 if (n0 == NULL) { 861 n0 = n; 862 } else { 863 nlast->m_nextpkt = n; 864 } 865 /* Keep track of last record on new chain */ 866 nlast = n; 867 868 for (np = n; np; np = np->m_next) 869 sballoc(sb, np); 870 } 871 872 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 873 874 /* Drop the entire chain of (asa+m) records onto the socket */ 875 SBLINKRECORDCHAIN(sb, n0, nlast); 876 877 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 878 879 for (m = nlast; m->m_next; m = m->m_next) 880 ; 881 sb->sb_mbtail = m; 882 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 883 884 return (1); 885 886bad: 887 /* 888 * On error, free the prepended addreseses. For consistency 889 * with sbappendaddr(), leave it to our caller to free 890 * the input record chain passed to us as m0. 891 */ 892 while ((n = n0) != NULL) { 893 struct mbuf *np; 894 895 /* Undo the sballoc() of this record */ 896 for (np = n; np; np = np->m_next) 897 sbfree(sb, np); 898 899 n0 = n->m_nextpkt; /* iterate at next prepended address */ 900 MFREE(n, np); /* free prepended address (not data) */ 901 } 902 return 0; 903} 904 905 906int 907sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 908{ 909 struct mbuf *m, *mlast, *n; 910 int space; 911 912 space = 0; 913 if (control == 0) 914 panic("sbappendcontrol"); 915 for (m = control; ; m = m->m_next) { 916 space += m->m_len; 917 MCLAIM(m, sb->sb_mowner); 918 if (m->m_next == 0) 919 break; 920 } 921 n = m; /* save pointer to last control buffer */ 922 for (m = m0; m; m = m->m_next) { 923 MCLAIM(m, sb->sb_mowner); 924 space += m->m_len; 925 } 926 if (space > sbspace(sb)) 927 return (0); 928 n->m_next = m0; /* concatenate data to control */ 929 930 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 931 932 for (m = control; m->m_next != NULL; m = m->m_next) 933 sballoc(sb, m); 934 sballoc(sb, m); 935 mlast = m; 936 SBLINKRECORD(sb, control); 937 938 sb->sb_mbtail = mlast; 939 SBLASTMBUFCHK(sb, "sbappendcontrol"); 940 941 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 942 943 return (1); 944} 945 946/* 947 * Compress mbuf chain m into the socket 948 * buffer sb following mbuf n. If n 949 * is null, the buffer is presumed empty. 950 */ 951void 952sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 953{ 954 int eor; 955 struct mbuf *o; 956 957 eor = 0; 958 while (m) { 959 eor |= m->m_flags & M_EOR; 960 if (m->m_len == 0 && 961 (eor == 0 || 962 (((o = m->m_next) || (o = n)) && 963 o->m_type == m->m_type))) { 964 if (sb->sb_lastrecord == m) 965 sb->sb_lastrecord = m->m_next; 966 m = m_free(m); 967 continue; 968 } 969 if (n && (n->m_flags & M_EOR) == 0 && 970 /* M_TRAILINGSPACE() checks buffer writeability */ 971 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 972 m->m_len <= M_TRAILINGSPACE(n) && 973 n->m_type == m->m_type) { 974 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 975 (unsigned)m->m_len); 976 n->m_len += m->m_len; 977 sb->sb_cc += m->m_len; 978 m = m_free(m); 979 continue; 980 } 981 if (n) 982 n->m_next = m; 983 else 984 sb->sb_mb = m; 985 sb->sb_mbtail = m; 986 sballoc(sb, m); 987 n = m; 988 m->m_flags &= ~M_EOR; 989 m = m->m_next; 990 n->m_next = 0; 991 } 992 if (eor) { 993 if (n) 994 n->m_flags |= eor; 995 else 996 printf("semi-panic: sbcompress\n"); 997 } 998 SBLASTMBUFCHK(sb, __func__); 999} 1000 1001/* 1002 * Free all mbufs in a sockbuf. 1003 * Check that all resources are reclaimed. 1004 */ 1005void 1006sbflush(struct sockbuf *sb) 1007{ 1008 1009 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1010 1011 while (sb->sb_mbcnt) 1012 sbdrop(sb, (int)sb->sb_cc); 1013 1014 KASSERT(sb->sb_cc == 0); 1015 KASSERT(sb->sb_mb == NULL); 1016 KASSERT(sb->sb_mbtail == NULL); 1017 KASSERT(sb->sb_lastrecord == NULL); 1018} 1019 1020/* 1021 * Drop data from (the front of) a sockbuf. 1022 */ 1023void 1024sbdrop(struct sockbuf *sb, int len) 1025{ 1026 struct mbuf *m, *mn, *next; 1027 1028 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 1029 while (len > 0) { 1030 if (m == 0) { 1031 if (next == 0) 1032 panic("sbdrop"); 1033 m = next; 1034 next = m->m_nextpkt; 1035 continue; 1036 } 1037 if (m->m_len > len) { 1038 m->m_len -= len; 1039 m->m_data += len; 1040 sb->sb_cc -= len; 1041 break; 1042 } 1043 len -= m->m_len; 1044 sbfree(sb, m); 1045 MFREE(m, mn); 1046 m = mn; 1047 } 1048 while (m && m->m_len == 0) { 1049 sbfree(sb, m); 1050 MFREE(m, mn); 1051 m = mn; 1052 } 1053 if (m) { 1054 sb->sb_mb = m; 1055 m->m_nextpkt = next; 1056 } else 1057 sb->sb_mb = next; 1058 /* 1059 * First part is an inline SB_EMPTY_FIXUP(). Second part 1060 * makes sure sb_lastrecord is up-to-date if we dropped 1061 * part of the last record. 1062 */ 1063 m = sb->sb_mb; 1064 if (m == NULL) { 1065 sb->sb_mbtail = NULL; 1066 sb->sb_lastrecord = NULL; 1067 } else if (m->m_nextpkt == NULL) 1068 sb->sb_lastrecord = m; 1069} 1070 1071/* 1072 * Drop a record off the front of a sockbuf 1073 * and move the next record to the front. 1074 */ 1075void 1076sbdroprecord(struct sockbuf *sb) 1077{ 1078 struct mbuf *m, *mn; 1079 1080 m = sb->sb_mb; 1081 if (m) { 1082 sb->sb_mb = m->m_nextpkt; 1083 do { 1084 sbfree(sb, m); 1085 MFREE(m, mn); 1086 } while ((m = mn) != NULL); 1087 } 1088 SB_EMPTY_FIXUP(sb); 1089} 1090 1091/* 1092 * Create a "control" mbuf containing the specified data 1093 * with the specified type for presentation on a socket buffer. 1094 */ 1095struct mbuf * 1096sbcreatecontrol(caddr_t p, int size, int type, int level) 1097{ 1098 struct cmsghdr *cp; 1099 struct mbuf *m; 1100 1101 if (CMSG_SPACE(size) > MCLBYTES) { 1102 printf("sbcreatecontrol: message too large %d\n", size); 1103 return NULL; 1104 } 1105 1106 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1107 return ((struct mbuf *) NULL); 1108 if (CMSG_SPACE(size) > MLEN) { 1109 MCLGET(m, M_DONTWAIT); 1110 if ((m->m_flags & M_EXT) == 0) { 1111 m_free(m); 1112 return NULL; 1113 } 1114 } 1115 cp = mtod(m, struct cmsghdr *); 1116 memcpy(CMSG_DATA(cp), p, size); 1117 m->m_len = CMSG_SPACE(size); 1118 cp->cmsg_len = CMSG_LEN(size); 1119 cp->cmsg_level = level; 1120 cp->cmsg_type = type; 1121 return (m); 1122} 1123