uipc_socket2.c revision 1.132
1/* $NetBSD: uipc_socket2.c,v 1.132 2018/09/03 16:29:35 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* 30 * Copyright (c) 1982, 1986, 1988, 1990, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 58 */ 59 60#include <sys/cdefs.h> 61__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.132 2018/09/03 16:29:35 riastradh Exp $"); 62 63#ifdef _KERNEL_OPT 64#include "opt_ddb.h" 65#include "opt_mbuftrace.h" 66#include "opt_sb_max.h" 67#endif 68 69#include <sys/param.h> 70#include <sys/systm.h> 71#include <sys/proc.h> 72#include <sys/file.h> 73#include <sys/buf.h> 74#include <sys/mbuf.h> 75#include <sys/protosw.h> 76#include <sys/domain.h> 77#include <sys/poll.h> 78#include <sys/socket.h> 79#include <sys/socketvar.h> 80#include <sys/signalvar.h> 81#include <sys/kauth.h> 82#include <sys/pool.h> 83#include <sys/uidinfo.h> 84 85#ifdef DDB 86#include <sys/filedesc.h> 87#endif 88 89/* 90 * Primitive routines for operating on sockets and socket buffers. 91 * 92 * Connection life-cycle: 93 * 94 * Normal sequence from the active (originating) side: 95 * 96 * - soisconnecting() is called during processing of connect() call, 97 * - resulting in an eventual call to soisconnected() if/when the 98 * connection is established. 99 * 100 * When the connection is torn down during processing of disconnect(): 101 * 102 * - soisdisconnecting() is called and, 103 * - soisdisconnected() is called when the connection to the peer 104 * is totally severed. 105 * 106 * The semantics of these routines are such that connectionless protocols 107 * can call soisconnected() and soisdisconnected() only, bypassing the 108 * in-progress calls when setting up a ``connection'' takes no time. 109 * 110 * From the passive side, a socket is created with two queues of sockets: 111 * 112 * - so_q0 (0) for partial connections (i.e. connections in progress) 113 * - so_q (1) for connections already made and awaiting user acceptance. 114 * 115 * As a protocol is preparing incoming connections, it creates a socket 116 * structure queued on so_q0 by calling sonewconn(). When the connection 117 * is established, soisconnected() is called, and transfers the 118 * socket structure to so_q, making it available to accept(). 119 * 120 * If a socket is closed with sockets on either so_q0 or so_q, these 121 * sockets are dropped. 122 * 123 * Locking rules and assumptions: 124 * 125 * o socket::so_lock can change on the fly. The low level routines used 126 * to lock sockets are aware of this. When so_lock is acquired, the 127 * routine locking must check to see if so_lock still points to the 128 * lock that was acquired. If so_lock has changed in the meantime, the 129 * now irrelevant lock that was acquired must be dropped and the lock 130 * operation retried. Although not proven here, this is completely safe 131 * on a multiprocessor system, even with relaxed memory ordering, given 132 * the next two rules: 133 * 134 * o In order to mutate so_lock, the lock pointed to by the current value 135 * of so_lock must be held: i.e., the socket must be held locked by the 136 * changing thread. The thread must issue membar_exit() to prevent 137 * memory accesses being reordered, and can set so_lock to the desired 138 * value. If the lock pointed to by the new value of so_lock is not 139 * held by the changing thread, the socket must then be considered 140 * unlocked. 141 * 142 * o If so_lock is mutated, and the previous lock referred to by so_lock 143 * could still be visible to other threads in the system (e.g. via file 144 * descriptor or protocol-internal reference), then the old lock must 145 * remain valid until the socket and/or protocol control block has been 146 * torn down. 147 * 148 * o If a socket has a non-NULL so_head value (i.e. is in the process of 149 * connecting), then locking the socket must also lock the socket pointed 150 * to by so_head: their lock pointers must match. 151 * 152 * o If a socket has connections in progress (so_q, so_q0 not empty) then 153 * locking the socket must also lock the sockets attached to both queues. 154 * Again, their lock pointers must match. 155 * 156 * o Beyond the initial lock assignment in socreate(), assigning locks to 157 * sockets is the responsibility of the individual protocols / protocol 158 * domains. 159 */ 160 161static pool_cache_t socket_cache; 162u_long sb_max = SB_MAX;/* maximum socket buffer size */ 163static u_long sb_max_adj; /* adjusted sb_max */ 164 165void 166soisconnecting(struct socket *so) 167{ 168 169 KASSERT(solocked(so)); 170 171 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 172 so->so_state |= SS_ISCONNECTING; 173} 174 175void 176soisconnected(struct socket *so) 177{ 178 struct socket *head; 179 180 head = so->so_head; 181 182 KASSERT(solocked(so)); 183 KASSERT(head == NULL || solocked2(so, head)); 184 185 so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING); 186 so->so_state |= SS_ISCONNECTED; 187 if (head && so->so_onq == &head->so_q0) { 188 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 189 /* 190 * Re-enqueue and wake up any waiters, e.g. 191 * processes blocking on accept(). 192 */ 193 soqremque(so, 0); 194 soqinsque(head, so, 1); 195 sorwakeup(head); 196 cv_broadcast(&head->so_cv); 197 } else { 198 so->so_upcall = 199 head->so_accf->so_accept_filter->accf_callback; 200 so->so_upcallarg = head->so_accf->so_accept_filter_arg; 201 so->so_rcv.sb_flags |= SB_UPCALL; 202 so->so_options &= ~SO_ACCEPTFILTER; 203 (*so->so_upcall)(so, so->so_upcallarg, 204 POLLIN|POLLRDNORM, M_DONTWAIT); 205 } 206 } else { 207 cv_broadcast(&so->so_cv); 208 sorwakeup(so); 209 sowwakeup(so); 210 } 211} 212 213void 214soisdisconnecting(struct socket *so) 215{ 216 217 KASSERT(solocked(so)); 218 219 so->so_state &= ~SS_ISCONNECTING; 220 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 221 cv_broadcast(&so->so_cv); 222 sowwakeup(so); 223 sorwakeup(so); 224} 225 226void 227soisdisconnected(struct socket *so) 228{ 229 230 KASSERT(solocked(so)); 231 232 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 233 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 234 cv_broadcast(&so->so_cv); 235 sowwakeup(so); 236 sorwakeup(so); 237} 238 239void 240soinit2(void) 241{ 242 243 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0, 244 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL); 245} 246 247/* 248 * sonewconn: accept a new connection. 249 * 250 * When an attempt at a new connection is noted on a socket which accepts 251 * connections, sonewconn(9) is called. If the connection is possible 252 * (subject to space constraints, etc) then we allocate a new structure, 253 * properly linked into the data structure of the original socket. 254 * 255 * => If 'soready' is true, then socket will become ready for accept() i.e. 256 * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken. 257 * => May be called from soft-interrupt context. 258 * => Listening socket should be locked. 259 * => Returns the new socket locked. 260 */ 261struct socket * 262sonewconn(struct socket *head, bool soready) 263{ 264 struct socket *so; 265 int soqueue, error; 266 267 KASSERT(solocked(head)); 268 269 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) { 270 /* 271 * Listen queue overflow. If there is an accept filter 272 * active, pass through the oldest cxn it's handling. 273 */ 274 if (head->so_accf == NULL) { 275 return NULL; 276 } else { 277 struct socket *so2, *next; 278 279 /* Pass the oldest connection waiting in the 280 accept filter */ 281 for (so2 = TAILQ_FIRST(&head->so_q0); 282 so2 != NULL; so2 = next) { 283 next = TAILQ_NEXT(so2, so_qe); 284 if (so2->so_upcall == NULL) { 285 continue; 286 } 287 so2->so_upcall = NULL; 288 so2->so_upcallarg = NULL; 289 so2->so_options &= ~SO_ACCEPTFILTER; 290 so2->so_rcv.sb_flags &= ~SB_UPCALL; 291 soisconnected(so2); 292 break; 293 } 294 295 /* If nothing was nudged out of the acept filter, bail 296 * out; otherwise proceed allocating the socket. */ 297 if (so2 == NULL) { 298 return NULL; 299 } 300 } 301 } 302 if ((head->so_options & SO_ACCEPTFILTER) != 0) { 303 soready = false; 304 } 305 soqueue = soready ? 1 : 0; 306 307 if ((so = soget(false)) == NULL) { 308 return NULL; 309 } 310 so->so_type = head->so_type; 311 so->so_options = head->so_options & ~SO_ACCEPTCONN; 312 so->so_linger = head->so_linger; 313 so->so_state = head->so_state | SS_NOFDREF; 314 so->so_proto = head->so_proto; 315 so->so_timeo = head->so_timeo; 316 so->so_pgid = head->so_pgid; 317 so->so_send = head->so_send; 318 so->so_receive = head->so_receive; 319 so->so_uidinfo = head->so_uidinfo; 320 so->so_cpid = head->so_cpid; 321 322 /* 323 * Share the lock with the listening-socket, it may get unshared 324 * once the connection is complete. 325 */ 326 mutex_obj_hold(head->so_lock); 327 so->so_lock = head->so_lock; 328 329 /* 330 * Reserve the space for socket buffers. 331 */ 332#ifdef MBUFTRACE 333 so->so_mowner = head->so_mowner; 334 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 335 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 336#endif 337 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 338 goto out; 339 } 340 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 341 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 342 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 343 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 344 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 345 so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 346 347 /* 348 * Finally, perform the protocol attach. Note: a new socket 349 * lock may be assigned at this point (if so, it will be held). 350 */ 351 error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0); 352 if (error) { 353out: 354 KASSERT(solocked(so)); 355 KASSERT(so->so_accf == NULL); 356 soput(so); 357 358 /* Note: the listening socket shall stay locked. */ 359 KASSERT(solocked(head)); 360 return NULL; 361 } 362 KASSERT(solocked2(head, so)); 363 364 /* 365 * Insert into the queue. If ready, update the connection status 366 * and wake up any waiters, e.g. processes blocking on accept(). 367 */ 368 soqinsque(head, so, soqueue); 369 if (soready) { 370 so->so_state |= SS_ISCONNECTED; 371 sorwakeup(head); 372 cv_broadcast(&head->so_cv); 373 } 374 return so; 375} 376 377struct socket * 378soget(bool waitok) 379{ 380 struct socket *so; 381 382 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 383 if (__predict_false(so == NULL)) 384 return (NULL); 385 memset(so, 0, sizeof(*so)); 386 TAILQ_INIT(&so->so_q0); 387 TAILQ_INIT(&so->so_q); 388 cv_init(&so->so_cv, "socket"); 389 cv_init(&so->so_rcv.sb_cv, "netio"); 390 cv_init(&so->so_snd.sb_cv, "netio"); 391 selinit(&so->so_rcv.sb_sel); 392 selinit(&so->so_snd.sb_sel); 393 so->so_rcv.sb_so = so; 394 so->so_snd.sb_so = so; 395 return so; 396} 397 398void 399soput(struct socket *so) 400{ 401 402 KASSERT(!cv_has_waiters(&so->so_cv)); 403 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 404 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 405 seldestroy(&so->so_rcv.sb_sel); 406 seldestroy(&so->so_snd.sb_sel); 407 mutex_obj_free(so->so_lock); 408 cv_destroy(&so->so_cv); 409 cv_destroy(&so->so_rcv.sb_cv); 410 cv_destroy(&so->so_snd.sb_cv); 411 pool_cache_put(socket_cache, so); 412} 413 414/* 415 * soqinsque: insert socket of a new connection into the specified 416 * accept queue of the listening socket (head). 417 * 418 * q = 0: queue of partial connections 419 * q = 1: queue of incoming connections 420 */ 421void 422soqinsque(struct socket *head, struct socket *so, int q) 423{ 424 KASSERT(q == 0 || q == 1); 425 KASSERT(solocked2(head, so)); 426 KASSERT(so->so_onq == NULL); 427 KASSERT(so->so_head == NULL); 428 429 so->so_head = head; 430 if (q == 0) { 431 head->so_q0len++; 432 so->so_onq = &head->so_q0; 433 } else { 434 head->so_qlen++; 435 so->so_onq = &head->so_q; 436 } 437 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 438} 439 440/* 441 * soqremque: remove socket from the specified queue. 442 * 443 * => Returns true if socket was removed from the specified queue. 444 * => False if socket was not removed (because it was in other queue). 445 */ 446bool 447soqremque(struct socket *so, int q) 448{ 449 struct socket *head = so->so_head; 450 451 KASSERT(q == 0 || q == 1); 452 KASSERT(solocked(so)); 453 KASSERT(so->so_onq != NULL); 454 KASSERT(head != NULL); 455 456 if (q == 0) { 457 if (so->so_onq != &head->so_q0) 458 return false; 459 head->so_q0len--; 460 } else { 461 if (so->so_onq != &head->so_q) 462 return false; 463 head->so_qlen--; 464 } 465 KASSERT(solocked2(so, head)); 466 TAILQ_REMOVE(so->so_onq, so, so_qe); 467 so->so_onq = NULL; 468 so->so_head = NULL; 469 return true; 470} 471 472/* 473 * socantsendmore: indicates that no more data will be sent on the 474 * socket; it would normally be applied to a socket when the user 475 * informs the system that no more data is to be sent, by the protocol 476 * code (in case pr_shutdown()). 477 */ 478void 479socantsendmore(struct socket *so) 480{ 481 KASSERT(solocked(so)); 482 483 so->so_state |= SS_CANTSENDMORE; 484 sowwakeup(so); 485} 486 487/* 488 * socantrcvmore(): indicates that no more data will be received and 489 * will normally be applied to the socket by a protocol when it detects 490 * that the peer will send no more data. Data queued for reading in 491 * the socket may yet be read. 492 */ 493void 494socantrcvmore(struct socket *so) 495{ 496 KASSERT(solocked(so)); 497 498 so->so_state |= SS_CANTRCVMORE; 499 sorwakeup(so); 500} 501 502/* 503 * soroverflow(): indicates that data was attempted to be sent 504 * but the receiving buffer overflowed. 505 */ 506void 507soroverflow(struct socket *so) 508{ 509 KASSERT(solocked(so)); 510 511 so->so_rcv.sb_overflowed++; 512 so->so_rerror = ENOBUFS; 513 sorwakeup(so); 514} 515 516/* 517 * Wait for data to arrive at/drain from a socket buffer. 518 */ 519int 520sbwait(struct sockbuf *sb) 521{ 522 struct socket *so; 523 kmutex_t *lock; 524 int error; 525 526 so = sb->sb_so; 527 528 KASSERT(solocked(so)); 529 530 sb->sb_flags |= SB_NOTIFY; 531 lock = so->so_lock; 532 if ((sb->sb_flags & SB_NOINTR) != 0) 533 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo); 534 else 535 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); 536 if (__predict_false(lock != so->so_lock)) 537 solockretry(so, lock); 538 return error; 539} 540 541/* 542 * Wakeup processes waiting on a socket buffer. 543 * Do asynchronous notification via SIGIO 544 * if the socket buffer has the SB_ASYNC flag set. 545 */ 546void 547sowakeup(struct socket *so, struct sockbuf *sb, int code) 548{ 549 int band; 550 551 KASSERT(solocked(so)); 552 KASSERT(sb->sb_so == so); 553 554 if (code == POLL_IN) 555 band = POLLIN|POLLRDNORM; 556 else 557 band = POLLOUT|POLLWRNORM; 558 sb->sb_flags &= ~SB_NOTIFY; 559 selnotify(&sb->sb_sel, band, NOTE_SUBMIT); 560 cv_broadcast(&sb->sb_cv); 561 if (sb->sb_flags & SB_ASYNC) 562 fownsignal(so->so_pgid, SIGIO, code, band, so); 563 if (sb->sb_flags & SB_UPCALL) 564 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT); 565} 566 567/* 568 * Reset a socket's lock pointer. Wake all threads waiting on the 569 * socket's condition variables so that they can restart their waits 570 * using the new lock. The existing lock must be held. 571 */ 572void 573solockreset(struct socket *so, kmutex_t *lock) 574{ 575 576 KASSERT(solocked(so)); 577 578 so->so_lock = lock; 579 cv_broadcast(&so->so_snd.sb_cv); 580 cv_broadcast(&so->so_rcv.sb_cv); 581 cv_broadcast(&so->so_cv); 582} 583 584/* 585 * Socket buffer (struct sockbuf) utility routines. 586 * 587 * Each socket contains two socket buffers: one for sending data and 588 * one for receiving data. Each buffer contains a queue of mbufs, 589 * information about the number of mbufs and amount of data in the 590 * queue, and other fields allowing poll() statements and notification 591 * on data availability to be implemented. 592 * 593 * Data stored in a socket buffer is maintained as a list of records. 594 * Each record is a list of mbufs chained together with the m_next 595 * field. Records are chained together with the m_nextpkt field. The upper 596 * level routine soreceive() expects the following conventions to be 597 * observed when placing information in the receive buffer: 598 * 599 * 1. If the protocol requires each message be preceded by the sender's 600 * name, then a record containing that name must be present before 601 * any associated data (mbuf's must be of type MT_SONAME). 602 * 2. If the protocol supports the exchange of ``access rights'' (really 603 * just additional data associated with the message), and there are 604 * ``rights'' to be received, then a record containing this data 605 * should be present (mbuf's must be of type MT_CONTROL). 606 * 3. If a name or rights record exists, then it must be followed by 607 * a data record, perhaps of zero length. 608 * 609 * Before using a new socket structure it is first necessary to reserve 610 * buffer space to the socket, by calling sbreserve(). This should commit 611 * some of the available buffer space in the system buffer pool for the 612 * socket (currently, it does nothing but enforce limits). The space 613 * should be released by calling sbrelease() when the socket is destroyed. 614 */ 615 616int 617sb_max_set(u_long new_sbmax) 618{ 619 int s; 620 621 if (new_sbmax < (16 * 1024)) 622 return (EINVAL); 623 624 s = splsoftnet(); 625 sb_max = new_sbmax; 626 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 627 splx(s); 628 629 return (0); 630} 631 632int 633soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 634{ 635 KASSERT(so->so_pcb == NULL || solocked(so)); 636 637 /* 638 * there's at least one application (a configure script of screen) 639 * which expects a fifo is writable even if it has "some" bytes 640 * in its buffer. 641 * so we want to make sure (hiwat - lowat) >= (some bytes). 642 * 643 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 644 * we expect it's large enough for such applications. 645 */ 646 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 647 u_long hiwat = lowat + PIPE_BUF; 648 649 if (sndcc < hiwat) 650 sndcc = hiwat; 651 if (sbreserve(&so->so_snd, sndcc, so) == 0) 652 goto bad; 653 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 654 goto bad2; 655 if (so->so_rcv.sb_lowat == 0) 656 so->so_rcv.sb_lowat = 1; 657 if (so->so_snd.sb_lowat == 0) 658 so->so_snd.sb_lowat = lowat; 659 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 660 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 661 return (0); 662 bad2: 663 sbrelease(&so->so_snd, so); 664 bad: 665 return (ENOBUFS); 666} 667 668/* 669 * Allot mbufs to a sockbuf. 670 * Attempt to scale mbmax so that mbcnt doesn't become limiting 671 * if buffering efficiency is near the normal case. 672 */ 673int 674sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 675{ 676 struct lwp *l = curlwp; /* XXX */ 677 rlim_t maxcc; 678 struct uidinfo *uidinfo; 679 680 KASSERT(so->so_pcb == NULL || solocked(so)); 681 KASSERT(sb->sb_so == so); 682 KASSERT(sb_max_adj != 0); 683 684 if (cc == 0 || cc > sb_max_adj) 685 return (0); 686 687 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 688 689 uidinfo = so->so_uidinfo; 690 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 691 return 0; 692 sb->sb_mbmax = uimin(cc * 2, sb_max); 693 if (sb->sb_lowat > sb->sb_hiwat) 694 sb->sb_lowat = sb->sb_hiwat; 695 696 return (1); 697} 698 699/* 700 * Free mbufs held by a socket, and reserved mbuf space. We do not assert 701 * that the socket is held locked here: see sorflush(). 702 */ 703void 704sbrelease(struct sockbuf *sb, struct socket *so) 705{ 706 707 KASSERT(sb->sb_so == so); 708 709 sbflush(sb); 710 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); 711 sb->sb_mbmax = 0; 712} 713 714/* 715 * Routines to add and remove 716 * data from an mbuf queue. 717 * 718 * The routines sbappend() or sbappendrecord() are normally called to 719 * append new mbufs to a socket buffer, after checking that adequate 720 * space is available, comparing the function sbspace() with the amount 721 * of data to be added. sbappendrecord() differs from sbappend() in 722 * that data supplied is treated as the beginning of a new record. 723 * To place a sender's address, optional access rights, and data in a 724 * socket receive buffer, sbappendaddr() should be used. To place 725 * access rights and data in a socket receive buffer, sbappendrights() 726 * should be used. In either case, the new data begins a new record. 727 * Note that unlike sbappend() and sbappendrecord(), these routines check 728 * for the caller that there will be enough space to store the data. 729 * Each fails if there is not enough space, or if it cannot find mbufs 730 * to store additional information in. 731 * 732 * Reliable protocols may use the socket send buffer to hold data 733 * awaiting acknowledgement. Data is normally copied from a socket 734 * send buffer in a protocol with m_copym for output to a peer, 735 * and then removing the data from the socket buffer with sbdrop() 736 * or sbdroprecord() when the data is acknowledged by the peer. 737 */ 738 739#ifdef SOCKBUF_DEBUG 740void 741sblastrecordchk(struct sockbuf *sb, const char *where) 742{ 743 struct mbuf *m = sb->sb_mb; 744 745 KASSERT(solocked(sb->sb_so)); 746 747 while (m && m->m_nextpkt) 748 m = m->m_nextpkt; 749 750 if (m != sb->sb_lastrecord) { 751 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 752 sb->sb_mb, sb->sb_lastrecord, m); 753 printf("packet chain:\n"); 754 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 755 printf("\t%p\n", m); 756 panic("sblastrecordchk from %s", where); 757 } 758} 759 760void 761sblastmbufchk(struct sockbuf *sb, const char *where) 762{ 763 struct mbuf *m = sb->sb_mb; 764 struct mbuf *n; 765 766 KASSERT(solocked(sb->sb_so)); 767 768 while (m && m->m_nextpkt) 769 m = m->m_nextpkt; 770 771 while (m && m->m_next) 772 m = m->m_next; 773 774 if (m != sb->sb_mbtail) { 775 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 776 sb->sb_mb, sb->sb_mbtail, m); 777 printf("packet tree:\n"); 778 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 779 printf("\t"); 780 for (n = m; n != NULL; n = n->m_next) 781 printf("%p ", n); 782 printf("\n"); 783 } 784 panic("sblastmbufchk from %s", where); 785 } 786} 787#endif /* SOCKBUF_DEBUG */ 788 789/* 790 * Link a chain of records onto a socket buffer 791 */ 792#define SBLINKRECORDCHAIN(sb, m0, mlast) \ 793do { \ 794 if ((sb)->sb_lastrecord != NULL) \ 795 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 796 else \ 797 (sb)->sb_mb = (m0); \ 798 (sb)->sb_lastrecord = (mlast); \ 799} while (/*CONSTCOND*/0) 800 801 802#define SBLINKRECORD(sb, m0) \ 803 SBLINKRECORDCHAIN(sb, m0, m0) 804 805/* 806 * Append mbuf chain m to the last record in the 807 * socket buffer sb. The additional space associated 808 * the mbuf chain is recorded in sb. Empty mbufs are 809 * discarded and mbufs are compacted where possible. 810 */ 811void 812sbappend(struct sockbuf *sb, struct mbuf *m) 813{ 814 struct mbuf *n; 815 816 KASSERT(solocked(sb->sb_so)); 817 818 if (m == NULL) 819 return; 820 821#ifdef MBUFTRACE 822 m_claimm(m, sb->sb_mowner); 823#endif 824 825 SBLASTRECORDCHK(sb, "sbappend 1"); 826 827 if ((n = sb->sb_lastrecord) != NULL) { 828 /* 829 * XXX Would like to simply use sb_mbtail here, but 830 * XXX I need to verify that I won't miss an EOR that 831 * XXX way. 832 */ 833 do { 834 if (n->m_flags & M_EOR) { 835 sbappendrecord(sb, m); /* XXXXXX!!!! */ 836 return; 837 } 838 } while (n->m_next && (n = n->m_next)); 839 } else { 840 /* 841 * If this is the first record in the socket buffer, it's 842 * also the last record. 843 */ 844 sb->sb_lastrecord = m; 845 } 846 sbcompress(sb, m, n); 847 SBLASTRECORDCHK(sb, "sbappend 2"); 848} 849 850/* 851 * This version of sbappend() should only be used when the caller 852 * absolutely knows that there will never be more than one record 853 * in the socket buffer, that is, a stream protocol (such as TCP). 854 */ 855void 856sbappendstream(struct sockbuf *sb, struct mbuf *m) 857{ 858 859 KASSERT(solocked(sb->sb_so)); 860 KDASSERT(m->m_nextpkt == NULL); 861 KASSERT(sb->sb_mb == sb->sb_lastrecord); 862 863 SBLASTMBUFCHK(sb, __func__); 864 865#ifdef MBUFTRACE 866 m_claimm(m, sb->sb_mowner); 867#endif 868 869 sbcompress(sb, m, sb->sb_mbtail); 870 871 sb->sb_lastrecord = sb->sb_mb; 872 SBLASTRECORDCHK(sb, __func__); 873} 874 875#ifdef SOCKBUF_DEBUG 876void 877sbcheck(struct sockbuf *sb) 878{ 879 struct mbuf *m, *m2; 880 u_long len, mbcnt; 881 882 KASSERT(solocked(sb->sb_so)); 883 884 len = 0; 885 mbcnt = 0; 886 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 887 for (m2 = m; m2 != NULL; m2 = m2->m_next) { 888 len += m2->m_len; 889 mbcnt += MSIZE; 890 if (m2->m_flags & M_EXT) 891 mbcnt += m2->m_ext.ext_size; 892 if (m2->m_nextpkt != NULL) 893 panic("sbcheck nextpkt"); 894 } 895 } 896 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 897 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 898 mbcnt, sb->sb_mbcnt); 899 panic("sbcheck"); 900 } 901} 902#endif 903 904/* 905 * As above, except the mbuf chain 906 * begins a new record. 907 */ 908void 909sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 910{ 911 struct mbuf *m; 912 913 KASSERT(solocked(sb->sb_so)); 914 915 if (m0 == NULL) 916 return; 917 918#ifdef MBUFTRACE 919 m_claimm(m0, sb->sb_mowner); 920#endif 921 /* 922 * Put the first mbuf on the queue. 923 * Note this permits zero length records. 924 */ 925 sballoc(sb, m0); 926 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 927 SBLINKRECORD(sb, m0); 928 m = m0->m_next; 929 m0->m_next = 0; 930 if (m && (m0->m_flags & M_EOR)) { 931 m0->m_flags &= ~M_EOR; 932 m->m_flags |= M_EOR; 933 } 934 sbcompress(sb, m, m0); 935 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 936} 937 938/* 939 * As above except that OOB data 940 * is inserted at the beginning of the sockbuf, 941 * but after any other OOB data. 942 */ 943void 944sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 945{ 946 struct mbuf *m, **mp; 947 948 KASSERT(solocked(sb->sb_so)); 949 950 if (m0 == NULL) 951 return; 952 953 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 954 955 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 956 again: 957 switch (m->m_type) { 958 959 case MT_OOBDATA: 960 continue; /* WANT next train */ 961 962 case MT_CONTROL: 963 if ((m = m->m_next) != NULL) 964 goto again; /* inspect THIS train further */ 965 } 966 break; 967 } 968 /* 969 * Put the first mbuf on the queue. 970 * Note this permits zero length records. 971 */ 972 sballoc(sb, m0); 973 m0->m_nextpkt = *mp; 974 if (*mp == NULL) { 975 /* m0 is actually the new tail */ 976 sb->sb_lastrecord = m0; 977 } 978 *mp = m0; 979 m = m0->m_next; 980 m0->m_next = 0; 981 if (m && (m0->m_flags & M_EOR)) { 982 m0->m_flags &= ~M_EOR; 983 m->m_flags |= M_EOR; 984 } 985 sbcompress(sb, m, m0); 986 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 987} 988 989/* 990 * Append address and data, and optionally, control (ancillary) data 991 * to the receive queue of a socket. If present, 992 * m0 must include a packet header with total length. 993 * Returns 0 if no space in sockbuf or insufficient mbufs. 994 */ 995int 996sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 997 struct mbuf *control) 998{ 999 struct mbuf *m, *n, *nlast; 1000 int space, len; 1001 1002 KASSERT(solocked(sb->sb_so)); 1003 1004 space = asa->sa_len; 1005 1006 if (m0 != NULL) { 1007 if ((m0->m_flags & M_PKTHDR) == 0) 1008 panic("sbappendaddr"); 1009 space += m0->m_pkthdr.len; 1010#ifdef MBUFTRACE 1011 m_claimm(m0, sb->sb_mowner); 1012#endif 1013 } 1014 for (n = control; n; n = n->m_next) { 1015 space += n->m_len; 1016 MCLAIM(n, sb->sb_mowner); 1017 if (n->m_next == NULL) /* keep pointer to last control buf */ 1018 break; 1019 } 1020 if (space > sbspace(sb)) 1021 return (0); 1022 m = m_get(M_DONTWAIT, MT_SONAME); 1023 if (m == NULL) 1024 return (0); 1025 MCLAIM(m, sb->sb_mowner); 1026 /* 1027 * XXX avoid 'comparison always true' warning which isn't easily 1028 * avoided. 1029 */ 1030 len = asa->sa_len; 1031 if (len > MLEN) { 1032 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 1033 if ((m->m_flags & M_EXT) == 0) { 1034 m_free(m); 1035 return (0); 1036 } 1037 } 1038 m->m_len = asa->sa_len; 1039 memcpy(mtod(m, void *), asa, asa->sa_len); 1040 if (n) 1041 n->m_next = m0; /* concatenate data to control */ 1042 else 1043 control = m0; 1044 m->m_next = control; 1045 1046 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 1047 1048 for (n = m; n->m_next != NULL; n = n->m_next) 1049 sballoc(sb, n); 1050 sballoc(sb, n); 1051 nlast = n; 1052 SBLINKRECORD(sb, m); 1053 1054 sb->sb_mbtail = nlast; 1055 SBLASTMBUFCHK(sb, "sbappendaddr"); 1056 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 1057 1058 return (1); 1059} 1060 1061/* 1062 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 1063 * an mbuf chain. 1064 */ 1065static inline struct mbuf * 1066m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 1067 const struct sockaddr *asa) 1068{ 1069 struct mbuf *m; 1070 const int salen = asa->sa_len; 1071 1072 KASSERT(solocked(sb->sb_so)); 1073 1074 /* only the first in each chain need be a pkthdr */ 1075 m = m_gethdr(M_DONTWAIT, MT_SONAME); 1076 if (m == NULL) 1077 return NULL; 1078 MCLAIM(m, sb->sb_mowner); 1079#ifdef notyet 1080 if (salen > MHLEN) { 1081 MEXTMALLOC(m, salen, M_NOWAIT); 1082 if ((m->m_flags & M_EXT) == 0) { 1083 m_free(m); 1084 return NULL; 1085 } 1086 } 1087#else 1088 KASSERT(salen <= MHLEN); 1089#endif 1090 m->m_len = salen; 1091 memcpy(mtod(m, void *), asa, salen); 1092 m->m_next = m0; 1093 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 1094 1095 return m; 1096} 1097 1098int 1099sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 1100 struct mbuf *m0, int sbprio) 1101{ 1102 struct mbuf *m, *n, *n0, *nlast; 1103 int error; 1104 1105 KASSERT(solocked(sb->sb_so)); 1106 1107 /* 1108 * XXX sbprio reserved for encoding priority of this* request: 1109 * SB_PRIO_NONE --> honour normal sb limits 1110 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 1111 * take whole chain. Intended for large requests 1112 * that should be delivered atomically (all, or none). 1113 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 1114 * over normal socket limits, for messages indicating 1115 * buffer overflow in earlier normal/lower-priority messages 1116 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 1117 * Intended for kernel-generated messages only. 1118 * Up to generator to avoid total mbuf resource exhaustion. 1119 */ 1120 (void)sbprio; 1121 1122 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 1123 panic("sbappendaddrchain"); 1124 1125#ifdef notyet 1126 space = sbspace(sb); 1127 1128 /* 1129 * Enforce SB_PRIO_* limits as described above. 1130 */ 1131#endif 1132 1133 n0 = NULL; 1134 nlast = NULL; 1135 for (m = m0; m; m = m->m_nextpkt) { 1136 struct mbuf *np; 1137 1138#ifdef MBUFTRACE 1139 m_claimm(m, sb->sb_mowner); 1140#endif 1141 1142 /* Prepend sockaddr to this record (m) of input chain m0 */ 1143 n = m_prepend_sockaddr(sb, m, asa); 1144 if (n == NULL) { 1145 error = ENOBUFS; 1146 goto bad; 1147 } 1148 1149 /* Append record (asa+m) to end of new chain n0 */ 1150 if (n0 == NULL) { 1151 n0 = n; 1152 } else { 1153 nlast->m_nextpkt = n; 1154 } 1155 /* Keep track of last record on new chain */ 1156 nlast = n; 1157 1158 for (np = n; np; np = np->m_next) 1159 sballoc(sb, np); 1160 } 1161 1162 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 1163 1164 /* Drop the entire chain of (asa+m) records onto the socket */ 1165 SBLINKRECORDCHAIN(sb, n0, nlast); 1166 1167 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 1168 1169 for (m = nlast; m->m_next; m = m->m_next) 1170 ; 1171 sb->sb_mbtail = m; 1172 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 1173 1174 return (1); 1175 1176bad: 1177 /* 1178 * On error, free the prepended addreseses. For consistency 1179 * with sbappendaddr(), leave it to our caller to free 1180 * the input record chain passed to us as m0. 1181 */ 1182 while ((n = n0) != NULL) { 1183 struct mbuf *np; 1184 1185 /* Undo the sballoc() of this record */ 1186 for (np = n; np; np = np->m_next) 1187 sbfree(sb, np); 1188 1189 n0 = n->m_nextpkt; /* iterate at next prepended address */ 1190 np = m_free(n); /* free prepended address (not data) */ 1191 } 1192 return error; 1193} 1194 1195 1196int 1197sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 1198{ 1199 struct mbuf *m, *mlast, *n; 1200 int space; 1201 1202 KASSERT(solocked(sb->sb_so)); 1203 1204 space = 0; 1205 if (control == NULL) 1206 panic("sbappendcontrol"); 1207 for (m = control; ; m = m->m_next) { 1208 space += m->m_len; 1209 MCLAIM(m, sb->sb_mowner); 1210 if (m->m_next == NULL) 1211 break; 1212 } 1213 n = m; /* save pointer to last control buffer */ 1214 for (m = m0; m; m = m->m_next) { 1215 MCLAIM(m, sb->sb_mowner); 1216 space += m->m_len; 1217 } 1218 if (space > sbspace(sb)) 1219 return (0); 1220 n->m_next = m0; /* concatenate data to control */ 1221 1222 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 1223 1224 for (m = control; m->m_next != NULL; m = m->m_next) 1225 sballoc(sb, m); 1226 sballoc(sb, m); 1227 mlast = m; 1228 SBLINKRECORD(sb, control); 1229 1230 sb->sb_mbtail = mlast; 1231 SBLASTMBUFCHK(sb, "sbappendcontrol"); 1232 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 1233 1234 return (1); 1235} 1236 1237/* 1238 * Compress mbuf chain m into the socket 1239 * buffer sb following mbuf n. If n 1240 * is null, the buffer is presumed empty. 1241 */ 1242void 1243sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 1244{ 1245 int eor; 1246 struct mbuf *o; 1247 1248 KASSERT(solocked(sb->sb_so)); 1249 1250 eor = 0; 1251 while (m) { 1252 eor |= m->m_flags & M_EOR; 1253 if (m->m_len == 0 && 1254 (eor == 0 || 1255 (((o = m->m_next) || (o = n)) && 1256 o->m_type == m->m_type))) { 1257 if (sb->sb_lastrecord == m) 1258 sb->sb_lastrecord = m->m_next; 1259 m = m_free(m); 1260 continue; 1261 } 1262 if (n && (n->m_flags & M_EOR) == 0 && 1263 /* M_TRAILINGSPACE() checks buffer writeability */ 1264 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 1265 m->m_len <= M_TRAILINGSPACE(n) && 1266 n->m_type == m->m_type) { 1267 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *), 1268 (unsigned)m->m_len); 1269 n->m_len += m->m_len; 1270 sb->sb_cc += m->m_len; 1271 m = m_free(m); 1272 continue; 1273 } 1274 if (n) 1275 n->m_next = m; 1276 else 1277 sb->sb_mb = m; 1278 sb->sb_mbtail = m; 1279 sballoc(sb, m); 1280 n = m; 1281 m->m_flags &= ~M_EOR; 1282 m = m->m_next; 1283 n->m_next = 0; 1284 } 1285 if (eor) { 1286 if (n) 1287 n->m_flags |= eor; 1288 else 1289 printf("semi-panic: sbcompress\n"); 1290 } 1291 SBLASTMBUFCHK(sb, __func__); 1292} 1293 1294/* 1295 * Free all mbufs in a sockbuf. 1296 * Check that all resources are reclaimed. 1297 */ 1298void 1299sbflush(struct sockbuf *sb) 1300{ 1301 1302 KASSERT(solocked(sb->sb_so)); 1303 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1304 1305 while (sb->sb_mbcnt) 1306 sbdrop(sb, (int)sb->sb_cc); 1307 1308 KASSERT(sb->sb_cc == 0); 1309 KASSERT(sb->sb_mb == NULL); 1310 KASSERT(sb->sb_mbtail == NULL); 1311 KASSERT(sb->sb_lastrecord == NULL); 1312} 1313 1314/* 1315 * Drop data from (the front of) a sockbuf. 1316 */ 1317void 1318sbdrop(struct sockbuf *sb, int len) 1319{ 1320 struct mbuf *m, *next; 1321 1322 KASSERT(solocked(sb->sb_so)); 1323 1324 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1325 while (len > 0) { 1326 if (m == NULL) { 1327 if (next == NULL) 1328 panic("sbdrop(%p,%d): cc=%lu", 1329 sb, len, sb->sb_cc); 1330 m = next; 1331 next = m->m_nextpkt; 1332 continue; 1333 } 1334 if (m->m_len > len) { 1335 m->m_len -= len; 1336 m->m_data += len; 1337 sb->sb_cc -= len; 1338 break; 1339 } 1340 len -= m->m_len; 1341 sbfree(sb, m); 1342 m = m_free(m); 1343 } 1344 while (m && m->m_len == 0) { 1345 sbfree(sb, m); 1346 m = m_free(m); 1347 } 1348 if (m) { 1349 sb->sb_mb = m; 1350 m->m_nextpkt = next; 1351 } else 1352 sb->sb_mb = next; 1353 /* 1354 * First part is an inline SB_EMPTY_FIXUP(). Second part 1355 * makes sure sb_lastrecord is up-to-date if we dropped 1356 * part of the last record. 1357 */ 1358 m = sb->sb_mb; 1359 if (m == NULL) { 1360 sb->sb_mbtail = NULL; 1361 sb->sb_lastrecord = NULL; 1362 } else if (m->m_nextpkt == NULL) 1363 sb->sb_lastrecord = m; 1364} 1365 1366/* 1367 * Drop a record off the front of a sockbuf 1368 * and move the next record to the front. 1369 */ 1370void 1371sbdroprecord(struct sockbuf *sb) 1372{ 1373 struct mbuf *m, *mn; 1374 1375 KASSERT(solocked(sb->sb_so)); 1376 1377 m = sb->sb_mb; 1378 if (m) { 1379 sb->sb_mb = m->m_nextpkt; 1380 do { 1381 sbfree(sb, m); 1382 mn = m_free(m); 1383 } while ((m = mn) != NULL); 1384 } 1385 SB_EMPTY_FIXUP(sb); 1386} 1387 1388/* 1389 * Create a "control" mbuf containing the specified data 1390 * with the specified type for presentation on a socket buffer. 1391 */ 1392struct mbuf * 1393sbcreatecontrol1(void **p, int size, int type, int level, int flags) 1394{ 1395 struct cmsghdr *cp; 1396 struct mbuf *m; 1397 int space = CMSG_SPACE(size); 1398 1399 if ((flags & M_DONTWAIT) && space > MCLBYTES) { 1400 printf("%s: message too large %d\n", __func__, space); 1401 return NULL; 1402 } 1403 1404 if ((m = m_get(flags, MT_CONTROL)) == NULL) 1405 return NULL; 1406 if (space > MLEN) { 1407 if (space > MCLBYTES) 1408 MEXTMALLOC(m, space, M_WAITOK); 1409 else 1410 MCLGET(m, flags); 1411 if ((m->m_flags & M_EXT) == 0) { 1412 m_free(m); 1413 return NULL; 1414 } 1415 } 1416 cp = mtod(m, struct cmsghdr *); 1417 *p = CMSG_DATA(cp); 1418 m->m_len = space; 1419 cp->cmsg_len = CMSG_LEN(size); 1420 cp->cmsg_level = level; 1421 cp->cmsg_type = type; 1422 return m; 1423} 1424 1425struct mbuf * 1426sbcreatecontrol(void *p, int size, int type, int level) 1427{ 1428 struct mbuf *m; 1429 void *v; 1430 1431 m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT); 1432 if (m == NULL) 1433 return NULL; 1434 memcpy(v, p, size); 1435 return m; 1436} 1437 1438void 1439solockretry(struct socket *so, kmutex_t *lock) 1440{ 1441 1442 while (lock != so->so_lock) { 1443 mutex_exit(lock); 1444 lock = so->so_lock; 1445 mutex_enter(lock); 1446 } 1447} 1448 1449bool 1450solocked(const struct socket *so) 1451{ 1452 1453 return mutex_owned(so->so_lock); 1454} 1455 1456bool 1457solocked2(const struct socket *so1, const struct socket *so2) 1458{ 1459 const kmutex_t *lock; 1460 1461 lock = so1->so_lock; 1462 if (lock != so2->so_lock) 1463 return false; 1464 return mutex_owned(lock); 1465} 1466 1467/* 1468 * sosetlock: assign a default lock to a new socket. 1469 */ 1470void 1471sosetlock(struct socket *so) 1472{ 1473 if (so->so_lock == NULL) { 1474 kmutex_t *lock = softnet_lock; 1475 1476 so->so_lock = lock; 1477 mutex_obj_hold(lock); 1478 mutex_enter(lock); 1479 } 1480 KASSERT(solocked(so)); 1481} 1482 1483/* 1484 * Set lock on sockbuf sb; sleep if lock is already held. 1485 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. 1486 * Returns error without lock if sleep is interrupted. 1487 */ 1488int 1489sblock(struct sockbuf *sb, int wf) 1490{ 1491 struct socket *so; 1492 kmutex_t *lock; 1493 int error; 1494 1495 KASSERT(solocked(sb->sb_so)); 1496 1497 for (;;) { 1498 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) { 1499 sb->sb_flags |= SB_LOCK; 1500 return 0; 1501 } 1502 if (wf != M_WAITOK) 1503 return EWOULDBLOCK; 1504 so = sb->sb_so; 1505 lock = so->so_lock; 1506 if ((sb->sb_flags & SB_NOINTR) != 0) { 1507 cv_wait(&so->so_cv, lock); 1508 error = 0; 1509 } else 1510 error = cv_wait_sig(&so->so_cv, lock); 1511 if (__predict_false(lock != so->so_lock)) 1512 solockretry(so, lock); 1513 if (error != 0) 1514 return error; 1515 } 1516} 1517 1518void 1519sbunlock(struct sockbuf *sb) 1520{ 1521 struct socket *so; 1522 1523 so = sb->sb_so; 1524 1525 KASSERT(solocked(so)); 1526 KASSERT((sb->sb_flags & SB_LOCK) != 0); 1527 1528 sb->sb_flags &= ~SB_LOCK; 1529 cv_broadcast(&so->so_cv); 1530} 1531 1532int 1533sowait(struct socket *so, bool catch_p, int timo) 1534{ 1535 kmutex_t *lock; 1536 int error; 1537 1538 KASSERT(solocked(so)); 1539 KASSERT(catch_p || timo != 0); 1540 1541 lock = so->so_lock; 1542 if (catch_p) 1543 error = cv_timedwait_sig(&so->so_cv, lock, timo); 1544 else 1545 error = cv_timedwait(&so->so_cv, lock, timo); 1546 if (__predict_false(lock != so->so_lock)) 1547 solockretry(so, lock); 1548 return error; 1549} 1550 1551#ifdef DDB 1552 1553/* 1554 * Currently, sofindproc() is used only from DDB. It could be used from others 1555 * by using db_mutex_enter() 1556 */ 1557 1558static inline int 1559db_mutex_enter(kmutex_t *mtx) 1560{ 1561 extern int db_active; 1562 int rv; 1563 1564 if (!db_active) { 1565 mutex_enter(mtx); 1566 rv = 1; 1567 } else 1568 rv = mutex_tryenter(mtx); 1569 1570 return rv; 1571} 1572 1573int 1574sofindproc(struct socket *so, int all, void (*pr)(const char *, ...)) 1575{ 1576 proc_t *p; 1577 filedesc_t *fdp; 1578 fdtab_t *dt; 1579 fdfile_t *ff; 1580 file_t *fp = NULL; 1581 int found = 0; 1582 int i, t; 1583 1584 if (so == NULL) 1585 return 0; 1586 1587 t = db_mutex_enter(proc_lock); 1588 if (!t) { 1589 pr("could not acquire proc_lock mutex\n"); 1590 return 0; 1591 } 1592 PROCLIST_FOREACH(p, &allproc) { 1593 if (p->p_stat == SIDL) 1594 continue; 1595 fdp = p->p_fd; 1596 t = db_mutex_enter(&fdp->fd_lock); 1597 if (!t) { 1598 pr("could not acquire fd_lock mutex\n"); 1599 continue; 1600 } 1601 dt = fdp->fd_dt; 1602 for (i = 0; i < dt->dt_nfiles; i++) { 1603 ff = dt->dt_ff[i]; 1604 if (ff == NULL) 1605 continue; 1606 1607 fp = ff->ff_file; 1608 if (fp == NULL) 1609 continue; 1610 1611 t = db_mutex_enter(&fp->f_lock); 1612 if (!t) { 1613 pr("could not acquire f_lock mutex\n"); 1614 continue; 1615 } 1616 if ((struct socket *)fp->f_data != so) { 1617 mutex_exit(&fp->f_lock); 1618 continue; 1619 } 1620 found++; 1621 if (pr) 1622 pr("socket %p: owner %s(pid=%d)\n", 1623 so, p->p_comm, p->p_pid); 1624 mutex_exit(&fp->f_lock); 1625 if (all == 0) 1626 break; 1627 } 1628 mutex_exit(&fdp->fd_lock); 1629 if (all == 0 && found != 0) 1630 break; 1631 } 1632 mutex_exit(proc_lock); 1633 1634 return found; 1635} 1636 1637void 1638socket_print(const char *modif, void (*pr)(const char *, ...)) 1639{ 1640 file_t *fp; 1641 struct socket *so; 1642 struct sockbuf *sb_snd, *sb_rcv; 1643 struct mbuf *m_rec, *m; 1644 bool opt_v = false; 1645 bool opt_m = false; 1646 bool opt_a = false; 1647 bool opt_p = false; 1648 int nrecs, nmbufs; 1649 char ch; 1650 const char *family; 1651 1652 while ( (ch = *(modif++)) != '\0') { 1653 switch (ch) { 1654 case 'v': 1655 opt_v = true; 1656 break; 1657 case 'm': 1658 opt_m = true; 1659 break; 1660 case 'a': 1661 opt_a = true; 1662 break; 1663 case 'p': 1664 opt_p = true; 1665 break; 1666 } 1667 } 1668 if (opt_v == false && pr) 1669 (pr)("Ignore empty sockets. use /v to print all.\n"); 1670 if (opt_p == true && pr) 1671 (pr)("Don't search owner process.\n"); 1672 1673 LIST_FOREACH(fp, &filehead, f_list) { 1674 if (fp->f_type != DTYPE_SOCKET) 1675 continue; 1676 so = (struct socket *)fp->f_data; 1677 if (so == NULL) 1678 continue; 1679 1680 if (so->so_proto->pr_domain->dom_family == AF_INET) 1681 family = "INET"; 1682#ifdef INET6 1683 else if (so->so_proto->pr_domain->dom_family == AF_INET6) 1684 family = "INET6"; 1685#endif 1686 else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY) 1687 family = "KEY"; 1688 else if (so->so_proto->pr_domain->dom_family == AF_ROUTE) 1689 family = "ROUTE"; 1690 else 1691 continue; 1692 1693 sb_snd = &so->so_snd; 1694 sb_rcv = &so->so_rcv; 1695 1696 if (opt_v != true && 1697 sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0) 1698 continue; 1699 1700 pr("---SOCKET %p: type %s\n", so, family); 1701 if (opt_p != true) 1702 sofindproc(so, opt_a == true ? 1 : 0, pr); 1703 pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc); 1704 pr("Send Buffer mbufs:\n"); 1705 m_rec = m = sb_snd->sb_mb; 1706 nrecs = 0; 1707 nmbufs = 0; 1708 while (m_rec) { 1709 nrecs++; 1710 if (opt_m == true) 1711 pr(" mbuf chain %p\n", m_rec); 1712 while (m) { 1713 nmbufs++; 1714 m = m->m_next; 1715 } 1716 m_rec = m = m_rec->m_nextpkt; 1717 } 1718 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); 1719 1720 pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc); 1721 pr("Recv Buffer mbufs:\n"); 1722 m_rec = m = sb_rcv->sb_mb; 1723 nrecs = 0; 1724 nmbufs = 0; 1725 while (m_rec) { 1726 nrecs++; 1727 if (opt_m == true) 1728 pr(" mbuf chain %p\n", m_rec); 1729 while (m) { 1730 nmbufs++; 1731 m = m->m_next; 1732 } 1733 m_rec = m = m_rec->m_nextpkt; 1734 } 1735 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); 1736 } 1737} 1738#endif /* DDB */ 1739