uipc_socket.c revision 160280
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34/* 35 * Comments on the socket life cycle: 36 * 37 * soalloc() sets of socket layer state for a socket, called only by 38 * socreate() and sonewconn(). Socket layer private. 39 * 40 * sdealloc() tears down socket layer state for a socket, called only by 41 * sofree() and sonewconn(). Socket layer private. 42 * 43 * pru_attach() associates protocol layer state with an allocated socket; 44 * called only once, may fail, aborting socket allocation. This is called 45 * from socreate() and sonewconn(). Socket layer private. 46 * 47 * pru_detach() disassociates protocol layer state from an attached socket, 48 * and will be called exactly once for sockets in which pru_attach() has 49 * been successfully called. If pru_attach() returned an error, 50 * pru_detach() will not be called. Socket layer private. 51 * 52 * socreate() creates a socket and attaches protocol state. This is a public 53 * interface that may be used by socket layer consumers to create new 54 * sockets. 55 * 56 * sonewconn() creates a socket and attaches protocol state. This is a 57 * public interface that may be used by protocols to create new sockets when 58 * a new connection is received and will be available for accept() on a 59 * listen socket. 60 * 61 * soclose() destroys a socket after possibly waiting for it to disconnect. 62 * This is a public interface that socket consumers should use to close and 63 * release a socket when done with it. 64 * 65 * soabort() destroys a socket without waiting for it to disconnect (used 66 * only for incoming connections that are already partially or fully 67 * connected). This is used internally by the socket layer when clearing 68 * listen socket queues (due to overflow or close on the listen socket), but 69 * is also a public interface protocols may use to abort connections in 70 * their incomplete listen queues should they no longer be required. Sockets 71 * placed in completed connection listen queues should not be aborted. 72 * 73 * sofree() will free a socket and its protocol state if all references on 74 * the socket have been released, and is the public interface to attempt to 75 * free a socket when a reference is removed. This is a socket layer private 76 * interface. 77 * 78 * NOTE: In addition to socreate() and soclose(), which provide a single 79 * socket reference to the consumer to be managed as required, there are two 80 * calls to explicitly manage socket references, soref(), and sorele(). 81 * Currently, these are generally required only when transitioning a socket 82 * from a listen queue to a file descriptor, in order to prevent garbage 83 * collection of the socket at an untimely moment. For a number of reasons, 84 * these interfaces are not preferred, and should be avoided. 85 * 86 * XXXRW: The behavior of sockets after soclose() but before the last 87 * sorele() is poorly defined. We can probably entirely eliminate them with 88 * a little work, since consumers are managing references anyway. 89 */ 90 91#include <sys/cdefs.h> 92__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 160280 2006-07-11 21:56:58Z rwatson $"); 93 94#include "opt_inet.h" 95#include "opt_mac.h" 96#include "opt_zero.h" 97#include "opt_compat.h" 98 99#include <sys/param.h> 100#include <sys/systm.h> 101#include <sys/fcntl.h> 102#include <sys/limits.h> 103#include <sys/lock.h> 104#include <sys/mac.h> 105#include <sys/malloc.h> 106#include <sys/mbuf.h> 107#include <sys/mutex.h> 108#include <sys/domain.h> 109#include <sys/file.h> /* for struct knote */ 110#include <sys/kernel.h> 111#include <sys/event.h> 112#include <sys/eventhandler.h> 113#include <sys/poll.h> 114#include <sys/proc.h> 115#include <sys/protosw.h> 116#include <sys/socket.h> 117#include <sys/socketvar.h> 118#include <sys/resourcevar.h> 119#include <sys/signalvar.h> 120#include <sys/sysctl.h> 121#include <sys/uio.h> 122#include <sys/jail.h> 123 124#include <vm/uma.h> 125 126#ifdef COMPAT_IA32 127#include <sys/mount.h> 128#include <compat/freebsd32/freebsd32.h> 129 130extern struct sysentvec ia32_freebsd_sysvec; 131#endif 132 133static int soreceive_rcvoob(struct socket *so, struct uio *uio, 134 int flags); 135 136static void filt_sordetach(struct knote *kn); 137static int filt_soread(struct knote *kn, long hint); 138static void filt_sowdetach(struct knote *kn); 139static int filt_sowrite(struct knote *kn, long hint); 140static int filt_solisten(struct knote *kn, long hint); 141 142static struct filterops solisten_filtops = 143 { 1, NULL, filt_sordetach, filt_solisten }; 144static struct filterops soread_filtops = 145 { 1, NULL, filt_sordetach, filt_soread }; 146static struct filterops sowrite_filtops = 147 { 1, NULL, filt_sowdetach, filt_sowrite }; 148 149uma_zone_t socket_zone; 150so_gen_t so_gencnt; /* generation count for sockets */ 151 152int maxsockets; 153 154MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 155MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 156 157static int somaxconn = SOMAXCONN; 158static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS); 159/* XXX: we dont have SYSCTL_USHORT */ 160SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 161 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection " 162 "queue size"); 163static int numopensockets; 164SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 165 &numopensockets, 0, "Number of open sockets"); 166#ifdef ZERO_COPY_SOCKETS 167/* These aren't static because they're used in other files. */ 168int so_zero_copy_send = 1; 169int so_zero_copy_receive = 1; 170SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 171 "Zero copy controls"); 172SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 173 &so_zero_copy_receive, 0, "Enable zero copy receive"); 174SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 175 &so_zero_copy_send, 0, "Enable zero copy send"); 176#endif /* ZERO_COPY_SOCKETS */ 177 178/* 179 * accept_mtx locks down per-socket fields relating to accept queues. See 180 * socketvar.h for an annotation of the protected fields of struct socket. 181 */ 182struct mtx accept_mtx; 183MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 184 185/* 186 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 187 * so_gencnt field. 188 */ 189static struct mtx so_global_mtx; 190MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 191 192SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 193 194static int 195sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 196{ 197 int error, newmaxsockets; 198 199 newmaxsockets = maxsockets; 200 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 201 if (error == 0 && req->newptr) { 202 if (newmaxsockets > maxsockets) { 203 maxsockets = newmaxsockets; 204 if (maxsockets > ((maxfiles / 4) * 3)) { 205 maxfiles = (maxsockets * 5) / 4; 206 maxfilesperproc = (maxfiles * 9) / 10; 207 } 208 EVENTHANDLER_INVOKE(maxsockets_change); 209 } else 210 error = EINVAL; 211 } 212 return (error); 213} 214 215SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 216 &maxsockets, 0, sysctl_maxsockets, "IU", 217 "Maximum number of sockets avaliable"); 218 219/* 220 * Initialise maxsockets 221 */ 222static void init_maxsockets(void *ignored) 223{ 224 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 225 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 226} 227SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 228 229/* 230 * Socket operation routines. 231 * These routines are called by the routines in 232 * sys_socket.c or from a system process, and 233 * implement the semantics of socket operations by 234 * switching out to the protocol specific routines. 235 */ 236 237/* 238 * Get a socket structure from our zone, and initialize it. 239 * Note that it would probably be better to allocate socket 240 * and PCB at the same time, but I'm not convinced that all 241 * the protocols can be easily modified to do this. 242 * 243 * soalloc() returns a socket with a ref count of 0. 244 */ 245static struct socket * 246soalloc(int mflags) 247{ 248 struct socket *so; 249 250 so = uma_zalloc(socket_zone, mflags | M_ZERO); 251 if (so == NULL) 252 return (NULL); 253#ifdef MAC 254 if (mac_init_socket(so, mflags) != 0) { 255 uma_zfree(socket_zone, so); 256 return (NULL); 257 } 258#endif 259 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 260 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 261 TAILQ_INIT(&so->so_aiojobq); 262 mtx_lock(&so_global_mtx); 263 so->so_gencnt = ++so_gencnt; 264 ++numopensockets; 265 mtx_unlock(&so_global_mtx); 266 return (so); 267} 268 269static void 270sodealloc(struct socket *so) 271{ 272 273 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 274 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 275 276 mtx_lock(&so_global_mtx); 277 so->so_gencnt = ++so_gencnt; 278 mtx_unlock(&so_global_mtx); 279 if (so->so_rcv.sb_hiwat) 280 (void)chgsbsize(so->so_cred->cr_uidinfo, 281 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 282 if (so->so_snd.sb_hiwat) 283 (void)chgsbsize(so->so_cred->cr_uidinfo, 284 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 285#ifdef INET 286 /* remove acccept filter if one is present. */ 287 if (so->so_accf != NULL) 288 do_setopt_accept_filter(so, NULL); 289#endif 290#ifdef MAC 291 mac_destroy_socket(so); 292#endif 293 crfree(so->so_cred); 294 SOCKBUF_LOCK_DESTROY(&so->so_snd); 295 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 296 uma_zfree(socket_zone, so); 297 mtx_lock(&so_global_mtx); 298 --numopensockets; 299 mtx_unlock(&so_global_mtx); 300} 301 302/* 303 * socreate returns a socket with a ref count of 1. The socket should be 304 * closed with soclose(). 305 */ 306int 307socreate(dom, aso, type, proto, cred, td) 308 int dom; 309 struct socket **aso; 310 int type; 311 int proto; 312 struct ucred *cred; 313 struct thread *td; 314{ 315 struct protosw *prp; 316 struct socket *so; 317 int error; 318 319 if (proto) 320 prp = pffindproto(dom, proto, type); 321 else 322 prp = pffindtype(dom, type); 323 324 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 325 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 326 return (EPROTONOSUPPORT); 327 328 if (jailed(cred) && jail_socket_unixiproute_only && 329 prp->pr_domain->dom_family != PF_LOCAL && 330 prp->pr_domain->dom_family != PF_INET && 331 prp->pr_domain->dom_family != PF_ROUTE) { 332 return (EPROTONOSUPPORT); 333 } 334 335 if (prp->pr_type != type) 336 return (EPROTOTYPE); 337 so = soalloc(M_WAITOK); 338 if (so == NULL) 339 return (ENOBUFS); 340 341 TAILQ_INIT(&so->so_incomp); 342 TAILQ_INIT(&so->so_comp); 343 so->so_type = type; 344 so->so_cred = crhold(cred); 345 so->so_proto = prp; 346#ifdef MAC 347 mac_create_socket(cred, so); 348#endif 349 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 350 NULL, NULL, NULL); 351 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 352 NULL, NULL, NULL); 353 so->so_count = 1; 354 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 355 if (error) { 356 sodealloc(so); 357 return (error); 358 } 359 *aso = so; 360 return (0); 361} 362 363#ifdef REGRESSION 364static int regression_sonewconn_earlytest = 1; 365SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 366 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 367#endif 368 369/* 370 * When an attempt at a new connection is noted on a socket 371 * which accepts connections, sonewconn is called. If the 372 * connection is possible (subject to space constraints, etc.) 373 * then we allocate a new structure, propoerly linked into the 374 * data structure of the original socket, and return this. 375 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 376 * 377 * note: the ref count on the socket is 0 on return 378 */ 379struct socket * 380sonewconn(head, connstatus) 381 register struct socket *head; 382 int connstatus; 383{ 384 register struct socket *so; 385 int over; 386 387 ACCEPT_LOCK(); 388 over = (head->so_qlen > 3 * head->so_qlimit / 2); 389 ACCEPT_UNLOCK(); 390#ifdef REGRESSION 391 if (regression_sonewconn_earlytest && over) 392#else 393 if (over) 394#endif 395 return (NULL); 396 so = soalloc(M_NOWAIT); 397 if (so == NULL) 398 return (NULL); 399 if ((head->so_options & SO_ACCEPTFILTER) != 0) 400 connstatus = 0; 401 so->so_head = head; 402 so->so_type = head->so_type; 403 so->so_options = head->so_options &~ SO_ACCEPTCONN; 404 so->so_linger = head->so_linger; 405 so->so_state = head->so_state | SS_NOFDREF; 406 so->so_proto = head->so_proto; 407 so->so_timeo = head->so_timeo; 408 so->so_cred = crhold(head->so_cred); 409#ifdef MAC 410 SOCK_LOCK(head); 411 mac_create_socket_from_socket(head, so); 412 SOCK_UNLOCK(head); 413#endif 414 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 415 NULL, NULL, NULL); 416 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 417 NULL, NULL, NULL); 418 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 419 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 420 sodealloc(so); 421 return (NULL); 422 } 423 so->so_state |= connstatus; 424 ACCEPT_LOCK(); 425 if (connstatus) { 426 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 427 so->so_qstate |= SQ_COMP; 428 head->so_qlen++; 429 } else { 430 /* 431 * Keep removing sockets from the head until there's room for 432 * us to insert on the tail. In pre-locking revisions, this 433 * was a simple if(), but as we could be racing with other 434 * threads and soabort() requires dropping locks, we must 435 * loop waiting for the condition to be true. 436 */ 437 while (head->so_incqlen > head->so_qlimit) { 438 struct socket *sp; 439 sp = TAILQ_FIRST(&head->so_incomp); 440 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 441 head->so_incqlen--; 442 sp->so_qstate &= ~SQ_INCOMP; 443 sp->so_head = NULL; 444 ACCEPT_UNLOCK(); 445 soabort(sp); 446 ACCEPT_LOCK(); 447 } 448 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 449 so->so_qstate |= SQ_INCOMP; 450 head->so_incqlen++; 451 } 452 ACCEPT_UNLOCK(); 453 if (connstatus) { 454 sorwakeup(head); 455 wakeup_one(&head->so_timeo); 456 } 457 return (so); 458} 459 460int 461sobind(so, nam, td) 462 struct socket *so; 463 struct sockaddr *nam; 464 struct thread *td; 465{ 466 467 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 468} 469 470/* 471 * solisten() transitions a socket from a non-listening state to a listening 472 * state, but can also be used to update the listen queue depth on an 473 * existing listen socket. The protocol will call back into the sockets 474 * layer using solisten_proto_check() and solisten_proto() to check and set 475 * socket-layer listen state. Call backs are used so that the protocol can 476 * acquire both protocol and socket layer locks in whatever order is required 477 * by the protocol. 478 * 479 * Protocol implementors are advised to hold the socket lock across the 480 * socket-layer test and set to avoid races at the socket layer. 481 */ 482int 483solisten(so, backlog, td) 484 struct socket *so; 485 int backlog; 486 struct thread *td; 487{ 488 489 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 490} 491 492int 493solisten_proto_check(so) 494 struct socket *so; 495{ 496 497 SOCK_LOCK_ASSERT(so); 498 499 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 500 SS_ISDISCONNECTING)) 501 return (EINVAL); 502 return (0); 503} 504 505void 506solisten_proto(so, backlog) 507 struct socket *so; 508 int backlog; 509{ 510 511 SOCK_LOCK_ASSERT(so); 512 513 if (backlog < 0 || backlog > somaxconn) 514 backlog = somaxconn; 515 so->so_qlimit = backlog; 516 so->so_options |= SO_ACCEPTCONN; 517} 518 519/* 520 * Attempt to free a socket. This should really be sotryfree(). 521 * 522 * sofree() will succeed if: 523 * 524 * - There are no outstanding file descriptor references or related consumers 525 * (so_count == 0). 526 * 527 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 528 * 529 * - The protocol does not have an outstanding strong reference on the socket 530 * (SS_PROTOREF). 531 * 532 * - The socket is not in a completed connection queue, so a process has been 533 * notified that it is present. If it is removed, the user process may 534 * block in accept() despite select() saying the socket was ready. 535 * 536 * Otherwise, it will quietly abort so that a future call to sofree(), when 537 * conditions are right, can succeed. 538 */ 539void 540sofree(so) 541 struct socket *so; 542{ 543 struct socket *head; 544 545 ACCEPT_LOCK_ASSERT(); 546 SOCK_LOCK_ASSERT(so); 547 548 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 549 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 550 SOCK_UNLOCK(so); 551 ACCEPT_UNLOCK(); 552 return; 553 } 554 555 head = so->so_head; 556 if (head != NULL) { 557 KASSERT((so->so_qstate & SQ_COMP) != 0 || 558 (so->so_qstate & SQ_INCOMP) != 0, 559 ("sofree: so_head != NULL, but neither SQ_COMP nor " 560 "SQ_INCOMP")); 561 KASSERT((so->so_qstate & SQ_COMP) == 0 || 562 (so->so_qstate & SQ_INCOMP) == 0, 563 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 564 TAILQ_REMOVE(&head->so_incomp, so, so_list); 565 head->so_incqlen--; 566 so->so_qstate &= ~SQ_INCOMP; 567 so->so_head = NULL; 568 } 569 KASSERT((so->so_qstate & SQ_COMP) == 0 && 570 (so->so_qstate & SQ_INCOMP) == 0, 571 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 572 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 573 SOCK_UNLOCK(so); 574 ACCEPT_UNLOCK(); 575 576 SOCKBUF_LOCK(&so->so_snd); 577 so->so_snd.sb_flags |= SB_NOINTR; 578 (void)sblock(&so->so_snd, M_WAITOK); 579 /* 580 * socantsendmore_locked() drops the socket buffer mutex so that it 581 * can safely perform wakeups. Re-acquire the mutex before 582 * continuing. 583 */ 584 socantsendmore_locked(so); 585 SOCKBUF_LOCK(&so->so_snd); 586 sbunlock(&so->so_snd); 587 sbrelease_locked(&so->so_snd, so); 588 SOCKBUF_UNLOCK(&so->so_snd); 589 sorflush(so); 590 knlist_destroy(&so->so_rcv.sb_sel.si_note); 591 knlist_destroy(&so->so_snd.sb_sel.si_note); 592 sodealloc(so); 593} 594 595/* 596 * Close a socket on last file table reference removal. 597 * Initiate disconnect if connected. 598 * Free socket when disconnect complete. 599 * 600 * This function will sorele() the socket. Note that soclose() may be 601 * called prior to the ref count reaching zero. The actual socket 602 * structure will not be freed until the ref count reaches zero. 603 */ 604int 605soclose(so) 606 struct socket *so; 607{ 608 int error = 0; 609 610 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 611 612 funsetown(&so->so_sigio); 613 if (so->so_options & SO_ACCEPTCONN) { 614 struct socket *sp; 615 ACCEPT_LOCK(); 616 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 617 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 618 so->so_incqlen--; 619 sp->so_qstate &= ~SQ_INCOMP; 620 sp->so_head = NULL; 621 ACCEPT_UNLOCK(); 622 soabort(sp); 623 ACCEPT_LOCK(); 624 } 625 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 626 TAILQ_REMOVE(&so->so_comp, sp, so_list); 627 so->so_qlen--; 628 sp->so_qstate &= ~SQ_COMP; 629 sp->so_head = NULL; 630 ACCEPT_UNLOCK(); 631 soabort(sp); 632 ACCEPT_LOCK(); 633 } 634 ACCEPT_UNLOCK(); 635 } 636 if (so->so_state & SS_ISCONNECTED) { 637 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 638 error = sodisconnect(so); 639 if (error) 640 goto drop; 641 } 642 if (so->so_options & SO_LINGER) { 643 if ((so->so_state & SS_ISDISCONNECTING) && 644 (so->so_state & SS_NBIO)) 645 goto drop; 646 while (so->so_state & SS_ISCONNECTED) { 647 error = tsleep(&so->so_timeo, 648 PSOCK | PCATCH, "soclos", so->so_linger * hz); 649 if (error) 650 break; 651 } 652 } 653 } 654 655drop: 656 (*so->so_proto->pr_usrreqs->pru_detach)(so); 657 ACCEPT_LOCK(); 658 SOCK_LOCK(so); 659 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 660 so->so_state |= SS_NOFDREF; 661 sorele(so); 662 return (error); 663} 664 665/* 666 * soabort() allows the socket code or protocol code to detach a socket that 667 * has been in an incomplete or completed listen queue, but has not yet been 668 * accepted. 669 * 670 * This interface is tricky, because it is called on an unreferenced socket, 671 * and must be called only by a thread that has actually removed the socket 672 * from the listen queue it was on, or races with other threads are risked. 673 * 674 * This interface will call into the protocol code, so must not be called 675 * with any socket locks held. Protocols do call it while holding their own 676 * recursible protocol mutexes, but this is something that should be subject 677 * to review in the future. 678 * 679 * XXXRW: Why do we maintain a distinction between pru_abort() and 680 * pru_detach()? 681 */ 682void 683soabort(so) 684 struct socket *so; 685{ 686 687 /* 688 * In as much as is possible, assert that no references to this 689 * socket are held. This is not quite the same as asserting that the 690 * current thread is responsible for arranging for no references, but 691 * is as close as we can get for now. 692 */ 693 KASSERT(so->so_count == 0, ("soabort: so_count")); 694 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 695 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 696 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 697 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 698 699 (*so->so_proto->pr_usrreqs->pru_abort)(so); 700 ACCEPT_LOCK(); 701 SOCK_LOCK(so); 702 sofree(so); 703} 704 705int 706soaccept(so, nam) 707 struct socket *so; 708 struct sockaddr **nam; 709{ 710 int error; 711 712 SOCK_LOCK(so); 713 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 714 so->so_state &= ~SS_NOFDREF; 715 SOCK_UNLOCK(so); 716 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 717 return (error); 718} 719 720int 721soconnect(so, nam, td) 722 struct socket *so; 723 struct sockaddr *nam; 724 struct thread *td; 725{ 726 int error; 727 728 if (so->so_options & SO_ACCEPTCONN) 729 return (EOPNOTSUPP); 730 /* 731 * If protocol is connection-based, can only connect once. 732 * Otherwise, if connected, try to disconnect first. 733 * This allows user to disconnect by connecting to, e.g., 734 * a null address. 735 */ 736 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 737 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 738 (error = sodisconnect(so)))) { 739 error = EISCONN; 740 } else { 741 /* 742 * Prevent accumulated error from previous connection 743 * from biting us. 744 */ 745 so->so_error = 0; 746 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 747 } 748 749 return (error); 750} 751 752int 753soconnect2(so1, so2) 754 struct socket *so1; 755 struct socket *so2; 756{ 757 758 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 759} 760 761int 762sodisconnect(so) 763 struct socket *so; 764{ 765 int error; 766 767 if ((so->so_state & SS_ISCONNECTED) == 0) 768 return (ENOTCONN); 769 if (so->so_state & SS_ISDISCONNECTING) 770 return (EALREADY); 771 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 772 return (error); 773} 774 775#ifdef ZERO_COPY_SOCKETS 776struct so_zerocopy_stats{ 777 int size_ok; 778 int align_ok; 779 int found_ifp; 780}; 781struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 782#include <netinet/in.h> 783#include <net/route.h> 784#include <netinet/in_pcb.h> 785#include <vm/vm.h> 786#include <vm/vm_page.h> 787#include <vm/vm_object.h> 788#endif /*ZERO_COPY_SOCKETS*/ 789 790/* 791 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 792 * all of the data referenced by the uio. If desired, it uses zero-copy. 793 * *space will be updated to reflect data copied in. 794 * 795 * NB: If atomic I/O is requested, the caller must already have checked that 796 * space can hold resid bytes. 797 * 798 * NB: In the event of an error, the caller may need to free the partial 799 * chain pointed to by *mpp. The contents of both *uio and *space may be 800 * modified even in the case of an error. 801 */ 802static int 803sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 804 int flags) 805{ 806 struct mbuf *m, **mp, *top; 807 long len, resid; 808 int error; 809#ifdef ZERO_COPY_SOCKETS 810 int cow_send; 811#endif 812 813 *retmp = top = NULL; 814 mp = ⊤ 815 len = 0; 816 resid = uio->uio_resid; 817 error = 0; 818 do { 819#ifdef ZERO_COPY_SOCKETS 820 cow_send = 0; 821#endif /* ZERO_COPY_SOCKETS */ 822 if (resid >= MINCLSIZE) { 823#ifdef ZERO_COPY_SOCKETS 824 if (top == NULL) { 825 MGETHDR(m, M_TRYWAIT, MT_DATA); 826 if (m == NULL) { 827 error = ENOBUFS; 828 goto out; 829 } 830 m->m_pkthdr.len = 0; 831 m->m_pkthdr.rcvif = NULL; 832 } else { 833 MGET(m, M_TRYWAIT, MT_DATA); 834 if (m == NULL) { 835 error = ENOBUFS; 836 goto out; 837 } 838 } 839 if (so_zero_copy_send && 840 resid>=PAGE_SIZE && 841 *space>=PAGE_SIZE && 842 uio->uio_iov->iov_len>=PAGE_SIZE) { 843 so_zerocp_stats.size_ok++; 844 so_zerocp_stats.align_ok++; 845 cow_send = socow_setup(m, uio); 846 len = cow_send; 847 } 848 if (!cow_send) { 849 MCLGET(m, M_TRYWAIT); 850 if ((m->m_flags & M_EXT) == 0) { 851 m_free(m); 852 m = NULL; 853 } else { 854 len = min(min(MCLBYTES, resid), 855 *space); 856 } 857 } 858#else /* ZERO_COPY_SOCKETS */ 859 if (top == NULL) { 860 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 861 m->m_pkthdr.len = 0; 862 m->m_pkthdr.rcvif = NULL; 863 } else 864 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 865 len = min(min(MCLBYTES, resid), *space); 866#endif /* ZERO_COPY_SOCKETS */ 867 } else { 868 if (top == NULL) { 869 m = m_gethdr(M_TRYWAIT, MT_DATA); 870 m->m_pkthdr.len = 0; 871 m->m_pkthdr.rcvif = NULL; 872 873 len = min(min(MHLEN, resid), *space); 874 /* 875 * For datagram protocols, leave room 876 * for protocol headers in first mbuf. 877 */ 878 if (atomic && m && len < MHLEN) 879 MH_ALIGN(m, len); 880 } else { 881 m = m_get(M_TRYWAIT, MT_DATA); 882 len = min(min(MLEN, resid), *space); 883 } 884 } 885 if (m == NULL) { 886 error = ENOBUFS; 887 goto out; 888 } 889 890 *space -= len; 891#ifdef ZERO_COPY_SOCKETS 892 if (cow_send) 893 error = 0; 894 else 895#endif /* ZERO_COPY_SOCKETS */ 896 error = uiomove(mtod(m, void *), (int)len, uio); 897 resid = uio->uio_resid; 898 m->m_len = len; 899 *mp = m; 900 top->m_pkthdr.len += len; 901 if (error) 902 goto out; 903 mp = &m->m_next; 904 if (resid <= 0) { 905 if (flags & MSG_EOR) 906 top->m_flags |= M_EOR; 907 break; 908 } 909 } while (*space > 0 && atomic); 910out: 911 *retmp = top; 912 return (error); 913} 914 915#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 916 917int 918sosend_dgram(so, addr, uio, top, control, flags, td) 919 struct socket *so; 920 struct sockaddr *addr; 921 struct uio *uio; 922 struct mbuf *top; 923 struct mbuf *control; 924 int flags; 925 struct thread *td; 926{ 927 long space, resid; 928 int clen = 0, error, dontroute; 929 int atomic = sosendallatonce(so) || top; 930 931 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 932 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 933 ("sodgram_send: !PR_ATOMIC")); 934 935 if (uio != NULL) 936 resid = uio->uio_resid; 937 else 938 resid = top->m_pkthdr.len; 939 /* 940 * In theory resid should be unsigned. 941 * However, space must be signed, as it might be less than 0 942 * if we over-committed, and we must use a signed comparison 943 * of space and resid. On the other hand, a negative resid 944 * causes us to loop sending 0-length segments to the protocol. 945 * 946 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 947 * type sockets since that's an error. 948 */ 949 if (resid < 0) { 950 error = EINVAL; 951 goto out; 952 } 953 954 dontroute = 955 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 956 if (td != NULL) 957 td->td_proc->p_stats->p_ru.ru_msgsnd++; 958 if (control != NULL) 959 clen = control->m_len; 960 961 SOCKBUF_LOCK(&so->so_snd); 962 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 963 SOCKBUF_UNLOCK(&so->so_snd); 964 error = EPIPE; 965 goto out; 966 } 967 if (so->so_error) { 968 error = so->so_error; 969 so->so_error = 0; 970 SOCKBUF_UNLOCK(&so->so_snd); 971 goto out; 972 } 973 if ((so->so_state & SS_ISCONNECTED) == 0) { 974 /* 975 * `sendto' and `sendmsg' is allowed on a connection- 976 * based socket if it supports implied connect. 977 * Return ENOTCONN if not connected and no address is 978 * supplied. 979 */ 980 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 981 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 982 if ((so->so_state & SS_ISCONFIRMING) == 0 && 983 !(resid == 0 && clen != 0)) { 984 SOCKBUF_UNLOCK(&so->so_snd); 985 error = ENOTCONN; 986 goto out; 987 } 988 } else if (addr == NULL) { 989 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 990 error = ENOTCONN; 991 else 992 error = EDESTADDRREQ; 993 SOCKBUF_UNLOCK(&so->so_snd); 994 goto out; 995 } 996 } 997 998 /* 999 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1000 * problem and need fixing. 1001 */ 1002 space = sbspace(&so->so_snd); 1003 if (flags & MSG_OOB) 1004 space += 1024; 1005 space -= clen; 1006 if (resid > space) { 1007 error = EMSGSIZE; 1008 goto out; 1009 } 1010 SOCKBUF_UNLOCK(&so->so_snd); 1011 if (uio == NULL) { 1012 resid = 0; 1013 if (flags & MSG_EOR) 1014 top->m_flags |= M_EOR; 1015 } else { 1016 error = sosend_copyin(uio, &top, atomic, &space, flags); 1017 if (error) 1018 goto out; 1019 resid = uio->uio_resid; 1020 } 1021 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1022 /* 1023 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1024 * than with. 1025 */ 1026 if (dontroute) { 1027 SOCK_LOCK(so); 1028 so->so_options |= SO_DONTROUTE; 1029 SOCK_UNLOCK(so); 1030 } 1031 /* 1032 * XXX all the SBS_CANTSENDMORE checks previously 1033 * done could be out of date. We could have recieved 1034 * a reset packet in an interrupt or maybe we slept 1035 * while doing page faults in uiomove() etc. We could 1036 * probably recheck again inside the locking protection 1037 * here, but there are probably other places that this 1038 * also happens. We must rethink this. 1039 */ 1040 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1041 (flags & MSG_OOB) ? PRUS_OOB : 1042 /* 1043 * If the user set MSG_EOF, the protocol 1044 * understands this flag and nothing left to 1045 * send then use PRU_SEND_EOF instead of PRU_SEND. 1046 */ 1047 ((flags & MSG_EOF) && 1048 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1049 (resid <= 0)) ? 1050 PRUS_EOF : 1051 /* If there is more to send set PRUS_MORETOCOME */ 1052 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1053 top, addr, control, td); 1054 if (dontroute) { 1055 SOCK_LOCK(so); 1056 so->so_options &= ~SO_DONTROUTE; 1057 SOCK_UNLOCK(so); 1058 } 1059 clen = 0; 1060 control = NULL; 1061 top = NULL; 1062out: 1063 if (top != NULL) 1064 m_freem(top); 1065 if (control != NULL) 1066 m_freem(control); 1067 return (error); 1068} 1069 1070/* 1071 * Send on a socket. 1072 * If send must go all at once and message is larger than 1073 * send buffering, then hard error. 1074 * Lock against other senders. 1075 * If must go all at once and not enough room now, then 1076 * inform user that this would block and do nothing. 1077 * Otherwise, if nonblocking, send as much as possible. 1078 * The data to be sent is described by "uio" if nonzero, 1079 * otherwise by the mbuf chain "top" (which must be null 1080 * if uio is not). Data provided in mbuf chain must be small 1081 * enough to send all at once. 1082 * 1083 * Returns nonzero on error, timeout or signal; callers 1084 * must check for short counts if EINTR/ERESTART are returned. 1085 * Data and control buffers are freed on return. 1086 */ 1087#define snderr(errno) { error = (errno); goto release; } 1088int 1089sosend(so, addr, uio, top, control, flags, td) 1090 struct socket *so; 1091 struct sockaddr *addr; 1092 struct uio *uio; 1093 struct mbuf *top; 1094 struct mbuf *control; 1095 int flags; 1096 struct thread *td; 1097{ 1098 long space, resid; 1099 int clen = 0, error, dontroute; 1100 int atomic = sosendallatonce(so) || top; 1101 1102 if (uio != NULL) 1103 resid = uio->uio_resid; 1104 else 1105 resid = top->m_pkthdr.len; 1106 /* 1107 * In theory resid should be unsigned. 1108 * However, space must be signed, as it might be less than 0 1109 * if we over-committed, and we must use a signed comparison 1110 * of space and resid. On the other hand, a negative resid 1111 * causes us to loop sending 0-length segments to the protocol. 1112 * 1113 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1114 * type sockets since that's an error. 1115 */ 1116 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1117 error = EINVAL; 1118 goto out; 1119 } 1120 1121 dontroute = 1122 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1123 (so->so_proto->pr_flags & PR_ATOMIC); 1124 if (td != NULL) 1125 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1126 if (control != NULL) 1127 clen = control->m_len; 1128 1129 SOCKBUF_LOCK(&so->so_snd); 1130restart: 1131 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1132 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1133 if (error) 1134 goto out_locked; 1135 do { 1136 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1137 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1138 snderr(EPIPE); 1139 if (so->so_error) { 1140 error = so->so_error; 1141 so->so_error = 0; 1142 goto release; 1143 } 1144 if ((so->so_state & SS_ISCONNECTED) == 0) { 1145 /* 1146 * `sendto' and `sendmsg' is allowed on a connection- 1147 * based socket if it supports implied connect. 1148 * Return ENOTCONN if not connected and no address is 1149 * supplied. 1150 */ 1151 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1152 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1153 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1154 !(resid == 0 && clen != 0)) 1155 snderr(ENOTCONN); 1156 } else if (addr == NULL) 1157 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1158 ENOTCONN : EDESTADDRREQ); 1159 } 1160 space = sbspace(&so->so_snd); 1161 if (flags & MSG_OOB) 1162 space += 1024; 1163 if ((atomic && resid > so->so_snd.sb_hiwat) || 1164 clen > so->so_snd.sb_hiwat) 1165 snderr(EMSGSIZE); 1166 if (space < resid + clen && 1167 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1168 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 1169 snderr(EWOULDBLOCK); 1170 sbunlock(&so->so_snd); 1171 error = sbwait(&so->so_snd); 1172 if (error) 1173 goto out_locked; 1174 goto restart; 1175 } 1176 SOCKBUF_UNLOCK(&so->so_snd); 1177 space -= clen; 1178 do { 1179 if (uio == NULL) { 1180 resid = 0; 1181 if (flags & MSG_EOR) 1182 top->m_flags |= M_EOR; 1183 } else { 1184 error = sosend_copyin(uio, &top, atomic, 1185 &space, flags); 1186 if (error != 0) { 1187 SOCKBUF_LOCK(&so->so_snd); 1188 goto release; 1189 } 1190 resid = uio->uio_resid; 1191 } 1192 if (dontroute) { 1193 SOCK_LOCK(so); 1194 so->so_options |= SO_DONTROUTE; 1195 SOCK_UNLOCK(so); 1196 } 1197 /* 1198 * XXX all the SBS_CANTSENDMORE checks previously 1199 * done could be out of date. We could have recieved 1200 * a reset packet in an interrupt or maybe we slept 1201 * while doing page faults in uiomove() etc. We could 1202 * probably recheck again inside the locking protection 1203 * here, but there are probably other places that this 1204 * also happens. We must rethink this. 1205 */ 1206 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1207 (flags & MSG_OOB) ? PRUS_OOB : 1208 /* 1209 * If the user set MSG_EOF, the protocol 1210 * understands this flag and nothing left to 1211 * send then use PRU_SEND_EOF instead of PRU_SEND. 1212 */ 1213 ((flags & MSG_EOF) && 1214 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1215 (resid <= 0)) ? 1216 PRUS_EOF : 1217 /* If there is more to send set PRUS_MORETOCOME */ 1218 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1219 top, addr, control, td); 1220 if (dontroute) { 1221 SOCK_LOCK(so); 1222 so->so_options &= ~SO_DONTROUTE; 1223 SOCK_UNLOCK(so); 1224 } 1225 clen = 0; 1226 control = NULL; 1227 top = NULL; 1228 if (error) { 1229 SOCKBUF_LOCK(&so->so_snd); 1230 goto release; 1231 } 1232 } while (resid && space > 0); 1233 SOCKBUF_LOCK(&so->so_snd); 1234 } while (resid); 1235 1236release: 1237 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1238 sbunlock(&so->so_snd); 1239out_locked: 1240 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1241 SOCKBUF_UNLOCK(&so->so_snd); 1242out: 1243 if (top != NULL) 1244 m_freem(top); 1245 if (control != NULL) 1246 m_freem(control); 1247 return (error); 1248} 1249#undef snderr 1250 1251/* 1252 * The part of soreceive() that implements reading non-inline out-of-band 1253 * data from a socket. For more complete comments, see soreceive(), from 1254 * which this code originated. 1255 * 1256 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1257 * unable to return an mbuf chain to the caller. 1258 */ 1259static int 1260soreceive_rcvoob(so, uio, flags) 1261 struct socket *so; 1262 struct uio *uio; 1263 int flags; 1264{ 1265 struct protosw *pr = so->so_proto; 1266 struct mbuf *m; 1267 int error; 1268 1269 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1270 1271 m = m_get(M_TRYWAIT, MT_DATA); 1272 if (m == NULL) 1273 return (ENOBUFS); 1274 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1275 if (error) 1276 goto bad; 1277 do { 1278#ifdef ZERO_COPY_SOCKETS 1279 if (so_zero_copy_receive) { 1280 int disposable; 1281 1282 if ((m->m_flags & M_EXT) 1283 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1284 disposable = 1; 1285 else 1286 disposable = 0; 1287 1288 error = uiomoveco(mtod(m, void *), 1289 min(uio->uio_resid, m->m_len), 1290 uio, disposable); 1291 } else 1292#endif /* ZERO_COPY_SOCKETS */ 1293 error = uiomove(mtod(m, void *), 1294 (int) min(uio->uio_resid, m->m_len), uio); 1295 m = m_free(m); 1296 } while (uio->uio_resid && error == 0 && m); 1297bad: 1298 if (m != NULL) 1299 m_freem(m); 1300 return (error); 1301} 1302 1303/* 1304 * Following replacement or removal of the first mbuf on the first mbuf chain 1305 * of a socket buffer, push necessary state changes back into the socket 1306 * buffer so that other consumers see the values consistently. 'nextrecord' 1307 * is the callers locally stored value of the original value of 1308 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1309 * NOTE: 'nextrecord' may be NULL. 1310 */ 1311static __inline void 1312sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1313{ 1314 1315 SOCKBUF_LOCK_ASSERT(sb); 1316 /* 1317 * First, update for the new value of nextrecord. If necessary, make 1318 * it the first record. 1319 */ 1320 if (sb->sb_mb != NULL) 1321 sb->sb_mb->m_nextpkt = nextrecord; 1322 else 1323 sb->sb_mb = nextrecord; 1324 1325 /* 1326 * Now update any dependent socket buffer fields to reflect the new 1327 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1328 * addition of a second clause that takes care of the case where 1329 * sb_mb has been updated, but remains the last record. 1330 */ 1331 if (sb->sb_mb == NULL) { 1332 sb->sb_mbtail = NULL; 1333 sb->sb_lastrecord = NULL; 1334 } else if (sb->sb_mb->m_nextpkt == NULL) 1335 sb->sb_lastrecord = sb->sb_mb; 1336} 1337 1338 1339/* 1340 * Implement receive operations on a socket. 1341 * We depend on the way that records are added to the sockbuf 1342 * by sbappend*. In particular, each record (mbufs linked through m_next) 1343 * must begin with an address if the protocol so specifies, 1344 * followed by an optional mbuf or mbufs containing ancillary data, 1345 * and then zero or more mbufs of data. 1346 * In order to avoid blocking network interrupts for the entire time here, 1347 * we splx() while doing the actual copy to user space. 1348 * Although the sockbuf is locked, new data may still be appended, 1349 * and thus we must maintain consistency of the sockbuf during that time. 1350 * 1351 * The caller may receive the data as a single mbuf chain by supplying 1352 * an mbuf **mp0 for use in returning the chain. The uio is then used 1353 * only for the count in uio_resid. 1354 */ 1355int 1356soreceive(so, psa, uio, mp0, controlp, flagsp) 1357 struct socket *so; 1358 struct sockaddr **psa; 1359 struct uio *uio; 1360 struct mbuf **mp0; 1361 struct mbuf **controlp; 1362 int *flagsp; 1363{ 1364 struct mbuf *m, **mp; 1365 int flags, len, error, offset; 1366 struct protosw *pr = so->so_proto; 1367 struct mbuf *nextrecord; 1368 int moff, type = 0; 1369 int orig_resid = uio->uio_resid; 1370 1371 mp = mp0; 1372 if (psa != NULL) 1373 *psa = NULL; 1374 if (controlp != NULL) 1375 *controlp = NULL; 1376 if (flagsp != NULL) 1377 flags = *flagsp &~ MSG_EOR; 1378 else 1379 flags = 0; 1380 if (flags & MSG_OOB) 1381 return (soreceive_rcvoob(so, uio, flags)); 1382 if (mp != NULL) 1383 *mp = NULL; 1384 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1385 && uio->uio_resid) 1386 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1387 1388 SOCKBUF_LOCK(&so->so_rcv); 1389restart: 1390 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1391 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1392 if (error) 1393 goto out; 1394 1395 m = so->so_rcv.sb_mb; 1396 /* 1397 * If we have less data than requested, block awaiting more 1398 * (subject to any timeout) if: 1399 * 1. the current count is less than the low water mark, or 1400 * 2. MSG_WAITALL is set, and it is possible to do the entire 1401 * receive operation at once if we block (resid <= hiwat). 1402 * 3. MSG_DONTWAIT is not set 1403 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1404 * we have to do the receive in sections, and thus risk returning 1405 * a short count if a timeout or signal occurs after we start. 1406 */ 1407 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1408 so->so_rcv.sb_cc < uio->uio_resid) && 1409 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1410 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1411 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1412 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1413 ("receive: m == %p so->so_rcv.sb_cc == %u", 1414 m, so->so_rcv.sb_cc)); 1415 if (so->so_error) { 1416 if (m != NULL) 1417 goto dontblock; 1418 error = so->so_error; 1419 if ((flags & MSG_PEEK) == 0) 1420 so->so_error = 0; 1421 goto release; 1422 } 1423 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1424 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1425 if (m) 1426 goto dontblock; 1427 else 1428 goto release; 1429 } 1430 for (; m != NULL; m = m->m_next) 1431 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1432 m = so->so_rcv.sb_mb; 1433 goto dontblock; 1434 } 1435 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1436 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1437 error = ENOTCONN; 1438 goto release; 1439 } 1440 if (uio->uio_resid == 0) 1441 goto release; 1442 if ((so->so_state & SS_NBIO) || 1443 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1444 error = EWOULDBLOCK; 1445 goto release; 1446 } 1447 SBLASTRECORDCHK(&so->so_rcv); 1448 SBLASTMBUFCHK(&so->so_rcv); 1449 sbunlock(&so->so_rcv); 1450 error = sbwait(&so->so_rcv); 1451 if (error) 1452 goto out; 1453 goto restart; 1454 } 1455dontblock: 1456 /* 1457 * From this point onward, we maintain 'nextrecord' as a cache of the 1458 * pointer to the next record in the socket buffer. We must keep the 1459 * various socket buffer pointers and local stack versions of the 1460 * pointers in sync, pushing out modifications before dropping the 1461 * socket buffer mutex, and re-reading them when picking it up. 1462 * 1463 * Otherwise, we will race with the network stack appending new data 1464 * or records onto the socket buffer by using inconsistent/stale 1465 * versions of the field, possibly resulting in socket buffer 1466 * corruption. 1467 * 1468 * By holding the high-level sblock(), we prevent simultaneous 1469 * readers from pulling off the front of the socket buffer. 1470 */ 1471 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1472 if (uio->uio_td) 1473 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1474 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1475 SBLASTRECORDCHK(&so->so_rcv); 1476 SBLASTMBUFCHK(&so->so_rcv); 1477 nextrecord = m->m_nextpkt; 1478 if (pr->pr_flags & PR_ADDR) { 1479 KASSERT(m->m_type == MT_SONAME, 1480 ("m->m_type == %d", m->m_type)); 1481 orig_resid = 0; 1482 if (psa != NULL) 1483 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1484 M_NOWAIT); 1485 if (flags & MSG_PEEK) { 1486 m = m->m_next; 1487 } else { 1488 sbfree(&so->so_rcv, m); 1489 so->so_rcv.sb_mb = m_free(m); 1490 m = so->so_rcv.sb_mb; 1491 sockbuf_pushsync(&so->so_rcv, nextrecord); 1492 } 1493 } 1494 1495 /* 1496 * Process one or more MT_CONTROL mbufs present before any data mbufs 1497 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1498 * just copy the data; if !MSG_PEEK, we call into the protocol to 1499 * perform externalization (or freeing if controlp == NULL). 1500 */ 1501 if (m != NULL && m->m_type == MT_CONTROL) { 1502 struct mbuf *cm = NULL, *cmn; 1503 struct mbuf **cme = &cm; 1504 1505 do { 1506 if (flags & MSG_PEEK) { 1507 if (controlp != NULL) { 1508 *controlp = m_copy(m, 0, m->m_len); 1509 controlp = &(*controlp)->m_next; 1510 } 1511 m = m->m_next; 1512 } else { 1513 sbfree(&so->so_rcv, m); 1514 so->so_rcv.sb_mb = m->m_next; 1515 m->m_next = NULL; 1516 *cme = m; 1517 cme = &(*cme)->m_next; 1518 m = so->so_rcv.sb_mb; 1519 } 1520 } while (m != NULL && m->m_type == MT_CONTROL); 1521 if ((flags & MSG_PEEK) == 0) 1522 sockbuf_pushsync(&so->so_rcv, nextrecord); 1523 while (cm != NULL) { 1524 cmn = cm->m_next; 1525 cm->m_next = NULL; 1526 if (pr->pr_domain->dom_externalize != NULL) { 1527 SOCKBUF_UNLOCK(&so->so_rcv); 1528 error = (*pr->pr_domain->dom_externalize) 1529 (cm, controlp); 1530 SOCKBUF_LOCK(&so->so_rcv); 1531 } else if (controlp != NULL) 1532 *controlp = cm; 1533 else 1534 m_freem(cm); 1535 if (controlp != NULL) { 1536 orig_resid = 0; 1537 while (*controlp != NULL) 1538 controlp = &(*controlp)->m_next; 1539 } 1540 cm = cmn; 1541 } 1542 if (so->so_rcv.sb_mb) 1543 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1544 else 1545 nextrecord = NULL; 1546 orig_resid = 0; 1547 } 1548 if (m != NULL) { 1549 if ((flags & MSG_PEEK) == 0) { 1550 KASSERT(m->m_nextpkt == nextrecord, 1551 ("soreceive: post-control, nextrecord !sync")); 1552 if (nextrecord == NULL) { 1553 KASSERT(so->so_rcv.sb_mb == m, 1554 ("soreceive: post-control, sb_mb!=m")); 1555 KASSERT(so->so_rcv.sb_lastrecord == m, 1556 ("soreceive: post-control, lastrecord!=m")); 1557 } 1558 } 1559 type = m->m_type; 1560 if (type == MT_OOBDATA) 1561 flags |= MSG_OOB; 1562 } else { 1563 if ((flags & MSG_PEEK) == 0) { 1564 KASSERT(so->so_rcv.sb_mb == nextrecord, 1565 ("soreceive: sb_mb != nextrecord")); 1566 if (so->so_rcv.sb_mb == NULL) { 1567 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1568 ("soreceive: sb_lastercord != NULL")); 1569 } 1570 } 1571 } 1572 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1573 SBLASTRECORDCHK(&so->so_rcv); 1574 SBLASTMBUFCHK(&so->so_rcv); 1575 1576 /* 1577 * Now continue to read any data mbufs off of the head of the socket 1578 * buffer until the read request is satisfied. Note that 'type' is 1579 * used to store the type of any mbuf reads that have happened so far 1580 * such that soreceive() can stop reading if the type changes, which 1581 * causes soreceive() to return only one of regular data and inline 1582 * out-of-band data in a single socket receive operation. 1583 */ 1584 moff = 0; 1585 offset = 0; 1586 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1587 /* 1588 * If the type of mbuf has changed since the last mbuf 1589 * examined ('type'), end the receive operation. 1590 */ 1591 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1592 if (m->m_type == MT_OOBDATA) { 1593 if (type != MT_OOBDATA) 1594 break; 1595 } else if (type == MT_OOBDATA) 1596 break; 1597 else 1598 KASSERT(m->m_type == MT_DATA, 1599 ("m->m_type == %d", m->m_type)); 1600 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1601 len = uio->uio_resid; 1602 if (so->so_oobmark && len > so->so_oobmark - offset) 1603 len = so->so_oobmark - offset; 1604 if (len > m->m_len - moff) 1605 len = m->m_len - moff; 1606 /* 1607 * If mp is set, just pass back the mbufs. 1608 * Otherwise copy them out via the uio, then free. 1609 * Sockbuf must be consistent here (points to current mbuf, 1610 * it points to next record) when we drop priority; 1611 * we must note any additions to the sockbuf when we 1612 * block interrupts again. 1613 */ 1614 if (mp == NULL) { 1615 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1616 SBLASTRECORDCHK(&so->so_rcv); 1617 SBLASTMBUFCHK(&so->so_rcv); 1618 SOCKBUF_UNLOCK(&so->so_rcv); 1619#ifdef ZERO_COPY_SOCKETS 1620 if (so_zero_copy_receive) { 1621 int disposable; 1622 1623 if ((m->m_flags & M_EXT) 1624 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1625 disposable = 1; 1626 else 1627 disposable = 0; 1628 1629 error = uiomoveco(mtod(m, char *) + moff, 1630 (int)len, uio, 1631 disposable); 1632 } else 1633#endif /* ZERO_COPY_SOCKETS */ 1634 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1635 SOCKBUF_LOCK(&so->so_rcv); 1636 if (error) 1637 goto release; 1638 } else 1639 uio->uio_resid -= len; 1640 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1641 if (len == m->m_len - moff) { 1642 if (m->m_flags & M_EOR) 1643 flags |= MSG_EOR; 1644 if (flags & MSG_PEEK) { 1645 m = m->m_next; 1646 moff = 0; 1647 } else { 1648 nextrecord = m->m_nextpkt; 1649 sbfree(&so->so_rcv, m); 1650 if (mp != NULL) { 1651 *mp = m; 1652 mp = &m->m_next; 1653 so->so_rcv.sb_mb = m = m->m_next; 1654 *mp = NULL; 1655 } else { 1656 so->so_rcv.sb_mb = m_free(m); 1657 m = so->so_rcv.sb_mb; 1658 } 1659 sockbuf_pushsync(&so->so_rcv, nextrecord); 1660 SBLASTRECORDCHK(&so->so_rcv); 1661 SBLASTMBUFCHK(&so->so_rcv); 1662 } 1663 } else { 1664 if (flags & MSG_PEEK) 1665 moff += len; 1666 else { 1667 if (mp != NULL) { 1668 int copy_flag; 1669 1670 if (flags & MSG_DONTWAIT) 1671 copy_flag = M_DONTWAIT; 1672 else 1673 copy_flag = M_TRYWAIT; 1674 if (copy_flag == M_TRYWAIT) 1675 SOCKBUF_UNLOCK(&so->so_rcv); 1676 *mp = m_copym(m, 0, len, copy_flag); 1677 if (copy_flag == M_TRYWAIT) 1678 SOCKBUF_LOCK(&so->so_rcv); 1679 if (*mp == NULL) { 1680 /* 1681 * m_copym() couldn't allocate an mbuf. 1682 * Adjust uio_resid back (it was adjusted 1683 * down by len bytes, which we didn't end 1684 * up "copying" over). 1685 */ 1686 uio->uio_resid += len; 1687 break; 1688 } 1689 } 1690 m->m_data += len; 1691 m->m_len -= len; 1692 so->so_rcv.sb_cc -= len; 1693 } 1694 } 1695 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1696 if (so->so_oobmark) { 1697 if ((flags & MSG_PEEK) == 0) { 1698 so->so_oobmark -= len; 1699 if (so->so_oobmark == 0) { 1700 so->so_rcv.sb_state |= SBS_RCVATMARK; 1701 break; 1702 } 1703 } else { 1704 offset += len; 1705 if (offset == so->so_oobmark) 1706 break; 1707 } 1708 } 1709 if (flags & MSG_EOR) 1710 break; 1711 /* 1712 * If the MSG_WAITALL flag is set (for non-atomic socket), 1713 * we must not quit until "uio->uio_resid == 0" or an error 1714 * termination. If a signal/timeout occurs, return 1715 * with a short count but without error. 1716 * Keep sockbuf locked against other readers. 1717 */ 1718 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1719 !sosendallatonce(so) && nextrecord == NULL) { 1720 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1721 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1722 break; 1723 /* 1724 * Notify the protocol that some data has been 1725 * drained before blocking. 1726 */ 1727 if (pr->pr_flags & PR_WANTRCVD) { 1728 SOCKBUF_UNLOCK(&so->so_rcv); 1729 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1730 SOCKBUF_LOCK(&so->so_rcv); 1731 } 1732 SBLASTRECORDCHK(&so->so_rcv); 1733 SBLASTMBUFCHK(&so->so_rcv); 1734 error = sbwait(&so->so_rcv); 1735 if (error) 1736 goto release; 1737 m = so->so_rcv.sb_mb; 1738 if (m != NULL) 1739 nextrecord = m->m_nextpkt; 1740 } 1741 } 1742 1743 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1744 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1745 flags |= MSG_TRUNC; 1746 if ((flags & MSG_PEEK) == 0) 1747 (void) sbdroprecord_locked(&so->so_rcv); 1748 } 1749 if ((flags & MSG_PEEK) == 0) { 1750 if (m == NULL) { 1751 /* 1752 * First part is an inline SB_EMPTY_FIXUP(). Second 1753 * part makes sure sb_lastrecord is up-to-date if 1754 * there is still data in the socket buffer. 1755 */ 1756 so->so_rcv.sb_mb = nextrecord; 1757 if (so->so_rcv.sb_mb == NULL) { 1758 so->so_rcv.sb_mbtail = NULL; 1759 so->so_rcv.sb_lastrecord = NULL; 1760 } else if (nextrecord->m_nextpkt == NULL) 1761 so->so_rcv.sb_lastrecord = nextrecord; 1762 } 1763 SBLASTRECORDCHK(&so->so_rcv); 1764 SBLASTMBUFCHK(&so->so_rcv); 1765 /* 1766 * If soreceive() is being done from the socket callback, then 1767 * don't need to generate ACK to peer to update window, since 1768 * ACK will be generated on return to TCP. 1769 */ 1770 if (!(flags & MSG_SOCALLBCK) && 1771 (pr->pr_flags & PR_WANTRCVD)) { 1772 SOCKBUF_UNLOCK(&so->so_rcv); 1773 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1774 SOCKBUF_LOCK(&so->so_rcv); 1775 } 1776 } 1777 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1778 if (orig_resid == uio->uio_resid && orig_resid && 1779 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1780 sbunlock(&so->so_rcv); 1781 goto restart; 1782 } 1783 1784 if (flagsp != NULL) 1785 *flagsp |= flags; 1786release: 1787 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1788 sbunlock(&so->so_rcv); 1789out: 1790 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1791 SOCKBUF_UNLOCK(&so->so_rcv); 1792 return (error); 1793} 1794 1795int 1796soshutdown(so, how) 1797 struct socket *so; 1798 int how; 1799{ 1800 struct protosw *pr = so->so_proto; 1801 1802 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1803 return (EINVAL); 1804 1805 if (how != SHUT_WR) 1806 sorflush(so); 1807 if (how != SHUT_RD) 1808 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1809 return (0); 1810} 1811 1812void 1813sorflush(so) 1814 struct socket *so; 1815{ 1816 struct sockbuf *sb = &so->so_rcv; 1817 struct protosw *pr = so->so_proto; 1818 struct sockbuf asb; 1819 1820 /* 1821 * XXXRW: This is quite ugly. Previously, this code made a copy of 1822 * the socket buffer, then zero'd the original to clear the buffer 1823 * fields. However, with mutexes in the socket buffer, this causes 1824 * problems. We only clear the zeroable bits of the original; 1825 * however, we have to initialize and destroy the mutex in the copy 1826 * so that dom_dispose() and sbrelease() can lock t as needed. 1827 */ 1828 SOCKBUF_LOCK(sb); 1829 sb->sb_flags |= SB_NOINTR; 1830 (void) sblock(sb, M_WAITOK); 1831 /* 1832 * socantrcvmore_locked() drops the socket buffer mutex so that it 1833 * can safely perform wakeups. Re-acquire the mutex before 1834 * continuing. 1835 */ 1836 socantrcvmore_locked(so); 1837 SOCKBUF_LOCK(sb); 1838 sbunlock(sb); 1839 /* 1840 * Invalidate/clear most of the sockbuf structure, but leave 1841 * selinfo and mutex data unchanged. 1842 */ 1843 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1844 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1845 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1846 bzero(&sb->sb_startzero, 1847 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1848 SOCKBUF_UNLOCK(sb); 1849 1850 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1851 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1852 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1853 sbrelease(&asb, so); 1854 SOCKBUF_LOCK_DESTROY(&asb); 1855} 1856 1857/* 1858 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1859 * an additional variant to handle the case where the option value needs 1860 * to be some kind of integer, but not a specific size. 1861 * In addition to their use here, these functions are also called by the 1862 * protocol-level pr_ctloutput() routines. 1863 */ 1864int 1865sooptcopyin(sopt, buf, len, minlen) 1866 struct sockopt *sopt; 1867 void *buf; 1868 size_t len; 1869 size_t minlen; 1870{ 1871 size_t valsize; 1872 1873 /* 1874 * If the user gives us more than we wanted, we ignore it, 1875 * but if we don't get the minimum length the caller 1876 * wants, we return EINVAL. On success, sopt->sopt_valsize 1877 * is set to however much we actually retrieved. 1878 */ 1879 if ((valsize = sopt->sopt_valsize) < minlen) 1880 return EINVAL; 1881 if (valsize > len) 1882 sopt->sopt_valsize = valsize = len; 1883 1884 if (sopt->sopt_td != NULL) 1885 return (copyin(sopt->sopt_val, buf, valsize)); 1886 1887 bcopy(sopt->sopt_val, buf, valsize); 1888 return (0); 1889} 1890 1891/* 1892 * Kernel version of setsockopt(2)/ 1893 * XXX: optlen is size_t, not socklen_t 1894 */ 1895int 1896so_setsockopt(struct socket *so, int level, int optname, void *optval, 1897 size_t optlen) 1898{ 1899 struct sockopt sopt; 1900 1901 sopt.sopt_level = level; 1902 sopt.sopt_name = optname; 1903 sopt.sopt_dir = SOPT_SET; 1904 sopt.sopt_val = optval; 1905 sopt.sopt_valsize = optlen; 1906 sopt.sopt_td = NULL; 1907 return (sosetopt(so, &sopt)); 1908} 1909 1910int 1911sosetopt(so, sopt) 1912 struct socket *so; 1913 struct sockopt *sopt; 1914{ 1915 int error, optval; 1916 struct linger l; 1917 struct timeval tv; 1918 u_long val; 1919#ifdef MAC 1920 struct mac extmac; 1921#endif 1922 1923 error = 0; 1924 if (sopt->sopt_level != SOL_SOCKET) { 1925 if (so->so_proto && so->so_proto->pr_ctloutput) 1926 return ((*so->so_proto->pr_ctloutput) 1927 (so, sopt)); 1928 error = ENOPROTOOPT; 1929 } else { 1930 switch (sopt->sopt_name) { 1931#ifdef INET 1932 case SO_ACCEPTFILTER: 1933 error = do_setopt_accept_filter(so, sopt); 1934 if (error) 1935 goto bad; 1936 break; 1937#endif 1938 case SO_LINGER: 1939 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1940 if (error) 1941 goto bad; 1942 1943 SOCK_LOCK(so); 1944 so->so_linger = l.l_linger; 1945 if (l.l_onoff) 1946 so->so_options |= SO_LINGER; 1947 else 1948 so->so_options &= ~SO_LINGER; 1949 SOCK_UNLOCK(so); 1950 break; 1951 1952 case SO_DEBUG: 1953 case SO_KEEPALIVE: 1954 case SO_DONTROUTE: 1955 case SO_USELOOPBACK: 1956 case SO_BROADCAST: 1957 case SO_REUSEADDR: 1958 case SO_REUSEPORT: 1959 case SO_OOBINLINE: 1960 case SO_TIMESTAMP: 1961 case SO_BINTIME: 1962 case SO_NOSIGPIPE: 1963 error = sooptcopyin(sopt, &optval, sizeof optval, 1964 sizeof optval); 1965 if (error) 1966 goto bad; 1967 SOCK_LOCK(so); 1968 if (optval) 1969 so->so_options |= sopt->sopt_name; 1970 else 1971 so->so_options &= ~sopt->sopt_name; 1972 SOCK_UNLOCK(so); 1973 break; 1974 1975 case SO_SNDBUF: 1976 case SO_RCVBUF: 1977 case SO_SNDLOWAT: 1978 case SO_RCVLOWAT: 1979 error = sooptcopyin(sopt, &optval, sizeof optval, 1980 sizeof optval); 1981 if (error) 1982 goto bad; 1983 1984 /* 1985 * Values < 1 make no sense for any of these 1986 * options, so disallow them. 1987 */ 1988 if (optval < 1) { 1989 error = EINVAL; 1990 goto bad; 1991 } 1992 1993 switch (sopt->sopt_name) { 1994 case SO_SNDBUF: 1995 case SO_RCVBUF: 1996 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1997 &so->so_snd : &so->so_rcv, (u_long)optval, 1998 so, curthread) == 0) { 1999 error = ENOBUFS; 2000 goto bad; 2001 } 2002 break; 2003 2004 /* 2005 * Make sure the low-water is never greater than 2006 * the high-water. 2007 */ 2008 case SO_SNDLOWAT: 2009 SOCKBUF_LOCK(&so->so_snd); 2010 so->so_snd.sb_lowat = 2011 (optval > so->so_snd.sb_hiwat) ? 2012 so->so_snd.sb_hiwat : optval; 2013 SOCKBUF_UNLOCK(&so->so_snd); 2014 break; 2015 case SO_RCVLOWAT: 2016 SOCKBUF_LOCK(&so->so_rcv); 2017 so->so_rcv.sb_lowat = 2018 (optval > so->so_rcv.sb_hiwat) ? 2019 so->so_rcv.sb_hiwat : optval; 2020 SOCKBUF_UNLOCK(&so->so_rcv); 2021 break; 2022 } 2023 break; 2024 2025 case SO_SNDTIMEO: 2026 case SO_RCVTIMEO: 2027#ifdef COMPAT_IA32 2028 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2029 struct timeval32 tv32; 2030 2031 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2032 sizeof tv32); 2033 CP(tv32, tv, tv_sec); 2034 CP(tv32, tv, tv_usec); 2035 } else 2036#endif 2037 error = sooptcopyin(sopt, &tv, sizeof tv, 2038 sizeof tv); 2039 if (error) 2040 goto bad; 2041 2042 /* assert(hz > 0); */ 2043 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2044 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2045 error = EDOM; 2046 goto bad; 2047 } 2048 /* assert(tick > 0); */ 2049 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2050 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2051 if (val > INT_MAX) { 2052 error = EDOM; 2053 goto bad; 2054 } 2055 if (val == 0 && tv.tv_usec != 0) 2056 val = 1; 2057 2058 switch (sopt->sopt_name) { 2059 case SO_SNDTIMEO: 2060 so->so_snd.sb_timeo = val; 2061 break; 2062 case SO_RCVTIMEO: 2063 so->so_rcv.sb_timeo = val; 2064 break; 2065 } 2066 break; 2067 2068 case SO_LABEL: 2069#ifdef MAC 2070 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2071 sizeof extmac); 2072 if (error) 2073 goto bad; 2074 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2075 so, &extmac); 2076#else 2077 error = EOPNOTSUPP; 2078#endif 2079 break; 2080 2081 default: 2082 error = ENOPROTOOPT; 2083 break; 2084 } 2085 if (error == 0 && so->so_proto != NULL && 2086 so->so_proto->pr_ctloutput != NULL) { 2087 (void) ((*so->so_proto->pr_ctloutput) 2088 (so, sopt)); 2089 } 2090 } 2091bad: 2092 return (error); 2093} 2094 2095/* Helper routine for getsockopt */ 2096int 2097sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2098{ 2099 int error; 2100 size_t valsize; 2101 2102 error = 0; 2103 2104 /* 2105 * Documented get behavior is that we always return a value, 2106 * possibly truncated to fit in the user's buffer. 2107 * Traditional behavior is that we always tell the user 2108 * precisely how much we copied, rather than something useful 2109 * like the total amount we had available for her. 2110 * Note that this interface is not idempotent; the entire answer must 2111 * generated ahead of time. 2112 */ 2113 valsize = min(len, sopt->sopt_valsize); 2114 sopt->sopt_valsize = valsize; 2115 if (sopt->sopt_val != NULL) { 2116 if (sopt->sopt_td != NULL) 2117 error = copyout(buf, sopt->sopt_val, valsize); 2118 else 2119 bcopy(buf, sopt->sopt_val, valsize); 2120 } 2121 return (error); 2122} 2123 2124int 2125sogetopt(so, sopt) 2126 struct socket *so; 2127 struct sockopt *sopt; 2128{ 2129 int error, optval; 2130 struct linger l; 2131 struct timeval tv; 2132#ifdef MAC 2133 struct mac extmac; 2134#endif 2135 2136 error = 0; 2137 if (sopt->sopt_level != SOL_SOCKET) { 2138 if (so->so_proto && so->so_proto->pr_ctloutput) { 2139 return ((*so->so_proto->pr_ctloutput) 2140 (so, sopt)); 2141 } else 2142 return (ENOPROTOOPT); 2143 } else { 2144 switch (sopt->sopt_name) { 2145#ifdef INET 2146 case SO_ACCEPTFILTER: 2147 error = do_getopt_accept_filter(so, sopt); 2148 break; 2149#endif 2150 case SO_LINGER: 2151 SOCK_LOCK(so); 2152 l.l_onoff = so->so_options & SO_LINGER; 2153 l.l_linger = so->so_linger; 2154 SOCK_UNLOCK(so); 2155 error = sooptcopyout(sopt, &l, sizeof l); 2156 break; 2157 2158 case SO_USELOOPBACK: 2159 case SO_DONTROUTE: 2160 case SO_DEBUG: 2161 case SO_KEEPALIVE: 2162 case SO_REUSEADDR: 2163 case SO_REUSEPORT: 2164 case SO_BROADCAST: 2165 case SO_OOBINLINE: 2166 case SO_ACCEPTCONN: 2167 case SO_TIMESTAMP: 2168 case SO_BINTIME: 2169 case SO_NOSIGPIPE: 2170 optval = so->so_options & sopt->sopt_name; 2171integer: 2172 error = sooptcopyout(sopt, &optval, sizeof optval); 2173 break; 2174 2175 case SO_TYPE: 2176 optval = so->so_type; 2177 goto integer; 2178 2179 case SO_ERROR: 2180 SOCK_LOCK(so); 2181 optval = so->so_error; 2182 so->so_error = 0; 2183 SOCK_UNLOCK(so); 2184 goto integer; 2185 2186 case SO_SNDBUF: 2187 optval = so->so_snd.sb_hiwat; 2188 goto integer; 2189 2190 case SO_RCVBUF: 2191 optval = so->so_rcv.sb_hiwat; 2192 goto integer; 2193 2194 case SO_SNDLOWAT: 2195 optval = so->so_snd.sb_lowat; 2196 goto integer; 2197 2198 case SO_RCVLOWAT: 2199 optval = so->so_rcv.sb_lowat; 2200 goto integer; 2201 2202 case SO_SNDTIMEO: 2203 case SO_RCVTIMEO: 2204 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2205 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2206 2207 tv.tv_sec = optval / hz; 2208 tv.tv_usec = (optval % hz) * tick; 2209#ifdef COMPAT_IA32 2210 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2211 struct timeval32 tv32; 2212 2213 CP(tv, tv32, tv_sec); 2214 CP(tv, tv32, tv_usec); 2215 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2216 } else 2217#endif 2218 error = sooptcopyout(sopt, &tv, sizeof tv); 2219 break; 2220 2221 case SO_LABEL: 2222#ifdef MAC 2223 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2224 sizeof(extmac)); 2225 if (error) 2226 return (error); 2227 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2228 so, &extmac); 2229 if (error) 2230 return (error); 2231 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2232#else 2233 error = EOPNOTSUPP; 2234#endif 2235 break; 2236 2237 case SO_PEERLABEL: 2238#ifdef MAC 2239 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2240 sizeof(extmac)); 2241 if (error) 2242 return (error); 2243 error = mac_getsockopt_peerlabel( 2244 sopt->sopt_td->td_ucred, so, &extmac); 2245 if (error) 2246 return (error); 2247 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2248#else 2249 error = EOPNOTSUPP; 2250#endif 2251 break; 2252 2253 case SO_LISTENQLIMIT: 2254 optval = so->so_qlimit; 2255 goto integer; 2256 2257 case SO_LISTENQLEN: 2258 optval = so->so_qlen; 2259 goto integer; 2260 2261 case SO_LISTENINCQLEN: 2262 optval = so->so_incqlen; 2263 goto integer; 2264 2265 default: 2266 error = ENOPROTOOPT; 2267 break; 2268 } 2269 return (error); 2270 } 2271} 2272 2273/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2274int 2275soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2276{ 2277 struct mbuf *m, *m_prev; 2278 int sopt_size = sopt->sopt_valsize; 2279 2280 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2281 if (m == NULL) 2282 return ENOBUFS; 2283 if (sopt_size > MLEN) { 2284 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2285 if ((m->m_flags & M_EXT) == 0) { 2286 m_free(m); 2287 return ENOBUFS; 2288 } 2289 m->m_len = min(MCLBYTES, sopt_size); 2290 } else { 2291 m->m_len = min(MLEN, sopt_size); 2292 } 2293 sopt_size -= m->m_len; 2294 *mp = m; 2295 m_prev = m; 2296 2297 while (sopt_size) { 2298 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2299 if (m == NULL) { 2300 m_freem(*mp); 2301 return ENOBUFS; 2302 } 2303 if (sopt_size > MLEN) { 2304 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2305 M_DONTWAIT); 2306 if ((m->m_flags & M_EXT) == 0) { 2307 m_freem(m); 2308 m_freem(*mp); 2309 return ENOBUFS; 2310 } 2311 m->m_len = min(MCLBYTES, sopt_size); 2312 } else { 2313 m->m_len = min(MLEN, sopt_size); 2314 } 2315 sopt_size -= m->m_len; 2316 m_prev->m_next = m; 2317 m_prev = m; 2318 } 2319 return (0); 2320} 2321 2322/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2323int 2324soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2325{ 2326 struct mbuf *m0 = m; 2327 2328 if (sopt->sopt_val == NULL) 2329 return (0); 2330 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2331 if (sopt->sopt_td != NULL) { 2332 int error; 2333 2334 error = copyin(sopt->sopt_val, mtod(m, char *), 2335 m->m_len); 2336 if (error != 0) { 2337 m_freem(m0); 2338 return(error); 2339 } 2340 } else 2341 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2342 sopt->sopt_valsize -= m->m_len; 2343 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2344 m = m->m_next; 2345 } 2346 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2347 panic("ip6_sooptmcopyin"); 2348 return (0); 2349} 2350 2351/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2352int 2353soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2354{ 2355 struct mbuf *m0 = m; 2356 size_t valsize = 0; 2357 2358 if (sopt->sopt_val == NULL) 2359 return (0); 2360 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2361 if (sopt->sopt_td != NULL) { 2362 int error; 2363 2364 error = copyout(mtod(m, char *), sopt->sopt_val, 2365 m->m_len); 2366 if (error != 0) { 2367 m_freem(m0); 2368 return(error); 2369 } 2370 } else 2371 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2372 sopt->sopt_valsize -= m->m_len; 2373 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2374 valsize += m->m_len; 2375 m = m->m_next; 2376 } 2377 if (m != NULL) { 2378 /* enough soopt buffer should be given from user-land */ 2379 m_freem(m0); 2380 return(EINVAL); 2381 } 2382 sopt->sopt_valsize = valsize; 2383 return (0); 2384} 2385 2386void 2387sohasoutofband(so) 2388 struct socket *so; 2389{ 2390 if (so->so_sigio != NULL) 2391 pgsigio(&so->so_sigio, SIGURG, 0); 2392 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2393} 2394 2395int 2396sopoll(struct socket *so, int events, struct ucred *active_cred, 2397 struct thread *td) 2398{ 2399 int revents = 0; 2400 2401 SOCKBUF_LOCK(&so->so_snd); 2402 SOCKBUF_LOCK(&so->so_rcv); 2403 if (events & (POLLIN | POLLRDNORM)) 2404 if (soreadable(so)) 2405 revents |= events & (POLLIN | POLLRDNORM); 2406 2407 if (events & POLLINIGNEOF) 2408 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2409 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2410 revents |= POLLINIGNEOF; 2411 2412 if (events & (POLLOUT | POLLWRNORM)) 2413 if (sowriteable(so)) 2414 revents |= events & (POLLOUT | POLLWRNORM); 2415 2416 if (events & (POLLPRI | POLLRDBAND)) 2417 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2418 revents |= events & (POLLPRI | POLLRDBAND); 2419 2420 if (revents == 0) { 2421 if (events & 2422 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2423 POLLRDBAND)) { 2424 selrecord(td, &so->so_rcv.sb_sel); 2425 so->so_rcv.sb_flags |= SB_SEL; 2426 } 2427 2428 if (events & (POLLOUT | POLLWRNORM)) { 2429 selrecord(td, &so->so_snd.sb_sel); 2430 so->so_snd.sb_flags |= SB_SEL; 2431 } 2432 } 2433 2434 SOCKBUF_UNLOCK(&so->so_rcv); 2435 SOCKBUF_UNLOCK(&so->so_snd); 2436 return (revents); 2437} 2438 2439int 2440soo_kqfilter(struct file *fp, struct knote *kn) 2441{ 2442 struct socket *so = kn->kn_fp->f_data; 2443 struct sockbuf *sb; 2444 2445 switch (kn->kn_filter) { 2446 case EVFILT_READ: 2447 if (so->so_options & SO_ACCEPTCONN) 2448 kn->kn_fop = &solisten_filtops; 2449 else 2450 kn->kn_fop = &soread_filtops; 2451 sb = &so->so_rcv; 2452 break; 2453 case EVFILT_WRITE: 2454 kn->kn_fop = &sowrite_filtops; 2455 sb = &so->so_snd; 2456 break; 2457 default: 2458 return (EINVAL); 2459 } 2460 2461 SOCKBUF_LOCK(sb); 2462 knlist_add(&sb->sb_sel.si_note, kn, 1); 2463 sb->sb_flags |= SB_KNOTE; 2464 SOCKBUF_UNLOCK(sb); 2465 return (0); 2466} 2467 2468static void 2469filt_sordetach(struct knote *kn) 2470{ 2471 struct socket *so = kn->kn_fp->f_data; 2472 2473 SOCKBUF_LOCK(&so->so_rcv); 2474 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2475 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2476 so->so_rcv.sb_flags &= ~SB_KNOTE; 2477 SOCKBUF_UNLOCK(&so->so_rcv); 2478} 2479 2480/*ARGSUSED*/ 2481static int 2482filt_soread(struct knote *kn, long hint) 2483{ 2484 struct socket *so; 2485 2486 so = kn->kn_fp->f_data; 2487 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2488 2489 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2490 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2491 kn->kn_flags |= EV_EOF; 2492 kn->kn_fflags = so->so_error; 2493 return (1); 2494 } else if (so->so_error) /* temporary udp error */ 2495 return (1); 2496 else if (kn->kn_sfflags & NOTE_LOWAT) 2497 return (kn->kn_data >= kn->kn_sdata); 2498 else 2499 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2500} 2501 2502static void 2503filt_sowdetach(struct knote *kn) 2504{ 2505 struct socket *so = kn->kn_fp->f_data; 2506 2507 SOCKBUF_LOCK(&so->so_snd); 2508 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2509 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2510 so->so_snd.sb_flags &= ~SB_KNOTE; 2511 SOCKBUF_UNLOCK(&so->so_snd); 2512} 2513 2514/*ARGSUSED*/ 2515static int 2516filt_sowrite(struct knote *kn, long hint) 2517{ 2518 struct socket *so; 2519 2520 so = kn->kn_fp->f_data; 2521 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2522 kn->kn_data = sbspace(&so->so_snd); 2523 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2524 kn->kn_flags |= EV_EOF; 2525 kn->kn_fflags = so->so_error; 2526 return (1); 2527 } else if (so->so_error) /* temporary udp error */ 2528 return (1); 2529 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2530 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2531 return (0); 2532 else if (kn->kn_sfflags & NOTE_LOWAT) 2533 return (kn->kn_data >= kn->kn_sdata); 2534 else 2535 return (kn->kn_data >= so->so_snd.sb_lowat); 2536} 2537 2538/*ARGSUSED*/ 2539static int 2540filt_solisten(struct knote *kn, long hint) 2541{ 2542 struct socket *so = kn->kn_fp->f_data; 2543 2544 kn->kn_data = so->so_qlen; 2545 return (! TAILQ_EMPTY(&so->so_comp)); 2546} 2547 2548int 2549socheckuid(struct socket *so, uid_t uid) 2550{ 2551 2552 if (so == NULL) 2553 return (EPERM); 2554 if (so->so_cred->cr_uid != uid) 2555 return (EPERM); 2556 return (0); 2557} 2558 2559static int 2560somaxconn_sysctl(SYSCTL_HANDLER_ARGS) 2561{ 2562 int error; 2563 int val; 2564 2565 val = somaxconn; 2566 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2567 if (error || !req->newptr ) 2568 return (error); 2569 2570 if (val < 1 || val > USHRT_MAX) 2571 return (EINVAL); 2572 2573 somaxconn = val; 2574 return (0); 2575} 2576