uipc_socket.c revision 166171
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35/* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 */ 96 97#include <sys/cdefs.h> 98__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 166171 2007-01-22 14:50:28Z andre $"); 99 100#include "opt_inet.h" 101#include "opt_mac.h" 102#include "opt_zero.h" 103#include "opt_compat.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/fcntl.h> 108#include <sys/limits.h> 109#include <sys/lock.h> 110#include <sys/mac.h> 111#include <sys/malloc.h> 112#include <sys/mbuf.h> 113#include <sys/mutex.h> 114#include <sys/domain.h> 115#include <sys/file.h> /* for struct knote */ 116#include <sys/kernel.h> 117#include <sys/event.h> 118#include <sys/eventhandler.h> 119#include <sys/poll.h> 120#include <sys/proc.h> 121#include <sys/protosw.h> 122#include <sys/socket.h> 123#include <sys/socketvar.h> 124#include <sys/resourcevar.h> 125#include <sys/signalvar.h> 126#include <sys/sysctl.h> 127#include <sys/uio.h> 128#include <sys/jail.h> 129 130#include <security/mac/mac_framework.h> 131 132#include <vm/uma.h> 133 134#ifdef COMPAT_IA32 135#include <sys/mount.h> 136#include <compat/freebsd32/freebsd32.h> 137 138extern struct sysentvec ia32_freebsd_sysvec; 139#endif 140 141static int soreceive_rcvoob(struct socket *so, struct uio *uio, 142 int flags); 143 144static void filt_sordetach(struct knote *kn); 145static int filt_soread(struct knote *kn, long hint); 146static void filt_sowdetach(struct knote *kn); 147static int filt_sowrite(struct knote *kn, long hint); 148static int filt_solisten(struct knote *kn, long hint); 149 150static struct filterops solisten_filtops = 151 { 1, NULL, filt_sordetach, filt_solisten }; 152static struct filterops soread_filtops = 153 { 1, NULL, filt_sordetach, filt_soread }; 154static struct filterops sowrite_filtops = 155 { 1, NULL, filt_sowdetach, filt_sowrite }; 156 157uma_zone_t socket_zone; 158so_gen_t so_gencnt; /* generation count for sockets */ 159 160int maxsockets; 161 162MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 163MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 164 165static int somaxconn = SOMAXCONN; 166static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS); 167/* XXX: we dont have SYSCTL_USHORT */ 168SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 169 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection " 170 "queue size"); 171static int numopensockets; 172SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 173 &numopensockets, 0, "Number of open sockets"); 174#ifdef ZERO_COPY_SOCKETS 175/* These aren't static because they're used in other files. */ 176int so_zero_copy_send = 1; 177int so_zero_copy_receive = 1; 178SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 179 "Zero copy controls"); 180SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 181 &so_zero_copy_receive, 0, "Enable zero copy receive"); 182SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 183 &so_zero_copy_send, 0, "Enable zero copy send"); 184#endif /* ZERO_COPY_SOCKETS */ 185 186/* 187 * accept_mtx locks down per-socket fields relating to accept queues. See 188 * socketvar.h for an annotation of the protected fields of struct socket. 189 */ 190struct mtx accept_mtx; 191MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 192 193/* 194 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 195 * so_gencnt field. 196 */ 197static struct mtx so_global_mtx; 198MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 199 200/* 201 * General IPC sysctl name space, used by sockets and a variety of other IPC 202 * types. 203 */ 204SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 205 206/* 207 * Sysctl to get and set the maximum global sockets limit. Notify protocols 208 * of the change so that they can update their dependent limits as required. 209 */ 210static int 211sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 212{ 213 int error, newmaxsockets; 214 215 newmaxsockets = maxsockets; 216 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 217 if (error == 0 && req->newptr) { 218 if (newmaxsockets > maxsockets) { 219 maxsockets = newmaxsockets; 220 if (maxsockets > ((maxfiles / 4) * 3)) { 221 maxfiles = (maxsockets * 5) / 4; 222 maxfilesperproc = (maxfiles * 9) / 10; 223 } 224 EVENTHANDLER_INVOKE(maxsockets_change); 225 } else 226 error = EINVAL; 227 } 228 return (error); 229} 230 231SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 232 &maxsockets, 0, sysctl_maxsockets, "IU", 233 "Maximum number of sockets avaliable"); 234 235/* 236 * Initialise maxsockets. 237 */ 238static void init_maxsockets(void *ignored) 239{ 240 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 241 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 242} 243SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 244 245/* 246 * Socket operation routines. These routines are called by the routines in 247 * sys_socket.c or from a system process, and implement the semantics of 248 * socket operations by switching out to the protocol specific routines. 249 */ 250 251/* 252 * Get a socket structure from our zone, and initialize it. Note that it 253 * would probably be better to allocate socket and PCB at the same time, but 254 * I'm not convinced that all the protocols can be easily modified to do 255 * this. 256 * 257 * soalloc() returns a socket with a ref count of 0. 258 */ 259static struct socket * 260soalloc(int mflags) 261{ 262 struct socket *so; 263 264 so = uma_zalloc(socket_zone, mflags | M_ZERO); 265 if (so == NULL) 266 return (NULL); 267#ifdef MAC 268 if (mac_init_socket(so, mflags) != 0) { 269 uma_zfree(socket_zone, so); 270 return (NULL); 271 } 272#endif 273 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 274 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 275 TAILQ_INIT(&so->so_aiojobq); 276 mtx_lock(&so_global_mtx); 277 so->so_gencnt = ++so_gencnt; 278 ++numopensockets; 279 mtx_unlock(&so_global_mtx); 280 return (so); 281} 282 283/* 284 * Free the storage associated with a socket at the socket layer, tear down 285 * locks, labels, etc. All protocol state is assumed already to have been 286 * torn down (and possibly never set up) by the caller. 287 */ 288static void 289sodealloc(struct socket *so) 290{ 291 292 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 293 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 294 295 mtx_lock(&so_global_mtx); 296 so->so_gencnt = ++so_gencnt; 297 --numopensockets; /* Could be below, but faster here. */ 298 mtx_unlock(&so_global_mtx); 299 if (so->so_rcv.sb_hiwat) 300 (void)chgsbsize(so->so_cred->cr_uidinfo, 301 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 302 if (so->so_snd.sb_hiwat) 303 (void)chgsbsize(so->so_cred->cr_uidinfo, 304 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 305#ifdef INET 306 /* remove acccept filter if one is present. */ 307 if (so->so_accf != NULL) 308 do_setopt_accept_filter(so, NULL); 309#endif 310#ifdef MAC 311 mac_destroy_socket(so); 312#endif 313 crfree(so->so_cred); 314 SOCKBUF_LOCK_DESTROY(&so->so_snd); 315 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 316 uma_zfree(socket_zone, so); 317} 318 319/* 320 * socreate returns a socket with a ref count of 1. The socket should be 321 * closed with soclose(). 322 */ 323int 324socreate(dom, aso, type, proto, cred, td) 325 int dom; 326 struct socket **aso; 327 int type; 328 int proto; 329 struct ucred *cred; 330 struct thread *td; 331{ 332 struct protosw *prp; 333 struct socket *so; 334 int error; 335 336 if (proto) 337 prp = pffindproto(dom, proto, type); 338 else 339 prp = pffindtype(dom, type); 340 341 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 342 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 343 return (EPROTONOSUPPORT); 344 345 if (jailed(cred) && jail_socket_unixiproute_only && 346 prp->pr_domain->dom_family != PF_LOCAL && 347 prp->pr_domain->dom_family != PF_INET && 348 prp->pr_domain->dom_family != PF_ROUTE) { 349 return (EPROTONOSUPPORT); 350 } 351 352 if (prp->pr_type != type) 353 return (EPROTOTYPE); 354 so = soalloc(M_WAITOK); 355 if (so == NULL) 356 return (ENOBUFS); 357 358 TAILQ_INIT(&so->so_incomp); 359 TAILQ_INIT(&so->so_comp); 360 so->so_type = type; 361 so->so_cred = crhold(cred); 362 so->so_proto = prp; 363#ifdef MAC 364 mac_create_socket(cred, so); 365#endif 366 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 367 NULL, NULL, NULL); 368 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 369 NULL, NULL, NULL); 370 so->so_count = 1; 371 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 372 if (error) { 373 KASSERT(so->so_count == 1, ("socreate: so_count %d", 374 so->so_count)); 375 so->so_count = 0; 376 sodealloc(so); 377 return (error); 378 } 379 *aso = so; 380 return (0); 381} 382 383#ifdef REGRESSION 384static int regression_sonewconn_earlytest = 1; 385SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 386 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 387#endif 388 389/* 390 * When an attempt at a new connection is noted on a socket which accepts 391 * connections, sonewconn is called. If the connection is possible (subject 392 * to space constraints, etc.) then we allocate a new structure, propoerly 393 * linked into the data structure of the original socket, and return this. 394 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 395 * 396 * Note: the ref count on the socket is 0 on return. 397 */ 398struct socket * 399sonewconn(head, connstatus) 400 register struct socket *head; 401 int connstatus; 402{ 403 register struct socket *so; 404 int over; 405 406 ACCEPT_LOCK(); 407 over = (head->so_qlen > 3 * head->so_qlimit / 2); 408 ACCEPT_UNLOCK(); 409#ifdef REGRESSION 410 if (regression_sonewconn_earlytest && over) 411#else 412 if (over) 413#endif 414 return (NULL); 415 so = soalloc(M_NOWAIT); 416 if (so == NULL) 417 return (NULL); 418 if ((head->so_options & SO_ACCEPTFILTER) != 0) 419 connstatus = 0; 420 so->so_head = head; 421 so->so_type = head->so_type; 422 so->so_options = head->so_options &~ SO_ACCEPTCONN; 423 so->so_linger = head->so_linger; 424 so->so_state = head->so_state | SS_NOFDREF; 425 so->so_proto = head->so_proto; 426 so->so_cred = crhold(head->so_cred); 427#ifdef MAC 428 SOCK_LOCK(head); 429 mac_create_socket_from_socket(head, so); 430 SOCK_UNLOCK(head); 431#endif 432 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 433 NULL, NULL, NULL); 434 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 435 NULL, NULL, NULL); 436 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 437 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 438 sodealloc(so); 439 return (NULL); 440 } 441 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 442 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 443 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 444 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 445 so->so_state |= connstatus; 446 ACCEPT_LOCK(); 447 if (connstatus) { 448 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 449 so->so_qstate |= SQ_COMP; 450 head->so_qlen++; 451 } else { 452 /* 453 * Keep removing sockets from the head until there's room for 454 * us to insert on the tail. In pre-locking revisions, this 455 * was a simple if(), but as we could be racing with other 456 * threads and soabort() requires dropping locks, we must 457 * loop waiting for the condition to be true. 458 */ 459 while (head->so_incqlen > head->so_qlimit) { 460 struct socket *sp; 461 sp = TAILQ_FIRST(&head->so_incomp); 462 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 463 head->so_incqlen--; 464 sp->so_qstate &= ~SQ_INCOMP; 465 sp->so_head = NULL; 466 ACCEPT_UNLOCK(); 467 soabort(sp); 468 ACCEPT_LOCK(); 469 } 470 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 471 so->so_qstate |= SQ_INCOMP; 472 head->so_incqlen++; 473 } 474 ACCEPT_UNLOCK(); 475 if (connstatus) { 476 sorwakeup(head); 477 wakeup_one(&head->so_timeo); 478 } 479 return (so); 480} 481 482int 483sobind(so, nam, td) 484 struct socket *so; 485 struct sockaddr *nam; 486 struct thread *td; 487{ 488 489 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 490} 491 492/* 493 * solisten() transitions a socket from a non-listening state to a listening 494 * state, but can also be used to update the listen queue depth on an 495 * existing listen socket. The protocol will call back into the sockets 496 * layer using solisten_proto_check() and solisten_proto() to check and set 497 * socket-layer listen state. Call backs are used so that the protocol can 498 * acquire both protocol and socket layer locks in whatever order is required 499 * by the protocol. 500 * 501 * Protocol implementors are advised to hold the socket lock across the 502 * socket-layer test and set to avoid races at the socket layer. 503 */ 504int 505solisten(so, backlog, td) 506 struct socket *so; 507 int backlog; 508 struct thread *td; 509{ 510 511 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 512} 513 514int 515solisten_proto_check(so) 516 struct socket *so; 517{ 518 519 SOCK_LOCK_ASSERT(so); 520 521 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 522 SS_ISDISCONNECTING)) 523 return (EINVAL); 524 return (0); 525} 526 527void 528solisten_proto(so, backlog) 529 struct socket *so; 530 int backlog; 531{ 532 533 SOCK_LOCK_ASSERT(so); 534 535 if (backlog < 0 || backlog > somaxconn) 536 backlog = somaxconn; 537 so->so_qlimit = backlog; 538 so->so_options |= SO_ACCEPTCONN; 539} 540 541/* 542 * Attempt to free a socket. This should really be sotryfree(). 543 * 544 * sofree() will succeed if: 545 * 546 * - There are no outstanding file descriptor references or related consumers 547 * (so_count == 0). 548 * 549 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 550 * 551 * - The protocol does not have an outstanding strong reference on the socket 552 * (SS_PROTOREF). 553 * 554 * - The socket is not in a completed connection queue, so a process has been 555 * notified that it is present. If it is removed, the user process may 556 * block in accept() despite select() saying the socket was ready. 557 * 558 * Otherwise, it will quietly abort so that a future call to sofree(), when 559 * conditions are right, can succeed. 560 */ 561void 562sofree(so) 563 struct socket *so; 564{ 565 struct protosw *pr = so->so_proto; 566 struct socket *head; 567 568 ACCEPT_LOCK_ASSERT(); 569 SOCK_LOCK_ASSERT(so); 570 571 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 572 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 573 SOCK_UNLOCK(so); 574 ACCEPT_UNLOCK(); 575 return; 576 } 577 578 head = so->so_head; 579 if (head != NULL) { 580 KASSERT((so->so_qstate & SQ_COMP) != 0 || 581 (so->so_qstate & SQ_INCOMP) != 0, 582 ("sofree: so_head != NULL, but neither SQ_COMP nor " 583 "SQ_INCOMP")); 584 KASSERT((so->so_qstate & SQ_COMP) == 0 || 585 (so->so_qstate & SQ_INCOMP) == 0, 586 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 587 TAILQ_REMOVE(&head->so_incomp, so, so_list); 588 head->so_incqlen--; 589 so->so_qstate &= ~SQ_INCOMP; 590 so->so_head = NULL; 591 } 592 KASSERT((so->so_qstate & SQ_COMP) == 0 && 593 (so->so_qstate & SQ_INCOMP) == 0, 594 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 595 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 596 if (so->so_options & SO_ACCEPTCONN) { 597 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 598 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); 599 } 600 SOCK_UNLOCK(so); 601 ACCEPT_UNLOCK(); 602 603 /* 604 * From this point on, we assume that no other references to this 605 * socket exist anywhere else in the stack. Therefore, no locks need 606 * to be acquired or held. 607 * 608 * We used to do a lot of socket buffer and socket locking here, as 609 * well as invoke sorflush() and perform wakeups. The direct call to 610 * dom_dispose() and sbrelease_internal() are an inlining of what was 611 * necessary from sorflush(). 612 * 613 * Notice that the socket buffer and kqueue state are torn down 614 * before calling pru_detach. This means that protocols shold not 615 * assume they can perform socket wakeups, etc, in their detach 616 * code. 617 */ 618 KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock")); 619 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock")); 620 sbdestroy(&so->so_snd, so); 621 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 622 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 623 sbdestroy(&so->so_rcv, so); 624 if (pr->pr_usrreqs->pru_detach != NULL) 625 (*pr->pr_usrreqs->pru_detach)(so); 626 knlist_destroy(&so->so_rcv.sb_sel.si_note); 627 knlist_destroy(&so->so_snd.sb_sel.si_note); 628 sodealloc(so); 629} 630 631/* 632 * Close a socket on last file table reference removal. Initiate disconnect 633 * if connected. Free socket when disconnect complete. 634 * 635 * This function will sorele() the socket. Note that soclose() may be called 636 * prior to the ref count reaching zero. The actual socket structure will 637 * not be freed until the ref count reaches zero. 638 */ 639int 640soclose(so) 641 struct socket *so; 642{ 643 int error = 0; 644 645 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 646 647 funsetown(&so->so_sigio); 648 if (so->so_state & SS_ISCONNECTED) { 649 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 650 error = sodisconnect(so); 651 if (error) 652 goto drop; 653 } 654 if (so->so_options & SO_LINGER) { 655 if ((so->so_state & SS_ISDISCONNECTING) && 656 (so->so_state & SS_NBIO)) 657 goto drop; 658 while (so->so_state & SS_ISCONNECTED) { 659 error = tsleep(&so->so_timeo, 660 PSOCK | PCATCH, "soclos", so->so_linger * hz); 661 if (error) 662 break; 663 } 664 } 665 } 666 667drop: 668 if (so->so_proto->pr_usrreqs->pru_close != NULL) 669 (*so->so_proto->pr_usrreqs->pru_close)(so); 670 if (so->so_options & SO_ACCEPTCONN) { 671 struct socket *sp; 672 ACCEPT_LOCK(); 673 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 674 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 675 so->so_incqlen--; 676 sp->so_qstate &= ~SQ_INCOMP; 677 sp->so_head = NULL; 678 ACCEPT_UNLOCK(); 679 soabort(sp); 680 ACCEPT_LOCK(); 681 } 682 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 683 TAILQ_REMOVE(&so->so_comp, sp, so_list); 684 so->so_qlen--; 685 sp->so_qstate &= ~SQ_COMP; 686 sp->so_head = NULL; 687 ACCEPT_UNLOCK(); 688 soabort(sp); 689 ACCEPT_LOCK(); 690 } 691 ACCEPT_UNLOCK(); 692 } 693 ACCEPT_LOCK(); 694 SOCK_LOCK(so); 695 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 696 so->so_state |= SS_NOFDREF; 697 sorele(so); 698 return (error); 699} 700 701/* 702 * soabort() is used to abruptly tear down a connection, such as when a 703 * resource limit is reached (listen queue depth exceeded), or if a listen 704 * socket is closed while there are sockets waiting to be accepted. 705 * 706 * This interface is tricky, because it is called on an unreferenced socket, 707 * and must be called only by a thread that has actually removed the socket 708 * from the listen queue it was on, or races with other threads are risked. 709 * 710 * This interface will call into the protocol code, so must not be called 711 * with any socket locks held. Protocols do call it while holding their own 712 * recursible protocol mutexes, but this is something that should be subject 713 * to review in the future. 714 */ 715void 716soabort(so) 717 struct socket *so; 718{ 719 720 /* 721 * In as much as is possible, assert that no references to this 722 * socket are held. This is not quite the same as asserting that the 723 * current thread is responsible for arranging for no references, but 724 * is as close as we can get for now. 725 */ 726 KASSERT(so->so_count == 0, ("soabort: so_count")); 727 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 728 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 729 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 730 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 731 732 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 733 (*so->so_proto->pr_usrreqs->pru_abort)(so); 734 ACCEPT_LOCK(); 735 SOCK_LOCK(so); 736 sofree(so); 737} 738 739int 740soaccept(so, nam) 741 struct socket *so; 742 struct sockaddr **nam; 743{ 744 int error; 745 746 SOCK_LOCK(so); 747 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 748 so->so_state &= ~SS_NOFDREF; 749 SOCK_UNLOCK(so); 750 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 751 return (error); 752} 753 754int 755soconnect(so, nam, td) 756 struct socket *so; 757 struct sockaddr *nam; 758 struct thread *td; 759{ 760 int error; 761 762 if (so->so_options & SO_ACCEPTCONN) 763 return (EOPNOTSUPP); 764 /* 765 * If protocol is connection-based, can only connect once. 766 * Otherwise, if connected, try to disconnect first. This allows 767 * user to disconnect by connecting to, e.g., a null address. 768 */ 769 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 770 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 771 (error = sodisconnect(so)))) { 772 error = EISCONN; 773 } else { 774 /* 775 * Prevent accumulated error from previous connection from 776 * biting us. 777 */ 778 so->so_error = 0; 779 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 780 } 781 782 return (error); 783} 784 785int 786soconnect2(so1, so2) 787 struct socket *so1; 788 struct socket *so2; 789{ 790 791 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 792} 793 794int 795sodisconnect(so) 796 struct socket *so; 797{ 798 int error; 799 800 if ((so->so_state & SS_ISCONNECTED) == 0) 801 return (ENOTCONN); 802 if (so->so_state & SS_ISDISCONNECTING) 803 return (EALREADY); 804 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 805 return (error); 806} 807 808#ifdef ZERO_COPY_SOCKETS 809struct so_zerocopy_stats{ 810 int size_ok; 811 int align_ok; 812 int found_ifp; 813}; 814struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 815#include <netinet/in.h> 816#include <net/route.h> 817#include <netinet/in_pcb.h> 818#include <vm/vm.h> 819#include <vm/vm_page.h> 820#include <vm/vm_object.h> 821 822/* 823 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 824 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 825 * 826 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 827 * all of the data referenced by the uio. If desired, it uses zero-copy. 828 * *space will be updated to reflect data copied in. 829 * 830 * NB: If atomic I/O is requested, the caller must already have checked that 831 * space can hold resid bytes. 832 * 833 * NB: In the event of an error, the caller may need to free the partial 834 * chain pointed to by *mpp. The contents of both *uio and *space may be 835 * modified even in the case of an error. 836 */ 837static int 838sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 839 int flags) 840{ 841 struct mbuf *m, **mp, *top; 842 long len, resid; 843 int error; 844#ifdef ZERO_COPY_SOCKETS 845 int cow_send; 846#endif 847 848 *retmp = top = NULL; 849 mp = ⊤ 850 len = 0; 851 resid = uio->uio_resid; 852 error = 0; 853 do { 854#ifdef ZERO_COPY_SOCKETS 855 cow_send = 0; 856#endif /* ZERO_COPY_SOCKETS */ 857 if (resid >= MINCLSIZE) { 858#ifdef ZERO_COPY_SOCKETS 859 if (top == NULL) { 860 MGETHDR(m, M_TRYWAIT, MT_DATA); 861 if (m == NULL) { 862 error = ENOBUFS; 863 goto out; 864 } 865 m->m_pkthdr.len = 0; 866 m->m_pkthdr.rcvif = NULL; 867 } else { 868 MGET(m, M_TRYWAIT, MT_DATA); 869 if (m == NULL) { 870 error = ENOBUFS; 871 goto out; 872 } 873 } 874 if (so_zero_copy_send && 875 resid>=PAGE_SIZE && 876 *space>=PAGE_SIZE && 877 uio->uio_iov->iov_len>=PAGE_SIZE) { 878 so_zerocp_stats.size_ok++; 879 so_zerocp_stats.align_ok++; 880 cow_send = socow_setup(m, uio); 881 len = cow_send; 882 } 883 if (!cow_send) { 884 MCLGET(m, M_TRYWAIT); 885 if ((m->m_flags & M_EXT) == 0) { 886 m_free(m); 887 m = NULL; 888 } else { 889 len = min(min(MCLBYTES, resid), 890 *space); 891 } 892 } 893#else /* ZERO_COPY_SOCKETS */ 894 if (top == NULL) { 895 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 896 m->m_pkthdr.len = 0; 897 m->m_pkthdr.rcvif = NULL; 898 } else 899 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 900 len = min(min(MCLBYTES, resid), *space); 901#endif /* ZERO_COPY_SOCKETS */ 902 } else { 903 if (top == NULL) { 904 m = m_gethdr(M_TRYWAIT, MT_DATA); 905 m->m_pkthdr.len = 0; 906 m->m_pkthdr.rcvif = NULL; 907 908 len = min(min(MHLEN, resid), *space); 909 /* 910 * For datagram protocols, leave room 911 * for protocol headers in first mbuf. 912 */ 913 if (atomic && m && len < MHLEN) 914 MH_ALIGN(m, len); 915 } else { 916 m = m_get(M_TRYWAIT, MT_DATA); 917 len = min(min(MLEN, resid), *space); 918 } 919 } 920 if (m == NULL) { 921 error = ENOBUFS; 922 goto out; 923 } 924 925 *space -= len; 926#ifdef ZERO_COPY_SOCKETS 927 if (cow_send) 928 error = 0; 929 else 930#endif /* ZERO_COPY_SOCKETS */ 931 error = uiomove(mtod(m, void *), (int)len, uio); 932 resid = uio->uio_resid; 933 m->m_len = len; 934 *mp = m; 935 top->m_pkthdr.len += len; 936 if (error) 937 goto out; 938 mp = &m->m_next; 939 if (resid <= 0) { 940 if (flags & MSG_EOR) 941 top->m_flags |= M_EOR; 942 break; 943 } 944 } while (*space > 0 && atomic); 945out: 946 *retmp = top; 947 return (error); 948} 949#endif /*ZERO_COPY_SOCKETS*/ 950 951#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 952 953int 954sosend_dgram(so, addr, uio, top, control, flags, td) 955 struct socket *so; 956 struct sockaddr *addr; 957 struct uio *uio; 958 struct mbuf *top; 959 struct mbuf *control; 960 int flags; 961 struct thread *td; 962{ 963 long space, resid; 964 int clen = 0, error, dontroute; 965#ifdef ZERO_COPY_SOCKETS 966 int atomic = sosendallatonce(so) || top; 967#endif 968 969 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 970 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 971 ("sodgram_send: !PR_ATOMIC")); 972 973 if (uio != NULL) 974 resid = uio->uio_resid; 975 else 976 resid = top->m_pkthdr.len; 977 /* 978 * In theory resid should be unsigned. However, space must be 979 * signed, as it might be less than 0 if we over-committed, and we 980 * must use a signed comparison of space and resid. On the other 981 * hand, a negative resid causes us to loop sending 0-length 982 * segments to the protocol. 983 * 984 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 985 * type sockets since that's an error. 986 */ 987 if (resid < 0) { 988 error = EINVAL; 989 goto out; 990 } 991 992 dontroute = 993 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 994 if (td != NULL) 995 td->td_proc->p_stats->p_ru.ru_msgsnd++; 996 if (control != NULL) 997 clen = control->m_len; 998 999 SOCKBUF_LOCK(&so->so_snd); 1000 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1001 SOCKBUF_UNLOCK(&so->so_snd); 1002 error = EPIPE; 1003 goto out; 1004 } 1005 if (so->so_error) { 1006 error = so->so_error; 1007 so->so_error = 0; 1008 SOCKBUF_UNLOCK(&so->so_snd); 1009 goto out; 1010 } 1011 if ((so->so_state & SS_ISCONNECTED) == 0) { 1012 /* 1013 * `sendto' and `sendmsg' is allowed on a connection-based 1014 * socket if it supports implied connect. Return ENOTCONN if 1015 * not connected and no address is supplied. 1016 */ 1017 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1018 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1019 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1020 !(resid == 0 && clen != 0)) { 1021 SOCKBUF_UNLOCK(&so->so_snd); 1022 error = ENOTCONN; 1023 goto out; 1024 } 1025 } else if (addr == NULL) { 1026 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1027 error = ENOTCONN; 1028 else 1029 error = EDESTADDRREQ; 1030 SOCKBUF_UNLOCK(&so->so_snd); 1031 goto out; 1032 } 1033 } 1034 1035 /* 1036 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1037 * problem and need fixing. 1038 */ 1039 space = sbspace(&so->so_snd); 1040 if (flags & MSG_OOB) 1041 space += 1024; 1042 space -= clen; 1043 SOCKBUF_UNLOCK(&so->so_snd); 1044 if (resid > space) { 1045 error = EMSGSIZE; 1046 goto out; 1047 } 1048 if (uio == NULL) { 1049 resid = 0; 1050 if (flags & MSG_EOR) 1051 top->m_flags |= M_EOR; 1052 } else { 1053#ifdef ZERO_COPY_SOCKETS 1054 error = sosend_copyin(uio, &top, atomic, &space, flags); 1055 if (error) 1056 goto out; 1057#else 1058 /* 1059 * Copy the data from userland into a mbuf chain. 1060 * If no data is to be copied in, a single empty mbuf 1061 * is returned. 1062 */ 1063 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1064 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1065 if (top == NULL) { 1066 error = EFAULT; /* only possible error */ 1067 goto out; 1068 } 1069 space -= resid - uio->uio_resid; 1070#endif 1071 resid = uio->uio_resid; 1072 } 1073 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1074 /* 1075 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1076 * than with. 1077 */ 1078 if (dontroute) { 1079 SOCK_LOCK(so); 1080 so->so_options |= SO_DONTROUTE; 1081 SOCK_UNLOCK(so); 1082 } 1083 /* 1084 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1085 * of date. We could have recieved a reset packet in an interrupt or 1086 * maybe we slept while doing page faults in uiomove() etc. We could 1087 * probably recheck again inside the locking protection here, but 1088 * there are probably other places that this also happens. We must 1089 * rethink this. 1090 */ 1091 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1092 (flags & MSG_OOB) ? PRUS_OOB : 1093 /* 1094 * If the user set MSG_EOF, the protocol understands this flag and 1095 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1096 */ 1097 ((flags & MSG_EOF) && 1098 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1099 (resid <= 0)) ? 1100 PRUS_EOF : 1101 /* If there is more to send set PRUS_MORETOCOME */ 1102 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1103 top, addr, control, td); 1104 if (dontroute) { 1105 SOCK_LOCK(so); 1106 so->so_options &= ~SO_DONTROUTE; 1107 SOCK_UNLOCK(so); 1108 } 1109 clen = 0; 1110 control = NULL; 1111 top = NULL; 1112out: 1113 if (top != NULL) 1114 m_freem(top); 1115 if (control != NULL) 1116 m_freem(control); 1117 return (error); 1118} 1119 1120/* 1121 * Send on a socket. If send must go all at once and message is larger than 1122 * send buffering, then hard error. Lock against other senders. If must go 1123 * all at once and not enough room now, then inform user that this would 1124 * block and do nothing. Otherwise, if nonblocking, send as much as 1125 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1126 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1127 * in mbuf chain must be small enough to send all at once. 1128 * 1129 * Returns nonzero on error, timeout or signal; callers must check for short 1130 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1131 * on return. 1132 */ 1133#define snderr(errno) { error = (errno); goto release; } 1134int 1135sosend_generic(so, addr, uio, top, control, flags, td) 1136 struct socket *so; 1137 struct sockaddr *addr; 1138 struct uio *uio; 1139 struct mbuf *top; 1140 struct mbuf *control; 1141 int flags; 1142 struct thread *td; 1143{ 1144 long space, resid; 1145 int clen = 0, error, dontroute; 1146 int atomic = sosendallatonce(so) || top; 1147 1148 if (uio != NULL) 1149 resid = uio->uio_resid; 1150 else 1151 resid = top->m_pkthdr.len; 1152 /* 1153 * In theory resid should be unsigned. However, space must be 1154 * signed, as it might be less than 0 if we over-committed, and we 1155 * must use a signed comparison of space and resid. On the other 1156 * hand, a negative resid causes us to loop sending 0-length 1157 * segments to the protocol. 1158 * 1159 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1160 * type sockets since that's an error. 1161 */ 1162 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1163 error = EINVAL; 1164 goto out; 1165 } 1166 1167 dontroute = 1168 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1169 (so->so_proto->pr_flags & PR_ATOMIC); 1170 if (td != NULL) 1171 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1172 if (control != NULL) 1173 clen = control->m_len; 1174 1175 SOCKBUF_LOCK(&so->so_snd); 1176restart: 1177 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1178 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1179 if (error) 1180 goto out_locked; 1181 do { 1182 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1183 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1184 snderr(EPIPE); 1185 if (so->so_error) { 1186 error = so->so_error; 1187 so->so_error = 0; 1188 goto release; 1189 } 1190 if ((so->so_state & SS_ISCONNECTED) == 0) { 1191 /* 1192 * `sendto' and `sendmsg' is allowed on a connection- 1193 * based socket if it supports implied connect. 1194 * Return ENOTCONN if not connected and no address is 1195 * supplied. 1196 */ 1197 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1198 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1199 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1200 !(resid == 0 && clen != 0)) 1201 snderr(ENOTCONN); 1202 } else if (addr == NULL) 1203 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1204 ENOTCONN : EDESTADDRREQ); 1205 } 1206 space = sbspace(&so->so_snd); 1207 if (flags & MSG_OOB) 1208 space += 1024; 1209 if ((atomic && resid > so->so_snd.sb_hiwat) || 1210 clen > so->so_snd.sb_hiwat) 1211 snderr(EMSGSIZE); 1212 if (space < resid + clen && 1213 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1214 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 1215 snderr(EWOULDBLOCK); 1216 sbunlock(&so->so_snd); 1217 error = sbwait(&so->so_snd); 1218 if (error) 1219 goto out_locked; 1220 goto restart; 1221 } 1222 SOCKBUF_UNLOCK(&so->so_snd); 1223 space -= clen; 1224 do { 1225 if (uio == NULL) { 1226 resid = 0; 1227 if (flags & MSG_EOR) 1228 top->m_flags |= M_EOR; 1229 } else { 1230#ifdef ZERO_COPY_SOCKETS 1231 error = sosend_copyin(uio, &top, atomic, 1232 &space, flags); 1233 if (error != 0) { 1234 SOCKBUF_LOCK(&so->so_snd); 1235 goto release; 1236 } 1237#else 1238 /* 1239 * Copy the data from userland into a mbuf 1240 * chain. If no data is to be copied in, 1241 * a single empty mbuf is returned. 1242 */ 1243 top = m_uiotombuf(uio, M_WAITOK, space, 1244 (atomic ? max_hdr : 0), 1245 (atomic ? M_PKTHDR : 0) | 1246 ((flags & MSG_EOR) ? M_EOR : 0)); 1247 if (top == NULL) { 1248 SOCKBUF_LOCK(&so->so_snd); 1249 error = EFAULT; /* only possible error */ 1250 goto release; 1251 } 1252 space -= resid - uio->uio_resid; 1253#endif 1254 resid = uio->uio_resid; 1255 } 1256 if (dontroute) { 1257 SOCK_LOCK(so); 1258 so->so_options |= SO_DONTROUTE; 1259 SOCK_UNLOCK(so); 1260 } 1261 /* 1262 * XXX all the SBS_CANTSENDMORE checks previously 1263 * done could be out of date. We could have recieved 1264 * a reset packet in an interrupt or maybe we slept 1265 * while doing page faults in uiomove() etc. We 1266 * could probably recheck again inside the locking 1267 * protection here, but there are probably other 1268 * places that this also happens. We must rethink 1269 * this. 1270 */ 1271 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1272 (flags & MSG_OOB) ? PRUS_OOB : 1273 /* 1274 * If the user set MSG_EOF, the protocol understands 1275 * this flag and nothing left to send then use 1276 * PRU_SEND_EOF instead of PRU_SEND. 1277 */ 1278 ((flags & MSG_EOF) && 1279 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1280 (resid <= 0)) ? 1281 PRUS_EOF : 1282 /* If there is more to send set PRUS_MORETOCOME. */ 1283 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1284 top, addr, control, td); 1285 if (dontroute) { 1286 SOCK_LOCK(so); 1287 so->so_options &= ~SO_DONTROUTE; 1288 SOCK_UNLOCK(so); 1289 } 1290 clen = 0; 1291 control = NULL; 1292 top = NULL; 1293 if (error) { 1294 SOCKBUF_LOCK(&so->so_snd); 1295 goto release; 1296 } 1297 } while (resid && space > 0); 1298 SOCKBUF_LOCK(&so->so_snd); 1299 } while (resid); 1300 1301release: 1302 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1303 sbunlock(&so->so_snd); 1304out_locked: 1305 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1306 SOCKBUF_UNLOCK(&so->so_snd); 1307out: 1308 if (top != NULL) 1309 m_freem(top); 1310 if (control != NULL) 1311 m_freem(control); 1312 return (error); 1313} 1314#undef snderr 1315 1316int 1317sosend(so, addr, uio, top, control, flags, td) 1318 struct socket *so; 1319 struct sockaddr *addr; 1320 struct uio *uio; 1321 struct mbuf *top; 1322 struct mbuf *control; 1323 int flags; 1324 struct thread *td; 1325{ 1326 1327 /* XXXRW: Temporary debugging. */ 1328 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, 1329 ("sosend: protocol calls sosend")); 1330 1331 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1332 control, flags, td)); 1333} 1334 1335/* 1336 * The part of soreceive() that implements reading non-inline out-of-band 1337 * data from a socket. For more complete comments, see soreceive(), from 1338 * which this code originated. 1339 * 1340 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1341 * unable to return an mbuf chain to the caller. 1342 */ 1343static int 1344soreceive_rcvoob(so, uio, flags) 1345 struct socket *so; 1346 struct uio *uio; 1347 int flags; 1348{ 1349 struct protosw *pr = so->so_proto; 1350 struct mbuf *m; 1351 int error; 1352 1353 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1354 1355 m = m_get(M_TRYWAIT, MT_DATA); 1356 if (m == NULL) 1357 return (ENOBUFS); 1358 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1359 if (error) 1360 goto bad; 1361 do { 1362#ifdef ZERO_COPY_SOCKETS 1363 if (so_zero_copy_receive) { 1364 int disposable; 1365 1366 if ((m->m_flags & M_EXT) 1367 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1368 disposable = 1; 1369 else 1370 disposable = 0; 1371 1372 error = uiomoveco(mtod(m, void *), 1373 min(uio->uio_resid, m->m_len), 1374 uio, disposable); 1375 } else 1376#endif /* ZERO_COPY_SOCKETS */ 1377 error = uiomove(mtod(m, void *), 1378 (int) min(uio->uio_resid, m->m_len), uio); 1379 m = m_free(m); 1380 } while (uio->uio_resid && error == 0 && m); 1381bad: 1382 if (m != NULL) 1383 m_freem(m); 1384 return (error); 1385} 1386 1387/* 1388 * Following replacement or removal of the first mbuf on the first mbuf chain 1389 * of a socket buffer, push necessary state changes back into the socket 1390 * buffer so that other consumers see the values consistently. 'nextrecord' 1391 * is the callers locally stored value of the original value of 1392 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1393 * NOTE: 'nextrecord' may be NULL. 1394 */ 1395static __inline void 1396sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1397{ 1398 1399 SOCKBUF_LOCK_ASSERT(sb); 1400 /* 1401 * First, update for the new value of nextrecord. If necessary, make 1402 * it the first record. 1403 */ 1404 if (sb->sb_mb != NULL) 1405 sb->sb_mb->m_nextpkt = nextrecord; 1406 else 1407 sb->sb_mb = nextrecord; 1408 1409 /* 1410 * Now update any dependent socket buffer fields to reflect the new 1411 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1412 * addition of a second clause that takes care of the case where 1413 * sb_mb has been updated, but remains the last record. 1414 */ 1415 if (sb->sb_mb == NULL) { 1416 sb->sb_mbtail = NULL; 1417 sb->sb_lastrecord = NULL; 1418 } else if (sb->sb_mb->m_nextpkt == NULL) 1419 sb->sb_lastrecord = sb->sb_mb; 1420} 1421 1422 1423/* 1424 * Implement receive operations on a socket. We depend on the way that 1425 * records are added to the sockbuf by sbappend. In particular, each record 1426 * (mbufs linked through m_next) must begin with an address if the protocol 1427 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1428 * data, and then zero or more mbufs of data. In order to allow parallelism 1429 * between network receive and copying to user space, as well as avoid 1430 * sleeping with a mutex held, we release the socket buffer mutex during the 1431 * user space copy. Although the sockbuf is locked, new data may still be 1432 * appended, and thus we must maintain consistency of the sockbuf during that 1433 * time. 1434 * 1435 * The caller may receive the data as a single mbuf chain by supplying an 1436 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1437 * the count in uio_resid. 1438 */ 1439int 1440soreceive_generic(so, psa, uio, mp0, controlp, flagsp) 1441 struct socket *so; 1442 struct sockaddr **psa; 1443 struct uio *uio; 1444 struct mbuf **mp0; 1445 struct mbuf **controlp; 1446 int *flagsp; 1447{ 1448 struct mbuf *m, **mp; 1449 int flags, len, error, offset; 1450 struct protosw *pr = so->so_proto; 1451 struct mbuf *nextrecord; 1452 int moff, type = 0; 1453 int orig_resid = uio->uio_resid; 1454 1455 mp = mp0; 1456 if (psa != NULL) 1457 *psa = NULL; 1458 if (controlp != NULL) 1459 *controlp = NULL; 1460 if (flagsp != NULL) 1461 flags = *flagsp &~ MSG_EOR; 1462 else 1463 flags = 0; 1464 if (flags & MSG_OOB) 1465 return (soreceive_rcvoob(so, uio, flags)); 1466 if (mp != NULL) 1467 *mp = NULL; 1468 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1469 && uio->uio_resid) 1470 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1471 1472 SOCKBUF_LOCK(&so->so_rcv); 1473restart: 1474 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1475 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1476 if (error) 1477 goto out; 1478 1479 m = so->so_rcv.sb_mb; 1480 /* 1481 * If we have less data than requested, block awaiting more (subject 1482 * to any timeout) if: 1483 * 1. the current count is less than the low water mark, or 1484 * 2. MSG_WAITALL is set, and it is possible to do the entire 1485 * receive operation at once if we block (resid <= hiwat). 1486 * 3. MSG_DONTWAIT is not set 1487 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1488 * we have to do the receive in sections, and thus risk returning a 1489 * short count if a timeout or signal occurs after we start. 1490 */ 1491 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1492 so->so_rcv.sb_cc < uio->uio_resid) && 1493 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1494 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1495 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1496 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1497 ("receive: m == %p so->so_rcv.sb_cc == %u", 1498 m, so->so_rcv.sb_cc)); 1499 if (so->so_error) { 1500 if (m != NULL) 1501 goto dontblock; 1502 error = so->so_error; 1503 if ((flags & MSG_PEEK) == 0) 1504 so->so_error = 0; 1505 goto release; 1506 } 1507 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1508 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1509 if (m) 1510 goto dontblock; 1511 else 1512 goto release; 1513 } 1514 for (; m != NULL; m = m->m_next) 1515 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1516 m = so->so_rcv.sb_mb; 1517 goto dontblock; 1518 } 1519 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1520 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1521 error = ENOTCONN; 1522 goto release; 1523 } 1524 if (uio->uio_resid == 0) 1525 goto release; 1526 if ((so->so_state & SS_NBIO) || 1527 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1528 error = EWOULDBLOCK; 1529 goto release; 1530 } 1531 SBLASTRECORDCHK(&so->so_rcv); 1532 SBLASTMBUFCHK(&so->so_rcv); 1533 sbunlock(&so->so_rcv); 1534 error = sbwait(&so->so_rcv); 1535 if (error) 1536 goto out; 1537 goto restart; 1538 } 1539dontblock: 1540 /* 1541 * From this point onward, we maintain 'nextrecord' as a cache of the 1542 * pointer to the next record in the socket buffer. We must keep the 1543 * various socket buffer pointers and local stack versions of the 1544 * pointers in sync, pushing out modifications before dropping the 1545 * socket buffer mutex, and re-reading them when picking it up. 1546 * 1547 * Otherwise, we will race with the network stack appending new data 1548 * or records onto the socket buffer by using inconsistent/stale 1549 * versions of the field, possibly resulting in socket buffer 1550 * corruption. 1551 * 1552 * By holding the high-level sblock(), we prevent simultaneous 1553 * readers from pulling off the front of the socket buffer. 1554 */ 1555 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1556 if (uio->uio_td) 1557 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1558 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1559 SBLASTRECORDCHK(&so->so_rcv); 1560 SBLASTMBUFCHK(&so->so_rcv); 1561 nextrecord = m->m_nextpkt; 1562 if (pr->pr_flags & PR_ADDR) { 1563 KASSERT(m->m_type == MT_SONAME, 1564 ("m->m_type == %d", m->m_type)); 1565 orig_resid = 0; 1566 if (psa != NULL) 1567 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1568 M_NOWAIT); 1569 if (flags & MSG_PEEK) { 1570 m = m->m_next; 1571 } else { 1572 sbfree(&so->so_rcv, m); 1573 so->so_rcv.sb_mb = m_free(m); 1574 m = so->so_rcv.sb_mb; 1575 sockbuf_pushsync(&so->so_rcv, nextrecord); 1576 } 1577 } 1578 1579 /* 1580 * Process one or more MT_CONTROL mbufs present before any data mbufs 1581 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1582 * just copy the data; if !MSG_PEEK, we call into the protocol to 1583 * perform externalization (or freeing if controlp == NULL). 1584 */ 1585 if (m != NULL && m->m_type == MT_CONTROL) { 1586 struct mbuf *cm = NULL, *cmn; 1587 struct mbuf **cme = &cm; 1588 1589 do { 1590 if (flags & MSG_PEEK) { 1591 if (controlp != NULL) { 1592 *controlp = m_copy(m, 0, m->m_len); 1593 controlp = &(*controlp)->m_next; 1594 } 1595 m = m->m_next; 1596 } else { 1597 sbfree(&so->so_rcv, m); 1598 so->so_rcv.sb_mb = m->m_next; 1599 m->m_next = NULL; 1600 *cme = m; 1601 cme = &(*cme)->m_next; 1602 m = so->so_rcv.sb_mb; 1603 } 1604 } while (m != NULL && m->m_type == MT_CONTROL); 1605 if ((flags & MSG_PEEK) == 0) 1606 sockbuf_pushsync(&so->so_rcv, nextrecord); 1607 while (cm != NULL) { 1608 cmn = cm->m_next; 1609 cm->m_next = NULL; 1610 if (pr->pr_domain->dom_externalize != NULL) { 1611 SOCKBUF_UNLOCK(&so->so_rcv); 1612 error = (*pr->pr_domain->dom_externalize) 1613 (cm, controlp); 1614 SOCKBUF_LOCK(&so->so_rcv); 1615 } else if (controlp != NULL) 1616 *controlp = cm; 1617 else 1618 m_freem(cm); 1619 if (controlp != NULL) { 1620 orig_resid = 0; 1621 while (*controlp != NULL) 1622 controlp = &(*controlp)->m_next; 1623 } 1624 cm = cmn; 1625 } 1626 if (m != NULL) 1627 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1628 else 1629 nextrecord = so->so_rcv.sb_mb; 1630 orig_resid = 0; 1631 } 1632 if (m != NULL) { 1633 if ((flags & MSG_PEEK) == 0) { 1634 KASSERT(m->m_nextpkt == nextrecord, 1635 ("soreceive: post-control, nextrecord !sync")); 1636 if (nextrecord == NULL) { 1637 KASSERT(so->so_rcv.sb_mb == m, 1638 ("soreceive: post-control, sb_mb!=m")); 1639 KASSERT(so->so_rcv.sb_lastrecord == m, 1640 ("soreceive: post-control, lastrecord!=m")); 1641 } 1642 } 1643 type = m->m_type; 1644 if (type == MT_OOBDATA) 1645 flags |= MSG_OOB; 1646 } else { 1647 if ((flags & MSG_PEEK) == 0) { 1648 KASSERT(so->so_rcv.sb_mb == nextrecord, 1649 ("soreceive: sb_mb != nextrecord")); 1650 if (so->so_rcv.sb_mb == NULL) { 1651 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1652 ("soreceive: sb_lastercord != NULL")); 1653 } 1654 } 1655 } 1656 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1657 SBLASTRECORDCHK(&so->so_rcv); 1658 SBLASTMBUFCHK(&so->so_rcv); 1659 1660 /* 1661 * Now continue to read any data mbufs off of the head of the socket 1662 * buffer until the read request is satisfied. Note that 'type' is 1663 * used to store the type of any mbuf reads that have happened so far 1664 * such that soreceive() can stop reading if the type changes, which 1665 * causes soreceive() to return only one of regular data and inline 1666 * out-of-band data in a single socket receive operation. 1667 */ 1668 moff = 0; 1669 offset = 0; 1670 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1671 /* 1672 * If the type of mbuf has changed since the last mbuf 1673 * examined ('type'), end the receive operation. 1674 */ 1675 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1676 if (m->m_type == MT_OOBDATA) { 1677 if (type != MT_OOBDATA) 1678 break; 1679 } else if (type == MT_OOBDATA) 1680 break; 1681 else 1682 KASSERT(m->m_type == MT_DATA, 1683 ("m->m_type == %d", m->m_type)); 1684 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1685 len = uio->uio_resid; 1686 if (so->so_oobmark && len > so->so_oobmark - offset) 1687 len = so->so_oobmark - offset; 1688 if (len > m->m_len - moff) 1689 len = m->m_len - moff; 1690 /* 1691 * If mp is set, just pass back the mbufs. Otherwise copy 1692 * them out via the uio, then free. Sockbuf must be 1693 * consistent here (points to current mbuf, it points to next 1694 * record) when we drop priority; we must note any additions 1695 * to the sockbuf when we block interrupts again. 1696 */ 1697 if (mp == NULL) { 1698 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1699 SBLASTRECORDCHK(&so->so_rcv); 1700 SBLASTMBUFCHK(&so->so_rcv); 1701 SOCKBUF_UNLOCK(&so->so_rcv); 1702#ifdef ZERO_COPY_SOCKETS 1703 if (so_zero_copy_receive) { 1704 int disposable; 1705 1706 if ((m->m_flags & M_EXT) 1707 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1708 disposable = 1; 1709 else 1710 disposable = 0; 1711 1712 error = uiomoveco(mtod(m, char *) + moff, 1713 (int)len, uio, 1714 disposable); 1715 } else 1716#endif /* ZERO_COPY_SOCKETS */ 1717 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1718 SOCKBUF_LOCK(&so->so_rcv); 1719 if (error) { 1720 /* 1721 * The MT_SONAME mbuf has already been removed 1722 * from the record, so it is necessary to 1723 * remove the data mbufs, if any, to preserve 1724 * the invariant in the case of PR_ADDR that 1725 * requires MT_SONAME mbufs at the head of 1726 * each record. 1727 */ 1728 if (m && pr->pr_flags & PR_ATOMIC 1729 && ((flags & MSG_PEEK) == 0)) { 1730 (void)sbdroprecord_locked(&so->so_rcv); 1731 } 1732 goto release; 1733 } 1734 } else 1735 uio->uio_resid -= len; 1736 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1737 if (len == m->m_len - moff) { 1738 if (m->m_flags & M_EOR) 1739 flags |= MSG_EOR; 1740 if (flags & MSG_PEEK) { 1741 m = m->m_next; 1742 moff = 0; 1743 } else { 1744 nextrecord = m->m_nextpkt; 1745 sbfree(&so->so_rcv, m); 1746 if (mp != NULL) { 1747 *mp = m; 1748 mp = &m->m_next; 1749 so->so_rcv.sb_mb = m = m->m_next; 1750 *mp = NULL; 1751 } else { 1752 so->so_rcv.sb_mb = m_free(m); 1753 m = so->so_rcv.sb_mb; 1754 } 1755 sockbuf_pushsync(&so->so_rcv, nextrecord); 1756 SBLASTRECORDCHK(&so->so_rcv); 1757 SBLASTMBUFCHK(&so->so_rcv); 1758 } 1759 } else { 1760 if (flags & MSG_PEEK) 1761 moff += len; 1762 else { 1763 if (mp != NULL) { 1764 int copy_flag; 1765 1766 if (flags & MSG_DONTWAIT) 1767 copy_flag = M_DONTWAIT; 1768 else 1769 copy_flag = M_TRYWAIT; 1770 if (copy_flag == M_TRYWAIT) 1771 SOCKBUF_UNLOCK(&so->so_rcv); 1772 *mp = m_copym(m, 0, len, copy_flag); 1773 if (copy_flag == M_TRYWAIT) 1774 SOCKBUF_LOCK(&so->so_rcv); 1775 if (*mp == NULL) { 1776 /* 1777 * m_copym() couldn't 1778 * allocate an mbuf. Adjust 1779 * uio_resid back (it was 1780 * adjusted down by len 1781 * bytes, which we didn't end 1782 * up "copying" over). 1783 */ 1784 uio->uio_resid += len; 1785 break; 1786 } 1787 } 1788 m->m_data += len; 1789 m->m_len -= len; 1790 so->so_rcv.sb_cc -= len; 1791 } 1792 } 1793 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1794 if (so->so_oobmark) { 1795 if ((flags & MSG_PEEK) == 0) { 1796 so->so_oobmark -= len; 1797 if (so->so_oobmark == 0) { 1798 so->so_rcv.sb_state |= SBS_RCVATMARK; 1799 break; 1800 } 1801 } else { 1802 offset += len; 1803 if (offset == so->so_oobmark) 1804 break; 1805 } 1806 } 1807 if (flags & MSG_EOR) 1808 break; 1809 /* 1810 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1811 * must not quit until "uio->uio_resid == 0" or an error 1812 * termination. If a signal/timeout occurs, return with a 1813 * short count but without error. Keep sockbuf locked 1814 * against other readers. 1815 */ 1816 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1817 !sosendallatonce(so) && nextrecord == NULL) { 1818 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1819 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1820 break; 1821 /* 1822 * Notify the protocol that some data has been 1823 * drained before blocking. 1824 */ 1825 if (pr->pr_flags & PR_WANTRCVD) { 1826 SOCKBUF_UNLOCK(&so->so_rcv); 1827 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1828 SOCKBUF_LOCK(&so->so_rcv); 1829 } 1830 SBLASTRECORDCHK(&so->so_rcv); 1831 SBLASTMBUFCHK(&so->so_rcv); 1832 error = sbwait(&so->so_rcv); 1833 if (error) 1834 goto release; 1835 m = so->so_rcv.sb_mb; 1836 if (m != NULL) 1837 nextrecord = m->m_nextpkt; 1838 } 1839 } 1840 1841 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1842 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1843 flags |= MSG_TRUNC; 1844 if ((flags & MSG_PEEK) == 0) 1845 (void) sbdroprecord_locked(&so->so_rcv); 1846 } 1847 if ((flags & MSG_PEEK) == 0) { 1848 if (m == NULL) { 1849 /* 1850 * First part is an inline SB_EMPTY_FIXUP(). Second 1851 * part makes sure sb_lastrecord is up-to-date if 1852 * there is still data in the socket buffer. 1853 */ 1854 so->so_rcv.sb_mb = nextrecord; 1855 if (so->so_rcv.sb_mb == NULL) { 1856 so->so_rcv.sb_mbtail = NULL; 1857 so->so_rcv.sb_lastrecord = NULL; 1858 } else if (nextrecord->m_nextpkt == NULL) 1859 so->so_rcv.sb_lastrecord = nextrecord; 1860 } 1861 SBLASTRECORDCHK(&so->so_rcv); 1862 SBLASTMBUFCHK(&so->so_rcv); 1863 /* 1864 * If soreceive() is being done from the socket callback, 1865 * then don't need to generate ACK to peer to update window, 1866 * since ACK will be generated on return to TCP. 1867 */ 1868 if (!(flags & MSG_SOCALLBCK) && 1869 (pr->pr_flags & PR_WANTRCVD)) { 1870 SOCKBUF_UNLOCK(&so->so_rcv); 1871 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1872 SOCKBUF_LOCK(&so->so_rcv); 1873 } 1874 } 1875 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1876 if (orig_resid == uio->uio_resid && orig_resid && 1877 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1878 sbunlock(&so->so_rcv); 1879 goto restart; 1880 } 1881 1882 if (flagsp != NULL) 1883 *flagsp |= flags; 1884release: 1885 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1886 sbunlock(&so->so_rcv); 1887out: 1888 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1889 SOCKBUF_UNLOCK(&so->so_rcv); 1890 return (error); 1891} 1892 1893int 1894soreceive(so, psa, uio, mp0, controlp, flagsp) 1895 struct socket *so; 1896 struct sockaddr **psa; 1897 struct uio *uio; 1898 struct mbuf **mp0; 1899 struct mbuf **controlp; 1900 int *flagsp; 1901{ 1902 1903 /* XXXRW: Temporary debugging. */ 1904 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive, 1905 ("soreceive: protocol calls soreceive")); 1906 1907 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 1908 controlp, flagsp)); 1909} 1910 1911int 1912soshutdown(so, how) 1913 struct socket *so; 1914 int how; 1915{ 1916 struct protosw *pr = so->so_proto; 1917 1918 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1919 return (EINVAL); 1920 1921 if (how != SHUT_WR) 1922 sorflush(so); 1923 if (how != SHUT_RD) 1924 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1925 return (0); 1926} 1927 1928void 1929sorflush(so) 1930 struct socket *so; 1931{ 1932 struct sockbuf *sb = &so->so_rcv; 1933 struct protosw *pr = so->so_proto; 1934 struct sockbuf asb; 1935 1936 /* 1937 * XXXRW: This is quite ugly. Previously, this code made a copy of 1938 * the socket buffer, then zero'd the original to clear the buffer 1939 * fields. However, with mutexes in the socket buffer, this causes 1940 * problems. We only clear the zeroable bits of the original; 1941 * however, we have to initialize and destroy the mutex in the copy 1942 * so that dom_dispose() and sbrelease() can lock t as needed. 1943 */ 1944 SOCKBUF_LOCK(sb); 1945 sb->sb_flags |= SB_NOINTR; 1946 (void) sblock(sb, M_WAITOK); 1947 /* 1948 * socantrcvmore_locked() drops the socket buffer mutex so that it 1949 * can safely perform wakeups. Re-acquire the mutex before 1950 * continuing. 1951 */ 1952 socantrcvmore_locked(so); 1953 SOCKBUF_LOCK(sb); 1954 sbunlock(sb); 1955 /* 1956 * Invalidate/clear most of the sockbuf structure, but leave selinfo 1957 * and mutex data unchanged. 1958 */ 1959 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1960 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1961 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1962 bzero(&sb->sb_startzero, 1963 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1964 SOCKBUF_UNLOCK(sb); 1965 1966 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1967 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1968 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1969 sbrelease(&asb, so); 1970 SOCKBUF_LOCK_DESTROY(&asb); 1971} 1972 1973/* 1974 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 1975 * additional variant to handle the case where the option value needs to be 1976 * some kind of integer, but not a specific size. In addition to their use 1977 * here, these functions are also called by the protocol-level pr_ctloutput() 1978 * routines. 1979 */ 1980int 1981sooptcopyin(sopt, buf, len, minlen) 1982 struct sockopt *sopt; 1983 void *buf; 1984 size_t len; 1985 size_t minlen; 1986{ 1987 size_t valsize; 1988 1989 /* 1990 * If the user gives us more than we wanted, we ignore it, but if we 1991 * don't get the minimum length the caller wants, we return EINVAL. 1992 * On success, sopt->sopt_valsize is set to however much we actually 1993 * retrieved. 1994 */ 1995 if ((valsize = sopt->sopt_valsize) < minlen) 1996 return EINVAL; 1997 if (valsize > len) 1998 sopt->sopt_valsize = valsize = len; 1999 2000 if (sopt->sopt_td != NULL) 2001 return (copyin(sopt->sopt_val, buf, valsize)); 2002 2003 bcopy(sopt->sopt_val, buf, valsize); 2004 return (0); 2005} 2006 2007/* 2008 * Kernel version of setsockopt(2). 2009 * 2010 * XXX: optlen is size_t, not socklen_t 2011 */ 2012int 2013so_setsockopt(struct socket *so, int level, int optname, void *optval, 2014 size_t optlen) 2015{ 2016 struct sockopt sopt; 2017 2018 sopt.sopt_level = level; 2019 sopt.sopt_name = optname; 2020 sopt.sopt_dir = SOPT_SET; 2021 sopt.sopt_val = optval; 2022 sopt.sopt_valsize = optlen; 2023 sopt.sopt_td = NULL; 2024 return (sosetopt(so, &sopt)); 2025} 2026 2027int 2028sosetopt(so, sopt) 2029 struct socket *so; 2030 struct sockopt *sopt; 2031{ 2032 int error, optval; 2033 struct linger l; 2034 struct timeval tv; 2035 u_long val; 2036#ifdef MAC 2037 struct mac extmac; 2038#endif 2039 2040 error = 0; 2041 if (sopt->sopt_level != SOL_SOCKET) { 2042 if (so->so_proto && so->so_proto->pr_ctloutput) 2043 return ((*so->so_proto->pr_ctloutput) 2044 (so, sopt)); 2045 error = ENOPROTOOPT; 2046 } else { 2047 switch (sopt->sopt_name) { 2048#ifdef INET 2049 case SO_ACCEPTFILTER: 2050 error = do_setopt_accept_filter(so, sopt); 2051 if (error) 2052 goto bad; 2053 break; 2054#endif 2055 case SO_LINGER: 2056 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2057 if (error) 2058 goto bad; 2059 2060 SOCK_LOCK(so); 2061 so->so_linger = l.l_linger; 2062 if (l.l_onoff) 2063 so->so_options |= SO_LINGER; 2064 else 2065 so->so_options &= ~SO_LINGER; 2066 SOCK_UNLOCK(so); 2067 break; 2068 2069 case SO_DEBUG: 2070 case SO_KEEPALIVE: 2071 case SO_DONTROUTE: 2072 case SO_USELOOPBACK: 2073 case SO_BROADCAST: 2074 case SO_REUSEADDR: 2075 case SO_REUSEPORT: 2076 case SO_OOBINLINE: 2077 case SO_TIMESTAMP: 2078 case SO_BINTIME: 2079 case SO_NOSIGPIPE: 2080 error = sooptcopyin(sopt, &optval, sizeof optval, 2081 sizeof optval); 2082 if (error) 2083 goto bad; 2084 SOCK_LOCK(so); 2085 if (optval) 2086 so->so_options |= sopt->sopt_name; 2087 else 2088 so->so_options &= ~sopt->sopt_name; 2089 SOCK_UNLOCK(so); 2090 break; 2091 2092 case SO_SNDBUF: 2093 case SO_RCVBUF: 2094 case SO_SNDLOWAT: 2095 case SO_RCVLOWAT: 2096 error = sooptcopyin(sopt, &optval, sizeof optval, 2097 sizeof optval); 2098 if (error) 2099 goto bad; 2100 2101 /* 2102 * Values < 1 make no sense for any of these options, 2103 * so disallow them. 2104 */ 2105 if (optval < 1) { 2106 error = EINVAL; 2107 goto bad; 2108 } 2109 2110 switch (sopt->sopt_name) { 2111 case SO_SNDBUF: 2112 case SO_RCVBUF: 2113 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2114 &so->so_snd : &so->so_rcv, (u_long)optval, 2115 so, curthread) == 0) { 2116 error = ENOBUFS; 2117 goto bad; 2118 } 2119 break; 2120 2121 /* 2122 * Make sure the low-water is never greater than the 2123 * high-water. 2124 */ 2125 case SO_SNDLOWAT: 2126 SOCKBUF_LOCK(&so->so_snd); 2127 so->so_snd.sb_lowat = 2128 (optval > so->so_snd.sb_hiwat) ? 2129 so->so_snd.sb_hiwat : optval; 2130 SOCKBUF_UNLOCK(&so->so_snd); 2131 break; 2132 case SO_RCVLOWAT: 2133 SOCKBUF_LOCK(&so->so_rcv); 2134 so->so_rcv.sb_lowat = 2135 (optval > so->so_rcv.sb_hiwat) ? 2136 so->so_rcv.sb_hiwat : optval; 2137 SOCKBUF_UNLOCK(&so->so_rcv); 2138 break; 2139 } 2140 break; 2141 2142 case SO_SNDTIMEO: 2143 case SO_RCVTIMEO: 2144#ifdef COMPAT_IA32 2145 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2146 struct timeval32 tv32; 2147 2148 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2149 sizeof tv32); 2150 CP(tv32, tv, tv_sec); 2151 CP(tv32, tv, tv_usec); 2152 } else 2153#endif 2154 error = sooptcopyin(sopt, &tv, sizeof tv, 2155 sizeof tv); 2156 if (error) 2157 goto bad; 2158 2159 /* assert(hz > 0); */ 2160 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2161 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2162 error = EDOM; 2163 goto bad; 2164 } 2165 /* assert(tick > 0); */ 2166 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2167 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2168 if (val > INT_MAX) { 2169 error = EDOM; 2170 goto bad; 2171 } 2172 if (val == 0 && tv.tv_usec != 0) 2173 val = 1; 2174 2175 switch (sopt->sopt_name) { 2176 case SO_SNDTIMEO: 2177 so->so_snd.sb_timeo = val; 2178 break; 2179 case SO_RCVTIMEO: 2180 so->so_rcv.sb_timeo = val; 2181 break; 2182 } 2183 break; 2184 2185 case SO_LABEL: 2186#ifdef MAC 2187 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2188 sizeof extmac); 2189 if (error) 2190 goto bad; 2191 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2192 so, &extmac); 2193#else 2194 error = EOPNOTSUPP; 2195#endif 2196 break; 2197 2198 default: 2199 error = ENOPROTOOPT; 2200 break; 2201 } 2202 if (error == 0 && so->so_proto != NULL && 2203 so->so_proto->pr_ctloutput != NULL) { 2204 (void) ((*so->so_proto->pr_ctloutput) 2205 (so, sopt)); 2206 } 2207 } 2208bad: 2209 return (error); 2210} 2211 2212/* 2213 * Helper routine for getsockopt. 2214 */ 2215int 2216sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2217{ 2218 int error; 2219 size_t valsize; 2220 2221 error = 0; 2222 2223 /* 2224 * Documented get behavior is that we always return a value, possibly 2225 * truncated to fit in the user's buffer. Traditional behavior is 2226 * that we always tell the user precisely how much we copied, rather 2227 * than something useful like the total amount we had available for 2228 * her. Note that this interface is not idempotent; the entire 2229 * answer must generated ahead of time. 2230 */ 2231 valsize = min(len, sopt->sopt_valsize); 2232 sopt->sopt_valsize = valsize; 2233 if (sopt->sopt_val != NULL) { 2234 if (sopt->sopt_td != NULL) 2235 error = copyout(buf, sopt->sopt_val, valsize); 2236 else 2237 bcopy(buf, sopt->sopt_val, valsize); 2238 } 2239 return (error); 2240} 2241 2242int 2243sogetopt(so, sopt) 2244 struct socket *so; 2245 struct sockopt *sopt; 2246{ 2247 int error, optval; 2248 struct linger l; 2249 struct timeval tv; 2250#ifdef MAC 2251 struct mac extmac; 2252#endif 2253 2254 error = 0; 2255 if (sopt->sopt_level != SOL_SOCKET) { 2256 if (so->so_proto && so->so_proto->pr_ctloutput) { 2257 return ((*so->so_proto->pr_ctloutput) 2258 (so, sopt)); 2259 } else 2260 return (ENOPROTOOPT); 2261 } else { 2262 switch (sopt->sopt_name) { 2263#ifdef INET 2264 case SO_ACCEPTFILTER: 2265 error = do_getopt_accept_filter(so, sopt); 2266 break; 2267#endif 2268 case SO_LINGER: 2269 SOCK_LOCK(so); 2270 l.l_onoff = so->so_options & SO_LINGER; 2271 l.l_linger = so->so_linger; 2272 SOCK_UNLOCK(so); 2273 error = sooptcopyout(sopt, &l, sizeof l); 2274 break; 2275 2276 case SO_USELOOPBACK: 2277 case SO_DONTROUTE: 2278 case SO_DEBUG: 2279 case SO_KEEPALIVE: 2280 case SO_REUSEADDR: 2281 case SO_REUSEPORT: 2282 case SO_BROADCAST: 2283 case SO_OOBINLINE: 2284 case SO_ACCEPTCONN: 2285 case SO_TIMESTAMP: 2286 case SO_BINTIME: 2287 case SO_NOSIGPIPE: 2288 optval = so->so_options & sopt->sopt_name; 2289integer: 2290 error = sooptcopyout(sopt, &optval, sizeof optval); 2291 break; 2292 2293 case SO_TYPE: 2294 optval = so->so_type; 2295 goto integer; 2296 2297 case SO_ERROR: 2298 SOCK_LOCK(so); 2299 optval = so->so_error; 2300 so->so_error = 0; 2301 SOCK_UNLOCK(so); 2302 goto integer; 2303 2304 case SO_SNDBUF: 2305 optval = so->so_snd.sb_hiwat; 2306 goto integer; 2307 2308 case SO_RCVBUF: 2309 optval = so->so_rcv.sb_hiwat; 2310 goto integer; 2311 2312 case SO_SNDLOWAT: 2313 optval = so->so_snd.sb_lowat; 2314 goto integer; 2315 2316 case SO_RCVLOWAT: 2317 optval = so->so_rcv.sb_lowat; 2318 goto integer; 2319 2320 case SO_SNDTIMEO: 2321 case SO_RCVTIMEO: 2322 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2323 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2324 2325 tv.tv_sec = optval / hz; 2326 tv.tv_usec = (optval % hz) * tick; 2327#ifdef COMPAT_IA32 2328 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2329 struct timeval32 tv32; 2330 2331 CP(tv, tv32, tv_sec); 2332 CP(tv, tv32, tv_usec); 2333 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2334 } else 2335#endif 2336 error = sooptcopyout(sopt, &tv, sizeof tv); 2337 break; 2338 2339 case SO_LABEL: 2340#ifdef MAC 2341 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2342 sizeof(extmac)); 2343 if (error) 2344 return (error); 2345 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2346 so, &extmac); 2347 if (error) 2348 return (error); 2349 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2350#else 2351 error = EOPNOTSUPP; 2352#endif 2353 break; 2354 2355 case SO_PEERLABEL: 2356#ifdef MAC 2357 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2358 sizeof(extmac)); 2359 if (error) 2360 return (error); 2361 error = mac_getsockopt_peerlabel( 2362 sopt->sopt_td->td_ucred, so, &extmac); 2363 if (error) 2364 return (error); 2365 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2366#else 2367 error = EOPNOTSUPP; 2368#endif 2369 break; 2370 2371 case SO_LISTENQLIMIT: 2372 optval = so->so_qlimit; 2373 goto integer; 2374 2375 case SO_LISTENQLEN: 2376 optval = so->so_qlen; 2377 goto integer; 2378 2379 case SO_LISTENINCQLEN: 2380 optval = so->so_incqlen; 2381 goto integer; 2382 2383 default: 2384 error = ENOPROTOOPT; 2385 break; 2386 } 2387 return (error); 2388 } 2389} 2390 2391/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2392int 2393soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2394{ 2395 struct mbuf *m, *m_prev; 2396 int sopt_size = sopt->sopt_valsize; 2397 2398 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2399 if (m == NULL) 2400 return ENOBUFS; 2401 if (sopt_size > MLEN) { 2402 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2403 if ((m->m_flags & M_EXT) == 0) { 2404 m_free(m); 2405 return ENOBUFS; 2406 } 2407 m->m_len = min(MCLBYTES, sopt_size); 2408 } else { 2409 m->m_len = min(MLEN, sopt_size); 2410 } 2411 sopt_size -= m->m_len; 2412 *mp = m; 2413 m_prev = m; 2414 2415 while (sopt_size) { 2416 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2417 if (m == NULL) { 2418 m_freem(*mp); 2419 return ENOBUFS; 2420 } 2421 if (sopt_size > MLEN) { 2422 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2423 M_DONTWAIT); 2424 if ((m->m_flags & M_EXT) == 0) { 2425 m_freem(m); 2426 m_freem(*mp); 2427 return ENOBUFS; 2428 } 2429 m->m_len = min(MCLBYTES, sopt_size); 2430 } else { 2431 m->m_len = min(MLEN, sopt_size); 2432 } 2433 sopt_size -= m->m_len; 2434 m_prev->m_next = m; 2435 m_prev = m; 2436 } 2437 return (0); 2438} 2439 2440/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2441int 2442soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2443{ 2444 struct mbuf *m0 = m; 2445 2446 if (sopt->sopt_val == NULL) 2447 return (0); 2448 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2449 if (sopt->sopt_td != NULL) { 2450 int error; 2451 2452 error = copyin(sopt->sopt_val, mtod(m, char *), 2453 m->m_len); 2454 if (error != 0) { 2455 m_freem(m0); 2456 return(error); 2457 } 2458 } else 2459 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2460 sopt->sopt_valsize -= m->m_len; 2461 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2462 m = m->m_next; 2463 } 2464 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2465 panic("ip6_sooptmcopyin"); 2466 return (0); 2467} 2468 2469/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2470int 2471soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2472{ 2473 struct mbuf *m0 = m; 2474 size_t valsize = 0; 2475 2476 if (sopt->sopt_val == NULL) 2477 return (0); 2478 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2479 if (sopt->sopt_td != NULL) { 2480 int error; 2481 2482 error = copyout(mtod(m, char *), sopt->sopt_val, 2483 m->m_len); 2484 if (error != 0) { 2485 m_freem(m0); 2486 return(error); 2487 } 2488 } else 2489 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2490 sopt->sopt_valsize -= m->m_len; 2491 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2492 valsize += m->m_len; 2493 m = m->m_next; 2494 } 2495 if (m != NULL) { 2496 /* enough soopt buffer should be given from user-land */ 2497 m_freem(m0); 2498 return(EINVAL); 2499 } 2500 sopt->sopt_valsize = valsize; 2501 return (0); 2502} 2503 2504/* 2505 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2506 * out-of-band data, which will then notify socket consumers. 2507 */ 2508void 2509sohasoutofband(so) 2510 struct socket *so; 2511{ 2512 if (so->so_sigio != NULL) 2513 pgsigio(&so->so_sigio, SIGURG, 0); 2514 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2515} 2516 2517int 2518sopoll(struct socket *so, int events, struct ucred *active_cred, 2519 struct thread *td) 2520{ 2521 2522 /* XXXRW: Temporary debugging. */ 2523 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll, 2524 ("sopoll: protocol calls sopoll")); 2525 2526 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 2527 td)); 2528} 2529 2530int 2531sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 2532 struct thread *td) 2533{ 2534 int revents = 0; 2535 2536 SOCKBUF_LOCK(&so->so_snd); 2537 SOCKBUF_LOCK(&so->so_rcv); 2538 if (events & (POLLIN | POLLRDNORM)) 2539 if (soreadable(so)) 2540 revents |= events & (POLLIN | POLLRDNORM); 2541 2542 if (events & POLLINIGNEOF) 2543 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2544 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2545 revents |= POLLINIGNEOF; 2546 2547 if (events & (POLLOUT | POLLWRNORM)) 2548 if (sowriteable(so)) 2549 revents |= events & (POLLOUT | POLLWRNORM); 2550 2551 if (events & (POLLPRI | POLLRDBAND)) 2552 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2553 revents |= events & (POLLPRI | POLLRDBAND); 2554 2555 if (revents == 0) { 2556 if (events & 2557 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2558 POLLRDBAND)) { 2559 selrecord(td, &so->so_rcv.sb_sel); 2560 so->so_rcv.sb_flags |= SB_SEL; 2561 } 2562 2563 if (events & (POLLOUT | POLLWRNORM)) { 2564 selrecord(td, &so->so_snd.sb_sel); 2565 so->so_snd.sb_flags |= SB_SEL; 2566 } 2567 } 2568 2569 SOCKBUF_UNLOCK(&so->so_rcv); 2570 SOCKBUF_UNLOCK(&so->so_snd); 2571 return (revents); 2572} 2573 2574int 2575soo_kqfilter(struct file *fp, struct knote *kn) 2576{ 2577 struct socket *so = kn->kn_fp->f_data; 2578 struct sockbuf *sb; 2579 2580 switch (kn->kn_filter) { 2581 case EVFILT_READ: 2582 if (so->so_options & SO_ACCEPTCONN) 2583 kn->kn_fop = &solisten_filtops; 2584 else 2585 kn->kn_fop = &soread_filtops; 2586 sb = &so->so_rcv; 2587 break; 2588 case EVFILT_WRITE: 2589 kn->kn_fop = &sowrite_filtops; 2590 sb = &so->so_snd; 2591 break; 2592 default: 2593 return (EINVAL); 2594 } 2595 2596 SOCKBUF_LOCK(sb); 2597 knlist_add(&sb->sb_sel.si_note, kn, 1); 2598 sb->sb_flags |= SB_KNOTE; 2599 SOCKBUF_UNLOCK(sb); 2600 return (0); 2601} 2602 2603static void 2604filt_sordetach(struct knote *kn) 2605{ 2606 struct socket *so = kn->kn_fp->f_data; 2607 2608 SOCKBUF_LOCK(&so->so_rcv); 2609 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2610 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2611 so->so_rcv.sb_flags &= ~SB_KNOTE; 2612 SOCKBUF_UNLOCK(&so->so_rcv); 2613} 2614 2615/*ARGSUSED*/ 2616static int 2617filt_soread(struct knote *kn, long hint) 2618{ 2619 struct socket *so; 2620 2621 so = kn->kn_fp->f_data; 2622 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2623 2624 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2625 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2626 kn->kn_flags |= EV_EOF; 2627 kn->kn_fflags = so->so_error; 2628 return (1); 2629 } else if (so->so_error) /* temporary udp error */ 2630 return (1); 2631 else if (kn->kn_sfflags & NOTE_LOWAT) 2632 return (kn->kn_data >= kn->kn_sdata); 2633 else 2634 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2635} 2636 2637static void 2638filt_sowdetach(struct knote *kn) 2639{ 2640 struct socket *so = kn->kn_fp->f_data; 2641 2642 SOCKBUF_LOCK(&so->so_snd); 2643 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2644 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2645 so->so_snd.sb_flags &= ~SB_KNOTE; 2646 SOCKBUF_UNLOCK(&so->so_snd); 2647} 2648 2649/*ARGSUSED*/ 2650static int 2651filt_sowrite(struct knote *kn, long hint) 2652{ 2653 struct socket *so; 2654 2655 so = kn->kn_fp->f_data; 2656 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2657 kn->kn_data = sbspace(&so->so_snd); 2658 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2659 kn->kn_flags |= EV_EOF; 2660 kn->kn_fflags = so->so_error; 2661 return (1); 2662 } else if (so->so_error) /* temporary udp error */ 2663 return (1); 2664 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2665 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2666 return (0); 2667 else if (kn->kn_sfflags & NOTE_LOWAT) 2668 return (kn->kn_data >= kn->kn_sdata); 2669 else 2670 return (kn->kn_data >= so->so_snd.sb_lowat); 2671} 2672 2673/*ARGSUSED*/ 2674static int 2675filt_solisten(struct knote *kn, long hint) 2676{ 2677 struct socket *so = kn->kn_fp->f_data; 2678 2679 kn->kn_data = so->so_qlen; 2680 return (! TAILQ_EMPTY(&so->so_comp)); 2681} 2682 2683int 2684socheckuid(struct socket *so, uid_t uid) 2685{ 2686 2687 if (so == NULL) 2688 return (EPERM); 2689 if (so->so_cred->cr_uid != uid) 2690 return (EPERM); 2691 return (0); 2692} 2693 2694static int 2695somaxconn_sysctl(SYSCTL_HANDLER_ARGS) 2696{ 2697 int error; 2698 int val; 2699 2700 val = somaxconn; 2701 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2702 if (error || !req->newptr ) 2703 return (error); 2704 2705 if (val < 1 || val > USHRT_MAX) 2706 return (EINVAL); 2707 2708 somaxconn = val; 2709 return (0); 2710} 2711