uipc_socket.c revision 243994
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2008 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35/* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 99 * and sorflush(), which are usually called from a pre-set VNET context. 100 * sopoll() currently does not need a VNET context to be set. 101 */ 102 103#include <sys/cdefs.h> 104__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 243994 2012-12-07 22:15:51Z pjd $"); 105 106#include "opt_inet.h" 107#include "opt_inet6.h" 108#include "opt_zero.h" 109#include "opt_compat.h" 110 111#include <sys/param.h> 112#include <sys/systm.h> 113#include <sys/fcntl.h> 114#include <sys/limits.h> 115#include <sys/lock.h> 116#include <sys/mac.h> 117#include <sys/malloc.h> 118#include <sys/mbuf.h> 119#include <sys/mutex.h> 120#include <sys/domain.h> 121#include <sys/file.h> /* for struct knote */ 122#include <sys/kernel.h> 123#include <sys/event.h> 124#include <sys/eventhandler.h> 125#include <sys/poll.h> 126#include <sys/proc.h> 127#include <sys/protosw.h> 128#include <sys/socket.h> 129#include <sys/socketvar.h> 130#include <sys/resourcevar.h> 131#include <net/route.h> 132#include <sys/signalvar.h> 133#include <sys/stat.h> 134#include <sys/sx.h> 135#include <sys/sysctl.h> 136#include <sys/uio.h> 137#include <sys/jail.h> 138#include <sys/syslog.h> 139 140#include <net/vnet.h> 141 142#include <security/mac/mac_framework.h> 143 144#include <vm/uma.h> 145 146#ifdef COMPAT_FREEBSD32 147#include <sys/mount.h> 148#include <sys/sysent.h> 149#include <compat/freebsd32/freebsd32.h> 150#endif 151 152static int soreceive_rcvoob(struct socket *so, struct uio *uio, 153 int flags); 154 155static void filt_sordetach(struct knote *kn); 156static int filt_soread(struct knote *kn, long hint); 157static void filt_sowdetach(struct knote *kn); 158static int filt_sowrite(struct knote *kn, long hint); 159static int filt_solisten(struct knote *kn, long hint); 160 161static struct filterops solisten_filtops = { 162 .f_isfd = 1, 163 .f_detach = filt_sordetach, 164 .f_event = filt_solisten, 165}; 166static struct filterops soread_filtops = { 167 .f_isfd = 1, 168 .f_detach = filt_sordetach, 169 .f_event = filt_soread, 170}; 171static struct filterops sowrite_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_sowdetach, 174 .f_event = filt_sowrite, 175}; 176 177so_gen_t so_gencnt; /* generation count for sockets */ 178 179MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 180MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 181 182#define VNET_SO_ASSERT(so) \ 183 VNET_ASSERT(curvnet != NULL, \ 184 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 185 186/* 187 * Limit on the number of connections in the listen queue waiting 188 * for accept(2). 189 * NB: The orginal sysctl somaxconn is still available but hidden 190 * to prevent confusion about the actual purpose of this number. 191 */ 192static int somaxconn = SOMAXCONN; 193 194static int 195sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 196{ 197 int error; 198 int val; 199 200 val = somaxconn; 201 error = sysctl_handle_int(oidp, &val, 0, req); 202 if (error || !req->newptr ) 203 return (error); 204 205 if (val < 1 || val > USHRT_MAX) 206 return (EINVAL); 207 208 somaxconn = val; 209 return (0); 210} 211SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 212 0, sizeof(int), sysctl_somaxconn, "I", 213 "Maximum listen socket pending connection accept queue size"); 214SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 215 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 216 0, sizeof(int), sysctl_somaxconn, "I", 217 "Maximum listen socket pending connection accept queue size (compat)"); 218 219static int numopensockets; 220SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 221 &numopensockets, 0, "Number of open sockets"); 222 223#if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP) 224SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 225 "Zero copy controls"); 226#ifdef SOCKET_RECV_PFLIP 227int so_zero_copy_receive = 1; 228SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 229 &so_zero_copy_receive, 0, "Enable zero copy receive"); 230#endif 231#ifdef SOCKET_SEND_COW 232int so_zero_copy_send = 1; 233SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 234 &so_zero_copy_send, 0, "Enable zero copy send"); 235#endif /* SOCKET_SEND_COW */ 236#endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */ 237 238/* 239 * accept_mtx locks down per-socket fields relating to accept queues. See 240 * socketvar.h for an annotation of the protected fields of struct socket. 241 */ 242struct mtx accept_mtx; 243MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 244 245/* 246 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 247 * so_gencnt field. 248 */ 249static struct mtx so_global_mtx; 250MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 251 252/* 253 * General IPC sysctl name space, used by sockets and a variety of other IPC 254 * types. 255 */ 256SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 257 258/* 259 * Initialize the socket subsystem and set up the socket 260 * memory allocator. 261 */ 262static uma_zone_t socket_zone; 263int maxsockets; 264 265static void 266socket_zone_change(void *tag) 267{ 268 269 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 270} 271 272static void 273socket_init(void *tag) 274{ 275 276 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 277 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 278 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 279 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 280 EVENTHANDLER_PRI_FIRST); 281} 282SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 283 284/* 285 * Initialise maxsockets. This SYSINIT must be run after 286 * tunable_mbinit(). 287 */ 288static void 289init_maxsockets(void *ignored) 290{ 291 292 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 293 maxsockets = imax(maxsockets, maxfiles); 294} 295SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 296 297/* 298 * Sysctl to get and set the maximum global sockets limit. Notify protocols 299 * of the change so that they can update their dependent limits as required. 300 */ 301static int 302sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 303{ 304 int error, newmaxsockets; 305 306 newmaxsockets = maxsockets; 307 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 308 if (error == 0 && req->newptr) { 309 if (newmaxsockets > maxsockets && 310 newmaxsockets <= maxfiles) { 311 maxsockets = newmaxsockets; 312 EVENTHANDLER_INVOKE(maxsockets_change); 313 } else 314 error = EINVAL; 315 } 316 return (error); 317} 318SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 319 &maxsockets, 0, sysctl_maxsockets, "IU", 320 "Maximum number of sockets avaliable"); 321 322/* 323 * Socket operation routines. These routines are called by the routines in 324 * sys_socket.c or from a system process, and implement the semantics of 325 * socket operations by switching out to the protocol specific routines. 326 */ 327 328/* 329 * Get a socket structure from our zone, and initialize it. Note that it 330 * would probably be better to allocate socket and PCB at the same time, but 331 * I'm not convinced that all the protocols can be easily modified to do 332 * this. 333 * 334 * soalloc() returns a socket with a ref count of 0. 335 */ 336static struct socket * 337soalloc(struct vnet *vnet) 338{ 339 struct socket *so; 340 341 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 342 if (so == NULL) 343 return (NULL); 344#ifdef MAC 345 if (mac_socket_init(so, M_NOWAIT) != 0) { 346 uma_zfree(socket_zone, so); 347 return (NULL); 348 } 349#endif 350 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 351 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 352 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 353 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 354 TAILQ_INIT(&so->so_aiojobq); 355 mtx_lock(&so_global_mtx); 356 so->so_gencnt = ++so_gencnt; 357 ++numopensockets; 358#ifdef VIMAGE 359 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 360 __func__, __LINE__, so)); 361 vnet->vnet_sockcnt++; 362 so->so_vnet = vnet; 363#endif 364 mtx_unlock(&so_global_mtx); 365 return (so); 366} 367 368/* 369 * Free the storage associated with a socket at the socket layer, tear down 370 * locks, labels, etc. All protocol state is assumed already to have been 371 * torn down (and possibly never set up) by the caller. 372 */ 373static void 374sodealloc(struct socket *so) 375{ 376 377 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 378 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 379 380 mtx_lock(&so_global_mtx); 381 so->so_gencnt = ++so_gencnt; 382 --numopensockets; /* Could be below, but faster here. */ 383#ifdef VIMAGE 384 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 385 __func__, __LINE__, so)); 386 so->so_vnet->vnet_sockcnt--; 387#endif 388 mtx_unlock(&so_global_mtx); 389 if (so->so_rcv.sb_hiwat) 390 (void)chgsbsize(so->so_cred->cr_uidinfo, 391 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 392 if (so->so_snd.sb_hiwat) 393 (void)chgsbsize(so->so_cred->cr_uidinfo, 394 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 395#ifdef INET 396 /* remove acccept filter if one is present. */ 397 if (so->so_accf != NULL) 398 do_setopt_accept_filter(so, NULL); 399#endif 400#ifdef MAC 401 mac_socket_destroy(so); 402#endif 403 crfree(so->so_cred); 404 sx_destroy(&so->so_snd.sb_sx); 405 sx_destroy(&so->so_rcv.sb_sx); 406 SOCKBUF_LOCK_DESTROY(&so->so_snd); 407 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 408 uma_zfree(socket_zone, so); 409} 410 411/* 412 * socreate returns a socket with a ref count of 1. The socket should be 413 * closed with soclose(). 414 */ 415int 416socreate(int dom, struct socket **aso, int type, int proto, 417 struct ucred *cred, struct thread *td) 418{ 419 struct protosw *prp; 420 struct socket *so; 421 int error; 422 423 if (proto) 424 prp = pffindproto(dom, proto, type); 425 else 426 prp = pffindtype(dom, type); 427 428 if (prp == NULL) { 429 /* No support for domain. */ 430 if (pffinddomain(dom) == NULL) 431 return (EAFNOSUPPORT); 432 /* No support for socket type. */ 433 if (proto == 0 && type != 0) 434 return (EPROTOTYPE); 435 return (EPROTONOSUPPORT); 436 } 437 if (prp->pr_usrreqs->pru_attach == NULL || 438 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 439 return (EPROTONOSUPPORT); 440 441 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 442 return (EPROTONOSUPPORT); 443 444 if (prp->pr_type != type) 445 return (EPROTOTYPE); 446 so = soalloc(CRED_TO_VNET(cred)); 447 if (so == NULL) 448 return (ENOBUFS); 449 450 TAILQ_INIT(&so->so_incomp); 451 TAILQ_INIT(&so->so_comp); 452 so->so_type = type; 453 so->so_cred = crhold(cred); 454 if ((prp->pr_domain->dom_family == PF_INET) || 455 (prp->pr_domain->dom_family == PF_INET6) || 456 (prp->pr_domain->dom_family == PF_ROUTE)) 457 so->so_fibnum = td->td_proc->p_fibnum; 458 else 459 so->so_fibnum = 0; 460 so->so_proto = prp; 461#ifdef MAC 462 mac_socket_create(cred, so); 463#endif 464 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 465 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 466 so->so_count = 1; 467 /* 468 * Auto-sizing of socket buffers is managed by the protocols and 469 * the appropriate flags must be set in the pru_attach function. 470 */ 471 CURVNET_SET(so->so_vnet); 472 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 473 CURVNET_RESTORE(); 474 if (error) { 475 KASSERT(so->so_count == 1, ("socreate: so_count %d", 476 so->so_count)); 477 so->so_count = 0; 478 sodealloc(so); 479 return (error); 480 } 481 *aso = so; 482 return (0); 483} 484 485#ifdef REGRESSION 486static int regression_sonewconn_earlytest = 1; 487SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 488 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 489#endif 490 491/* 492 * When an attempt at a new connection is noted on a socket which accepts 493 * connections, sonewconn is called. If the connection is possible (subject 494 * to space constraints, etc.) then we allocate a new structure, propoerly 495 * linked into the data structure of the original socket, and return this. 496 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 497 * 498 * Note: the ref count on the socket is 0 on return. 499 */ 500struct socket * 501sonewconn(struct socket *head, int connstatus) 502{ 503 struct socket *so; 504 int over; 505 506 ACCEPT_LOCK(); 507 over = (head->so_qlen > 3 * head->so_qlimit / 2); 508 ACCEPT_UNLOCK(); 509#ifdef REGRESSION 510 if (regression_sonewconn_earlytest && over) { 511#else 512 if (over) { 513#endif 514 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 515 "%i already in queue awaiting acceptance\n", 516 __func__, head->so_pcb, over); 517 return (NULL); 518 } 519 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 520 __func__, __LINE__, head)); 521 so = soalloc(head->so_vnet); 522 if (so == NULL) { 523 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 524 "limit reached or out of memory\n", 525 __func__, head->so_pcb); 526 return (NULL); 527 } 528 if ((head->so_options & SO_ACCEPTFILTER) != 0) 529 connstatus = 0; 530 so->so_head = head; 531 so->so_type = head->so_type; 532 so->so_options = head->so_options &~ SO_ACCEPTCONN; 533 so->so_linger = head->so_linger; 534 so->so_state = head->so_state | SS_NOFDREF; 535 so->so_fibnum = head->so_fibnum; 536 so->so_proto = head->so_proto; 537 so->so_cred = crhold(head->so_cred); 538#ifdef MAC 539 mac_socket_newconn(head, so); 540#endif 541 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 542 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 543 VNET_SO_ASSERT(head); 544 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 545 sodealloc(so); 546 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 547 __func__, head->so_pcb); 548 return (NULL); 549 } 550 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 551 sodealloc(so); 552 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 553 __func__, head->so_pcb); 554 return (NULL); 555 } 556 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 557 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 558 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 559 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 560 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 561 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 562 so->so_state |= connstatus; 563 ACCEPT_LOCK(); 564 /* 565 * The accept socket may be tearing down but we just 566 * won a race on the ACCEPT_LOCK. 567 */ 568 if (!(head->so_options & SO_ACCEPTCONN)) { 569 SOCK_LOCK(so); 570 so->so_head = NULL; 571 sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ 572 return (NULL); 573 } 574 if (connstatus) { 575 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 576 so->so_qstate |= SQ_COMP; 577 head->so_qlen++; 578 } else { 579 /* 580 * Keep removing sockets from the head until there's room for 581 * us to insert on the tail. In pre-locking revisions, this 582 * was a simple if(), but as we could be racing with other 583 * threads and soabort() requires dropping locks, we must 584 * loop waiting for the condition to be true. 585 */ 586 while (head->so_incqlen > head->so_qlimit) { 587 struct socket *sp; 588 sp = TAILQ_FIRST(&head->so_incomp); 589 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 590 head->so_incqlen--; 591 sp->so_qstate &= ~SQ_INCOMP; 592 sp->so_head = NULL; 593 ACCEPT_UNLOCK(); 594 soabort(sp); 595 ACCEPT_LOCK(); 596 } 597 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 598 so->so_qstate |= SQ_INCOMP; 599 head->so_incqlen++; 600 } 601 ACCEPT_UNLOCK(); 602 if (connstatus) { 603 sorwakeup(head); 604 wakeup_one(&head->so_timeo); 605 } 606 return (so); 607} 608 609int 610sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 611{ 612 int error; 613 614 CURVNET_SET(so->so_vnet); 615 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 616 CURVNET_RESTORE(); 617 return error; 618} 619 620/* 621 * solisten() transitions a socket from a non-listening state to a listening 622 * state, but can also be used to update the listen queue depth on an 623 * existing listen socket. The protocol will call back into the sockets 624 * layer using solisten_proto_check() and solisten_proto() to check and set 625 * socket-layer listen state. Call backs are used so that the protocol can 626 * acquire both protocol and socket layer locks in whatever order is required 627 * by the protocol. 628 * 629 * Protocol implementors are advised to hold the socket lock across the 630 * socket-layer test and set to avoid races at the socket layer. 631 */ 632int 633solisten(struct socket *so, int backlog, struct thread *td) 634{ 635 int error; 636 637 CURVNET_SET(so->so_vnet); 638 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 639 CURVNET_RESTORE(); 640 return error; 641} 642 643int 644solisten_proto_check(struct socket *so) 645{ 646 647 SOCK_LOCK_ASSERT(so); 648 649 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 650 SS_ISDISCONNECTING)) 651 return (EINVAL); 652 return (0); 653} 654 655void 656solisten_proto(struct socket *so, int backlog) 657{ 658 659 SOCK_LOCK_ASSERT(so); 660 661 if (backlog < 0 || backlog > somaxconn) 662 backlog = somaxconn; 663 so->so_qlimit = backlog; 664 so->so_options |= SO_ACCEPTCONN; 665} 666 667/* 668 * Evaluate the reference count and named references on a socket; if no 669 * references remain, free it. This should be called whenever a reference is 670 * released, such as in sorele(), but also when named reference flags are 671 * cleared in socket or protocol code. 672 * 673 * sofree() will free the socket if: 674 * 675 * - There are no outstanding file descriptor references or related consumers 676 * (so_count == 0). 677 * 678 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 679 * 680 * - The protocol does not have an outstanding strong reference on the socket 681 * (SS_PROTOREF). 682 * 683 * - The socket is not in a completed connection queue, so a process has been 684 * notified that it is present. If it is removed, the user process may 685 * block in accept() despite select() saying the socket was ready. 686 */ 687void 688sofree(struct socket *so) 689{ 690 struct protosw *pr = so->so_proto; 691 struct socket *head; 692 693 ACCEPT_LOCK_ASSERT(); 694 SOCK_LOCK_ASSERT(so); 695 696 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 697 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 698 SOCK_UNLOCK(so); 699 ACCEPT_UNLOCK(); 700 return; 701 } 702 703 head = so->so_head; 704 if (head != NULL) { 705 KASSERT((so->so_qstate & SQ_COMP) != 0 || 706 (so->so_qstate & SQ_INCOMP) != 0, 707 ("sofree: so_head != NULL, but neither SQ_COMP nor " 708 "SQ_INCOMP")); 709 KASSERT((so->so_qstate & SQ_COMP) == 0 || 710 (so->so_qstate & SQ_INCOMP) == 0, 711 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 712 TAILQ_REMOVE(&head->so_incomp, so, so_list); 713 head->so_incqlen--; 714 so->so_qstate &= ~SQ_INCOMP; 715 so->so_head = NULL; 716 } 717 KASSERT((so->so_qstate & SQ_COMP) == 0 && 718 (so->so_qstate & SQ_INCOMP) == 0, 719 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 720 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 721 if (so->so_options & SO_ACCEPTCONN) { 722 KASSERT((TAILQ_EMPTY(&so->so_comp)), 723 ("sofree: so_comp populated")); 724 KASSERT((TAILQ_EMPTY(&so->so_incomp)), 725 ("sofree: so_incomp populated")); 726 } 727 SOCK_UNLOCK(so); 728 ACCEPT_UNLOCK(); 729 730 VNET_SO_ASSERT(so); 731 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 732 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 733 if (pr->pr_usrreqs->pru_detach != NULL) 734 (*pr->pr_usrreqs->pru_detach)(so); 735 736 /* 737 * From this point on, we assume that no other references to this 738 * socket exist anywhere else in the stack. Therefore, no locks need 739 * to be acquired or held. 740 * 741 * We used to do a lot of socket buffer and socket locking here, as 742 * well as invoke sorflush() and perform wakeups. The direct call to 743 * dom_dispose() and sbrelease_internal() are an inlining of what was 744 * necessary from sorflush(). 745 * 746 * Notice that the socket buffer and kqueue state are torn down 747 * before calling pru_detach. This means that protocols shold not 748 * assume they can perform socket wakeups, etc, in their detach code. 749 */ 750 sbdestroy(&so->so_snd, so); 751 sbdestroy(&so->so_rcv, so); 752 seldrain(&so->so_snd.sb_sel); 753 seldrain(&so->so_rcv.sb_sel); 754 knlist_destroy(&so->so_rcv.sb_sel.si_note); 755 knlist_destroy(&so->so_snd.sb_sel.si_note); 756 sodealloc(so); 757} 758 759/* 760 * Close a socket on last file table reference removal. Initiate disconnect 761 * if connected. Free socket when disconnect complete. 762 * 763 * This function will sorele() the socket. Note that soclose() may be called 764 * prior to the ref count reaching zero. The actual socket structure will 765 * not be freed until the ref count reaches zero. 766 */ 767int 768soclose(struct socket *so) 769{ 770 int error = 0; 771 772 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 773 774 CURVNET_SET(so->so_vnet); 775 funsetown(&so->so_sigio); 776 if (so->so_state & SS_ISCONNECTED) { 777 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 778 error = sodisconnect(so); 779 if (error) { 780 if (error == ENOTCONN) 781 error = 0; 782 goto drop; 783 } 784 } 785 if (so->so_options & SO_LINGER) { 786 if ((so->so_state & SS_ISDISCONNECTING) && 787 (so->so_state & SS_NBIO)) 788 goto drop; 789 while (so->so_state & SS_ISCONNECTED) { 790 error = tsleep(&so->so_timeo, 791 PSOCK | PCATCH, "soclos", 792 so->so_linger * hz); 793 if (error) 794 break; 795 } 796 } 797 } 798 799drop: 800 if (so->so_proto->pr_usrreqs->pru_close != NULL) 801 (*so->so_proto->pr_usrreqs->pru_close)(so); 802 ACCEPT_LOCK(); 803 if (so->so_options & SO_ACCEPTCONN) { 804 struct socket *sp; 805 /* 806 * Prevent new additions to the accept queues due 807 * to ACCEPT_LOCK races while we are draining them. 808 */ 809 so->so_options &= ~SO_ACCEPTCONN; 810 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 811 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 812 so->so_incqlen--; 813 sp->so_qstate &= ~SQ_INCOMP; 814 sp->so_head = NULL; 815 ACCEPT_UNLOCK(); 816 soabort(sp); 817 ACCEPT_LOCK(); 818 } 819 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 820 TAILQ_REMOVE(&so->so_comp, sp, so_list); 821 so->so_qlen--; 822 sp->so_qstate &= ~SQ_COMP; 823 sp->so_head = NULL; 824 ACCEPT_UNLOCK(); 825 soabort(sp); 826 ACCEPT_LOCK(); 827 } 828 KASSERT((TAILQ_EMPTY(&so->so_comp)), 829 ("%s: so_comp populated", __func__)); 830 KASSERT((TAILQ_EMPTY(&so->so_incomp)), 831 ("%s: so_incomp populated", __func__)); 832 } 833 SOCK_LOCK(so); 834 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 835 so->so_state |= SS_NOFDREF; 836 sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ 837 CURVNET_RESTORE(); 838 return (error); 839} 840 841/* 842 * soabort() is used to abruptly tear down a connection, such as when a 843 * resource limit is reached (listen queue depth exceeded), or if a listen 844 * socket is closed while there are sockets waiting to be accepted. 845 * 846 * This interface is tricky, because it is called on an unreferenced socket, 847 * and must be called only by a thread that has actually removed the socket 848 * from the listen queue it was on, or races with other threads are risked. 849 * 850 * This interface will call into the protocol code, so must not be called 851 * with any socket locks held. Protocols do call it while holding their own 852 * recursible protocol mutexes, but this is something that should be subject 853 * to review in the future. 854 */ 855void 856soabort(struct socket *so) 857{ 858 859 /* 860 * In as much as is possible, assert that no references to this 861 * socket are held. This is not quite the same as asserting that the 862 * current thread is responsible for arranging for no references, but 863 * is as close as we can get for now. 864 */ 865 KASSERT(so->so_count == 0, ("soabort: so_count")); 866 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 867 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 868 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 869 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 870 VNET_SO_ASSERT(so); 871 872 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 873 (*so->so_proto->pr_usrreqs->pru_abort)(so); 874 ACCEPT_LOCK(); 875 SOCK_LOCK(so); 876 sofree(so); 877} 878 879int 880soaccept(struct socket *so, struct sockaddr **nam) 881{ 882 int error; 883 884 SOCK_LOCK(so); 885 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 886 so->so_state &= ~SS_NOFDREF; 887 SOCK_UNLOCK(so); 888 889 CURVNET_SET(so->so_vnet); 890 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 891 CURVNET_RESTORE(); 892 return (error); 893} 894 895int 896soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 897{ 898 int error; 899 900 if (so->so_options & SO_ACCEPTCONN) 901 return (EOPNOTSUPP); 902 903 CURVNET_SET(so->so_vnet); 904 /* 905 * If protocol is connection-based, can only connect once. 906 * Otherwise, if connected, try to disconnect first. This allows 907 * user to disconnect by connecting to, e.g., a null address. 908 */ 909 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 910 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 911 (error = sodisconnect(so)))) { 912 error = EISCONN; 913 } else { 914 /* 915 * Prevent accumulated error from previous connection from 916 * biting us. 917 */ 918 so->so_error = 0; 919 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 920 } 921 CURVNET_RESTORE(); 922 923 return (error); 924} 925 926int 927soconnect2(struct socket *so1, struct socket *so2) 928{ 929 int error; 930 931 CURVNET_SET(so1->so_vnet); 932 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 933 CURVNET_RESTORE(); 934 return (error); 935} 936 937int 938sodisconnect(struct socket *so) 939{ 940 int error; 941 942 if ((so->so_state & SS_ISCONNECTED) == 0) 943 return (ENOTCONN); 944 if (so->so_state & SS_ISDISCONNECTING) 945 return (EALREADY); 946 VNET_SO_ASSERT(so); 947 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 948 return (error); 949} 950 951#ifdef SOCKET_SEND_COW 952struct so_zerocopy_stats{ 953 int size_ok; 954 int align_ok; 955 int found_ifp; 956}; 957struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 958 959/* 960 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 961 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 962 * 963 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 964 * all of the data referenced by the uio. If desired, it uses zero-copy. 965 * *space will be updated to reflect data copied in. 966 * 967 * NB: If atomic I/O is requested, the caller must already have checked that 968 * space can hold resid bytes. 969 * 970 * NB: In the event of an error, the caller may need to free the partial 971 * chain pointed to by *mpp. The contents of both *uio and *space may be 972 * modified even in the case of an error. 973 */ 974static int 975sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 976 int flags) 977{ 978 struct mbuf *m, **mp, *top; 979 long len; 980 ssize_t resid; 981 int error; 982 int cow_send; 983 984 *retmp = top = NULL; 985 mp = ⊤ 986 len = 0; 987 resid = uio->uio_resid; 988 error = 0; 989 do { 990 cow_send = 0; 991 if (resid >= MINCLSIZE) { 992 if (top == NULL) { 993 m = m_gethdr(M_WAITOK, MT_DATA); 994 m->m_pkthdr.len = 0; 995 m->m_pkthdr.rcvif = NULL; 996 } else 997 m = m_get(M_WAITOK, MT_DATA); 998 if (so_zero_copy_send && 999 resid >= PAGE_SIZE && 1000 *space >= PAGE_SIZE && 1001 uio->uio_iov->iov_len >= PAGE_SIZE) { 1002 so_zerocp_stats.size_ok++; 1003 so_zerocp_stats.align_ok++; 1004 cow_send = socow_setup(m, uio); 1005 len = cow_send; 1006 } 1007 if (!cow_send) { 1008 m_clget(m, M_WAITOK); 1009 len = min(min(MCLBYTES, resid), *space); 1010 } 1011 } else { 1012 if (top == NULL) { 1013 m = m_gethdr(M_WAITOK, MT_DATA); 1014 m->m_pkthdr.len = 0; 1015 m->m_pkthdr.rcvif = NULL; 1016 1017 len = min(min(MHLEN, resid), *space); 1018 /* 1019 * For datagram protocols, leave room 1020 * for protocol headers in first mbuf. 1021 */ 1022 if (atomic && m && len < MHLEN) 1023 MH_ALIGN(m, len); 1024 } else { 1025 m = m_get(M_WAITOK, MT_DATA); 1026 len = min(min(MLEN, resid), *space); 1027 } 1028 } 1029 if (m == NULL) { 1030 error = ENOBUFS; 1031 goto out; 1032 } 1033 1034 *space -= len; 1035 if (cow_send) 1036 error = 0; 1037 else 1038 error = uiomove(mtod(m, void *), (int)len, uio); 1039 resid = uio->uio_resid; 1040 m->m_len = len; 1041 *mp = m; 1042 top->m_pkthdr.len += len; 1043 if (error) 1044 goto out; 1045 mp = &m->m_next; 1046 if (resid <= 0) { 1047 if (flags & MSG_EOR) 1048 top->m_flags |= M_EOR; 1049 break; 1050 } 1051 } while (*space > 0 && atomic); 1052out: 1053 *retmp = top; 1054 return (error); 1055} 1056#endif /* SOCKET_SEND_COW */ 1057 1058#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1059 1060int 1061sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1062 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1063{ 1064 long space; 1065 ssize_t resid; 1066 int clen = 0, error, dontroute; 1067#ifdef SOCKET_SEND_COW 1068 int atomic = sosendallatonce(so) || top; 1069#endif 1070 1071 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1072 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1073 ("sosend_dgram: !PR_ATOMIC")); 1074 1075 if (uio != NULL) 1076 resid = uio->uio_resid; 1077 else 1078 resid = top->m_pkthdr.len; 1079 /* 1080 * In theory resid should be unsigned. However, space must be 1081 * signed, as it might be less than 0 if we over-committed, and we 1082 * must use a signed comparison of space and resid. On the other 1083 * hand, a negative resid causes us to loop sending 0-length 1084 * segments to the protocol. 1085 */ 1086 if (resid < 0) { 1087 error = EINVAL; 1088 goto out; 1089 } 1090 1091 dontroute = 1092 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1093 if (td != NULL) 1094 td->td_ru.ru_msgsnd++; 1095 if (control != NULL) 1096 clen = control->m_len; 1097 1098 SOCKBUF_LOCK(&so->so_snd); 1099 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1100 SOCKBUF_UNLOCK(&so->so_snd); 1101 error = EPIPE; 1102 goto out; 1103 } 1104 if (so->so_error) { 1105 error = so->so_error; 1106 so->so_error = 0; 1107 SOCKBUF_UNLOCK(&so->so_snd); 1108 goto out; 1109 } 1110 if ((so->so_state & SS_ISCONNECTED) == 0) { 1111 /* 1112 * `sendto' and `sendmsg' is allowed on a connection-based 1113 * socket if it supports implied connect. Return ENOTCONN if 1114 * not connected and no address is supplied. 1115 */ 1116 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1117 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1118 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1119 !(resid == 0 && clen != 0)) { 1120 SOCKBUF_UNLOCK(&so->so_snd); 1121 error = ENOTCONN; 1122 goto out; 1123 } 1124 } else if (addr == NULL) { 1125 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1126 error = ENOTCONN; 1127 else 1128 error = EDESTADDRREQ; 1129 SOCKBUF_UNLOCK(&so->so_snd); 1130 goto out; 1131 } 1132 } 1133 1134 /* 1135 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1136 * problem and need fixing. 1137 */ 1138 space = sbspace(&so->so_snd); 1139 if (flags & MSG_OOB) 1140 space += 1024; 1141 space -= clen; 1142 SOCKBUF_UNLOCK(&so->so_snd); 1143 if (resid > space) { 1144 error = EMSGSIZE; 1145 goto out; 1146 } 1147 if (uio == NULL) { 1148 resid = 0; 1149 if (flags & MSG_EOR) 1150 top->m_flags |= M_EOR; 1151 } else { 1152#ifdef SOCKET_SEND_COW 1153 error = sosend_copyin(uio, &top, atomic, &space, flags); 1154 if (error) 1155 goto out; 1156#else 1157 /* 1158 * Copy the data from userland into a mbuf chain. 1159 * If no data is to be copied in, a single empty mbuf 1160 * is returned. 1161 */ 1162 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1163 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1164 if (top == NULL) { 1165 error = EFAULT; /* only possible error */ 1166 goto out; 1167 } 1168 space -= resid - uio->uio_resid; 1169#endif /* SOCKET_SEND_COW */ 1170 resid = uio->uio_resid; 1171 } 1172 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1173 /* 1174 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1175 * than with. 1176 */ 1177 if (dontroute) { 1178 SOCK_LOCK(so); 1179 so->so_options |= SO_DONTROUTE; 1180 SOCK_UNLOCK(so); 1181 } 1182 /* 1183 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1184 * of date. We could have recieved a reset packet in an interrupt or 1185 * maybe we slept while doing page faults in uiomove() etc. We could 1186 * probably recheck again inside the locking protection here, but 1187 * there are probably other places that this also happens. We must 1188 * rethink this. 1189 */ 1190 VNET_SO_ASSERT(so); 1191 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1192 (flags & MSG_OOB) ? PRUS_OOB : 1193 /* 1194 * If the user set MSG_EOF, the protocol understands this flag and 1195 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1196 */ 1197 ((flags & MSG_EOF) && 1198 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1199 (resid <= 0)) ? 1200 PRUS_EOF : 1201 /* If there is more to send set PRUS_MORETOCOME */ 1202 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1203 top, addr, control, td); 1204 if (dontroute) { 1205 SOCK_LOCK(so); 1206 so->so_options &= ~SO_DONTROUTE; 1207 SOCK_UNLOCK(so); 1208 } 1209 clen = 0; 1210 control = NULL; 1211 top = NULL; 1212out: 1213 if (top != NULL) 1214 m_freem(top); 1215 if (control != NULL) 1216 m_freem(control); 1217 return (error); 1218} 1219 1220/* 1221 * Send on a socket. If send must go all at once and message is larger than 1222 * send buffering, then hard error. Lock against other senders. If must go 1223 * all at once and not enough room now, then inform user that this would 1224 * block and do nothing. Otherwise, if nonblocking, send as much as 1225 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1226 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1227 * in mbuf chain must be small enough to send all at once. 1228 * 1229 * Returns nonzero on error, timeout or signal; callers must check for short 1230 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1231 * on return. 1232 */ 1233int 1234sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1235 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1236{ 1237 long space; 1238 ssize_t resid; 1239 int clen = 0, error, dontroute; 1240 int atomic = sosendallatonce(so) || top; 1241 1242 if (uio != NULL) 1243 resid = uio->uio_resid; 1244 else 1245 resid = top->m_pkthdr.len; 1246 /* 1247 * In theory resid should be unsigned. However, space must be 1248 * signed, as it might be less than 0 if we over-committed, and we 1249 * must use a signed comparison of space and resid. On the other 1250 * hand, a negative resid causes us to loop sending 0-length 1251 * segments to the protocol. 1252 * 1253 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1254 * type sockets since that's an error. 1255 */ 1256 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1257 error = EINVAL; 1258 goto out; 1259 } 1260 1261 dontroute = 1262 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1263 (so->so_proto->pr_flags & PR_ATOMIC); 1264 if (td != NULL) 1265 td->td_ru.ru_msgsnd++; 1266 if (control != NULL) 1267 clen = control->m_len; 1268 1269 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1270 if (error) 1271 goto out; 1272 1273restart: 1274 do { 1275 SOCKBUF_LOCK(&so->so_snd); 1276 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1277 SOCKBUF_UNLOCK(&so->so_snd); 1278 error = EPIPE; 1279 goto release; 1280 } 1281 if (so->so_error) { 1282 error = so->so_error; 1283 so->so_error = 0; 1284 SOCKBUF_UNLOCK(&so->so_snd); 1285 goto release; 1286 } 1287 if ((so->so_state & SS_ISCONNECTED) == 0) { 1288 /* 1289 * `sendto' and `sendmsg' is allowed on a connection- 1290 * based socket if it supports implied connect. 1291 * Return ENOTCONN if not connected and no address is 1292 * supplied. 1293 */ 1294 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1295 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1296 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1297 !(resid == 0 && clen != 0)) { 1298 SOCKBUF_UNLOCK(&so->so_snd); 1299 error = ENOTCONN; 1300 goto release; 1301 } 1302 } else if (addr == NULL) { 1303 SOCKBUF_UNLOCK(&so->so_snd); 1304 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1305 error = ENOTCONN; 1306 else 1307 error = EDESTADDRREQ; 1308 goto release; 1309 } 1310 } 1311 space = sbspace(&so->so_snd); 1312 if (flags & MSG_OOB) 1313 space += 1024; 1314 if ((atomic && resid > so->so_snd.sb_hiwat) || 1315 clen > so->so_snd.sb_hiwat) { 1316 SOCKBUF_UNLOCK(&so->so_snd); 1317 error = EMSGSIZE; 1318 goto release; 1319 } 1320 if (space < resid + clen && 1321 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1322 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1323 SOCKBUF_UNLOCK(&so->so_snd); 1324 error = EWOULDBLOCK; 1325 goto release; 1326 } 1327 error = sbwait(&so->so_snd); 1328 SOCKBUF_UNLOCK(&so->so_snd); 1329 if (error) 1330 goto release; 1331 goto restart; 1332 } 1333 SOCKBUF_UNLOCK(&so->so_snd); 1334 space -= clen; 1335 do { 1336 if (uio == NULL) { 1337 resid = 0; 1338 if (flags & MSG_EOR) 1339 top->m_flags |= M_EOR; 1340 } else { 1341#ifdef SOCKET_SEND_COW 1342 error = sosend_copyin(uio, &top, atomic, 1343 &space, flags); 1344 if (error != 0) 1345 goto release; 1346#else 1347 /* 1348 * Copy the data from userland into a mbuf 1349 * chain. If no data is to be copied in, 1350 * a single empty mbuf is returned. 1351 */ 1352 top = m_uiotombuf(uio, M_WAITOK, space, 1353 (atomic ? max_hdr : 0), 1354 (atomic ? M_PKTHDR : 0) | 1355 ((flags & MSG_EOR) ? M_EOR : 0)); 1356 if (top == NULL) { 1357 error = EFAULT; /* only possible error */ 1358 goto release; 1359 } 1360 space -= resid - uio->uio_resid; 1361#endif /* SOCKET_SEND_COW */ 1362 resid = uio->uio_resid; 1363 } 1364 if (dontroute) { 1365 SOCK_LOCK(so); 1366 so->so_options |= SO_DONTROUTE; 1367 SOCK_UNLOCK(so); 1368 } 1369 /* 1370 * XXX all the SBS_CANTSENDMORE checks previously 1371 * done could be out of date. We could have recieved 1372 * a reset packet in an interrupt or maybe we slept 1373 * while doing page faults in uiomove() etc. We 1374 * could probably recheck again inside the locking 1375 * protection here, but there are probably other 1376 * places that this also happens. We must rethink 1377 * this. 1378 */ 1379 VNET_SO_ASSERT(so); 1380 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1381 (flags & MSG_OOB) ? PRUS_OOB : 1382 /* 1383 * If the user set MSG_EOF, the protocol understands 1384 * this flag and nothing left to send then use 1385 * PRU_SEND_EOF instead of PRU_SEND. 1386 */ 1387 ((flags & MSG_EOF) && 1388 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1389 (resid <= 0)) ? 1390 PRUS_EOF : 1391 /* If there is more to send set PRUS_MORETOCOME. */ 1392 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1393 top, addr, control, td); 1394 if (dontroute) { 1395 SOCK_LOCK(so); 1396 so->so_options &= ~SO_DONTROUTE; 1397 SOCK_UNLOCK(so); 1398 } 1399 clen = 0; 1400 control = NULL; 1401 top = NULL; 1402 if (error) 1403 goto release; 1404 } while (resid && space > 0); 1405 } while (resid); 1406 1407release: 1408 sbunlock(&so->so_snd); 1409out: 1410 if (top != NULL) 1411 m_freem(top); 1412 if (control != NULL) 1413 m_freem(control); 1414 return (error); 1415} 1416 1417int 1418sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1419 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1420{ 1421 int error; 1422 1423 CURVNET_SET(so->so_vnet); 1424 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1425 control, flags, td); 1426 CURVNET_RESTORE(); 1427 return (error); 1428} 1429 1430/* 1431 * The part of soreceive() that implements reading non-inline out-of-band 1432 * data from a socket. For more complete comments, see soreceive(), from 1433 * which this code originated. 1434 * 1435 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1436 * unable to return an mbuf chain to the caller. 1437 */ 1438static int 1439soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1440{ 1441 struct protosw *pr = so->so_proto; 1442 struct mbuf *m; 1443 int error; 1444 1445 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1446 VNET_SO_ASSERT(so); 1447 1448 m = m_get(M_WAITOK, MT_DATA); 1449 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1450 if (error) 1451 goto bad; 1452 do { 1453#ifdef SOCKET_RECV_PFLIP 1454 if (so_zero_copy_receive) { 1455 int disposable; 1456 1457 if ((m->m_flags & M_EXT) 1458 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1459 disposable = 1; 1460 else 1461 disposable = 0; 1462 1463 error = uiomoveco(mtod(m, void *), 1464 min(uio->uio_resid, m->m_len), uio, disposable); 1465 } else 1466#endif /* SOCKET_RECV_PFLIP */ 1467 error = uiomove(mtod(m, void *), 1468 (int) min(uio->uio_resid, m->m_len), uio); 1469 m = m_free(m); 1470 } while (uio->uio_resid && error == 0 && m); 1471bad: 1472 if (m != NULL) 1473 m_freem(m); 1474 return (error); 1475} 1476 1477/* 1478 * Following replacement or removal of the first mbuf on the first mbuf chain 1479 * of a socket buffer, push necessary state changes back into the socket 1480 * buffer so that other consumers see the values consistently. 'nextrecord' 1481 * is the callers locally stored value of the original value of 1482 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1483 * NOTE: 'nextrecord' may be NULL. 1484 */ 1485static __inline void 1486sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1487{ 1488 1489 SOCKBUF_LOCK_ASSERT(sb); 1490 /* 1491 * First, update for the new value of nextrecord. If necessary, make 1492 * it the first record. 1493 */ 1494 if (sb->sb_mb != NULL) 1495 sb->sb_mb->m_nextpkt = nextrecord; 1496 else 1497 sb->sb_mb = nextrecord; 1498 1499 /* 1500 * Now update any dependent socket buffer fields to reflect the new 1501 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1502 * addition of a second clause that takes care of the case where 1503 * sb_mb has been updated, but remains the last record. 1504 */ 1505 if (sb->sb_mb == NULL) { 1506 sb->sb_mbtail = NULL; 1507 sb->sb_lastrecord = NULL; 1508 } else if (sb->sb_mb->m_nextpkt == NULL) 1509 sb->sb_lastrecord = sb->sb_mb; 1510} 1511 1512/* 1513 * Implement receive operations on a socket. We depend on the way that 1514 * records are added to the sockbuf by sbappend. In particular, each record 1515 * (mbufs linked through m_next) must begin with an address if the protocol 1516 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1517 * data, and then zero or more mbufs of data. In order to allow parallelism 1518 * between network receive and copying to user space, as well as avoid 1519 * sleeping with a mutex held, we release the socket buffer mutex during the 1520 * user space copy. Although the sockbuf is locked, new data may still be 1521 * appended, and thus we must maintain consistency of the sockbuf during that 1522 * time. 1523 * 1524 * The caller may receive the data as a single mbuf chain by supplying an 1525 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1526 * the count in uio_resid. 1527 */ 1528int 1529soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1530 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1531{ 1532 struct mbuf *m, **mp; 1533 int flags, error, offset; 1534 ssize_t len; 1535 struct protosw *pr = so->so_proto; 1536 struct mbuf *nextrecord; 1537 int moff, type = 0; 1538 ssize_t orig_resid = uio->uio_resid; 1539 1540 mp = mp0; 1541 if (psa != NULL) 1542 *psa = NULL; 1543 if (controlp != NULL) 1544 *controlp = NULL; 1545 if (flagsp != NULL) 1546 flags = *flagsp &~ MSG_EOR; 1547 else 1548 flags = 0; 1549 if (flags & MSG_OOB) 1550 return (soreceive_rcvoob(so, uio, flags)); 1551 if (mp != NULL) 1552 *mp = NULL; 1553 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1554 && uio->uio_resid) { 1555 VNET_SO_ASSERT(so); 1556 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1557 } 1558 1559 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1560 if (error) 1561 return (error); 1562 1563restart: 1564 SOCKBUF_LOCK(&so->so_rcv); 1565 m = so->so_rcv.sb_mb; 1566 /* 1567 * If we have less data than requested, block awaiting more (subject 1568 * to any timeout) if: 1569 * 1. the current count is less than the low water mark, or 1570 * 2. MSG_DONTWAIT is not set 1571 */ 1572 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1573 so->so_rcv.sb_cc < uio->uio_resid) && 1574 so->so_rcv.sb_cc < so->so_rcv.sb_lowat && 1575 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1576 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1577 ("receive: m == %p so->so_rcv.sb_cc == %u", 1578 m, so->so_rcv.sb_cc)); 1579 if (so->so_error) { 1580 if (m != NULL) 1581 goto dontblock; 1582 error = so->so_error; 1583 if ((flags & MSG_PEEK) == 0) 1584 so->so_error = 0; 1585 SOCKBUF_UNLOCK(&so->so_rcv); 1586 goto release; 1587 } 1588 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1589 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1590 if (m == NULL) { 1591 SOCKBUF_UNLOCK(&so->so_rcv); 1592 goto release; 1593 } else 1594 goto dontblock; 1595 } 1596 for (; m != NULL; m = m->m_next) 1597 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1598 m = so->so_rcv.sb_mb; 1599 goto dontblock; 1600 } 1601 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1602 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1603 SOCKBUF_UNLOCK(&so->so_rcv); 1604 error = ENOTCONN; 1605 goto release; 1606 } 1607 if (uio->uio_resid == 0) { 1608 SOCKBUF_UNLOCK(&so->so_rcv); 1609 goto release; 1610 } 1611 if ((so->so_state & SS_NBIO) || 1612 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1613 SOCKBUF_UNLOCK(&so->so_rcv); 1614 error = EWOULDBLOCK; 1615 goto release; 1616 } 1617 SBLASTRECORDCHK(&so->so_rcv); 1618 SBLASTMBUFCHK(&so->so_rcv); 1619 error = sbwait(&so->so_rcv); 1620 SOCKBUF_UNLOCK(&so->so_rcv); 1621 if (error) 1622 goto release; 1623 goto restart; 1624 } 1625dontblock: 1626 /* 1627 * From this point onward, we maintain 'nextrecord' as a cache of the 1628 * pointer to the next record in the socket buffer. We must keep the 1629 * various socket buffer pointers and local stack versions of the 1630 * pointers in sync, pushing out modifications before dropping the 1631 * socket buffer mutex, and re-reading them when picking it up. 1632 * 1633 * Otherwise, we will race with the network stack appending new data 1634 * or records onto the socket buffer by using inconsistent/stale 1635 * versions of the field, possibly resulting in socket buffer 1636 * corruption. 1637 * 1638 * By holding the high-level sblock(), we prevent simultaneous 1639 * readers from pulling off the front of the socket buffer. 1640 */ 1641 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1642 if (uio->uio_td) 1643 uio->uio_td->td_ru.ru_msgrcv++; 1644 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1645 SBLASTRECORDCHK(&so->so_rcv); 1646 SBLASTMBUFCHK(&so->so_rcv); 1647 nextrecord = m->m_nextpkt; 1648 if (pr->pr_flags & PR_ADDR) { 1649 KASSERT(m->m_type == MT_SONAME, 1650 ("m->m_type == %d", m->m_type)); 1651 orig_resid = 0; 1652 if (psa != NULL) 1653 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1654 M_NOWAIT); 1655 if (flags & MSG_PEEK) { 1656 m = m->m_next; 1657 } else { 1658 sbfree(&so->so_rcv, m); 1659 so->so_rcv.sb_mb = m_free(m); 1660 m = so->so_rcv.sb_mb; 1661 sockbuf_pushsync(&so->so_rcv, nextrecord); 1662 } 1663 } 1664 1665 /* 1666 * Process one or more MT_CONTROL mbufs present before any data mbufs 1667 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1668 * just copy the data; if !MSG_PEEK, we call into the protocol to 1669 * perform externalization (or freeing if controlp == NULL). 1670 */ 1671 if (m != NULL && m->m_type == MT_CONTROL) { 1672 struct mbuf *cm = NULL, *cmn; 1673 struct mbuf **cme = &cm; 1674 1675 do { 1676 if (flags & MSG_PEEK) { 1677 if (controlp != NULL) { 1678 *controlp = m_copy(m, 0, m->m_len); 1679 controlp = &(*controlp)->m_next; 1680 } 1681 m = m->m_next; 1682 } else { 1683 sbfree(&so->so_rcv, m); 1684 so->so_rcv.sb_mb = m->m_next; 1685 m->m_next = NULL; 1686 *cme = m; 1687 cme = &(*cme)->m_next; 1688 m = so->so_rcv.sb_mb; 1689 } 1690 } while (m != NULL && m->m_type == MT_CONTROL); 1691 if ((flags & MSG_PEEK) == 0) 1692 sockbuf_pushsync(&so->so_rcv, nextrecord); 1693 while (cm != NULL) { 1694 cmn = cm->m_next; 1695 cm->m_next = NULL; 1696 if (pr->pr_domain->dom_externalize != NULL) { 1697 SOCKBUF_UNLOCK(&so->so_rcv); 1698 VNET_SO_ASSERT(so); 1699 error = (*pr->pr_domain->dom_externalize) 1700 (cm, controlp); 1701 SOCKBUF_LOCK(&so->so_rcv); 1702 } else if (controlp != NULL) 1703 *controlp = cm; 1704 else 1705 m_freem(cm); 1706 if (controlp != NULL) { 1707 orig_resid = 0; 1708 while (*controlp != NULL) 1709 controlp = &(*controlp)->m_next; 1710 } 1711 cm = cmn; 1712 } 1713 if (m != NULL) 1714 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1715 else 1716 nextrecord = so->so_rcv.sb_mb; 1717 orig_resid = 0; 1718 } 1719 if (m != NULL) { 1720 if ((flags & MSG_PEEK) == 0) { 1721 KASSERT(m->m_nextpkt == nextrecord, 1722 ("soreceive: post-control, nextrecord !sync")); 1723 if (nextrecord == NULL) { 1724 KASSERT(so->so_rcv.sb_mb == m, 1725 ("soreceive: post-control, sb_mb!=m")); 1726 KASSERT(so->so_rcv.sb_lastrecord == m, 1727 ("soreceive: post-control, lastrecord!=m")); 1728 } 1729 } 1730 type = m->m_type; 1731 if (type == MT_OOBDATA) 1732 flags |= MSG_OOB; 1733 } else { 1734 if ((flags & MSG_PEEK) == 0) { 1735 KASSERT(so->so_rcv.sb_mb == nextrecord, 1736 ("soreceive: sb_mb != nextrecord")); 1737 if (so->so_rcv.sb_mb == NULL) { 1738 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1739 ("soreceive: sb_lastercord != NULL")); 1740 } 1741 } 1742 } 1743 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1744 SBLASTRECORDCHK(&so->so_rcv); 1745 SBLASTMBUFCHK(&so->so_rcv); 1746 1747 /* 1748 * Now continue to read any data mbufs off of the head of the socket 1749 * buffer until the read request is satisfied. Note that 'type' is 1750 * used to store the type of any mbuf reads that have happened so far 1751 * such that soreceive() can stop reading if the type changes, which 1752 * causes soreceive() to return only one of regular data and inline 1753 * out-of-band data in a single socket receive operation. 1754 */ 1755 moff = 0; 1756 offset = 0; 1757 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1758 /* 1759 * If the type of mbuf has changed since the last mbuf 1760 * examined ('type'), end the receive operation. 1761 */ 1762 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1763 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 1764 if (type != m->m_type) 1765 break; 1766 } else if (type == MT_OOBDATA) 1767 break; 1768 else 1769 KASSERT(m->m_type == MT_DATA, 1770 ("m->m_type == %d", m->m_type)); 1771 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1772 len = uio->uio_resid; 1773 if (so->so_oobmark && len > so->so_oobmark - offset) 1774 len = so->so_oobmark - offset; 1775 if (len > m->m_len - moff) 1776 len = m->m_len - moff; 1777 /* 1778 * If mp is set, just pass back the mbufs. Otherwise copy 1779 * them out via the uio, then free. Sockbuf must be 1780 * consistent here (points to current mbuf, it points to next 1781 * record) when we drop priority; we must note any additions 1782 * to the sockbuf when we block interrupts again. 1783 */ 1784 if (mp == NULL) { 1785 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1786 SBLASTRECORDCHK(&so->so_rcv); 1787 SBLASTMBUFCHK(&so->so_rcv); 1788 SOCKBUF_UNLOCK(&so->so_rcv); 1789#ifdef SOCKET_RECV_PFLIP 1790 if (so_zero_copy_receive) { 1791 int disposable; 1792 1793 if ((m->m_flags & M_EXT) 1794 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1795 disposable = 1; 1796 else 1797 disposable = 0; 1798 1799 error = uiomoveco(mtod(m, char *) + moff, 1800 (int)len, uio, disposable); 1801 } else 1802#endif /* SOCKET_RECV_PFLIP */ 1803 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1804 SOCKBUF_LOCK(&so->so_rcv); 1805 if (error) { 1806 /* 1807 * The MT_SONAME mbuf has already been removed 1808 * from the record, so it is necessary to 1809 * remove the data mbufs, if any, to preserve 1810 * the invariant in the case of PR_ADDR that 1811 * requires MT_SONAME mbufs at the head of 1812 * each record. 1813 */ 1814 if (m && pr->pr_flags & PR_ATOMIC && 1815 ((flags & MSG_PEEK) == 0)) 1816 (void)sbdroprecord_locked(&so->so_rcv); 1817 SOCKBUF_UNLOCK(&so->so_rcv); 1818 goto release; 1819 } 1820 } else 1821 uio->uio_resid -= len; 1822 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1823 if (len == m->m_len - moff) { 1824 if (m->m_flags & M_EOR) 1825 flags |= MSG_EOR; 1826 if (flags & MSG_PEEK) { 1827 m = m->m_next; 1828 moff = 0; 1829 } else { 1830 nextrecord = m->m_nextpkt; 1831 sbfree(&so->so_rcv, m); 1832 if (mp != NULL) { 1833 *mp = m; 1834 mp = &m->m_next; 1835 so->so_rcv.sb_mb = m = m->m_next; 1836 *mp = NULL; 1837 } else { 1838 so->so_rcv.sb_mb = m_free(m); 1839 m = so->so_rcv.sb_mb; 1840 } 1841 sockbuf_pushsync(&so->so_rcv, nextrecord); 1842 SBLASTRECORDCHK(&so->so_rcv); 1843 SBLASTMBUFCHK(&so->so_rcv); 1844 } 1845 } else { 1846 if (flags & MSG_PEEK) 1847 moff += len; 1848 else { 1849 if (mp != NULL) { 1850 int copy_flag; 1851 1852 if (flags & MSG_DONTWAIT) 1853 copy_flag = M_NOWAIT; 1854 else 1855 copy_flag = M_WAIT; 1856 if (copy_flag == M_WAITOK) 1857 SOCKBUF_UNLOCK(&so->so_rcv); 1858 *mp = m_copym(m, 0, len, copy_flag); 1859 if (copy_flag == M_WAITOK) 1860 SOCKBUF_LOCK(&so->so_rcv); 1861 if (*mp == NULL) { 1862 /* 1863 * m_copym() couldn't 1864 * allocate an mbuf. Adjust 1865 * uio_resid back (it was 1866 * adjusted down by len 1867 * bytes, which we didn't end 1868 * up "copying" over). 1869 */ 1870 uio->uio_resid += len; 1871 break; 1872 } 1873 } 1874 m->m_data += len; 1875 m->m_len -= len; 1876 so->so_rcv.sb_cc -= len; 1877 } 1878 } 1879 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1880 if (so->so_oobmark) { 1881 if ((flags & MSG_PEEK) == 0) { 1882 so->so_oobmark -= len; 1883 if (so->so_oobmark == 0) { 1884 so->so_rcv.sb_state |= SBS_RCVATMARK; 1885 break; 1886 } 1887 } else { 1888 offset += len; 1889 if (offset == so->so_oobmark) 1890 break; 1891 } 1892 } 1893 if (flags & MSG_EOR) 1894 break; 1895 /* 1896 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1897 * must not quit until "uio->uio_resid == 0" or an error 1898 * termination. If a signal/timeout occurs, return with a 1899 * short count but without error. Keep sockbuf locked 1900 * against other readers. 1901 */ 1902 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1903 !sosendallatonce(so) && nextrecord == NULL) { 1904 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1905 if (so->so_error || 1906 so->so_rcv.sb_state & SBS_CANTRCVMORE) 1907 break; 1908 /* 1909 * Notify the protocol that some data has been 1910 * drained before blocking. 1911 */ 1912 if (pr->pr_flags & PR_WANTRCVD) { 1913 SOCKBUF_UNLOCK(&so->so_rcv); 1914 VNET_SO_ASSERT(so); 1915 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1916 SOCKBUF_LOCK(&so->so_rcv); 1917 } 1918 SBLASTRECORDCHK(&so->so_rcv); 1919 SBLASTMBUFCHK(&so->so_rcv); 1920 /* 1921 * We could receive some data while was notifying 1922 * the protocol. Skip blocking in this case. 1923 */ 1924 if (so->so_rcv.sb_mb == NULL) { 1925 error = sbwait(&so->so_rcv); 1926 if (error) { 1927 SOCKBUF_UNLOCK(&so->so_rcv); 1928 goto release; 1929 } 1930 } 1931 m = so->so_rcv.sb_mb; 1932 if (m != NULL) 1933 nextrecord = m->m_nextpkt; 1934 } 1935 } 1936 1937 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1938 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1939 flags |= MSG_TRUNC; 1940 if ((flags & MSG_PEEK) == 0) 1941 (void) sbdroprecord_locked(&so->so_rcv); 1942 } 1943 if ((flags & MSG_PEEK) == 0) { 1944 if (m == NULL) { 1945 /* 1946 * First part is an inline SB_EMPTY_FIXUP(). Second 1947 * part makes sure sb_lastrecord is up-to-date if 1948 * there is still data in the socket buffer. 1949 */ 1950 so->so_rcv.sb_mb = nextrecord; 1951 if (so->so_rcv.sb_mb == NULL) { 1952 so->so_rcv.sb_mbtail = NULL; 1953 so->so_rcv.sb_lastrecord = NULL; 1954 } else if (nextrecord->m_nextpkt == NULL) 1955 so->so_rcv.sb_lastrecord = nextrecord; 1956 } 1957 SBLASTRECORDCHK(&so->so_rcv); 1958 SBLASTMBUFCHK(&so->so_rcv); 1959 /* 1960 * If soreceive() is being done from the socket callback, 1961 * then don't need to generate ACK to peer to update window, 1962 * since ACK will be generated on return to TCP. 1963 */ 1964 if (!(flags & MSG_SOCALLBCK) && 1965 (pr->pr_flags & PR_WANTRCVD)) { 1966 SOCKBUF_UNLOCK(&so->so_rcv); 1967 VNET_SO_ASSERT(so); 1968 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1969 SOCKBUF_LOCK(&so->so_rcv); 1970 } 1971 } 1972 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1973 if (orig_resid == uio->uio_resid && orig_resid && 1974 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1975 SOCKBUF_UNLOCK(&so->so_rcv); 1976 goto restart; 1977 } 1978 SOCKBUF_UNLOCK(&so->so_rcv); 1979 1980 if (flagsp != NULL) 1981 *flagsp |= flags; 1982release: 1983 sbunlock(&so->so_rcv); 1984 return (error); 1985} 1986 1987/* 1988 * Optimized version of soreceive() for stream (TCP) sockets. 1989 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. 1990 */ 1991int 1992soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 1993 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1994{ 1995 int len = 0, error = 0, flags, oresid; 1996 struct sockbuf *sb; 1997 struct mbuf *m, *n = NULL; 1998 1999 /* We only do stream sockets. */ 2000 if (so->so_type != SOCK_STREAM) 2001 return (EINVAL); 2002 if (psa != NULL) 2003 *psa = NULL; 2004 if (controlp != NULL) 2005 return (EINVAL); 2006 if (flagsp != NULL) 2007 flags = *flagsp &~ MSG_EOR; 2008 else 2009 flags = 0; 2010 if (flags & MSG_OOB) 2011 return (soreceive_rcvoob(so, uio, flags)); 2012 if (mp0 != NULL) 2013 *mp0 = NULL; 2014 2015 sb = &so->so_rcv; 2016 2017 /* Prevent other readers from entering the socket. */ 2018 error = sblock(sb, SBLOCKWAIT(flags)); 2019 if (error) 2020 goto out; 2021 SOCKBUF_LOCK(sb); 2022 2023 /* Easy one, no space to copyout anything. */ 2024 if (uio->uio_resid == 0) { 2025 error = EINVAL; 2026 goto out; 2027 } 2028 oresid = uio->uio_resid; 2029 2030 /* We will never ever get anything unless we are or were connected. */ 2031 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2032 error = ENOTCONN; 2033 goto out; 2034 } 2035 2036restart: 2037 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2038 2039 /* Abort if socket has reported problems. */ 2040 if (so->so_error) { 2041 if (sb->sb_cc > 0) 2042 goto deliver; 2043 if (oresid > uio->uio_resid) 2044 goto out; 2045 error = so->so_error; 2046 if (!(flags & MSG_PEEK)) 2047 so->so_error = 0; 2048 goto out; 2049 } 2050 2051 /* Door is closed. Deliver what is left, if any. */ 2052 if (sb->sb_state & SBS_CANTRCVMORE) { 2053 if (sb->sb_cc > 0) 2054 goto deliver; 2055 else 2056 goto out; 2057 } 2058 2059 /* Socket buffer is empty and we shall not block. */ 2060 if (sb->sb_cc == 0 && 2061 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2062 error = EAGAIN; 2063 goto out; 2064 } 2065 2066 /* Socket buffer got some data that we shall deliver now. */ 2067 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 2068 ((sb->sb_flags & SS_NBIO) || 2069 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2070 sb->sb_cc >= sb->sb_lowat || 2071 sb->sb_cc >= uio->uio_resid || 2072 sb->sb_cc >= sb->sb_hiwat) ) { 2073 goto deliver; 2074 } 2075 2076 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2077 if ((flags & MSG_WAITALL) && 2078 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat)) 2079 goto deliver; 2080 2081 /* 2082 * Wait and block until (more) data comes in. 2083 * NB: Drops the sockbuf lock during wait. 2084 */ 2085 error = sbwait(sb); 2086 if (error) 2087 goto out; 2088 goto restart; 2089 2090deliver: 2091 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2092 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 2093 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2094 2095 /* Statistics. */ 2096 if (uio->uio_td) 2097 uio->uio_td->td_ru.ru_msgrcv++; 2098 2099 /* Fill uio until full or current end of socket buffer is reached. */ 2100 len = min(uio->uio_resid, sb->sb_cc); 2101 if (mp0 != NULL) { 2102 /* Dequeue as many mbufs as possible. */ 2103 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2104 if (*mp0 == NULL) 2105 *mp0 = sb->sb_mb; 2106 else 2107 m_cat(*mp0, sb->sb_mb); 2108 for (m = sb->sb_mb; 2109 m != NULL && m->m_len <= len; 2110 m = m->m_next) { 2111 len -= m->m_len; 2112 uio->uio_resid -= m->m_len; 2113 sbfree(sb, m); 2114 n = m; 2115 } 2116 n->m_next = NULL; 2117 sb->sb_mb = m; 2118 sb->sb_lastrecord = sb->sb_mb; 2119 if (sb->sb_mb == NULL) 2120 SB_EMPTY_FIXUP(sb); 2121 } 2122 /* Copy the remainder. */ 2123 if (len > 0) { 2124 KASSERT(sb->sb_mb != NULL, 2125 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2126 2127 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2128 if (m == NULL) 2129 len = 0; /* Don't flush data from sockbuf. */ 2130 else 2131 uio->uio_resid -= len; 2132 if (*mp0 != NULL) 2133 m_cat(*mp0, m); 2134 else 2135 *mp0 = m; 2136 if (*mp0 == NULL) { 2137 error = ENOBUFS; 2138 goto out; 2139 } 2140 } 2141 } else { 2142 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2143 SOCKBUF_UNLOCK(sb); 2144 error = m_mbuftouio(uio, sb->sb_mb, len); 2145 SOCKBUF_LOCK(sb); 2146 if (error) 2147 goto out; 2148 } 2149 SBLASTRECORDCHK(sb); 2150 SBLASTMBUFCHK(sb); 2151 2152 /* 2153 * Remove the delivered data from the socket buffer unless we 2154 * were only peeking. 2155 */ 2156 if (!(flags & MSG_PEEK)) { 2157 if (len > 0) 2158 sbdrop_locked(sb, len); 2159 2160 /* Notify protocol that we drained some data. */ 2161 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2162 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2163 !(flags & MSG_SOCALLBCK))) { 2164 SOCKBUF_UNLOCK(sb); 2165 VNET_SO_ASSERT(so); 2166 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2167 SOCKBUF_LOCK(sb); 2168 } 2169 } 2170 2171 /* 2172 * For MSG_WAITALL we may have to loop again and wait for 2173 * more data to come in. 2174 */ 2175 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2176 goto restart; 2177out: 2178 SOCKBUF_LOCK_ASSERT(sb); 2179 SBLASTRECORDCHK(sb); 2180 SBLASTMBUFCHK(sb); 2181 SOCKBUF_UNLOCK(sb); 2182 sbunlock(sb); 2183 return (error); 2184} 2185 2186/* 2187 * Optimized version of soreceive() for simple datagram cases from userspace. 2188 * Unlike in the stream case, we're able to drop a datagram if copyout() 2189 * fails, and because we handle datagrams atomically, we don't need to use a 2190 * sleep lock to prevent I/O interlacing. 2191 */ 2192int 2193soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2194 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2195{ 2196 struct mbuf *m, *m2; 2197 int flags, error; 2198 ssize_t len; 2199 struct protosw *pr = so->so_proto; 2200 struct mbuf *nextrecord; 2201 2202 if (psa != NULL) 2203 *psa = NULL; 2204 if (controlp != NULL) 2205 *controlp = NULL; 2206 if (flagsp != NULL) 2207 flags = *flagsp &~ MSG_EOR; 2208 else 2209 flags = 0; 2210 2211 /* 2212 * For any complicated cases, fall back to the full 2213 * soreceive_generic(). 2214 */ 2215 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2216 return (soreceive_generic(so, psa, uio, mp0, controlp, 2217 flagsp)); 2218 2219 /* 2220 * Enforce restrictions on use. 2221 */ 2222 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2223 ("soreceive_dgram: wantrcvd")); 2224 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2225 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2226 ("soreceive_dgram: SBS_RCVATMARK")); 2227 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2228 ("soreceive_dgram: P_CONNREQUIRED")); 2229 2230 /* 2231 * Loop blocking while waiting for a datagram. 2232 */ 2233 SOCKBUF_LOCK(&so->so_rcv); 2234 while ((m = so->so_rcv.sb_mb) == NULL) { 2235 KASSERT(so->so_rcv.sb_cc == 0, 2236 ("soreceive_dgram: sb_mb NULL but sb_cc %u", 2237 so->so_rcv.sb_cc)); 2238 if (so->so_error) { 2239 error = so->so_error; 2240 so->so_error = 0; 2241 SOCKBUF_UNLOCK(&so->so_rcv); 2242 return (error); 2243 } 2244 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2245 uio->uio_resid == 0) { 2246 SOCKBUF_UNLOCK(&so->so_rcv); 2247 return (0); 2248 } 2249 if ((so->so_state & SS_NBIO) || 2250 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2251 SOCKBUF_UNLOCK(&so->so_rcv); 2252 return (EWOULDBLOCK); 2253 } 2254 SBLASTRECORDCHK(&so->so_rcv); 2255 SBLASTMBUFCHK(&so->so_rcv); 2256 error = sbwait(&so->so_rcv); 2257 if (error) { 2258 SOCKBUF_UNLOCK(&so->so_rcv); 2259 return (error); 2260 } 2261 } 2262 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2263 2264 if (uio->uio_td) 2265 uio->uio_td->td_ru.ru_msgrcv++; 2266 SBLASTRECORDCHK(&so->so_rcv); 2267 SBLASTMBUFCHK(&so->so_rcv); 2268 nextrecord = m->m_nextpkt; 2269 if (nextrecord == NULL) { 2270 KASSERT(so->so_rcv.sb_lastrecord == m, 2271 ("soreceive_dgram: lastrecord != m")); 2272 } 2273 2274 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2275 ("soreceive_dgram: m_nextpkt != nextrecord")); 2276 2277 /* 2278 * Pull 'm' and its chain off the front of the packet queue. 2279 */ 2280 so->so_rcv.sb_mb = NULL; 2281 sockbuf_pushsync(&so->so_rcv, nextrecord); 2282 2283 /* 2284 * Walk 'm's chain and free that many bytes from the socket buffer. 2285 */ 2286 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2287 sbfree(&so->so_rcv, m2); 2288 2289 /* 2290 * Do a few last checks before we let go of the lock. 2291 */ 2292 SBLASTRECORDCHK(&so->so_rcv); 2293 SBLASTMBUFCHK(&so->so_rcv); 2294 SOCKBUF_UNLOCK(&so->so_rcv); 2295 2296 if (pr->pr_flags & PR_ADDR) { 2297 KASSERT(m->m_type == MT_SONAME, 2298 ("m->m_type == %d", m->m_type)); 2299 if (psa != NULL) 2300 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2301 M_NOWAIT); 2302 m = m_free(m); 2303 } 2304 if (m == NULL) { 2305 /* XXXRW: Can this happen? */ 2306 return (0); 2307 } 2308 2309 /* 2310 * Packet to copyout() is now in 'm' and it is disconnected from the 2311 * queue. 2312 * 2313 * Process one or more MT_CONTROL mbufs present before any data mbufs 2314 * in the first mbuf chain on the socket buffer. We call into the 2315 * protocol to perform externalization (or freeing if controlp == 2316 * NULL). 2317 */ 2318 if (m->m_type == MT_CONTROL) { 2319 struct mbuf *cm = NULL, *cmn; 2320 struct mbuf **cme = &cm; 2321 2322 do { 2323 m2 = m->m_next; 2324 m->m_next = NULL; 2325 *cme = m; 2326 cme = &(*cme)->m_next; 2327 m = m2; 2328 } while (m != NULL && m->m_type == MT_CONTROL); 2329 while (cm != NULL) { 2330 cmn = cm->m_next; 2331 cm->m_next = NULL; 2332 if (pr->pr_domain->dom_externalize != NULL) { 2333 error = (*pr->pr_domain->dom_externalize) 2334 (cm, controlp); 2335 } else if (controlp != NULL) 2336 *controlp = cm; 2337 else 2338 m_freem(cm); 2339 if (controlp != NULL) { 2340 while (*controlp != NULL) 2341 controlp = &(*controlp)->m_next; 2342 } 2343 cm = cmn; 2344 } 2345 } 2346 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); 2347 2348 while (m != NULL && uio->uio_resid > 0) { 2349 len = uio->uio_resid; 2350 if (len > m->m_len) 2351 len = m->m_len; 2352 error = uiomove(mtod(m, char *), (int)len, uio); 2353 if (error) { 2354 m_freem(m); 2355 return (error); 2356 } 2357 if (len == m->m_len) 2358 m = m_free(m); 2359 else { 2360 m->m_data += len; 2361 m->m_len -= len; 2362 } 2363 } 2364 if (m != NULL) 2365 flags |= MSG_TRUNC; 2366 m_freem(m); 2367 if (flagsp != NULL) 2368 *flagsp |= flags; 2369 return (0); 2370} 2371 2372int 2373soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2374 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2375{ 2376 int error; 2377 2378 CURVNET_SET(so->so_vnet); 2379 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 2380 controlp, flagsp)); 2381 CURVNET_RESTORE(); 2382 return (error); 2383} 2384 2385int 2386soshutdown(struct socket *so, int how) 2387{ 2388 struct protosw *pr = so->so_proto; 2389 int error; 2390 2391 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2392 return (EINVAL); 2393 2394 CURVNET_SET(so->so_vnet); 2395 if (pr->pr_usrreqs->pru_flush != NULL) 2396 (*pr->pr_usrreqs->pru_flush)(so, how); 2397 if (how != SHUT_WR) 2398 sorflush(so); 2399 if (how != SHUT_RD) { 2400 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2401 CURVNET_RESTORE(); 2402 return (error); 2403 } 2404 CURVNET_RESTORE(); 2405 return (0); 2406} 2407 2408void 2409sorflush(struct socket *so) 2410{ 2411 struct sockbuf *sb = &so->so_rcv; 2412 struct protosw *pr = so->so_proto; 2413 struct sockbuf asb; 2414 2415 VNET_SO_ASSERT(so); 2416 2417 /* 2418 * In order to avoid calling dom_dispose with the socket buffer mutex 2419 * held, and in order to generally avoid holding the lock for a long 2420 * time, we make a copy of the socket buffer and clear the original 2421 * (except locks, state). The new socket buffer copy won't have 2422 * initialized locks so we can only call routines that won't use or 2423 * assert those locks. 2424 * 2425 * Dislodge threads currently blocked in receive and wait to acquire 2426 * a lock against other simultaneous readers before clearing the 2427 * socket buffer. Don't let our acquire be interrupted by a signal 2428 * despite any existing socket disposition on interruptable waiting. 2429 */ 2430 socantrcvmore(so); 2431 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2432 2433 /* 2434 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2435 * and mutex data unchanged. 2436 */ 2437 SOCKBUF_LOCK(sb); 2438 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 2439 bcopy(&sb->sb_startzero, &asb.sb_startzero, 2440 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2441 bzero(&sb->sb_startzero, 2442 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2443 SOCKBUF_UNLOCK(sb); 2444 sbunlock(sb); 2445 2446 /* 2447 * Dispose of special rights and flush the socket buffer. Don't call 2448 * any unsafe routines (that rely on locks being initialized) on asb. 2449 */ 2450 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2451 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 2452 sbrelease_internal(&asb, so); 2453} 2454 2455/* 2456 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2457 * additional variant to handle the case where the option value needs to be 2458 * some kind of integer, but not a specific size. In addition to their use 2459 * here, these functions are also called by the protocol-level pr_ctloutput() 2460 * routines. 2461 */ 2462int 2463sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2464{ 2465 size_t valsize; 2466 2467 /* 2468 * If the user gives us more than we wanted, we ignore it, but if we 2469 * don't get the minimum length the caller wants, we return EINVAL. 2470 * On success, sopt->sopt_valsize is set to however much we actually 2471 * retrieved. 2472 */ 2473 if ((valsize = sopt->sopt_valsize) < minlen) 2474 return EINVAL; 2475 if (valsize > len) 2476 sopt->sopt_valsize = valsize = len; 2477 2478 if (sopt->sopt_td != NULL) 2479 return (copyin(sopt->sopt_val, buf, valsize)); 2480 2481 bcopy(sopt->sopt_val, buf, valsize); 2482 return (0); 2483} 2484 2485/* 2486 * Kernel version of setsockopt(2). 2487 * 2488 * XXX: optlen is size_t, not socklen_t 2489 */ 2490int 2491so_setsockopt(struct socket *so, int level, int optname, void *optval, 2492 size_t optlen) 2493{ 2494 struct sockopt sopt; 2495 2496 sopt.sopt_level = level; 2497 sopt.sopt_name = optname; 2498 sopt.sopt_dir = SOPT_SET; 2499 sopt.sopt_val = optval; 2500 sopt.sopt_valsize = optlen; 2501 sopt.sopt_td = NULL; 2502 return (sosetopt(so, &sopt)); 2503} 2504 2505int 2506sosetopt(struct socket *so, struct sockopt *sopt) 2507{ 2508 int error, optval; 2509 struct linger l; 2510 struct timeval tv; 2511 u_long val; 2512 uint32_t val32; 2513#ifdef MAC 2514 struct mac extmac; 2515#endif 2516 2517 CURVNET_SET(so->so_vnet); 2518 error = 0; 2519 if (sopt->sopt_level != SOL_SOCKET) { 2520 if (so->so_proto->pr_ctloutput != NULL) { 2521 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2522 CURVNET_RESTORE(); 2523 return (error); 2524 } 2525 error = ENOPROTOOPT; 2526 } else { 2527 switch (sopt->sopt_name) { 2528#ifdef INET 2529 case SO_ACCEPTFILTER: 2530 error = do_setopt_accept_filter(so, sopt); 2531 if (error) 2532 goto bad; 2533 break; 2534#endif 2535 case SO_LINGER: 2536 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2537 if (error) 2538 goto bad; 2539 2540 SOCK_LOCK(so); 2541 so->so_linger = l.l_linger; 2542 if (l.l_onoff) 2543 so->so_options |= SO_LINGER; 2544 else 2545 so->so_options &= ~SO_LINGER; 2546 SOCK_UNLOCK(so); 2547 break; 2548 2549 case SO_DEBUG: 2550 case SO_KEEPALIVE: 2551 case SO_DONTROUTE: 2552 case SO_USELOOPBACK: 2553 case SO_BROADCAST: 2554 case SO_REUSEADDR: 2555 case SO_REUSEPORT: 2556 case SO_OOBINLINE: 2557 case SO_TIMESTAMP: 2558 case SO_BINTIME: 2559 case SO_NOSIGPIPE: 2560 case SO_NO_DDP: 2561 case SO_NO_OFFLOAD: 2562 error = sooptcopyin(sopt, &optval, sizeof optval, 2563 sizeof optval); 2564 if (error) 2565 goto bad; 2566 SOCK_LOCK(so); 2567 if (optval) 2568 so->so_options |= sopt->sopt_name; 2569 else 2570 so->so_options &= ~sopt->sopt_name; 2571 SOCK_UNLOCK(so); 2572 break; 2573 2574 case SO_SETFIB: 2575 error = sooptcopyin(sopt, &optval, sizeof optval, 2576 sizeof optval); 2577 if (error) 2578 goto bad; 2579 2580 if (optval < 0 || optval >= rt_numfibs) { 2581 error = EINVAL; 2582 goto bad; 2583 } 2584 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2585 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2586 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2587 so->so_fibnum = optval; 2588 else 2589 so->so_fibnum = 0; 2590 break; 2591 2592 case SO_USER_COOKIE: 2593 error = sooptcopyin(sopt, &val32, sizeof val32, 2594 sizeof val32); 2595 if (error) 2596 goto bad; 2597 so->so_user_cookie = val32; 2598 break; 2599 2600 case SO_SNDBUF: 2601 case SO_RCVBUF: 2602 case SO_SNDLOWAT: 2603 case SO_RCVLOWAT: 2604 error = sooptcopyin(sopt, &optval, sizeof optval, 2605 sizeof optval); 2606 if (error) 2607 goto bad; 2608 2609 /* 2610 * Values < 1 make no sense for any of these options, 2611 * so disallow them. 2612 */ 2613 if (optval < 1) { 2614 error = EINVAL; 2615 goto bad; 2616 } 2617 2618 switch (sopt->sopt_name) { 2619 case SO_SNDBUF: 2620 case SO_RCVBUF: 2621 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2622 &so->so_snd : &so->so_rcv, (u_long)optval, 2623 so, curthread) == 0) { 2624 error = ENOBUFS; 2625 goto bad; 2626 } 2627 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2628 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2629 break; 2630 2631 /* 2632 * Make sure the low-water is never greater than the 2633 * high-water. 2634 */ 2635 case SO_SNDLOWAT: 2636 SOCKBUF_LOCK(&so->so_snd); 2637 so->so_snd.sb_lowat = 2638 (optval > so->so_snd.sb_hiwat) ? 2639 so->so_snd.sb_hiwat : optval; 2640 SOCKBUF_UNLOCK(&so->so_snd); 2641 break; 2642 case SO_RCVLOWAT: 2643 SOCKBUF_LOCK(&so->so_rcv); 2644 so->so_rcv.sb_lowat = 2645 (optval > so->so_rcv.sb_hiwat) ? 2646 so->so_rcv.sb_hiwat : optval; 2647 SOCKBUF_UNLOCK(&so->so_rcv); 2648 break; 2649 } 2650 break; 2651 2652 case SO_SNDTIMEO: 2653 case SO_RCVTIMEO: 2654#ifdef COMPAT_FREEBSD32 2655 if (SV_CURPROC_FLAG(SV_ILP32)) { 2656 struct timeval32 tv32; 2657 2658 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2659 sizeof tv32); 2660 CP(tv32, tv, tv_sec); 2661 CP(tv32, tv, tv_usec); 2662 } else 2663#endif 2664 error = sooptcopyin(sopt, &tv, sizeof tv, 2665 sizeof tv); 2666 if (error) 2667 goto bad; 2668 2669 /* assert(hz > 0); */ 2670 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2671 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2672 error = EDOM; 2673 goto bad; 2674 } 2675 /* assert(tick > 0); */ 2676 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2677 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2678 if (val > INT_MAX) { 2679 error = EDOM; 2680 goto bad; 2681 } 2682 if (val == 0 && tv.tv_usec != 0) 2683 val = 1; 2684 2685 switch (sopt->sopt_name) { 2686 case SO_SNDTIMEO: 2687 so->so_snd.sb_timeo = val; 2688 break; 2689 case SO_RCVTIMEO: 2690 so->so_rcv.sb_timeo = val; 2691 break; 2692 } 2693 break; 2694 2695 case SO_LABEL: 2696#ifdef MAC 2697 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2698 sizeof extmac); 2699 if (error) 2700 goto bad; 2701 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2702 so, &extmac); 2703#else 2704 error = EOPNOTSUPP; 2705#endif 2706 break; 2707 2708 default: 2709 error = ENOPROTOOPT; 2710 break; 2711 } 2712 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 2713 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 2714 } 2715bad: 2716 CURVNET_RESTORE(); 2717 return (error); 2718} 2719 2720/* 2721 * Helper routine for getsockopt. 2722 */ 2723int 2724sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2725{ 2726 int error; 2727 size_t valsize; 2728 2729 error = 0; 2730 2731 /* 2732 * Documented get behavior is that we always return a value, possibly 2733 * truncated to fit in the user's buffer. Traditional behavior is 2734 * that we always tell the user precisely how much we copied, rather 2735 * than something useful like the total amount we had available for 2736 * her. Note that this interface is not idempotent; the entire 2737 * answer must generated ahead of time. 2738 */ 2739 valsize = min(len, sopt->sopt_valsize); 2740 sopt->sopt_valsize = valsize; 2741 if (sopt->sopt_val != NULL) { 2742 if (sopt->sopt_td != NULL) 2743 error = copyout(buf, sopt->sopt_val, valsize); 2744 else 2745 bcopy(buf, sopt->sopt_val, valsize); 2746 } 2747 return (error); 2748} 2749 2750int 2751sogetopt(struct socket *so, struct sockopt *sopt) 2752{ 2753 int error, optval; 2754 struct linger l; 2755 struct timeval tv; 2756#ifdef MAC 2757 struct mac extmac; 2758#endif 2759 2760 CURVNET_SET(so->so_vnet); 2761 error = 0; 2762 if (sopt->sopt_level != SOL_SOCKET) { 2763 if (so->so_proto->pr_ctloutput != NULL) 2764 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2765 else 2766 error = ENOPROTOOPT; 2767 CURVNET_RESTORE(); 2768 return (error); 2769 } else { 2770 switch (sopt->sopt_name) { 2771#ifdef INET 2772 case SO_ACCEPTFILTER: 2773 error = do_getopt_accept_filter(so, sopt); 2774 break; 2775#endif 2776 case SO_LINGER: 2777 SOCK_LOCK(so); 2778 l.l_onoff = so->so_options & SO_LINGER; 2779 l.l_linger = so->so_linger; 2780 SOCK_UNLOCK(so); 2781 error = sooptcopyout(sopt, &l, sizeof l); 2782 break; 2783 2784 case SO_USELOOPBACK: 2785 case SO_DONTROUTE: 2786 case SO_DEBUG: 2787 case SO_KEEPALIVE: 2788 case SO_REUSEADDR: 2789 case SO_REUSEPORT: 2790 case SO_BROADCAST: 2791 case SO_OOBINLINE: 2792 case SO_ACCEPTCONN: 2793 case SO_TIMESTAMP: 2794 case SO_BINTIME: 2795 case SO_NOSIGPIPE: 2796 optval = so->so_options & sopt->sopt_name; 2797integer: 2798 error = sooptcopyout(sopt, &optval, sizeof optval); 2799 break; 2800 2801 case SO_TYPE: 2802 optval = so->so_type; 2803 goto integer; 2804 2805 case SO_PROTOCOL: 2806 optval = so->so_proto->pr_protocol; 2807 goto integer; 2808 2809 case SO_ERROR: 2810 SOCK_LOCK(so); 2811 optval = so->so_error; 2812 so->so_error = 0; 2813 SOCK_UNLOCK(so); 2814 goto integer; 2815 2816 case SO_SNDBUF: 2817 optval = so->so_snd.sb_hiwat; 2818 goto integer; 2819 2820 case SO_RCVBUF: 2821 optval = so->so_rcv.sb_hiwat; 2822 goto integer; 2823 2824 case SO_SNDLOWAT: 2825 optval = so->so_snd.sb_lowat; 2826 goto integer; 2827 2828 case SO_RCVLOWAT: 2829 optval = so->so_rcv.sb_lowat; 2830 goto integer; 2831 2832 case SO_SNDTIMEO: 2833 case SO_RCVTIMEO: 2834 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2835 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2836 2837 tv.tv_sec = optval / hz; 2838 tv.tv_usec = (optval % hz) * tick; 2839#ifdef COMPAT_FREEBSD32 2840 if (SV_CURPROC_FLAG(SV_ILP32)) { 2841 struct timeval32 tv32; 2842 2843 CP(tv, tv32, tv_sec); 2844 CP(tv, tv32, tv_usec); 2845 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2846 } else 2847#endif 2848 error = sooptcopyout(sopt, &tv, sizeof tv); 2849 break; 2850 2851 case SO_LABEL: 2852#ifdef MAC 2853 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2854 sizeof(extmac)); 2855 if (error) 2856 goto bad; 2857 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2858 so, &extmac); 2859 if (error) 2860 goto bad; 2861 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2862#else 2863 error = EOPNOTSUPP; 2864#endif 2865 break; 2866 2867 case SO_PEERLABEL: 2868#ifdef MAC 2869 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2870 sizeof(extmac)); 2871 if (error) 2872 goto bad; 2873 error = mac_getsockopt_peerlabel( 2874 sopt->sopt_td->td_ucred, so, &extmac); 2875 if (error) 2876 goto bad; 2877 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2878#else 2879 error = EOPNOTSUPP; 2880#endif 2881 break; 2882 2883 case SO_LISTENQLIMIT: 2884 optval = so->so_qlimit; 2885 goto integer; 2886 2887 case SO_LISTENQLEN: 2888 optval = so->so_qlen; 2889 goto integer; 2890 2891 case SO_LISTENINCQLEN: 2892 optval = so->so_incqlen; 2893 goto integer; 2894 2895 default: 2896 error = ENOPROTOOPT; 2897 break; 2898 } 2899 } 2900#ifdef MAC 2901bad: 2902#endif 2903 CURVNET_RESTORE(); 2904 return (error); 2905} 2906 2907int 2908soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2909{ 2910 struct mbuf *m, *m_prev; 2911 int sopt_size = sopt->sopt_valsize; 2912 2913 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 2914 if (m == NULL) 2915 return ENOBUFS; 2916 if (sopt_size > MLEN) { 2917 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 2918 if ((m->m_flags & M_EXT) == 0) { 2919 m_free(m); 2920 return ENOBUFS; 2921 } 2922 m->m_len = min(MCLBYTES, sopt_size); 2923 } else { 2924 m->m_len = min(MLEN, sopt_size); 2925 } 2926 sopt_size -= m->m_len; 2927 *mp = m; 2928 m_prev = m; 2929 2930 while (sopt_size) { 2931 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 2932 if (m == NULL) { 2933 m_freem(*mp); 2934 return ENOBUFS; 2935 } 2936 if (sopt_size > MLEN) { 2937 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 2938 M_NOWAIT); 2939 if ((m->m_flags & M_EXT) == 0) { 2940 m_freem(m); 2941 m_freem(*mp); 2942 return ENOBUFS; 2943 } 2944 m->m_len = min(MCLBYTES, sopt_size); 2945 } else { 2946 m->m_len = min(MLEN, sopt_size); 2947 } 2948 sopt_size -= m->m_len; 2949 m_prev->m_next = m; 2950 m_prev = m; 2951 } 2952 return (0); 2953} 2954 2955int 2956soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2957{ 2958 struct mbuf *m0 = m; 2959 2960 if (sopt->sopt_val == NULL) 2961 return (0); 2962 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2963 if (sopt->sopt_td != NULL) { 2964 int error; 2965 2966 error = copyin(sopt->sopt_val, mtod(m, char *), 2967 m->m_len); 2968 if (error != 0) { 2969 m_freem(m0); 2970 return(error); 2971 } 2972 } else 2973 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2974 sopt->sopt_valsize -= m->m_len; 2975 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2976 m = m->m_next; 2977 } 2978 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2979 panic("ip6_sooptmcopyin"); 2980 return (0); 2981} 2982 2983int 2984soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2985{ 2986 struct mbuf *m0 = m; 2987 size_t valsize = 0; 2988 2989 if (sopt->sopt_val == NULL) 2990 return (0); 2991 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2992 if (sopt->sopt_td != NULL) { 2993 int error; 2994 2995 error = copyout(mtod(m, char *), sopt->sopt_val, 2996 m->m_len); 2997 if (error != 0) { 2998 m_freem(m0); 2999 return(error); 3000 } 3001 } else 3002 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3003 sopt->sopt_valsize -= m->m_len; 3004 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3005 valsize += m->m_len; 3006 m = m->m_next; 3007 } 3008 if (m != NULL) { 3009 /* enough soopt buffer should be given from user-land */ 3010 m_freem(m0); 3011 return(EINVAL); 3012 } 3013 sopt->sopt_valsize = valsize; 3014 return (0); 3015} 3016 3017/* 3018 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3019 * out-of-band data, which will then notify socket consumers. 3020 */ 3021void 3022sohasoutofband(struct socket *so) 3023{ 3024 3025 if (so->so_sigio != NULL) 3026 pgsigio(&so->so_sigio, SIGURG, 0); 3027 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 3028} 3029 3030int 3031sopoll(struct socket *so, int events, struct ucred *active_cred, 3032 struct thread *td) 3033{ 3034 3035 /* 3036 * We do not need to set or assert curvnet as long as everyone uses 3037 * sopoll_generic(). 3038 */ 3039 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3040 td)); 3041} 3042 3043int 3044sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3045 struct thread *td) 3046{ 3047 int revents = 0; 3048 3049 SOCKBUF_LOCK(&so->so_snd); 3050 SOCKBUF_LOCK(&so->so_rcv); 3051 if (events & (POLLIN | POLLRDNORM)) 3052 if (soreadabledata(so)) 3053 revents |= events & (POLLIN | POLLRDNORM); 3054 3055 if (events & (POLLOUT | POLLWRNORM)) 3056 if (sowriteable(so)) 3057 revents |= events & (POLLOUT | POLLWRNORM); 3058 3059 if (events & (POLLPRI | POLLRDBAND)) 3060 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 3061 revents |= events & (POLLPRI | POLLRDBAND); 3062 3063 if ((events & POLLINIGNEOF) == 0) { 3064 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3065 revents |= events & (POLLIN | POLLRDNORM); 3066 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3067 revents |= POLLHUP; 3068 } 3069 } 3070 3071 if (revents == 0) { 3072 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3073 selrecord(td, &so->so_rcv.sb_sel); 3074 so->so_rcv.sb_flags |= SB_SEL; 3075 } 3076 3077 if (events & (POLLOUT | POLLWRNORM)) { 3078 selrecord(td, &so->so_snd.sb_sel); 3079 so->so_snd.sb_flags |= SB_SEL; 3080 } 3081 } 3082 3083 SOCKBUF_UNLOCK(&so->so_rcv); 3084 SOCKBUF_UNLOCK(&so->so_snd); 3085 return (revents); 3086} 3087 3088int 3089soo_kqfilter(struct file *fp, struct knote *kn) 3090{ 3091 struct socket *so = kn->kn_fp->f_data; 3092 struct sockbuf *sb; 3093 3094 switch (kn->kn_filter) { 3095 case EVFILT_READ: 3096 if (so->so_options & SO_ACCEPTCONN) 3097 kn->kn_fop = &solisten_filtops; 3098 else 3099 kn->kn_fop = &soread_filtops; 3100 sb = &so->so_rcv; 3101 break; 3102 case EVFILT_WRITE: 3103 kn->kn_fop = &sowrite_filtops; 3104 sb = &so->so_snd; 3105 break; 3106 default: 3107 return (EINVAL); 3108 } 3109 3110 SOCKBUF_LOCK(sb); 3111 knlist_add(&sb->sb_sel.si_note, kn, 1); 3112 sb->sb_flags |= SB_KNOTE; 3113 SOCKBUF_UNLOCK(sb); 3114 return (0); 3115} 3116 3117/* 3118 * Some routines that return EOPNOTSUPP for entry points that are not 3119 * supported by a protocol. Fill in as needed. 3120 */ 3121int 3122pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3123{ 3124 3125 return EOPNOTSUPP; 3126} 3127 3128int 3129pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3130{ 3131 3132 return EOPNOTSUPP; 3133} 3134 3135int 3136pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3137{ 3138 3139 return EOPNOTSUPP; 3140} 3141 3142int 3143pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3144{ 3145 3146 return EOPNOTSUPP; 3147} 3148 3149int 3150pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3151{ 3152 3153 return EOPNOTSUPP; 3154} 3155 3156int 3157pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3158 struct ifnet *ifp, struct thread *td) 3159{ 3160 3161 return EOPNOTSUPP; 3162} 3163 3164int 3165pru_disconnect_notsupp(struct socket *so) 3166{ 3167 3168 return EOPNOTSUPP; 3169} 3170 3171int 3172pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3173{ 3174 3175 return EOPNOTSUPP; 3176} 3177 3178int 3179pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3180{ 3181 3182 return EOPNOTSUPP; 3183} 3184 3185int 3186pru_rcvd_notsupp(struct socket *so, int flags) 3187{ 3188 3189 return EOPNOTSUPP; 3190} 3191 3192int 3193pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3194{ 3195 3196 return EOPNOTSUPP; 3197} 3198 3199int 3200pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3201 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3202{ 3203 3204 return EOPNOTSUPP; 3205} 3206 3207/* 3208 * This isn't really a ``null'' operation, but it's the default one and 3209 * doesn't do anything destructive. 3210 */ 3211int 3212pru_sense_null(struct socket *so, struct stat *sb) 3213{ 3214 3215 sb->st_blksize = so->so_snd.sb_hiwat; 3216 return 0; 3217} 3218 3219int 3220pru_shutdown_notsupp(struct socket *so) 3221{ 3222 3223 return EOPNOTSUPP; 3224} 3225 3226int 3227pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3228{ 3229 3230 return EOPNOTSUPP; 3231} 3232 3233int 3234pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3235 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3236{ 3237 3238 return EOPNOTSUPP; 3239} 3240 3241int 3242pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3243 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3244{ 3245 3246 return EOPNOTSUPP; 3247} 3248 3249int 3250pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3251 struct thread *td) 3252{ 3253 3254 return EOPNOTSUPP; 3255} 3256 3257static void 3258filt_sordetach(struct knote *kn) 3259{ 3260 struct socket *so = kn->kn_fp->f_data; 3261 3262 SOCKBUF_LOCK(&so->so_rcv); 3263 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 3264 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 3265 so->so_rcv.sb_flags &= ~SB_KNOTE; 3266 SOCKBUF_UNLOCK(&so->so_rcv); 3267} 3268 3269/*ARGSUSED*/ 3270static int 3271filt_soread(struct knote *kn, long hint) 3272{ 3273 struct socket *so; 3274 3275 so = kn->kn_fp->f_data; 3276 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3277 3278 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 3279 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3280 kn->kn_flags |= EV_EOF; 3281 kn->kn_fflags = so->so_error; 3282 return (1); 3283 } else if (so->so_error) /* temporary udp error */ 3284 return (1); 3285 else if (kn->kn_sfflags & NOTE_LOWAT) 3286 return (kn->kn_data >= kn->kn_sdata); 3287 else 3288 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 3289} 3290 3291static void 3292filt_sowdetach(struct knote *kn) 3293{ 3294 struct socket *so = kn->kn_fp->f_data; 3295 3296 SOCKBUF_LOCK(&so->so_snd); 3297 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 3298 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 3299 so->so_snd.sb_flags &= ~SB_KNOTE; 3300 SOCKBUF_UNLOCK(&so->so_snd); 3301} 3302 3303/*ARGSUSED*/ 3304static int 3305filt_sowrite(struct knote *kn, long hint) 3306{ 3307 struct socket *so; 3308 3309 so = kn->kn_fp->f_data; 3310 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3311 kn->kn_data = sbspace(&so->so_snd); 3312 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3313 kn->kn_flags |= EV_EOF; 3314 kn->kn_fflags = so->so_error; 3315 return (1); 3316 } else if (so->so_error) /* temporary udp error */ 3317 return (1); 3318 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3319 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3320 return (0); 3321 else if (kn->kn_sfflags & NOTE_LOWAT) 3322 return (kn->kn_data >= kn->kn_sdata); 3323 else 3324 return (kn->kn_data >= so->so_snd.sb_lowat); 3325} 3326 3327/*ARGSUSED*/ 3328static int 3329filt_solisten(struct knote *kn, long hint) 3330{ 3331 struct socket *so = kn->kn_fp->f_data; 3332 3333 kn->kn_data = so->so_qlen; 3334 return (!TAILQ_EMPTY(&so->so_comp)); 3335} 3336 3337int 3338socheckuid(struct socket *so, uid_t uid) 3339{ 3340 3341 if (so == NULL) 3342 return (EPERM); 3343 if (so->so_cred->cr_uid != uid) 3344 return (EPERM); 3345 return (0); 3346} 3347 3348/* 3349 * These functions are used by protocols to notify the socket layer (and its 3350 * consumers) of state changes in the sockets driven by protocol-side events. 3351 */ 3352 3353/* 3354 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3355 * 3356 * Normal sequence from the active (originating) side is that 3357 * soisconnecting() is called during processing of connect() call, resulting 3358 * in an eventual call to soisconnected() if/when the connection is 3359 * established. When the connection is torn down soisdisconnecting() is 3360 * called during processing of disconnect() call, and soisdisconnected() is 3361 * called when the connection to the peer is totally severed. The semantics 3362 * of these routines are such that connectionless protocols can call 3363 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3364 * calls when setting up a ``connection'' takes no time. 3365 * 3366 * From the passive side, a socket is created with two queues of sockets: 3367 * so_incomp for connections in progress and so_comp for connections already 3368 * made and awaiting user acceptance. As a protocol is preparing incoming 3369 * connections, it creates a socket structure queued on so_incomp by calling 3370 * sonewconn(). When the connection is established, soisconnected() is 3371 * called, and transfers the socket structure to so_comp, making it available 3372 * to accept(). 3373 * 3374 * If a socket is closed with sockets on either so_incomp or so_comp, these 3375 * sockets are dropped. 3376 * 3377 * If higher-level protocols are implemented in the kernel, the wakeups done 3378 * here will sometimes cause software-interrupt process scheduling. 3379 */ 3380void 3381soisconnecting(struct socket *so) 3382{ 3383 3384 SOCK_LOCK(so); 3385 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3386 so->so_state |= SS_ISCONNECTING; 3387 SOCK_UNLOCK(so); 3388} 3389 3390void 3391soisconnected(struct socket *so) 3392{ 3393 struct socket *head; 3394 int ret; 3395 3396restart: 3397 ACCEPT_LOCK(); 3398 SOCK_LOCK(so); 3399 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3400 so->so_state |= SS_ISCONNECTED; 3401 head = so->so_head; 3402 if (head != NULL && (so->so_qstate & SQ_INCOMP)) { 3403 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3404 SOCK_UNLOCK(so); 3405 TAILQ_REMOVE(&head->so_incomp, so, so_list); 3406 head->so_incqlen--; 3407 so->so_qstate &= ~SQ_INCOMP; 3408 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 3409 head->so_qlen++; 3410 so->so_qstate |= SQ_COMP; 3411 ACCEPT_UNLOCK(); 3412 sorwakeup(head); 3413 wakeup_one(&head->so_timeo); 3414 } else { 3415 ACCEPT_UNLOCK(); 3416 soupcall_set(so, SO_RCV, 3417 head->so_accf->so_accept_filter->accf_callback, 3418 head->so_accf->so_accept_filter_arg); 3419 so->so_options &= ~SO_ACCEPTFILTER; 3420 ret = head->so_accf->so_accept_filter->accf_callback(so, 3421 head->so_accf->so_accept_filter_arg, M_NOWAIT); 3422 if (ret == SU_ISCONNECTED) 3423 soupcall_clear(so, SO_RCV); 3424 SOCK_UNLOCK(so); 3425 if (ret == SU_ISCONNECTED) 3426 goto restart; 3427 } 3428 return; 3429 } 3430 SOCK_UNLOCK(so); 3431 ACCEPT_UNLOCK(); 3432 wakeup(&so->so_timeo); 3433 sorwakeup(so); 3434 sowwakeup(so); 3435} 3436 3437void 3438soisdisconnecting(struct socket *so) 3439{ 3440 3441 /* 3442 * Note: This code assumes that SOCK_LOCK(so) and 3443 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3444 */ 3445 SOCKBUF_LOCK(&so->so_rcv); 3446 so->so_state &= ~SS_ISCONNECTING; 3447 so->so_state |= SS_ISDISCONNECTING; 3448 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3449 sorwakeup_locked(so); 3450 SOCKBUF_LOCK(&so->so_snd); 3451 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3452 sowwakeup_locked(so); 3453 wakeup(&so->so_timeo); 3454} 3455 3456void 3457soisdisconnected(struct socket *so) 3458{ 3459 3460 /* 3461 * Note: This code assumes that SOCK_LOCK(so) and 3462 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3463 */ 3464 SOCKBUF_LOCK(&so->so_rcv); 3465 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3466 so->so_state |= SS_ISDISCONNECTED; 3467 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3468 sorwakeup_locked(so); 3469 SOCKBUF_LOCK(&so->so_snd); 3470 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3471 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); 3472 sowwakeup_locked(so); 3473 wakeup(&so->so_timeo); 3474} 3475 3476/* 3477 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3478 */ 3479struct sockaddr * 3480sodupsockaddr(const struct sockaddr *sa, int mflags) 3481{ 3482 struct sockaddr *sa2; 3483 3484 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3485 if (sa2) 3486 bcopy(sa, sa2, sa->sa_len); 3487 return sa2; 3488} 3489 3490/* 3491 * Register per-socket buffer upcalls. 3492 */ 3493void 3494soupcall_set(struct socket *so, int which, 3495 int (*func)(struct socket *, void *, int), void *arg) 3496{ 3497 struct sockbuf *sb; 3498 3499 switch (which) { 3500 case SO_RCV: 3501 sb = &so->so_rcv; 3502 break; 3503 case SO_SND: 3504 sb = &so->so_snd; 3505 break; 3506 default: 3507 panic("soupcall_set: bad which"); 3508 } 3509 SOCKBUF_LOCK_ASSERT(sb); 3510#if 0 3511 /* XXX: accf_http actually wants to do this on purpose. */ 3512 KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); 3513#endif 3514 sb->sb_upcall = func; 3515 sb->sb_upcallarg = arg; 3516 sb->sb_flags |= SB_UPCALL; 3517} 3518 3519void 3520soupcall_clear(struct socket *so, int which) 3521{ 3522 struct sockbuf *sb; 3523 3524 switch (which) { 3525 case SO_RCV: 3526 sb = &so->so_rcv; 3527 break; 3528 case SO_SND: 3529 sb = &so->so_snd; 3530 break; 3531 default: 3532 panic("soupcall_clear: bad which"); 3533 } 3534 SOCKBUF_LOCK_ASSERT(sb); 3535 KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); 3536 sb->sb_upcall = NULL; 3537 sb->sb_upcallarg = NULL; 3538 sb->sb_flags &= ~SB_UPCALL; 3539} 3540 3541/* 3542 * Create an external-format (``xsocket'') structure using the information in 3543 * the kernel-format socket structure pointed to by so. This is done to 3544 * reduce the spew of irrelevant information over this interface, to isolate 3545 * user code from changes in the kernel structure, and potentially to provide 3546 * information-hiding if we decide that some of this information should be 3547 * hidden from users. 3548 */ 3549void 3550sotoxsocket(struct socket *so, struct xsocket *xso) 3551{ 3552 3553 xso->xso_len = sizeof *xso; 3554 xso->xso_so = so; 3555 xso->so_type = so->so_type; 3556 xso->so_options = so->so_options; 3557 xso->so_linger = so->so_linger; 3558 xso->so_state = so->so_state; 3559 xso->so_pcb = so->so_pcb; 3560 xso->xso_protocol = so->so_proto->pr_protocol; 3561 xso->xso_family = so->so_proto->pr_domain->dom_family; 3562 xso->so_qlen = so->so_qlen; 3563 xso->so_incqlen = so->so_incqlen; 3564 xso->so_qlimit = so->so_qlimit; 3565 xso->so_timeo = so->so_timeo; 3566 xso->so_error = so->so_error; 3567 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 3568 xso->so_oobmark = so->so_oobmark; 3569 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 3570 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 3571 xso->so_uid = so->so_cred->cr_uid; 3572} 3573 3574 3575/* 3576 * Socket accessor functions to provide external consumers with 3577 * a safe interface to socket state 3578 * 3579 */ 3580 3581void 3582so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), 3583 void *arg) 3584{ 3585 3586 TAILQ_FOREACH(so, &so->so_comp, so_list) 3587 func(so, arg); 3588} 3589 3590struct sockbuf * 3591so_sockbuf_rcv(struct socket *so) 3592{ 3593 3594 return (&so->so_rcv); 3595} 3596 3597struct sockbuf * 3598so_sockbuf_snd(struct socket *so) 3599{ 3600 3601 return (&so->so_snd); 3602} 3603 3604int 3605so_state_get(const struct socket *so) 3606{ 3607 3608 return (so->so_state); 3609} 3610 3611void 3612so_state_set(struct socket *so, int val) 3613{ 3614 3615 so->so_state = val; 3616} 3617 3618int 3619so_options_get(const struct socket *so) 3620{ 3621 3622 return (so->so_options); 3623} 3624 3625void 3626so_options_set(struct socket *so, int val) 3627{ 3628 3629 so->so_options = val; 3630} 3631 3632int 3633so_error_get(const struct socket *so) 3634{ 3635 3636 return (so->so_error); 3637} 3638 3639void 3640so_error_set(struct socket *so, int val) 3641{ 3642 3643 so->so_error = val; 3644} 3645 3646int 3647so_linger_get(const struct socket *so) 3648{ 3649 3650 return (so->so_linger); 3651} 3652 3653void 3654so_linger_set(struct socket *so, int val) 3655{ 3656 3657 so->so_linger = val; 3658} 3659 3660struct protosw * 3661so_protosw_get(const struct socket *so) 3662{ 3663 3664 return (so->so_proto); 3665} 3666 3667void 3668so_protosw_set(struct socket *so, struct protosw *val) 3669{ 3670 3671 so->so_proto = val; 3672} 3673 3674void 3675so_sorwakeup(struct socket *so) 3676{ 3677 3678 sorwakeup(so); 3679} 3680 3681void 3682so_sowwakeup(struct socket *so) 3683{ 3684 3685 sowwakeup(so); 3686} 3687 3688void 3689so_sorwakeup_locked(struct socket *so) 3690{ 3691 3692 sorwakeup_locked(so); 3693} 3694 3695void 3696so_sowwakeup_locked(struct socket *so) 3697{ 3698 3699 sowwakeup_locked(so); 3700} 3701 3702void 3703so_lock(struct socket *so) 3704{ 3705 3706 SOCK_LOCK(so); 3707} 3708 3709void 3710so_unlock(struct socket *so) 3711{ 3712 3713 SOCK_UNLOCK(so); 3714} 3715