uipc_socket.c revision 243638
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2008 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35/* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 99 * and sorflush(), which are usually called from a pre-set VNET context. 100 * sopoll() currently does not need a VNET context to be set. 101 */ 102 103#include <sys/cdefs.h> 104__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 243638 2012-11-27 22:35:48Z andre $"); 105 106#include "opt_inet.h" 107#include "opt_inet6.h" 108#include "opt_zero.h" 109#include "opt_compat.h" 110 111#include <sys/param.h> 112#include <sys/systm.h> 113#include <sys/fcntl.h> 114#include <sys/limits.h> 115#include <sys/lock.h> 116#include <sys/mac.h> 117#include <sys/malloc.h> 118#include <sys/mbuf.h> 119#include <sys/mutex.h> 120#include <sys/domain.h> 121#include <sys/file.h> /* for struct knote */ 122#include <sys/kernel.h> 123#include <sys/event.h> 124#include <sys/eventhandler.h> 125#include <sys/poll.h> 126#include <sys/proc.h> 127#include <sys/protosw.h> 128#include <sys/socket.h> 129#include <sys/socketvar.h> 130#include <sys/resourcevar.h> 131#include <net/route.h> 132#include <sys/signalvar.h> 133#include <sys/stat.h> 134#include <sys/sx.h> 135#include <sys/sysctl.h> 136#include <sys/uio.h> 137#include <sys/jail.h> 138#include <sys/syslog.h> 139 140#include <net/vnet.h> 141 142#include <security/mac/mac_framework.h> 143 144#include <vm/uma.h> 145 146#ifdef COMPAT_FREEBSD32 147#include <sys/mount.h> 148#include <sys/sysent.h> 149#include <compat/freebsd32/freebsd32.h> 150#endif 151 152static int soreceive_rcvoob(struct socket *so, struct uio *uio, 153 int flags); 154 155static void filt_sordetach(struct knote *kn); 156static int filt_soread(struct knote *kn, long hint); 157static void filt_sowdetach(struct knote *kn); 158static int filt_sowrite(struct knote *kn, long hint); 159static int filt_solisten(struct knote *kn, long hint); 160 161static struct filterops solisten_filtops = { 162 .f_isfd = 1, 163 .f_detach = filt_sordetach, 164 .f_event = filt_solisten, 165}; 166static struct filterops soread_filtops = { 167 .f_isfd = 1, 168 .f_detach = filt_sordetach, 169 .f_event = filt_soread, 170}; 171static struct filterops sowrite_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_sowdetach, 174 .f_event = filt_sowrite, 175}; 176 177so_gen_t so_gencnt; /* generation count for sockets */ 178 179MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 180MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 181 182#define VNET_SO_ASSERT(so) \ 183 VNET_ASSERT(curvnet != NULL, \ 184 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 185 186/* 187 * Limit on the number of connections in the listen queue waiting 188 * for accept(2). 189 * NB: The orginal sysctl somaxconn is still available but hidden 190 * to prevent confusion about the actual purpose of this number. 191 */ 192static int somaxconn = SOMAXCONN; 193 194static int 195sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 196{ 197 int error; 198 int val; 199 200 val = somaxconn; 201 error = sysctl_handle_int(oidp, &val, 0, req); 202 if (error || !req->newptr ) 203 return (error); 204 205 if (val < 1 || val > USHRT_MAX) 206 return (EINVAL); 207 208 somaxconn = val; 209 return (0); 210} 211SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 212 0, sizeof(int), sysctl_somaxconn, "I", 213 "Maximum listen socket pending connection accept queue size"); 214SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 215 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 216 0, sizeof(int), sysctl_somaxconn, "I", 217 "Maximum listen socket pending connection accept queue size (compat)"); 218 219static int numopensockets; 220SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 221 &numopensockets, 0, "Number of open sockets"); 222 223#if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP) 224SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 225 "Zero copy controls"); 226#ifdef SOCKET_RECV_PFLIP 227int so_zero_copy_receive = 1; 228SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 229 &so_zero_copy_receive, 0, "Enable zero copy receive"); 230#endif 231#ifdef SOCKET_SEND_COW 232int so_zero_copy_send = 1; 233SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 234 &so_zero_copy_send, 0, "Enable zero copy send"); 235#endif /* SOCKET_SEND_COW */ 236#endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */ 237 238/* 239 * accept_mtx locks down per-socket fields relating to accept queues. See 240 * socketvar.h for an annotation of the protected fields of struct socket. 241 */ 242struct mtx accept_mtx; 243MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 244 245/* 246 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 247 * so_gencnt field. 248 */ 249static struct mtx so_global_mtx; 250MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 251 252/* 253 * General IPC sysctl name space, used by sockets and a variety of other IPC 254 * types. 255 */ 256SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 257 258/* 259 * Initialize the socket subsystem and set up the socket 260 * memory allocator. 261 */ 262uma_zone_t socket_zone; 263int maxsockets; 264 265static void 266socket_zone_change(void *tag) 267{ 268 269 uma_zone_set_max(socket_zone, maxsockets); 270} 271 272static void 273socket_init(void *tag) 274{ 275 276 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 277 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 278 uma_zone_set_max(socket_zone, maxsockets); 279 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 280 EVENTHANDLER_PRI_FIRST); 281} 282SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 283 284/* 285 * Initialise maxsockets. This SYSINIT must be run after 286 * tunable_mbinit(). 287 */ 288static void 289init_maxsockets(void *ignored) 290{ 291 292 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 293 maxsockets = imax(maxsockets, maxfiles); 294} 295SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 296 297/* 298 * Sysctl to get and set the maximum global sockets limit. Notify protocols 299 * of the change so that they can update their dependent limits as required. 300 */ 301static int 302sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 303{ 304 int error, newmaxsockets; 305 306 newmaxsockets = maxsockets; 307 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 308 if (error == 0 && req->newptr) { 309 if (newmaxsockets > maxsockets && 310 newmaxsockets <= maxfiles) { 311 maxsockets = newmaxsockets; 312 EVENTHANDLER_INVOKE(maxsockets_change); 313 } else 314 error = EINVAL; 315 } 316 return (error); 317} 318SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 319 &maxsockets, 0, sysctl_maxsockets, "IU", 320 "Maximum number of sockets avaliable"); 321 322/* 323 * Socket operation routines. These routines are called by the routines in 324 * sys_socket.c or from a system process, and implement the semantics of 325 * socket operations by switching out to the protocol specific routines. 326 */ 327 328/* 329 * Get a socket structure from our zone, and initialize it. Note that it 330 * would probably be better to allocate socket and PCB at the same time, but 331 * I'm not convinced that all the protocols can be easily modified to do 332 * this. 333 * 334 * soalloc() returns a socket with a ref count of 0. 335 */ 336static struct socket * 337soalloc(struct vnet *vnet) 338{ 339 struct socket *so; 340 341 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 342 if (so == NULL) 343 return (NULL); 344#ifdef MAC 345 if (mac_socket_init(so, M_NOWAIT) != 0) { 346 uma_zfree(socket_zone, so); 347 return (NULL); 348 } 349#endif 350 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 351 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 352 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 353 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 354 TAILQ_INIT(&so->so_aiojobq); 355 mtx_lock(&so_global_mtx); 356 so->so_gencnt = ++so_gencnt; 357 ++numopensockets; 358#ifdef VIMAGE 359 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 360 __func__, __LINE__, so)); 361 vnet->vnet_sockcnt++; 362 so->so_vnet = vnet; 363#endif 364 mtx_unlock(&so_global_mtx); 365 return (so); 366} 367 368/* 369 * Free the storage associated with a socket at the socket layer, tear down 370 * locks, labels, etc. All protocol state is assumed already to have been 371 * torn down (and possibly never set up) by the caller. 372 */ 373static void 374sodealloc(struct socket *so) 375{ 376 377 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 378 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 379 380 mtx_lock(&so_global_mtx); 381 so->so_gencnt = ++so_gencnt; 382 --numopensockets; /* Could be below, but faster here. */ 383#ifdef VIMAGE 384 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 385 __func__, __LINE__, so)); 386 so->so_vnet->vnet_sockcnt--; 387#endif 388 mtx_unlock(&so_global_mtx); 389 if (so->so_rcv.sb_hiwat) 390 (void)chgsbsize(so->so_cred->cr_uidinfo, 391 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 392 if (so->so_snd.sb_hiwat) 393 (void)chgsbsize(so->so_cred->cr_uidinfo, 394 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 395#ifdef INET 396 /* remove acccept filter if one is present. */ 397 if (so->so_accf != NULL) 398 do_setopt_accept_filter(so, NULL); 399#endif 400#ifdef MAC 401 mac_socket_destroy(so); 402#endif 403 crfree(so->so_cred); 404 sx_destroy(&so->so_snd.sb_sx); 405 sx_destroy(&so->so_rcv.sb_sx); 406 SOCKBUF_LOCK_DESTROY(&so->so_snd); 407 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 408 uma_zfree(socket_zone, so); 409} 410 411/* 412 * socreate returns a socket with a ref count of 1. The socket should be 413 * closed with soclose(). 414 */ 415int 416socreate(int dom, struct socket **aso, int type, int proto, 417 struct ucred *cred, struct thread *td) 418{ 419 struct protosw *prp; 420 struct socket *so; 421 int error; 422 423 if (proto) 424 prp = pffindproto(dom, proto, type); 425 else 426 prp = pffindtype(dom, type); 427 428 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 429 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 430 return (EPROTONOSUPPORT); 431 432 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 433 return (EPROTONOSUPPORT); 434 435 if (prp->pr_type != type) 436 return (EPROTOTYPE); 437 so = soalloc(CRED_TO_VNET(cred)); 438 if (so == NULL) 439 return (ENOBUFS); 440 441 TAILQ_INIT(&so->so_incomp); 442 TAILQ_INIT(&so->so_comp); 443 so->so_type = type; 444 so->so_cred = crhold(cred); 445 if ((prp->pr_domain->dom_family == PF_INET) || 446 (prp->pr_domain->dom_family == PF_INET6) || 447 (prp->pr_domain->dom_family == PF_ROUTE)) 448 so->so_fibnum = td->td_proc->p_fibnum; 449 else 450 so->so_fibnum = 0; 451 so->so_proto = prp; 452#ifdef MAC 453 mac_socket_create(cred, so); 454#endif 455 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 456 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 457 so->so_count = 1; 458 /* 459 * Auto-sizing of socket buffers is managed by the protocols and 460 * the appropriate flags must be set in the pru_attach function. 461 */ 462 CURVNET_SET(so->so_vnet); 463 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 464 CURVNET_RESTORE(); 465 if (error) { 466 KASSERT(so->so_count == 1, ("socreate: so_count %d", 467 so->so_count)); 468 so->so_count = 0; 469 sodealloc(so); 470 return (error); 471 } 472 *aso = so; 473 return (0); 474} 475 476#ifdef REGRESSION 477static int regression_sonewconn_earlytest = 1; 478SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 479 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 480#endif 481 482/* 483 * When an attempt at a new connection is noted on a socket which accepts 484 * connections, sonewconn is called. If the connection is possible (subject 485 * to space constraints, etc.) then we allocate a new structure, propoerly 486 * linked into the data structure of the original socket, and return this. 487 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 488 * 489 * Note: the ref count on the socket is 0 on return. 490 */ 491struct socket * 492sonewconn(struct socket *head, int connstatus) 493{ 494 struct socket *so; 495 int over; 496 497 ACCEPT_LOCK(); 498 over = (head->so_qlen > 3 * head->so_qlimit / 2); 499 ACCEPT_UNLOCK(); 500#ifdef REGRESSION 501 if (regression_sonewconn_earlytest && over) { 502#else 503 if (over) { 504#endif 505 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 506 "%i already in queue awaiting acceptance\n", 507 __func__, head->so_pcb, over); 508 return (NULL); 509 } 510 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 511 __func__, __LINE__, head)); 512 so = soalloc(head->so_vnet); 513 if (so == NULL) { 514 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 515 "limit reached or out of memory\n", 516 __func__, head->so_pcb); 517 return (NULL); 518 } 519 if ((head->so_options & SO_ACCEPTFILTER) != 0) 520 connstatus = 0; 521 so->so_head = head; 522 so->so_type = head->so_type; 523 so->so_options = head->so_options &~ SO_ACCEPTCONN; 524 so->so_linger = head->so_linger; 525 so->so_state = head->so_state | SS_NOFDREF; 526 so->so_fibnum = head->so_fibnum; 527 so->so_proto = head->so_proto; 528 so->so_cred = crhold(head->so_cred); 529#ifdef MAC 530 mac_socket_newconn(head, so); 531#endif 532 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 533 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 534 VNET_SO_ASSERT(head); 535 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 536 sodealloc(so); 537 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 538 __func__, head->so_pcb); 539 return (NULL); 540 } 541 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 542 sodealloc(so); 543 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 544 __func__, head->so_pcb); 545 return (NULL); 546 } 547 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 548 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 549 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 550 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 551 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 552 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 553 so->so_state |= connstatus; 554 ACCEPT_LOCK(); 555 /* 556 * The accept socket may be tearing down but we just 557 * won a race on the ACCEPT_LOCK. 558 */ 559 if (!(head->so_options & SO_ACCEPTCONN)) { 560 SOCK_LOCK(so); 561 so->so_head = NULL; 562 sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ 563 return (NULL); 564 } 565 if (connstatus) { 566 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 567 so->so_qstate |= SQ_COMP; 568 head->so_qlen++; 569 } else { 570 /* 571 * Keep removing sockets from the head until there's room for 572 * us to insert on the tail. In pre-locking revisions, this 573 * was a simple if(), but as we could be racing with other 574 * threads and soabort() requires dropping locks, we must 575 * loop waiting for the condition to be true. 576 */ 577 while (head->so_incqlen > head->so_qlimit) { 578 struct socket *sp; 579 sp = TAILQ_FIRST(&head->so_incomp); 580 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 581 head->so_incqlen--; 582 sp->so_qstate &= ~SQ_INCOMP; 583 sp->so_head = NULL; 584 ACCEPT_UNLOCK(); 585 soabort(sp); 586 ACCEPT_LOCK(); 587 } 588 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 589 so->so_qstate |= SQ_INCOMP; 590 head->so_incqlen++; 591 } 592 ACCEPT_UNLOCK(); 593 if (connstatus) { 594 sorwakeup(head); 595 wakeup_one(&head->so_timeo); 596 } 597 return (so); 598} 599 600int 601sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 602{ 603 int error; 604 605 CURVNET_SET(so->so_vnet); 606 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 607 CURVNET_RESTORE(); 608 return error; 609} 610 611/* 612 * solisten() transitions a socket from a non-listening state to a listening 613 * state, but can also be used to update the listen queue depth on an 614 * existing listen socket. The protocol will call back into the sockets 615 * layer using solisten_proto_check() and solisten_proto() to check and set 616 * socket-layer listen state. Call backs are used so that the protocol can 617 * acquire both protocol and socket layer locks in whatever order is required 618 * by the protocol. 619 * 620 * Protocol implementors are advised to hold the socket lock across the 621 * socket-layer test and set to avoid races at the socket layer. 622 */ 623int 624solisten(struct socket *so, int backlog, struct thread *td) 625{ 626 int error; 627 628 CURVNET_SET(so->so_vnet); 629 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 630 CURVNET_RESTORE(); 631 return error; 632} 633 634int 635solisten_proto_check(struct socket *so) 636{ 637 638 SOCK_LOCK_ASSERT(so); 639 640 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 641 SS_ISDISCONNECTING)) 642 return (EINVAL); 643 return (0); 644} 645 646void 647solisten_proto(struct socket *so, int backlog) 648{ 649 650 SOCK_LOCK_ASSERT(so); 651 652 if (backlog < 0 || backlog > somaxconn) 653 backlog = somaxconn; 654 so->so_qlimit = backlog; 655 so->so_options |= SO_ACCEPTCONN; 656} 657 658/* 659 * Evaluate the reference count and named references on a socket; if no 660 * references remain, free it. This should be called whenever a reference is 661 * released, such as in sorele(), but also when named reference flags are 662 * cleared in socket or protocol code. 663 * 664 * sofree() will free the socket if: 665 * 666 * - There are no outstanding file descriptor references or related consumers 667 * (so_count == 0). 668 * 669 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 670 * 671 * - The protocol does not have an outstanding strong reference on the socket 672 * (SS_PROTOREF). 673 * 674 * - The socket is not in a completed connection queue, so a process has been 675 * notified that it is present. If it is removed, the user process may 676 * block in accept() despite select() saying the socket was ready. 677 */ 678void 679sofree(struct socket *so) 680{ 681 struct protosw *pr = so->so_proto; 682 struct socket *head; 683 684 ACCEPT_LOCK_ASSERT(); 685 SOCK_LOCK_ASSERT(so); 686 687 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 688 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 689 SOCK_UNLOCK(so); 690 ACCEPT_UNLOCK(); 691 return; 692 } 693 694 head = so->so_head; 695 if (head != NULL) { 696 KASSERT((so->so_qstate & SQ_COMP) != 0 || 697 (so->so_qstate & SQ_INCOMP) != 0, 698 ("sofree: so_head != NULL, but neither SQ_COMP nor " 699 "SQ_INCOMP")); 700 KASSERT((so->so_qstate & SQ_COMP) == 0 || 701 (so->so_qstate & SQ_INCOMP) == 0, 702 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 703 TAILQ_REMOVE(&head->so_incomp, so, so_list); 704 head->so_incqlen--; 705 so->so_qstate &= ~SQ_INCOMP; 706 so->so_head = NULL; 707 } 708 KASSERT((so->so_qstate & SQ_COMP) == 0 && 709 (so->so_qstate & SQ_INCOMP) == 0, 710 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 711 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 712 if (so->so_options & SO_ACCEPTCONN) { 713 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 714 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated")); 715 } 716 SOCK_UNLOCK(so); 717 ACCEPT_UNLOCK(); 718 719 VNET_SO_ASSERT(so); 720 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 721 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 722 if (pr->pr_usrreqs->pru_detach != NULL) 723 (*pr->pr_usrreqs->pru_detach)(so); 724 725 /* 726 * From this point on, we assume that no other references to this 727 * socket exist anywhere else in the stack. Therefore, no locks need 728 * to be acquired or held. 729 * 730 * We used to do a lot of socket buffer and socket locking here, as 731 * well as invoke sorflush() and perform wakeups. The direct call to 732 * dom_dispose() and sbrelease_internal() are an inlining of what was 733 * necessary from sorflush(). 734 * 735 * Notice that the socket buffer and kqueue state are torn down 736 * before calling pru_detach. This means that protocols shold not 737 * assume they can perform socket wakeups, etc, in their detach code. 738 */ 739 sbdestroy(&so->so_snd, so); 740 sbdestroy(&so->so_rcv, so); 741 seldrain(&so->so_snd.sb_sel); 742 seldrain(&so->so_rcv.sb_sel); 743 knlist_destroy(&so->so_rcv.sb_sel.si_note); 744 knlist_destroy(&so->so_snd.sb_sel.si_note); 745 sodealloc(so); 746} 747 748/* 749 * Close a socket on last file table reference removal. Initiate disconnect 750 * if connected. Free socket when disconnect complete. 751 * 752 * This function will sorele() the socket. Note that soclose() may be called 753 * prior to the ref count reaching zero. The actual socket structure will 754 * not be freed until the ref count reaches zero. 755 */ 756int 757soclose(struct socket *so) 758{ 759 int error = 0; 760 761 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 762 763 CURVNET_SET(so->so_vnet); 764 funsetown(&so->so_sigio); 765 if (so->so_state & SS_ISCONNECTED) { 766 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 767 error = sodisconnect(so); 768 if (error) { 769 if (error == ENOTCONN) 770 error = 0; 771 goto drop; 772 } 773 } 774 if (so->so_options & SO_LINGER) { 775 if ((so->so_state & SS_ISDISCONNECTING) && 776 (so->so_state & SS_NBIO)) 777 goto drop; 778 while (so->so_state & SS_ISCONNECTED) { 779 error = tsleep(&so->so_timeo, 780 PSOCK | PCATCH, "soclos", so->so_linger * hz); 781 if (error) 782 break; 783 } 784 } 785 } 786 787drop: 788 if (so->so_proto->pr_usrreqs->pru_close != NULL) 789 (*so->so_proto->pr_usrreqs->pru_close)(so); 790 ACCEPT_LOCK(); 791 if (so->so_options & SO_ACCEPTCONN) { 792 struct socket *sp; 793 /* 794 * Prevent new additions to the accept queues due 795 * to ACCEPT_LOCK races while we are draining them. 796 */ 797 so->so_options &= ~SO_ACCEPTCONN; 798 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 799 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 800 so->so_incqlen--; 801 sp->so_qstate &= ~SQ_INCOMP; 802 sp->so_head = NULL; 803 ACCEPT_UNLOCK(); 804 soabort(sp); 805 ACCEPT_LOCK(); 806 } 807 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 808 TAILQ_REMOVE(&so->so_comp, sp, so_list); 809 so->so_qlen--; 810 sp->so_qstate &= ~SQ_COMP; 811 sp->so_head = NULL; 812 ACCEPT_UNLOCK(); 813 soabort(sp); 814 ACCEPT_LOCK(); 815 } 816 KASSERT((TAILQ_EMPTY(&so->so_comp)), 817 ("%s: so_comp populated", __func__)); 818 KASSERT((TAILQ_EMPTY(&so->so_incomp)), 819 ("%s: so_incomp populated", __func__)); 820 } 821 SOCK_LOCK(so); 822 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 823 so->so_state |= SS_NOFDREF; 824 sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ 825 CURVNET_RESTORE(); 826 return (error); 827} 828 829/* 830 * soabort() is used to abruptly tear down a connection, such as when a 831 * resource limit is reached (listen queue depth exceeded), or if a listen 832 * socket is closed while there are sockets waiting to be accepted. 833 * 834 * This interface is tricky, because it is called on an unreferenced socket, 835 * and must be called only by a thread that has actually removed the socket 836 * from the listen queue it was on, or races with other threads are risked. 837 * 838 * This interface will call into the protocol code, so must not be called 839 * with any socket locks held. Protocols do call it while holding their own 840 * recursible protocol mutexes, but this is something that should be subject 841 * to review in the future. 842 */ 843void 844soabort(struct socket *so) 845{ 846 847 /* 848 * In as much as is possible, assert that no references to this 849 * socket are held. This is not quite the same as asserting that the 850 * current thread is responsible for arranging for no references, but 851 * is as close as we can get for now. 852 */ 853 KASSERT(so->so_count == 0, ("soabort: so_count")); 854 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 855 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 856 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 857 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 858 VNET_SO_ASSERT(so); 859 860 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 861 (*so->so_proto->pr_usrreqs->pru_abort)(so); 862 ACCEPT_LOCK(); 863 SOCK_LOCK(so); 864 sofree(so); 865} 866 867int 868soaccept(struct socket *so, struct sockaddr **nam) 869{ 870 int error; 871 872 SOCK_LOCK(so); 873 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 874 so->so_state &= ~SS_NOFDREF; 875 SOCK_UNLOCK(so); 876 877 CURVNET_SET(so->so_vnet); 878 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 879 CURVNET_RESTORE(); 880 return (error); 881} 882 883int 884soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 885{ 886 int error; 887 888 if (so->so_options & SO_ACCEPTCONN) 889 return (EOPNOTSUPP); 890 891 CURVNET_SET(so->so_vnet); 892 /* 893 * If protocol is connection-based, can only connect once. 894 * Otherwise, if connected, try to disconnect first. This allows 895 * user to disconnect by connecting to, e.g., a null address. 896 */ 897 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 898 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 899 (error = sodisconnect(so)))) { 900 error = EISCONN; 901 } else { 902 /* 903 * Prevent accumulated error from previous connection from 904 * biting us. 905 */ 906 so->so_error = 0; 907 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 908 } 909 CURVNET_RESTORE(); 910 911 return (error); 912} 913 914int 915soconnect2(struct socket *so1, struct socket *so2) 916{ 917 int error; 918 919 CURVNET_SET(so1->so_vnet); 920 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 921 CURVNET_RESTORE(); 922 return (error); 923} 924 925int 926sodisconnect(struct socket *so) 927{ 928 int error; 929 930 if ((so->so_state & SS_ISCONNECTED) == 0) 931 return (ENOTCONN); 932 if (so->so_state & SS_ISDISCONNECTING) 933 return (EALREADY); 934 VNET_SO_ASSERT(so); 935 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 936 return (error); 937} 938 939#ifdef SOCKET_SEND_COW 940struct so_zerocopy_stats{ 941 int size_ok; 942 int align_ok; 943 int found_ifp; 944}; 945struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 946 947/* 948 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 949 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 950 * 951 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 952 * all of the data referenced by the uio. If desired, it uses zero-copy. 953 * *space will be updated to reflect data copied in. 954 * 955 * NB: If atomic I/O is requested, the caller must already have checked that 956 * space can hold resid bytes. 957 * 958 * NB: In the event of an error, the caller may need to free the partial 959 * chain pointed to by *mpp. The contents of both *uio and *space may be 960 * modified even in the case of an error. 961 */ 962static int 963sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 964 int flags) 965{ 966 struct mbuf *m, **mp, *top; 967 long len; 968 ssize_t resid; 969 int error; 970 int cow_send; 971 972 *retmp = top = NULL; 973 mp = ⊤ 974 len = 0; 975 resid = uio->uio_resid; 976 error = 0; 977 do { 978 cow_send = 0; 979 if (resid >= MINCLSIZE) { 980 if (top == NULL) { 981 m = m_gethdr(M_WAITOK, MT_DATA); 982 m->m_pkthdr.len = 0; 983 m->m_pkthdr.rcvif = NULL; 984 } else 985 m = m_get(M_WAITOK, MT_DATA); 986 if (so_zero_copy_send && 987 resid >= PAGE_SIZE && 988 *space >= PAGE_SIZE && 989 uio->uio_iov->iov_len >= PAGE_SIZE) { 990 so_zerocp_stats.size_ok++; 991 so_zerocp_stats.align_ok++; 992 cow_send = socow_setup(m, uio); 993 len = cow_send; 994 } 995 if (!cow_send) { 996 m_clget(m, M_WAITOK); 997 len = min(min(MCLBYTES, resid), *space); 998 } 999 } else { 1000 if (top == NULL) { 1001 m = m_gethdr(M_WAIT, MT_DATA); 1002 m->m_pkthdr.len = 0; 1003 m->m_pkthdr.rcvif = NULL; 1004 1005 len = min(min(MHLEN, resid), *space); 1006 /* 1007 * For datagram protocols, leave room 1008 * for protocol headers in first mbuf. 1009 */ 1010 if (atomic && m && len < MHLEN) 1011 MH_ALIGN(m, len); 1012 } else { 1013 m = m_get(M_WAIT, MT_DATA); 1014 len = min(min(MLEN, resid), *space); 1015 } 1016 } 1017 if (m == NULL) { 1018 error = ENOBUFS; 1019 goto out; 1020 } 1021 1022 *space -= len; 1023 if (cow_send) 1024 error = 0; 1025 else 1026 error = uiomove(mtod(m, void *), (int)len, uio); 1027 resid = uio->uio_resid; 1028 m->m_len = len; 1029 *mp = m; 1030 top->m_pkthdr.len += len; 1031 if (error) 1032 goto out; 1033 mp = &m->m_next; 1034 if (resid <= 0) { 1035 if (flags & MSG_EOR) 1036 top->m_flags |= M_EOR; 1037 break; 1038 } 1039 } while (*space > 0 && atomic); 1040out: 1041 *retmp = top; 1042 return (error); 1043} 1044#endif /* SOCKET_SEND_COW */ 1045 1046#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1047 1048int 1049sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1050 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1051{ 1052 long space; 1053 ssize_t resid; 1054 int clen = 0, error, dontroute; 1055#ifdef SOCKET_SEND_COW 1056 int atomic = sosendallatonce(so) || top; 1057#endif 1058 1059 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1060 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1061 ("sosend_dgram: !PR_ATOMIC")); 1062 1063 if (uio != NULL) 1064 resid = uio->uio_resid; 1065 else 1066 resid = top->m_pkthdr.len; 1067 /* 1068 * In theory resid should be unsigned. However, space must be 1069 * signed, as it might be less than 0 if we over-committed, and we 1070 * must use a signed comparison of space and resid. On the other 1071 * hand, a negative resid causes us to loop sending 0-length 1072 * segments to the protocol. 1073 */ 1074 if (resid < 0) { 1075 error = EINVAL; 1076 goto out; 1077 } 1078 1079 dontroute = 1080 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1081 if (td != NULL) 1082 td->td_ru.ru_msgsnd++; 1083 if (control != NULL) 1084 clen = control->m_len; 1085 1086 SOCKBUF_LOCK(&so->so_snd); 1087 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1088 SOCKBUF_UNLOCK(&so->so_snd); 1089 error = EPIPE; 1090 goto out; 1091 } 1092 if (so->so_error) { 1093 error = so->so_error; 1094 so->so_error = 0; 1095 SOCKBUF_UNLOCK(&so->so_snd); 1096 goto out; 1097 } 1098 if ((so->so_state & SS_ISCONNECTED) == 0) { 1099 /* 1100 * `sendto' and `sendmsg' is allowed on a connection-based 1101 * socket if it supports implied connect. Return ENOTCONN if 1102 * not connected and no address is supplied. 1103 */ 1104 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1105 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1106 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1107 !(resid == 0 && clen != 0)) { 1108 SOCKBUF_UNLOCK(&so->so_snd); 1109 error = ENOTCONN; 1110 goto out; 1111 } 1112 } else if (addr == NULL) { 1113 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1114 error = ENOTCONN; 1115 else 1116 error = EDESTADDRREQ; 1117 SOCKBUF_UNLOCK(&so->so_snd); 1118 goto out; 1119 } 1120 } 1121 1122 /* 1123 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1124 * problem and need fixing. 1125 */ 1126 space = sbspace(&so->so_snd); 1127 if (flags & MSG_OOB) 1128 space += 1024; 1129 space -= clen; 1130 SOCKBUF_UNLOCK(&so->so_snd); 1131 if (resid > space) { 1132 error = EMSGSIZE; 1133 goto out; 1134 } 1135 if (uio == NULL) { 1136 resid = 0; 1137 if (flags & MSG_EOR) 1138 top->m_flags |= M_EOR; 1139 } else { 1140#ifdef SOCKET_SEND_COW 1141 error = sosend_copyin(uio, &top, atomic, &space, flags); 1142 if (error) 1143 goto out; 1144#else 1145 /* 1146 * Copy the data from userland into a mbuf chain. 1147 * If no data is to be copied in, a single empty mbuf 1148 * is returned. 1149 */ 1150 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1151 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1152 if (top == NULL) { 1153 error = EFAULT; /* only possible error */ 1154 goto out; 1155 } 1156 space -= resid - uio->uio_resid; 1157#endif /* SOCKET_SEND_COW */ 1158 resid = uio->uio_resid; 1159 } 1160 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1161 /* 1162 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1163 * than with. 1164 */ 1165 if (dontroute) { 1166 SOCK_LOCK(so); 1167 so->so_options |= SO_DONTROUTE; 1168 SOCK_UNLOCK(so); 1169 } 1170 /* 1171 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1172 * of date. We could have recieved a reset packet in an interrupt or 1173 * maybe we slept while doing page faults in uiomove() etc. We could 1174 * probably recheck again inside the locking protection here, but 1175 * there are probably other places that this also happens. We must 1176 * rethink this. 1177 */ 1178 VNET_SO_ASSERT(so); 1179 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1180 (flags & MSG_OOB) ? PRUS_OOB : 1181 /* 1182 * If the user set MSG_EOF, the protocol understands this flag and 1183 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1184 */ 1185 ((flags & MSG_EOF) && 1186 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1187 (resid <= 0)) ? 1188 PRUS_EOF : 1189 /* If there is more to send set PRUS_MORETOCOME */ 1190 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1191 top, addr, control, td); 1192 if (dontroute) { 1193 SOCK_LOCK(so); 1194 so->so_options &= ~SO_DONTROUTE; 1195 SOCK_UNLOCK(so); 1196 } 1197 clen = 0; 1198 control = NULL; 1199 top = NULL; 1200out: 1201 if (top != NULL) 1202 m_freem(top); 1203 if (control != NULL) 1204 m_freem(control); 1205 return (error); 1206} 1207 1208/* 1209 * Send on a socket. If send must go all at once and message is larger than 1210 * send buffering, then hard error. Lock against other senders. If must go 1211 * all at once and not enough room now, then inform user that this would 1212 * block and do nothing. Otherwise, if nonblocking, send as much as 1213 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1214 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1215 * in mbuf chain must be small enough to send all at once. 1216 * 1217 * Returns nonzero on error, timeout or signal; callers must check for short 1218 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1219 * on return. 1220 */ 1221int 1222sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1223 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1224{ 1225 long space; 1226 ssize_t resid; 1227 int clen = 0, error, dontroute; 1228 int atomic = sosendallatonce(so) || top; 1229 1230 if (uio != NULL) 1231 resid = uio->uio_resid; 1232 else 1233 resid = top->m_pkthdr.len; 1234 /* 1235 * In theory resid should be unsigned. However, space must be 1236 * signed, as it might be less than 0 if we over-committed, and we 1237 * must use a signed comparison of space and resid. On the other 1238 * hand, a negative resid causes us to loop sending 0-length 1239 * segments to the protocol. 1240 * 1241 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1242 * type sockets since that's an error. 1243 */ 1244 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1245 error = EINVAL; 1246 goto out; 1247 } 1248 1249 dontroute = 1250 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1251 (so->so_proto->pr_flags & PR_ATOMIC); 1252 if (td != NULL) 1253 td->td_ru.ru_msgsnd++; 1254 if (control != NULL) 1255 clen = control->m_len; 1256 1257 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1258 if (error) 1259 goto out; 1260 1261restart: 1262 do { 1263 SOCKBUF_LOCK(&so->so_snd); 1264 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1265 SOCKBUF_UNLOCK(&so->so_snd); 1266 error = EPIPE; 1267 goto release; 1268 } 1269 if (so->so_error) { 1270 error = so->so_error; 1271 so->so_error = 0; 1272 SOCKBUF_UNLOCK(&so->so_snd); 1273 goto release; 1274 } 1275 if ((so->so_state & SS_ISCONNECTED) == 0) { 1276 /* 1277 * `sendto' and `sendmsg' is allowed on a connection- 1278 * based socket if it supports implied connect. 1279 * Return ENOTCONN if not connected and no address is 1280 * supplied. 1281 */ 1282 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1283 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1284 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1285 !(resid == 0 && clen != 0)) { 1286 SOCKBUF_UNLOCK(&so->so_snd); 1287 error = ENOTCONN; 1288 goto release; 1289 } 1290 } else if (addr == NULL) { 1291 SOCKBUF_UNLOCK(&so->so_snd); 1292 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1293 error = ENOTCONN; 1294 else 1295 error = EDESTADDRREQ; 1296 goto release; 1297 } 1298 } 1299 space = sbspace(&so->so_snd); 1300 if (flags & MSG_OOB) 1301 space += 1024; 1302 if ((atomic && resid > so->so_snd.sb_hiwat) || 1303 clen > so->so_snd.sb_hiwat) { 1304 SOCKBUF_UNLOCK(&so->so_snd); 1305 error = EMSGSIZE; 1306 goto release; 1307 } 1308 if (space < resid + clen && 1309 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1310 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1311 SOCKBUF_UNLOCK(&so->so_snd); 1312 error = EWOULDBLOCK; 1313 goto release; 1314 } 1315 error = sbwait(&so->so_snd); 1316 SOCKBUF_UNLOCK(&so->so_snd); 1317 if (error) 1318 goto release; 1319 goto restart; 1320 } 1321 SOCKBUF_UNLOCK(&so->so_snd); 1322 space -= clen; 1323 do { 1324 if (uio == NULL) { 1325 resid = 0; 1326 if (flags & MSG_EOR) 1327 top->m_flags |= M_EOR; 1328 } else { 1329#ifdef SOCKET_SEND_COW 1330 error = sosend_copyin(uio, &top, atomic, 1331 &space, flags); 1332 if (error != 0) 1333 goto release; 1334#else 1335 /* 1336 * Copy the data from userland into a mbuf 1337 * chain. If no data is to be copied in, 1338 * a single empty mbuf is returned. 1339 */ 1340 top = m_uiotombuf(uio, M_WAITOK, space, 1341 (atomic ? max_hdr : 0), 1342 (atomic ? M_PKTHDR : 0) | 1343 ((flags & MSG_EOR) ? M_EOR : 0)); 1344 if (top == NULL) { 1345 error = EFAULT; /* only possible error */ 1346 goto release; 1347 } 1348 space -= resid - uio->uio_resid; 1349#endif /* SOCKET_SEND_COW */ 1350 resid = uio->uio_resid; 1351 } 1352 if (dontroute) { 1353 SOCK_LOCK(so); 1354 so->so_options |= SO_DONTROUTE; 1355 SOCK_UNLOCK(so); 1356 } 1357 /* 1358 * XXX all the SBS_CANTSENDMORE checks previously 1359 * done could be out of date. We could have recieved 1360 * a reset packet in an interrupt or maybe we slept 1361 * while doing page faults in uiomove() etc. We 1362 * could probably recheck again inside the locking 1363 * protection here, but there are probably other 1364 * places that this also happens. We must rethink 1365 * this. 1366 */ 1367 VNET_SO_ASSERT(so); 1368 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1369 (flags & MSG_OOB) ? PRUS_OOB : 1370 /* 1371 * If the user set MSG_EOF, the protocol understands 1372 * this flag and nothing left to send then use 1373 * PRU_SEND_EOF instead of PRU_SEND. 1374 */ 1375 ((flags & MSG_EOF) && 1376 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1377 (resid <= 0)) ? 1378 PRUS_EOF : 1379 /* If there is more to send set PRUS_MORETOCOME. */ 1380 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1381 top, addr, control, td); 1382 if (dontroute) { 1383 SOCK_LOCK(so); 1384 so->so_options &= ~SO_DONTROUTE; 1385 SOCK_UNLOCK(so); 1386 } 1387 clen = 0; 1388 control = NULL; 1389 top = NULL; 1390 if (error) 1391 goto release; 1392 } while (resid && space > 0); 1393 } while (resid); 1394 1395release: 1396 sbunlock(&so->so_snd); 1397out: 1398 if (top != NULL) 1399 m_freem(top); 1400 if (control != NULL) 1401 m_freem(control); 1402 return (error); 1403} 1404 1405int 1406sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1407 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1408{ 1409 int error; 1410 1411 CURVNET_SET(so->so_vnet); 1412 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1413 control, flags, td); 1414 CURVNET_RESTORE(); 1415 return (error); 1416} 1417 1418/* 1419 * The part of soreceive() that implements reading non-inline out-of-band 1420 * data from a socket. For more complete comments, see soreceive(), from 1421 * which this code originated. 1422 * 1423 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1424 * unable to return an mbuf chain to the caller. 1425 */ 1426static int 1427soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1428{ 1429 struct protosw *pr = so->so_proto; 1430 struct mbuf *m; 1431 int error; 1432 1433 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1434 VNET_SO_ASSERT(so); 1435 1436 m = m_get(M_WAIT, MT_DATA); 1437 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1438 if (error) 1439 goto bad; 1440 do { 1441#ifdef SOCKET_RECV_PFLIP 1442 if (so_zero_copy_receive) { 1443 int disposable; 1444 1445 if ((m->m_flags & M_EXT) 1446 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1447 disposable = 1; 1448 else 1449 disposable = 0; 1450 1451 error = uiomoveco(mtod(m, void *), 1452 min(uio->uio_resid, m->m_len), 1453 uio, disposable); 1454 } else 1455#endif /* SOCKET_RECV_PFLIP */ 1456 error = uiomove(mtod(m, void *), 1457 (int) min(uio->uio_resid, m->m_len), uio); 1458 m = m_free(m); 1459 } while (uio->uio_resid && error == 0 && m); 1460bad: 1461 if (m != NULL) 1462 m_freem(m); 1463 return (error); 1464} 1465 1466/* 1467 * Following replacement or removal of the first mbuf on the first mbuf chain 1468 * of a socket buffer, push necessary state changes back into the socket 1469 * buffer so that other consumers see the values consistently. 'nextrecord' 1470 * is the callers locally stored value of the original value of 1471 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1472 * NOTE: 'nextrecord' may be NULL. 1473 */ 1474static __inline void 1475sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1476{ 1477 1478 SOCKBUF_LOCK_ASSERT(sb); 1479 /* 1480 * First, update for the new value of nextrecord. If necessary, make 1481 * it the first record. 1482 */ 1483 if (sb->sb_mb != NULL) 1484 sb->sb_mb->m_nextpkt = nextrecord; 1485 else 1486 sb->sb_mb = nextrecord; 1487 1488 /* 1489 * Now update any dependent socket buffer fields to reflect the new 1490 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1491 * addition of a second clause that takes care of the case where 1492 * sb_mb has been updated, but remains the last record. 1493 */ 1494 if (sb->sb_mb == NULL) { 1495 sb->sb_mbtail = NULL; 1496 sb->sb_lastrecord = NULL; 1497 } else if (sb->sb_mb->m_nextpkt == NULL) 1498 sb->sb_lastrecord = sb->sb_mb; 1499} 1500 1501 1502/* 1503 * Implement receive operations on a socket. We depend on the way that 1504 * records are added to the sockbuf by sbappend. In particular, each record 1505 * (mbufs linked through m_next) must begin with an address if the protocol 1506 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1507 * data, and then zero or more mbufs of data. In order to allow parallelism 1508 * between network receive and copying to user space, as well as avoid 1509 * sleeping with a mutex held, we release the socket buffer mutex during the 1510 * user space copy. Although the sockbuf is locked, new data may still be 1511 * appended, and thus we must maintain consistency of the sockbuf during that 1512 * time. 1513 * 1514 * The caller may receive the data as a single mbuf chain by supplying an 1515 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1516 * the count in uio_resid. 1517 */ 1518int 1519soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1520 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1521{ 1522 struct mbuf *m, **mp; 1523 int flags, error, offset; 1524 ssize_t len; 1525 struct protosw *pr = so->so_proto; 1526 struct mbuf *nextrecord; 1527 int moff, type = 0; 1528 ssize_t orig_resid = uio->uio_resid; 1529 1530 mp = mp0; 1531 if (psa != NULL) 1532 *psa = NULL; 1533 if (controlp != NULL) 1534 *controlp = NULL; 1535 if (flagsp != NULL) 1536 flags = *flagsp &~ MSG_EOR; 1537 else 1538 flags = 0; 1539 if (flags & MSG_OOB) 1540 return (soreceive_rcvoob(so, uio, flags)); 1541 if (mp != NULL) 1542 *mp = NULL; 1543 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1544 && uio->uio_resid) { 1545 VNET_SO_ASSERT(so); 1546 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1547 } 1548 1549 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1550 if (error) 1551 return (error); 1552 1553restart: 1554 SOCKBUF_LOCK(&so->so_rcv); 1555 m = so->so_rcv.sb_mb; 1556 /* 1557 * If we have less data than requested, block awaiting more (subject 1558 * to any timeout) if: 1559 * 1. the current count is less than the low water mark, or 1560 * 2. MSG_DONTWAIT is not set 1561 */ 1562 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1563 so->so_rcv.sb_cc < uio->uio_resid) && 1564 so->so_rcv.sb_cc < so->so_rcv.sb_lowat && 1565 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1566 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1567 ("receive: m == %p so->so_rcv.sb_cc == %u", 1568 m, so->so_rcv.sb_cc)); 1569 if (so->so_error) { 1570 if (m != NULL) 1571 goto dontblock; 1572 error = so->so_error; 1573 if ((flags & MSG_PEEK) == 0) 1574 so->so_error = 0; 1575 SOCKBUF_UNLOCK(&so->so_rcv); 1576 goto release; 1577 } 1578 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1579 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1580 if (m == NULL) { 1581 SOCKBUF_UNLOCK(&so->so_rcv); 1582 goto release; 1583 } else 1584 goto dontblock; 1585 } 1586 for (; m != NULL; m = m->m_next) 1587 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1588 m = so->so_rcv.sb_mb; 1589 goto dontblock; 1590 } 1591 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1592 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1593 SOCKBUF_UNLOCK(&so->so_rcv); 1594 error = ENOTCONN; 1595 goto release; 1596 } 1597 if (uio->uio_resid == 0) { 1598 SOCKBUF_UNLOCK(&so->so_rcv); 1599 goto release; 1600 } 1601 if ((so->so_state & SS_NBIO) || 1602 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1603 SOCKBUF_UNLOCK(&so->so_rcv); 1604 error = EWOULDBLOCK; 1605 goto release; 1606 } 1607 SBLASTRECORDCHK(&so->so_rcv); 1608 SBLASTMBUFCHK(&so->so_rcv); 1609 error = sbwait(&so->so_rcv); 1610 SOCKBUF_UNLOCK(&so->so_rcv); 1611 if (error) 1612 goto release; 1613 goto restart; 1614 } 1615dontblock: 1616 /* 1617 * From this point onward, we maintain 'nextrecord' as a cache of the 1618 * pointer to the next record in the socket buffer. We must keep the 1619 * various socket buffer pointers and local stack versions of the 1620 * pointers in sync, pushing out modifications before dropping the 1621 * socket buffer mutex, and re-reading them when picking it up. 1622 * 1623 * Otherwise, we will race with the network stack appending new data 1624 * or records onto the socket buffer by using inconsistent/stale 1625 * versions of the field, possibly resulting in socket buffer 1626 * corruption. 1627 * 1628 * By holding the high-level sblock(), we prevent simultaneous 1629 * readers from pulling off the front of the socket buffer. 1630 */ 1631 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1632 if (uio->uio_td) 1633 uio->uio_td->td_ru.ru_msgrcv++; 1634 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1635 SBLASTRECORDCHK(&so->so_rcv); 1636 SBLASTMBUFCHK(&so->so_rcv); 1637 nextrecord = m->m_nextpkt; 1638 if (pr->pr_flags & PR_ADDR) { 1639 KASSERT(m->m_type == MT_SONAME, 1640 ("m->m_type == %d", m->m_type)); 1641 orig_resid = 0; 1642 if (psa != NULL) 1643 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1644 M_NOWAIT); 1645 if (flags & MSG_PEEK) { 1646 m = m->m_next; 1647 } else { 1648 sbfree(&so->so_rcv, m); 1649 so->so_rcv.sb_mb = m_free(m); 1650 m = so->so_rcv.sb_mb; 1651 sockbuf_pushsync(&so->so_rcv, nextrecord); 1652 } 1653 } 1654 1655 /* 1656 * Process one or more MT_CONTROL mbufs present before any data mbufs 1657 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1658 * just copy the data; if !MSG_PEEK, we call into the protocol to 1659 * perform externalization (or freeing if controlp == NULL). 1660 */ 1661 if (m != NULL && m->m_type == MT_CONTROL) { 1662 struct mbuf *cm = NULL, *cmn; 1663 struct mbuf **cme = &cm; 1664 1665 do { 1666 if (flags & MSG_PEEK) { 1667 if (controlp != NULL) { 1668 *controlp = m_copy(m, 0, m->m_len); 1669 controlp = &(*controlp)->m_next; 1670 } 1671 m = m->m_next; 1672 } else { 1673 sbfree(&so->so_rcv, m); 1674 so->so_rcv.sb_mb = m->m_next; 1675 m->m_next = NULL; 1676 *cme = m; 1677 cme = &(*cme)->m_next; 1678 m = so->so_rcv.sb_mb; 1679 } 1680 } while (m != NULL && m->m_type == MT_CONTROL); 1681 if ((flags & MSG_PEEK) == 0) 1682 sockbuf_pushsync(&so->so_rcv, nextrecord); 1683 while (cm != NULL) { 1684 cmn = cm->m_next; 1685 cm->m_next = NULL; 1686 if (pr->pr_domain->dom_externalize != NULL) { 1687 SOCKBUF_UNLOCK(&so->so_rcv); 1688 VNET_SO_ASSERT(so); 1689 error = (*pr->pr_domain->dom_externalize) 1690 (cm, controlp); 1691 SOCKBUF_LOCK(&so->so_rcv); 1692 } else if (controlp != NULL) 1693 *controlp = cm; 1694 else 1695 m_freem(cm); 1696 if (controlp != NULL) { 1697 orig_resid = 0; 1698 while (*controlp != NULL) 1699 controlp = &(*controlp)->m_next; 1700 } 1701 cm = cmn; 1702 } 1703 if (m != NULL) 1704 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1705 else 1706 nextrecord = so->so_rcv.sb_mb; 1707 orig_resid = 0; 1708 } 1709 if (m != NULL) { 1710 if ((flags & MSG_PEEK) == 0) { 1711 KASSERT(m->m_nextpkt == nextrecord, 1712 ("soreceive: post-control, nextrecord !sync")); 1713 if (nextrecord == NULL) { 1714 KASSERT(so->so_rcv.sb_mb == m, 1715 ("soreceive: post-control, sb_mb!=m")); 1716 KASSERT(so->so_rcv.sb_lastrecord == m, 1717 ("soreceive: post-control, lastrecord!=m")); 1718 } 1719 } 1720 type = m->m_type; 1721 if (type == MT_OOBDATA) 1722 flags |= MSG_OOB; 1723 } else { 1724 if ((flags & MSG_PEEK) == 0) { 1725 KASSERT(so->so_rcv.sb_mb == nextrecord, 1726 ("soreceive: sb_mb != nextrecord")); 1727 if (so->so_rcv.sb_mb == NULL) { 1728 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1729 ("soreceive: sb_lastercord != NULL")); 1730 } 1731 } 1732 } 1733 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1734 SBLASTRECORDCHK(&so->so_rcv); 1735 SBLASTMBUFCHK(&so->so_rcv); 1736 1737 /* 1738 * Now continue to read any data mbufs off of the head of the socket 1739 * buffer until the read request is satisfied. Note that 'type' is 1740 * used to store the type of any mbuf reads that have happened so far 1741 * such that soreceive() can stop reading if the type changes, which 1742 * causes soreceive() to return only one of regular data and inline 1743 * out-of-band data in a single socket receive operation. 1744 */ 1745 moff = 0; 1746 offset = 0; 1747 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1748 /* 1749 * If the type of mbuf has changed since the last mbuf 1750 * examined ('type'), end the receive operation. 1751 */ 1752 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1753 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 1754 if (type != m->m_type) 1755 break; 1756 } else if (type == MT_OOBDATA) 1757 break; 1758 else 1759 KASSERT(m->m_type == MT_DATA, 1760 ("m->m_type == %d", m->m_type)); 1761 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1762 len = uio->uio_resid; 1763 if (so->so_oobmark && len > so->so_oobmark - offset) 1764 len = so->so_oobmark - offset; 1765 if (len > m->m_len - moff) 1766 len = m->m_len - moff; 1767 /* 1768 * If mp is set, just pass back the mbufs. Otherwise copy 1769 * them out via the uio, then free. Sockbuf must be 1770 * consistent here (points to current mbuf, it points to next 1771 * record) when we drop priority; we must note any additions 1772 * to the sockbuf when we block interrupts again. 1773 */ 1774 if (mp == NULL) { 1775 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1776 SBLASTRECORDCHK(&so->so_rcv); 1777 SBLASTMBUFCHK(&so->so_rcv); 1778 SOCKBUF_UNLOCK(&so->so_rcv); 1779#ifdef SOCKET_RECV_PFLIP 1780 if (so_zero_copy_receive) { 1781 int disposable; 1782 1783 if ((m->m_flags & M_EXT) 1784 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1785 disposable = 1; 1786 else 1787 disposable = 0; 1788 1789 error = uiomoveco(mtod(m, char *) + moff, 1790 (int)len, uio, 1791 disposable); 1792 } else 1793#endif /* SOCKET_RECV_PFLIP */ 1794 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1795 SOCKBUF_LOCK(&so->so_rcv); 1796 if (error) { 1797 /* 1798 * The MT_SONAME mbuf has already been removed 1799 * from the record, so it is necessary to 1800 * remove the data mbufs, if any, to preserve 1801 * the invariant in the case of PR_ADDR that 1802 * requires MT_SONAME mbufs at the head of 1803 * each record. 1804 */ 1805 if (m && pr->pr_flags & PR_ATOMIC && 1806 ((flags & MSG_PEEK) == 0)) 1807 (void)sbdroprecord_locked(&so->so_rcv); 1808 SOCKBUF_UNLOCK(&so->so_rcv); 1809 goto release; 1810 } 1811 } else 1812 uio->uio_resid -= len; 1813 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1814 if (len == m->m_len - moff) { 1815 if (m->m_flags & M_EOR) 1816 flags |= MSG_EOR; 1817 if (flags & MSG_PEEK) { 1818 m = m->m_next; 1819 moff = 0; 1820 } else { 1821 nextrecord = m->m_nextpkt; 1822 sbfree(&so->so_rcv, m); 1823 if (mp != NULL) { 1824 *mp = m; 1825 mp = &m->m_next; 1826 so->so_rcv.sb_mb = m = m->m_next; 1827 *mp = NULL; 1828 } else { 1829 so->so_rcv.sb_mb = m_free(m); 1830 m = so->so_rcv.sb_mb; 1831 } 1832 sockbuf_pushsync(&so->so_rcv, nextrecord); 1833 SBLASTRECORDCHK(&so->so_rcv); 1834 SBLASTMBUFCHK(&so->so_rcv); 1835 } 1836 } else { 1837 if (flags & MSG_PEEK) 1838 moff += len; 1839 else { 1840 if (mp != NULL) { 1841 int copy_flag; 1842 1843 if (flags & MSG_DONTWAIT) 1844 copy_flag = M_DONTWAIT; 1845 else 1846 copy_flag = M_WAIT; 1847 if (copy_flag == M_WAIT) 1848 SOCKBUF_UNLOCK(&so->so_rcv); 1849 *mp = m_copym(m, 0, len, copy_flag); 1850 if (copy_flag == M_WAIT) 1851 SOCKBUF_LOCK(&so->so_rcv); 1852 if (*mp == NULL) { 1853 /* 1854 * m_copym() couldn't 1855 * allocate an mbuf. Adjust 1856 * uio_resid back (it was 1857 * adjusted down by len 1858 * bytes, which we didn't end 1859 * up "copying" over). 1860 */ 1861 uio->uio_resid += len; 1862 break; 1863 } 1864 } 1865 m->m_data += len; 1866 m->m_len -= len; 1867 so->so_rcv.sb_cc -= len; 1868 } 1869 } 1870 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1871 if (so->so_oobmark) { 1872 if ((flags & MSG_PEEK) == 0) { 1873 so->so_oobmark -= len; 1874 if (so->so_oobmark == 0) { 1875 so->so_rcv.sb_state |= SBS_RCVATMARK; 1876 break; 1877 } 1878 } else { 1879 offset += len; 1880 if (offset == so->so_oobmark) 1881 break; 1882 } 1883 } 1884 if (flags & MSG_EOR) 1885 break; 1886 /* 1887 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1888 * must not quit until "uio->uio_resid == 0" or an error 1889 * termination. If a signal/timeout occurs, return with a 1890 * short count but without error. Keep sockbuf locked 1891 * against other readers. 1892 */ 1893 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1894 !sosendallatonce(so) && nextrecord == NULL) { 1895 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1896 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1897 break; 1898 /* 1899 * Notify the protocol that some data has been 1900 * drained before blocking. 1901 */ 1902 if (pr->pr_flags & PR_WANTRCVD) { 1903 SOCKBUF_UNLOCK(&so->so_rcv); 1904 VNET_SO_ASSERT(so); 1905 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1906 SOCKBUF_LOCK(&so->so_rcv); 1907 } 1908 SBLASTRECORDCHK(&so->so_rcv); 1909 SBLASTMBUFCHK(&so->so_rcv); 1910 /* 1911 * We could receive some data while was notifying 1912 * the protocol. Skip blocking in this case. 1913 */ 1914 if (so->so_rcv.sb_mb == NULL) { 1915 error = sbwait(&so->so_rcv); 1916 if (error) { 1917 SOCKBUF_UNLOCK(&so->so_rcv); 1918 goto release; 1919 } 1920 } 1921 m = so->so_rcv.sb_mb; 1922 if (m != NULL) 1923 nextrecord = m->m_nextpkt; 1924 } 1925 } 1926 1927 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1928 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1929 flags |= MSG_TRUNC; 1930 if ((flags & MSG_PEEK) == 0) 1931 (void) sbdroprecord_locked(&so->so_rcv); 1932 } 1933 if ((flags & MSG_PEEK) == 0) { 1934 if (m == NULL) { 1935 /* 1936 * First part is an inline SB_EMPTY_FIXUP(). Second 1937 * part makes sure sb_lastrecord is up-to-date if 1938 * there is still data in the socket buffer. 1939 */ 1940 so->so_rcv.sb_mb = nextrecord; 1941 if (so->so_rcv.sb_mb == NULL) { 1942 so->so_rcv.sb_mbtail = NULL; 1943 so->so_rcv.sb_lastrecord = NULL; 1944 } else if (nextrecord->m_nextpkt == NULL) 1945 so->so_rcv.sb_lastrecord = nextrecord; 1946 } 1947 SBLASTRECORDCHK(&so->so_rcv); 1948 SBLASTMBUFCHK(&so->so_rcv); 1949 /* 1950 * If soreceive() is being done from the socket callback, 1951 * then don't need to generate ACK to peer to update window, 1952 * since ACK will be generated on return to TCP. 1953 */ 1954 if (!(flags & MSG_SOCALLBCK) && 1955 (pr->pr_flags & PR_WANTRCVD)) { 1956 SOCKBUF_UNLOCK(&so->so_rcv); 1957 VNET_SO_ASSERT(so); 1958 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1959 SOCKBUF_LOCK(&so->so_rcv); 1960 } 1961 } 1962 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1963 if (orig_resid == uio->uio_resid && orig_resid && 1964 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1965 SOCKBUF_UNLOCK(&so->so_rcv); 1966 goto restart; 1967 } 1968 SOCKBUF_UNLOCK(&so->so_rcv); 1969 1970 if (flagsp != NULL) 1971 *flagsp |= flags; 1972release: 1973 sbunlock(&so->so_rcv); 1974 return (error); 1975} 1976 1977/* 1978 * Optimized version of soreceive() for stream (TCP) sockets. 1979 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. 1980 */ 1981int 1982soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 1983 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1984{ 1985 int len = 0, error = 0, flags, oresid; 1986 struct sockbuf *sb; 1987 struct mbuf *m, *n = NULL; 1988 1989 /* We only do stream sockets. */ 1990 if (so->so_type != SOCK_STREAM) 1991 return (EINVAL); 1992 if (psa != NULL) 1993 *psa = NULL; 1994 if (controlp != NULL) 1995 return (EINVAL); 1996 if (flagsp != NULL) 1997 flags = *flagsp &~ MSG_EOR; 1998 else 1999 flags = 0; 2000 if (flags & MSG_OOB) 2001 return (soreceive_rcvoob(so, uio, flags)); 2002 if (mp0 != NULL) 2003 *mp0 = NULL; 2004 2005 sb = &so->so_rcv; 2006 2007 /* Prevent other readers from entering the socket. */ 2008 error = sblock(sb, SBLOCKWAIT(flags)); 2009 if (error) 2010 goto out; 2011 SOCKBUF_LOCK(sb); 2012 2013 /* Easy one, no space to copyout anything. */ 2014 if (uio->uio_resid == 0) { 2015 error = EINVAL; 2016 goto out; 2017 } 2018 oresid = uio->uio_resid; 2019 2020 /* We will never ever get anything unless we are or were connected. */ 2021 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2022 error = ENOTCONN; 2023 goto out; 2024 } 2025 2026restart: 2027 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2028 2029 /* Abort if socket has reported problems. */ 2030 if (so->so_error) { 2031 if (sb->sb_cc > 0) 2032 goto deliver; 2033 if (oresid > uio->uio_resid) 2034 goto out; 2035 error = so->so_error; 2036 if (!(flags & MSG_PEEK)) 2037 so->so_error = 0; 2038 goto out; 2039 } 2040 2041 /* Door is closed. Deliver what is left, if any. */ 2042 if (sb->sb_state & SBS_CANTRCVMORE) { 2043 if (sb->sb_cc > 0) 2044 goto deliver; 2045 else 2046 goto out; 2047 } 2048 2049 /* Socket buffer is empty and we shall not block. */ 2050 if (sb->sb_cc == 0 && 2051 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2052 error = EAGAIN; 2053 goto out; 2054 } 2055 2056 /* Socket buffer got some data that we shall deliver now. */ 2057 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 2058 ((sb->sb_flags & SS_NBIO) || 2059 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2060 sb->sb_cc >= sb->sb_lowat || 2061 sb->sb_cc >= uio->uio_resid || 2062 sb->sb_cc >= sb->sb_hiwat) ) { 2063 goto deliver; 2064 } 2065 2066 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2067 if ((flags & MSG_WAITALL) && 2068 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat)) 2069 goto deliver; 2070 2071 /* 2072 * Wait and block until (more) data comes in. 2073 * NB: Drops the sockbuf lock during wait. 2074 */ 2075 error = sbwait(sb); 2076 if (error) 2077 goto out; 2078 goto restart; 2079 2080deliver: 2081 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2082 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 2083 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2084 2085 /* Statistics. */ 2086 if (uio->uio_td) 2087 uio->uio_td->td_ru.ru_msgrcv++; 2088 2089 /* Fill uio until full or current end of socket buffer is reached. */ 2090 len = min(uio->uio_resid, sb->sb_cc); 2091 if (mp0 != NULL) { 2092 /* Dequeue as many mbufs as possible. */ 2093 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2094 if (*mp0 == NULL) 2095 *mp0 = sb->sb_mb; 2096 else 2097 m_cat(*mp0, sb->sb_mb); 2098 for (m = sb->sb_mb; 2099 m != NULL && m->m_len <= len; 2100 m = m->m_next) { 2101 len -= m->m_len; 2102 uio->uio_resid -= m->m_len; 2103 sbfree(sb, m); 2104 n = m; 2105 } 2106 n->m_next = NULL; 2107 sb->sb_mb = m; 2108 sb->sb_lastrecord = sb->sb_mb; 2109 if (sb->sb_mb == NULL) 2110 SB_EMPTY_FIXUP(sb); 2111 } 2112 /* Copy the remainder. */ 2113 if (len > 0) { 2114 KASSERT(sb->sb_mb != NULL, 2115 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2116 2117 m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); 2118 if (m == NULL) 2119 len = 0; /* Don't flush data from sockbuf. */ 2120 else 2121 uio->uio_resid -= len; 2122 if (*mp0 != NULL) 2123 m_cat(*mp0, m); 2124 else 2125 *mp0 = m; 2126 if (*mp0 == NULL) { 2127 error = ENOBUFS; 2128 goto out; 2129 } 2130 } 2131 } else { 2132 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2133 SOCKBUF_UNLOCK(sb); 2134 error = m_mbuftouio(uio, sb->sb_mb, len); 2135 SOCKBUF_LOCK(sb); 2136 if (error) 2137 goto out; 2138 } 2139 SBLASTRECORDCHK(sb); 2140 SBLASTMBUFCHK(sb); 2141 2142 /* 2143 * Remove the delivered data from the socket buffer unless we 2144 * were only peeking. 2145 */ 2146 if (!(flags & MSG_PEEK)) { 2147 if (len > 0) 2148 sbdrop_locked(sb, len); 2149 2150 /* Notify protocol that we drained some data. */ 2151 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2152 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2153 !(flags & MSG_SOCALLBCK))) { 2154 SOCKBUF_UNLOCK(sb); 2155 VNET_SO_ASSERT(so); 2156 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2157 SOCKBUF_LOCK(sb); 2158 } 2159 } 2160 2161 /* 2162 * For MSG_WAITALL we may have to loop again and wait for 2163 * more data to come in. 2164 */ 2165 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2166 goto restart; 2167out: 2168 SOCKBUF_LOCK_ASSERT(sb); 2169 SBLASTRECORDCHK(sb); 2170 SBLASTMBUFCHK(sb); 2171 SOCKBUF_UNLOCK(sb); 2172 sbunlock(sb); 2173 return (error); 2174} 2175 2176/* 2177 * Optimized version of soreceive() for simple datagram cases from userspace. 2178 * Unlike in the stream case, we're able to drop a datagram if copyout() 2179 * fails, and because we handle datagrams atomically, we don't need to use a 2180 * sleep lock to prevent I/O interlacing. 2181 */ 2182int 2183soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2184 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2185{ 2186 struct mbuf *m, *m2; 2187 int flags, error; 2188 ssize_t len; 2189 struct protosw *pr = so->so_proto; 2190 struct mbuf *nextrecord; 2191 2192 if (psa != NULL) 2193 *psa = NULL; 2194 if (controlp != NULL) 2195 *controlp = NULL; 2196 if (flagsp != NULL) 2197 flags = *flagsp &~ MSG_EOR; 2198 else 2199 flags = 0; 2200 2201 /* 2202 * For any complicated cases, fall back to the full 2203 * soreceive_generic(). 2204 */ 2205 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2206 return (soreceive_generic(so, psa, uio, mp0, controlp, 2207 flagsp)); 2208 2209 /* 2210 * Enforce restrictions on use. 2211 */ 2212 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2213 ("soreceive_dgram: wantrcvd")); 2214 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2215 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2216 ("soreceive_dgram: SBS_RCVATMARK")); 2217 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2218 ("soreceive_dgram: P_CONNREQUIRED")); 2219 2220 /* 2221 * Loop blocking while waiting for a datagram. 2222 */ 2223 SOCKBUF_LOCK(&so->so_rcv); 2224 while ((m = so->so_rcv.sb_mb) == NULL) { 2225 KASSERT(so->so_rcv.sb_cc == 0, 2226 ("soreceive_dgram: sb_mb NULL but sb_cc %u", 2227 so->so_rcv.sb_cc)); 2228 if (so->so_error) { 2229 error = so->so_error; 2230 so->so_error = 0; 2231 SOCKBUF_UNLOCK(&so->so_rcv); 2232 return (error); 2233 } 2234 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2235 uio->uio_resid == 0) { 2236 SOCKBUF_UNLOCK(&so->so_rcv); 2237 return (0); 2238 } 2239 if ((so->so_state & SS_NBIO) || 2240 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2241 SOCKBUF_UNLOCK(&so->so_rcv); 2242 return (EWOULDBLOCK); 2243 } 2244 SBLASTRECORDCHK(&so->so_rcv); 2245 SBLASTMBUFCHK(&so->so_rcv); 2246 error = sbwait(&so->so_rcv); 2247 if (error) { 2248 SOCKBUF_UNLOCK(&so->so_rcv); 2249 return (error); 2250 } 2251 } 2252 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2253 2254 if (uio->uio_td) 2255 uio->uio_td->td_ru.ru_msgrcv++; 2256 SBLASTRECORDCHK(&so->so_rcv); 2257 SBLASTMBUFCHK(&so->so_rcv); 2258 nextrecord = m->m_nextpkt; 2259 if (nextrecord == NULL) { 2260 KASSERT(so->so_rcv.sb_lastrecord == m, 2261 ("soreceive_dgram: lastrecord != m")); 2262 } 2263 2264 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2265 ("soreceive_dgram: m_nextpkt != nextrecord")); 2266 2267 /* 2268 * Pull 'm' and its chain off the front of the packet queue. 2269 */ 2270 so->so_rcv.sb_mb = NULL; 2271 sockbuf_pushsync(&so->so_rcv, nextrecord); 2272 2273 /* 2274 * Walk 'm's chain and free that many bytes from the socket buffer. 2275 */ 2276 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2277 sbfree(&so->so_rcv, m2); 2278 2279 /* 2280 * Do a few last checks before we let go of the lock. 2281 */ 2282 SBLASTRECORDCHK(&so->so_rcv); 2283 SBLASTMBUFCHK(&so->so_rcv); 2284 SOCKBUF_UNLOCK(&so->so_rcv); 2285 2286 if (pr->pr_flags & PR_ADDR) { 2287 KASSERT(m->m_type == MT_SONAME, 2288 ("m->m_type == %d", m->m_type)); 2289 if (psa != NULL) 2290 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2291 M_NOWAIT); 2292 m = m_free(m); 2293 } 2294 if (m == NULL) { 2295 /* XXXRW: Can this happen? */ 2296 return (0); 2297 } 2298 2299 /* 2300 * Packet to copyout() is now in 'm' and it is disconnected from the 2301 * queue. 2302 * 2303 * Process one or more MT_CONTROL mbufs present before any data mbufs 2304 * in the first mbuf chain on the socket buffer. We call into the 2305 * protocol to perform externalization (or freeing if controlp == 2306 * NULL). 2307 */ 2308 if (m->m_type == MT_CONTROL) { 2309 struct mbuf *cm = NULL, *cmn; 2310 struct mbuf **cme = &cm; 2311 2312 do { 2313 m2 = m->m_next; 2314 m->m_next = NULL; 2315 *cme = m; 2316 cme = &(*cme)->m_next; 2317 m = m2; 2318 } while (m != NULL && m->m_type == MT_CONTROL); 2319 while (cm != NULL) { 2320 cmn = cm->m_next; 2321 cm->m_next = NULL; 2322 if (pr->pr_domain->dom_externalize != NULL) { 2323 error = (*pr->pr_domain->dom_externalize) 2324 (cm, controlp); 2325 } else if (controlp != NULL) 2326 *controlp = cm; 2327 else 2328 m_freem(cm); 2329 if (controlp != NULL) { 2330 while (*controlp != NULL) 2331 controlp = &(*controlp)->m_next; 2332 } 2333 cm = cmn; 2334 } 2335 } 2336 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); 2337 2338 while (m != NULL && uio->uio_resid > 0) { 2339 len = uio->uio_resid; 2340 if (len > m->m_len) 2341 len = m->m_len; 2342 error = uiomove(mtod(m, char *), (int)len, uio); 2343 if (error) { 2344 m_freem(m); 2345 return (error); 2346 } 2347 if (len == m->m_len) 2348 m = m_free(m); 2349 else { 2350 m->m_data += len; 2351 m->m_len -= len; 2352 } 2353 } 2354 if (m != NULL) 2355 flags |= MSG_TRUNC; 2356 m_freem(m); 2357 if (flagsp != NULL) 2358 *flagsp |= flags; 2359 return (0); 2360} 2361 2362int 2363soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2364 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2365{ 2366 int error; 2367 2368 CURVNET_SET(so->so_vnet); 2369 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 2370 controlp, flagsp)); 2371 CURVNET_RESTORE(); 2372 return (error); 2373} 2374 2375int 2376soshutdown(struct socket *so, int how) 2377{ 2378 struct protosw *pr = so->so_proto; 2379 int error; 2380 2381 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2382 return (EINVAL); 2383 2384 CURVNET_SET(so->so_vnet); 2385 if (pr->pr_usrreqs->pru_flush != NULL) { 2386 (*pr->pr_usrreqs->pru_flush)(so, how); 2387 } 2388 if (how != SHUT_WR) 2389 sorflush(so); 2390 if (how != SHUT_RD) { 2391 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2392 CURVNET_RESTORE(); 2393 return (error); 2394 } 2395 CURVNET_RESTORE(); 2396 return (0); 2397} 2398 2399void 2400sorflush(struct socket *so) 2401{ 2402 struct sockbuf *sb = &so->so_rcv; 2403 struct protosw *pr = so->so_proto; 2404 struct sockbuf asb; 2405 2406 VNET_SO_ASSERT(so); 2407 2408 /* 2409 * In order to avoid calling dom_dispose with the socket buffer mutex 2410 * held, and in order to generally avoid holding the lock for a long 2411 * time, we make a copy of the socket buffer and clear the original 2412 * (except locks, state). The new socket buffer copy won't have 2413 * initialized locks so we can only call routines that won't use or 2414 * assert those locks. 2415 * 2416 * Dislodge threads currently blocked in receive and wait to acquire 2417 * a lock against other simultaneous readers before clearing the 2418 * socket buffer. Don't let our acquire be interrupted by a signal 2419 * despite any existing socket disposition on interruptable waiting. 2420 */ 2421 socantrcvmore(so); 2422 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2423 2424 /* 2425 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2426 * and mutex data unchanged. 2427 */ 2428 SOCKBUF_LOCK(sb); 2429 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 2430 bcopy(&sb->sb_startzero, &asb.sb_startzero, 2431 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2432 bzero(&sb->sb_startzero, 2433 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2434 SOCKBUF_UNLOCK(sb); 2435 sbunlock(sb); 2436 2437 /* 2438 * Dispose of special rights and flush the socket buffer. Don't call 2439 * any unsafe routines (that rely on locks being initialized) on asb. 2440 */ 2441 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2442 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 2443 sbrelease_internal(&asb, so); 2444} 2445 2446/* 2447 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2448 * additional variant to handle the case where the option value needs to be 2449 * some kind of integer, but not a specific size. In addition to their use 2450 * here, these functions are also called by the protocol-level pr_ctloutput() 2451 * routines. 2452 */ 2453int 2454sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2455{ 2456 size_t valsize; 2457 2458 /* 2459 * If the user gives us more than we wanted, we ignore it, but if we 2460 * don't get the minimum length the caller wants, we return EINVAL. 2461 * On success, sopt->sopt_valsize is set to however much we actually 2462 * retrieved. 2463 */ 2464 if ((valsize = sopt->sopt_valsize) < minlen) 2465 return EINVAL; 2466 if (valsize > len) 2467 sopt->sopt_valsize = valsize = len; 2468 2469 if (sopt->sopt_td != NULL) 2470 return (copyin(sopt->sopt_val, buf, valsize)); 2471 2472 bcopy(sopt->sopt_val, buf, valsize); 2473 return (0); 2474} 2475 2476/* 2477 * Kernel version of setsockopt(2). 2478 * 2479 * XXX: optlen is size_t, not socklen_t 2480 */ 2481int 2482so_setsockopt(struct socket *so, int level, int optname, void *optval, 2483 size_t optlen) 2484{ 2485 struct sockopt sopt; 2486 2487 sopt.sopt_level = level; 2488 sopt.sopt_name = optname; 2489 sopt.sopt_dir = SOPT_SET; 2490 sopt.sopt_val = optval; 2491 sopt.sopt_valsize = optlen; 2492 sopt.sopt_td = NULL; 2493 return (sosetopt(so, &sopt)); 2494} 2495 2496int 2497sosetopt(struct socket *so, struct sockopt *sopt) 2498{ 2499 int error, optval; 2500 struct linger l; 2501 struct timeval tv; 2502 u_long val; 2503 uint32_t val32; 2504#ifdef MAC 2505 struct mac extmac; 2506#endif 2507 2508 CURVNET_SET(so->so_vnet); 2509 error = 0; 2510 if (sopt->sopt_level != SOL_SOCKET) { 2511 if (so->so_proto->pr_ctloutput != NULL) { 2512 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2513 CURVNET_RESTORE(); 2514 return (error); 2515 } 2516 error = ENOPROTOOPT; 2517 } else { 2518 switch (sopt->sopt_name) { 2519#ifdef INET 2520 case SO_ACCEPTFILTER: 2521 error = do_setopt_accept_filter(so, sopt); 2522 if (error) 2523 goto bad; 2524 break; 2525#endif 2526 case SO_LINGER: 2527 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2528 if (error) 2529 goto bad; 2530 2531 SOCK_LOCK(so); 2532 so->so_linger = l.l_linger; 2533 if (l.l_onoff) 2534 so->so_options |= SO_LINGER; 2535 else 2536 so->so_options &= ~SO_LINGER; 2537 SOCK_UNLOCK(so); 2538 break; 2539 2540 case SO_DEBUG: 2541 case SO_KEEPALIVE: 2542 case SO_DONTROUTE: 2543 case SO_USELOOPBACK: 2544 case SO_BROADCAST: 2545 case SO_REUSEADDR: 2546 case SO_REUSEPORT: 2547 case SO_OOBINLINE: 2548 case SO_TIMESTAMP: 2549 case SO_BINTIME: 2550 case SO_NOSIGPIPE: 2551 case SO_NO_DDP: 2552 case SO_NO_OFFLOAD: 2553 error = sooptcopyin(sopt, &optval, sizeof optval, 2554 sizeof optval); 2555 if (error) 2556 goto bad; 2557 SOCK_LOCK(so); 2558 if (optval) 2559 so->so_options |= sopt->sopt_name; 2560 else 2561 so->so_options &= ~sopt->sopt_name; 2562 SOCK_UNLOCK(so); 2563 break; 2564 2565 case SO_SETFIB: 2566 error = sooptcopyin(sopt, &optval, sizeof optval, 2567 sizeof optval); 2568 if (error) 2569 goto bad; 2570 2571 if (optval < 0 || optval >= rt_numfibs) { 2572 error = EINVAL; 2573 goto bad; 2574 } 2575 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2576 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2577 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2578 so->so_fibnum = optval; 2579 else 2580 so->so_fibnum = 0; 2581 break; 2582 2583 case SO_USER_COOKIE: 2584 error = sooptcopyin(sopt, &val32, sizeof val32, 2585 sizeof val32); 2586 if (error) 2587 goto bad; 2588 so->so_user_cookie = val32; 2589 break; 2590 2591 case SO_SNDBUF: 2592 case SO_RCVBUF: 2593 case SO_SNDLOWAT: 2594 case SO_RCVLOWAT: 2595 error = sooptcopyin(sopt, &optval, sizeof optval, 2596 sizeof optval); 2597 if (error) 2598 goto bad; 2599 2600 /* 2601 * Values < 1 make no sense for any of these options, 2602 * so disallow them. 2603 */ 2604 if (optval < 1) { 2605 error = EINVAL; 2606 goto bad; 2607 } 2608 2609 switch (sopt->sopt_name) { 2610 case SO_SNDBUF: 2611 case SO_RCVBUF: 2612 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2613 &so->so_snd : &so->so_rcv, (u_long)optval, 2614 so, curthread) == 0) { 2615 error = ENOBUFS; 2616 goto bad; 2617 } 2618 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2619 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2620 break; 2621 2622 /* 2623 * Make sure the low-water is never greater than the 2624 * high-water. 2625 */ 2626 case SO_SNDLOWAT: 2627 SOCKBUF_LOCK(&so->so_snd); 2628 so->so_snd.sb_lowat = 2629 (optval > so->so_snd.sb_hiwat) ? 2630 so->so_snd.sb_hiwat : optval; 2631 SOCKBUF_UNLOCK(&so->so_snd); 2632 break; 2633 case SO_RCVLOWAT: 2634 SOCKBUF_LOCK(&so->so_rcv); 2635 so->so_rcv.sb_lowat = 2636 (optval > so->so_rcv.sb_hiwat) ? 2637 so->so_rcv.sb_hiwat : optval; 2638 SOCKBUF_UNLOCK(&so->so_rcv); 2639 break; 2640 } 2641 break; 2642 2643 case SO_SNDTIMEO: 2644 case SO_RCVTIMEO: 2645#ifdef COMPAT_FREEBSD32 2646 if (SV_CURPROC_FLAG(SV_ILP32)) { 2647 struct timeval32 tv32; 2648 2649 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2650 sizeof tv32); 2651 CP(tv32, tv, tv_sec); 2652 CP(tv32, tv, tv_usec); 2653 } else 2654#endif 2655 error = sooptcopyin(sopt, &tv, sizeof tv, 2656 sizeof tv); 2657 if (error) 2658 goto bad; 2659 2660 /* assert(hz > 0); */ 2661 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2662 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2663 error = EDOM; 2664 goto bad; 2665 } 2666 /* assert(tick > 0); */ 2667 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2668 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2669 if (val > INT_MAX) { 2670 error = EDOM; 2671 goto bad; 2672 } 2673 if (val == 0 && tv.tv_usec != 0) 2674 val = 1; 2675 2676 switch (sopt->sopt_name) { 2677 case SO_SNDTIMEO: 2678 so->so_snd.sb_timeo = val; 2679 break; 2680 case SO_RCVTIMEO: 2681 so->so_rcv.sb_timeo = val; 2682 break; 2683 } 2684 break; 2685 2686 case SO_LABEL: 2687#ifdef MAC 2688 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2689 sizeof extmac); 2690 if (error) 2691 goto bad; 2692 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2693 so, &extmac); 2694#else 2695 error = EOPNOTSUPP; 2696#endif 2697 break; 2698 2699 default: 2700 error = ENOPROTOOPT; 2701 break; 2702 } 2703 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 2704 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 2705 } 2706bad: 2707 CURVNET_RESTORE(); 2708 return (error); 2709} 2710 2711/* 2712 * Helper routine for getsockopt. 2713 */ 2714int 2715sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2716{ 2717 int error; 2718 size_t valsize; 2719 2720 error = 0; 2721 2722 /* 2723 * Documented get behavior is that we always return a value, possibly 2724 * truncated to fit in the user's buffer. Traditional behavior is 2725 * that we always tell the user precisely how much we copied, rather 2726 * than something useful like the total amount we had available for 2727 * her. Note that this interface is not idempotent; the entire 2728 * answer must generated ahead of time. 2729 */ 2730 valsize = min(len, sopt->sopt_valsize); 2731 sopt->sopt_valsize = valsize; 2732 if (sopt->sopt_val != NULL) { 2733 if (sopt->sopt_td != NULL) 2734 error = copyout(buf, sopt->sopt_val, valsize); 2735 else 2736 bcopy(buf, sopt->sopt_val, valsize); 2737 } 2738 return (error); 2739} 2740 2741int 2742sogetopt(struct socket *so, struct sockopt *sopt) 2743{ 2744 int error, optval; 2745 struct linger l; 2746 struct timeval tv; 2747#ifdef MAC 2748 struct mac extmac; 2749#endif 2750 2751 CURVNET_SET(so->so_vnet); 2752 error = 0; 2753 if (sopt->sopt_level != SOL_SOCKET) { 2754 if (so->so_proto->pr_ctloutput != NULL) 2755 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2756 else 2757 error = ENOPROTOOPT; 2758 CURVNET_RESTORE(); 2759 return (error); 2760 } else { 2761 switch (sopt->sopt_name) { 2762#ifdef INET 2763 case SO_ACCEPTFILTER: 2764 error = do_getopt_accept_filter(so, sopt); 2765 break; 2766#endif 2767 case SO_LINGER: 2768 SOCK_LOCK(so); 2769 l.l_onoff = so->so_options & SO_LINGER; 2770 l.l_linger = so->so_linger; 2771 SOCK_UNLOCK(so); 2772 error = sooptcopyout(sopt, &l, sizeof l); 2773 break; 2774 2775 case SO_USELOOPBACK: 2776 case SO_DONTROUTE: 2777 case SO_DEBUG: 2778 case SO_KEEPALIVE: 2779 case SO_REUSEADDR: 2780 case SO_REUSEPORT: 2781 case SO_BROADCAST: 2782 case SO_OOBINLINE: 2783 case SO_ACCEPTCONN: 2784 case SO_TIMESTAMP: 2785 case SO_BINTIME: 2786 case SO_NOSIGPIPE: 2787 optval = so->so_options & sopt->sopt_name; 2788integer: 2789 error = sooptcopyout(sopt, &optval, sizeof optval); 2790 break; 2791 2792 case SO_TYPE: 2793 optval = so->so_type; 2794 goto integer; 2795 2796 case SO_PROTOCOL: 2797 optval = so->so_proto->pr_protocol; 2798 goto integer; 2799 2800 case SO_ERROR: 2801 SOCK_LOCK(so); 2802 optval = so->so_error; 2803 so->so_error = 0; 2804 SOCK_UNLOCK(so); 2805 goto integer; 2806 2807 case SO_SNDBUF: 2808 optval = so->so_snd.sb_hiwat; 2809 goto integer; 2810 2811 case SO_RCVBUF: 2812 optval = so->so_rcv.sb_hiwat; 2813 goto integer; 2814 2815 case SO_SNDLOWAT: 2816 optval = so->so_snd.sb_lowat; 2817 goto integer; 2818 2819 case SO_RCVLOWAT: 2820 optval = so->so_rcv.sb_lowat; 2821 goto integer; 2822 2823 case SO_SNDTIMEO: 2824 case SO_RCVTIMEO: 2825 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2826 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2827 2828 tv.tv_sec = optval / hz; 2829 tv.tv_usec = (optval % hz) * tick; 2830#ifdef COMPAT_FREEBSD32 2831 if (SV_CURPROC_FLAG(SV_ILP32)) { 2832 struct timeval32 tv32; 2833 2834 CP(tv, tv32, tv_sec); 2835 CP(tv, tv32, tv_usec); 2836 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2837 } else 2838#endif 2839 error = sooptcopyout(sopt, &tv, sizeof tv); 2840 break; 2841 2842 case SO_LABEL: 2843#ifdef MAC 2844 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2845 sizeof(extmac)); 2846 if (error) 2847 goto bad; 2848 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2849 so, &extmac); 2850 if (error) 2851 goto bad; 2852 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2853#else 2854 error = EOPNOTSUPP; 2855#endif 2856 break; 2857 2858 case SO_PEERLABEL: 2859#ifdef MAC 2860 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2861 sizeof(extmac)); 2862 if (error) 2863 goto bad; 2864 error = mac_getsockopt_peerlabel( 2865 sopt->sopt_td->td_ucred, so, &extmac); 2866 if (error) 2867 goto bad; 2868 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2869#else 2870 error = EOPNOTSUPP; 2871#endif 2872 break; 2873 2874 case SO_LISTENQLIMIT: 2875 optval = so->so_qlimit; 2876 goto integer; 2877 2878 case SO_LISTENQLEN: 2879 optval = so->so_qlen; 2880 goto integer; 2881 2882 case SO_LISTENINCQLEN: 2883 optval = so->so_incqlen; 2884 goto integer; 2885 2886 default: 2887 error = ENOPROTOOPT; 2888 break; 2889 } 2890 } 2891#ifdef MAC 2892bad: 2893#endif 2894 CURVNET_RESTORE(); 2895 return (error); 2896} 2897 2898int 2899soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2900{ 2901 struct mbuf *m, *m_prev; 2902 int sopt_size = sopt->sopt_valsize; 2903 2904 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 2905 if (m == NULL) 2906 return ENOBUFS; 2907 if (sopt_size > MLEN) { 2908 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT); 2909 if ((m->m_flags & M_EXT) == 0) { 2910 m_free(m); 2911 return ENOBUFS; 2912 } 2913 m->m_len = min(MCLBYTES, sopt_size); 2914 } else { 2915 m->m_len = min(MLEN, sopt_size); 2916 } 2917 sopt_size -= m->m_len; 2918 *mp = m; 2919 m_prev = m; 2920 2921 while (sopt_size) { 2922 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 2923 if (m == NULL) { 2924 m_freem(*mp); 2925 return ENOBUFS; 2926 } 2927 if (sopt_size > MLEN) { 2928 MCLGET(m, sopt->sopt_td != NULL ? M_WAIT : 2929 M_DONTWAIT); 2930 if ((m->m_flags & M_EXT) == 0) { 2931 m_freem(m); 2932 m_freem(*mp); 2933 return ENOBUFS; 2934 } 2935 m->m_len = min(MCLBYTES, sopt_size); 2936 } else { 2937 m->m_len = min(MLEN, sopt_size); 2938 } 2939 sopt_size -= m->m_len; 2940 m_prev->m_next = m; 2941 m_prev = m; 2942 } 2943 return (0); 2944} 2945 2946int 2947soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2948{ 2949 struct mbuf *m0 = m; 2950 2951 if (sopt->sopt_val == NULL) 2952 return (0); 2953 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2954 if (sopt->sopt_td != NULL) { 2955 int error; 2956 2957 error = copyin(sopt->sopt_val, mtod(m, char *), 2958 m->m_len); 2959 if (error != 0) { 2960 m_freem(m0); 2961 return(error); 2962 } 2963 } else 2964 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2965 sopt->sopt_valsize -= m->m_len; 2966 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2967 m = m->m_next; 2968 } 2969 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2970 panic("ip6_sooptmcopyin"); 2971 return (0); 2972} 2973 2974int 2975soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2976{ 2977 struct mbuf *m0 = m; 2978 size_t valsize = 0; 2979 2980 if (sopt->sopt_val == NULL) 2981 return (0); 2982 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2983 if (sopt->sopt_td != NULL) { 2984 int error; 2985 2986 error = copyout(mtod(m, char *), sopt->sopt_val, 2987 m->m_len); 2988 if (error != 0) { 2989 m_freem(m0); 2990 return(error); 2991 } 2992 } else 2993 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2994 sopt->sopt_valsize -= m->m_len; 2995 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2996 valsize += m->m_len; 2997 m = m->m_next; 2998 } 2999 if (m != NULL) { 3000 /* enough soopt buffer should be given from user-land */ 3001 m_freem(m0); 3002 return(EINVAL); 3003 } 3004 sopt->sopt_valsize = valsize; 3005 return (0); 3006} 3007 3008/* 3009 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3010 * out-of-band data, which will then notify socket consumers. 3011 */ 3012void 3013sohasoutofband(struct socket *so) 3014{ 3015 3016 if (so->so_sigio != NULL) 3017 pgsigio(&so->so_sigio, SIGURG, 0); 3018 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 3019} 3020 3021int 3022sopoll(struct socket *so, int events, struct ucred *active_cred, 3023 struct thread *td) 3024{ 3025 3026 /* 3027 * We do not need to set or assert curvnet as long as everyone uses 3028 * sopoll_generic(). 3029 */ 3030 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3031 td)); 3032} 3033 3034int 3035sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3036 struct thread *td) 3037{ 3038 int revents = 0; 3039 3040 SOCKBUF_LOCK(&so->so_snd); 3041 SOCKBUF_LOCK(&so->so_rcv); 3042 if (events & (POLLIN | POLLRDNORM)) 3043 if (soreadabledata(so)) 3044 revents |= events & (POLLIN | POLLRDNORM); 3045 3046 if (events & (POLLOUT | POLLWRNORM)) 3047 if (sowriteable(so)) 3048 revents |= events & (POLLOUT | POLLWRNORM); 3049 3050 if (events & (POLLPRI | POLLRDBAND)) 3051 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 3052 revents |= events & (POLLPRI | POLLRDBAND); 3053 3054 if ((events & POLLINIGNEOF) == 0) { 3055 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3056 revents |= events & (POLLIN | POLLRDNORM); 3057 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3058 revents |= POLLHUP; 3059 } 3060 } 3061 3062 if (revents == 0) { 3063 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3064 selrecord(td, &so->so_rcv.sb_sel); 3065 so->so_rcv.sb_flags |= SB_SEL; 3066 } 3067 3068 if (events & (POLLOUT | POLLWRNORM)) { 3069 selrecord(td, &so->so_snd.sb_sel); 3070 so->so_snd.sb_flags |= SB_SEL; 3071 } 3072 } 3073 3074 SOCKBUF_UNLOCK(&so->so_rcv); 3075 SOCKBUF_UNLOCK(&so->so_snd); 3076 return (revents); 3077} 3078 3079int 3080soo_kqfilter(struct file *fp, struct knote *kn) 3081{ 3082 struct socket *so = kn->kn_fp->f_data; 3083 struct sockbuf *sb; 3084 3085 switch (kn->kn_filter) { 3086 case EVFILT_READ: 3087 if (so->so_options & SO_ACCEPTCONN) 3088 kn->kn_fop = &solisten_filtops; 3089 else 3090 kn->kn_fop = &soread_filtops; 3091 sb = &so->so_rcv; 3092 break; 3093 case EVFILT_WRITE: 3094 kn->kn_fop = &sowrite_filtops; 3095 sb = &so->so_snd; 3096 break; 3097 default: 3098 return (EINVAL); 3099 } 3100 3101 SOCKBUF_LOCK(sb); 3102 knlist_add(&sb->sb_sel.si_note, kn, 1); 3103 sb->sb_flags |= SB_KNOTE; 3104 SOCKBUF_UNLOCK(sb); 3105 return (0); 3106} 3107 3108/* 3109 * Some routines that return EOPNOTSUPP for entry points that are not 3110 * supported by a protocol. Fill in as needed. 3111 */ 3112int 3113pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3114{ 3115 3116 return EOPNOTSUPP; 3117} 3118 3119int 3120pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3121{ 3122 3123 return EOPNOTSUPP; 3124} 3125 3126int 3127pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3128{ 3129 3130 return EOPNOTSUPP; 3131} 3132 3133int 3134pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3135{ 3136 3137 return EOPNOTSUPP; 3138} 3139 3140int 3141pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3142{ 3143 3144 return EOPNOTSUPP; 3145} 3146 3147int 3148pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3149 struct ifnet *ifp, struct thread *td) 3150{ 3151 3152 return EOPNOTSUPP; 3153} 3154 3155int 3156pru_disconnect_notsupp(struct socket *so) 3157{ 3158 3159 return EOPNOTSUPP; 3160} 3161 3162int 3163pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3164{ 3165 3166 return EOPNOTSUPP; 3167} 3168 3169int 3170pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3171{ 3172 3173 return EOPNOTSUPP; 3174} 3175 3176int 3177pru_rcvd_notsupp(struct socket *so, int flags) 3178{ 3179 3180 return EOPNOTSUPP; 3181} 3182 3183int 3184pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3185{ 3186 3187 return EOPNOTSUPP; 3188} 3189 3190int 3191pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3192 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3193{ 3194 3195 return EOPNOTSUPP; 3196} 3197 3198/* 3199 * This isn't really a ``null'' operation, but it's the default one and 3200 * doesn't do anything destructive. 3201 */ 3202int 3203pru_sense_null(struct socket *so, struct stat *sb) 3204{ 3205 3206 sb->st_blksize = so->so_snd.sb_hiwat; 3207 return 0; 3208} 3209 3210int 3211pru_shutdown_notsupp(struct socket *so) 3212{ 3213 3214 return EOPNOTSUPP; 3215} 3216 3217int 3218pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3219{ 3220 3221 return EOPNOTSUPP; 3222} 3223 3224int 3225pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3226 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3227{ 3228 3229 return EOPNOTSUPP; 3230} 3231 3232int 3233pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3234 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3235{ 3236 3237 return EOPNOTSUPP; 3238} 3239 3240int 3241pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3242 struct thread *td) 3243{ 3244 3245 return EOPNOTSUPP; 3246} 3247 3248static void 3249filt_sordetach(struct knote *kn) 3250{ 3251 struct socket *so = kn->kn_fp->f_data; 3252 3253 SOCKBUF_LOCK(&so->so_rcv); 3254 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 3255 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 3256 so->so_rcv.sb_flags &= ~SB_KNOTE; 3257 SOCKBUF_UNLOCK(&so->so_rcv); 3258} 3259 3260/*ARGSUSED*/ 3261static int 3262filt_soread(struct knote *kn, long hint) 3263{ 3264 struct socket *so; 3265 3266 so = kn->kn_fp->f_data; 3267 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3268 3269 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 3270 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3271 kn->kn_flags |= EV_EOF; 3272 kn->kn_fflags = so->so_error; 3273 return (1); 3274 } else if (so->so_error) /* temporary udp error */ 3275 return (1); 3276 else if (kn->kn_sfflags & NOTE_LOWAT) 3277 return (kn->kn_data >= kn->kn_sdata); 3278 else 3279 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 3280} 3281 3282static void 3283filt_sowdetach(struct knote *kn) 3284{ 3285 struct socket *so = kn->kn_fp->f_data; 3286 3287 SOCKBUF_LOCK(&so->so_snd); 3288 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 3289 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 3290 so->so_snd.sb_flags &= ~SB_KNOTE; 3291 SOCKBUF_UNLOCK(&so->so_snd); 3292} 3293 3294/*ARGSUSED*/ 3295static int 3296filt_sowrite(struct knote *kn, long hint) 3297{ 3298 struct socket *so; 3299 3300 so = kn->kn_fp->f_data; 3301 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3302 kn->kn_data = sbspace(&so->so_snd); 3303 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3304 kn->kn_flags |= EV_EOF; 3305 kn->kn_fflags = so->so_error; 3306 return (1); 3307 } else if (so->so_error) /* temporary udp error */ 3308 return (1); 3309 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3310 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3311 return (0); 3312 else if (kn->kn_sfflags & NOTE_LOWAT) 3313 return (kn->kn_data >= kn->kn_sdata); 3314 else 3315 return (kn->kn_data >= so->so_snd.sb_lowat); 3316} 3317 3318/*ARGSUSED*/ 3319static int 3320filt_solisten(struct knote *kn, long hint) 3321{ 3322 struct socket *so = kn->kn_fp->f_data; 3323 3324 kn->kn_data = so->so_qlen; 3325 return (! TAILQ_EMPTY(&so->so_comp)); 3326} 3327 3328int 3329socheckuid(struct socket *so, uid_t uid) 3330{ 3331 3332 if (so == NULL) 3333 return (EPERM); 3334 if (so->so_cred->cr_uid != uid) 3335 return (EPERM); 3336 return (0); 3337} 3338 3339/* 3340 * These functions are used by protocols to notify the socket layer (and its 3341 * consumers) of state changes in the sockets driven by protocol-side events. 3342 */ 3343 3344/* 3345 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3346 * 3347 * Normal sequence from the active (originating) side is that 3348 * soisconnecting() is called during processing of connect() call, resulting 3349 * in an eventual call to soisconnected() if/when the connection is 3350 * established. When the connection is torn down soisdisconnecting() is 3351 * called during processing of disconnect() call, and soisdisconnected() is 3352 * called when the connection to the peer is totally severed. The semantics 3353 * of these routines are such that connectionless protocols can call 3354 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3355 * calls when setting up a ``connection'' takes no time. 3356 * 3357 * From the passive side, a socket is created with two queues of sockets: 3358 * so_incomp for connections in progress and so_comp for connections already 3359 * made and awaiting user acceptance. As a protocol is preparing incoming 3360 * connections, it creates a socket structure queued on so_incomp by calling 3361 * sonewconn(). When the connection is established, soisconnected() is 3362 * called, and transfers the socket structure to so_comp, making it available 3363 * to accept(). 3364 * 3365 * If a socket is closed with sockets on either so_incomp or so_comp, these 3366 * sockets are dropped. 3367 * 3368 * If higher-level protocols are implemented in the kernel, the wakeups done 3369 * here will sometimes cause software-interrupt process scheduling. 3370 */ 3371void 3372soisconnecting(struct socket *so) 3373{ 3374 3375 SOCK_LOCK(so); 3376 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3377 so->so_state |= SS_ISCONNECTING; 3378 SOCK_UNLOCK(so); 3379} 3380 3381void 3382soisconnected(struct socket *so) 3383{ 3384 struct socket *head; 3385 int ret; 3386 3387restart: 3388 ACCEPT_LOCK(); 3389 SOCK_LOCK(so); 3390 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3391 so->so_state |= SS_ISCONNECTED; 3392 head = so->so_head; 3393 if (head != NULL && (so->so_qstate & SQ_INCOMP)) { 3394 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3395 SOCK_UNLOCK(so); 3396 TAILQ_REMOVE(&head->so_incomp, so, so_list); 3397 head->so_incqlen--; 3398 so->so_qstate &= ~SQ_INCOMP; 3399 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 3400 head->so_qlen++; 3401 so->so_qstate |= SQ_COMP; 3402 ACCEPT_UNLOCK(); 3403 sorwakeup(head); 3404 wakeup_one(&head->so_timeo); 3405 } else { 3406 ACCEPT_UNLOCK(); 3407 soupcall_set(so, SO_RCV, 3408 head->so_accf->so_accept_filter->accf_callback, 3409 head->so_accf->so_accept_filter_arg); 3410 so->so_options &= ~SO_ACCEPTFILTER; 3411 ret = head->so_accf->so_accept_filter->accf_callback(so, 3412 head->so_accf->so_accept_filter_arg, M_DONTWAIT); 3413 if (ret == SU_ISCONNECTED) 3414 soupcall_clear(so, SO_RCV); 3415 SOCK_UNLOCK(so); 3416 if (ret == SU_ISCONNECTED) 3417 goto restart; 3418 } 3419 return; 3420 } 3421 SOCK_UNLOCK(so); 3422 ACCEPT_UNLOCK(); 3423 wakeup(&so->so_timeo); 3424 sorwakeup(so); 3425 sowwakeup(so); 3426} 3427 3428void 3429soisdisconnecting(struct socket *so) 3430{ 3431 3432 /* 3433 * Note: This code assumes that SOCK_LOCK(so) and 3434 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3435 */ 3436 SOCKBUF_LOCK(&so->so_rcv); 3437 so->so_state &= ~SS_ISCONNECTING; 3438 so->so_state |= SS_ISDISCONNECTING; 3439 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3440 sorwakeup_locked(so); 3441 SOCKBUF_LOCK(&so->so_snd); 3442 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3443 sowwakeup_locked(so); 3444 wakeup(&so->so_timeo); 3445} 3446 3447void 3448soisdisconnected(struct socket *so) 3449{ 3450 3451 /* 3452 * Note: This code assumes that SOCK_LOCK(so) and 3453 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3454 */ 3455 SOCKBUF_LOCK(&so->so_rcv); 3456 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3457 so->so_state |= SS_ISDISCONNECTED; 3458 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3459 sorwakeup_locked(so); 3460 SOCKBUF_LOCK(&so->so_snd); 3461 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3462 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); 3463 sowwakeup_locked(so); 3464 wakeup(&so->so_timeo); 3465} 3466 3467/* 3468 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3469 */ 3470struct sockaddr * 3471sodupsockaddr(const struct sockaddr *sa, int mflags) 3472{ 3473 struct sockaddr *sa2; 3474 3475 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3476 if (sa2) 3477 bcopy(sa, sa2, sa->sa_len); 3478 return sa2; 3479} 3480 3481/* 3482 * Register per-socket buffer upcalls. 3483 */ 3484void 3485soupcall_set(struct socket *so, int which, 3486 int (*func)(struct socket *, void *, int), void *arg) 3487{ 3488 struct sockbuf *sb; 3489 3490 switch (which) { 3491 case SO_RCV: 3492 sb = &so->so_rcv; 3493 break; 3494 case SO_SND: 3495 sb = &so->so_snd; 3496 break; 3497 default: 3498 panic("soupcall_set: bad which"); 3499 } 3500 SOCKBUF_LOCK_ASSERT(sb); 3501#if 0 3502 /* XXX: accf_http actually wants to do this on purpose. */ 3503 KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); 3504#endif 3505 sb->sb_upcall = func; 3506 sb->sb_upcallarg = arg; 3507 sb->sb_flags |= SB_UPCALL; 3508} 3509 3510void 3511soupcall_clear(struct socket *so, int which) 3512{ 3513 struct sockbuf *sb; 3514 3515 switch (which) { 3516 case SO_RCV: 3517 sb = &so->so_rcv; 3518 break; 3519 case SO_SND: 3520 sb = &so->so_snd; 3521 break; 3522 default: 3523 panic("soupcall_clear: bad which"); 3524 } 3525 SOCKBUF_LOCK_ASSERT(sb); 3526 KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); 3527 sb->sb_upcall = NULL; 3528 sb->sb_upcallarg = NULL; 3529 sb->sb_flags &= ~SB_UPCALL; 3530} 3531 3532/* 3533 * Create an external-format (``xsocket'') structure using the information in 3534 * the kernel-format socket structure pointed to by so. This is done to 3535 * reduce the spew of irrelevant information over this interface, to isolate 3536 * user code from changes in the kernel structure, and potentially to provide 3537 * information-hiding if we decide that some of this information should be 3538 * hidden from users. 3539 */ 3540void 3541sotoxsocket(struct socket *so, struct xsocket *xso) 3542{ 3543 3544 xso->xso_len = sizeof *xso; 3545 xso->xso_so = so; 3546 xso->so_type = so->so_type; 3547 xso->so_options = so->so_options; 3548 xso->so_linger = so->so_linger; 3549 xso->so_state = so->so_state; 3550 xso->so_pcb = so->so_pcb; 3551 xso->xso_protocol = so->so_proto->pr_protocol; 3552 xso->xso_family = so->so_proto->pr_domain->dom_family; 3553 xso->so_qlen = so->so_qlen; 3554 xso->so_incqlen = so->so_incqlen; 3555 xso->so_qlimit = so->so_qlimit; 3556 xso->so_timeo = so->so_timeo; 3557 xso->so_error = so->so_error; 3558 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 3559 xso->so_oobmark = so->so_oobmark; 3560 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 3561 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 3562 xso->so_uid = so->so_cred->cr_uid; 3563} 3564 3565 3566/* 3567 * Socket accessor functions to provide external consumers with 3568 * a safe interface to socket state 3569 * 3570 */ 3571 3572void 3573so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg) 3574{ 3575 3576 TAILQ_FOREACH(so, &so->so_comp, so_list) 3577 func(so, arg); 3578} 3579 3580struct sockbuf * 3581so_sockbuf_rcv(struct socket *so) 3582{ 3583 3584 return (&so->so_rcv); 3585} 3586 3587struct sockbuf * 3588so_sockbuf_snd(struct socket *so) 3589{ 3590 3591 return (&so->so_snd); 3592} 3593 3594int 3595so_state_get(const struct socket *so) 3596{ 3597 3598 return (so->so_state); 3599} 3600 3601void 3602so_state_set(struct socket *so, int val) 3603{ 3604 3605 so->so_state = val; 3606} 3607 3608int 3609so_options_get(const struct socket *so) 3610{ 3611 3612 return (so->so_options); 3613} 3614 3615void 3616so_options_set(struct socket *so, int val) 3617{ 3618 3619 so->so_options = val; 3620} 3621 3622int 3623so_error_get(const struct socket *so) 3624{ 3625 3626 return (so->so_error); 3627} 3628 3629void 3630so_error_set(struct socket *so, int val) 3631{ 3632 3633 so->so_error = val; 3634} 3635 3636int 3637so_linger_get(const struct socket *so) 3638{ 3639 3640 return (so->so_linger); 3641} 3642 3643void 3644so_linger_set(struct socket *so, int val) 3645{ 3646 3647 so->so_linger = val; 3648} 3649 3650struct protosw * 3651so_protosw_get(const struct socket *so) 3652{ 3653 3654 return (so->so_proto); 3655} 3656 3657void 3658so_protosw_set(struct socket *so, struct protosw *val) 3659{ 3660 3661 so->so_proto = val; 3662} 3663 3664void 3665so_sorwakeup(struct socket *so) 3666{ 3667 3668 sorwakeup(so); 3669} 3670 3671void 3672so_sowwakeup(struct socket *so) 3673{ 3674 3675 sowwakeup(so); 3676} 3677 3678void 3679so_sorwakeup_locked(struct socket *so) 3680{ 3681 3682 sorwakeup_locked(so); 3683} 3684 3685void 3686so_sowwakeup_locked(struct socket *so) 3687{ 3688 3689 sowwakeup_locked(so); 3690} 3691 3692void 3693so_lock(struct socket *so) 3694{ 3695 SOCK_LOCK(so); 3696} 3697 3698void 3699so_unlock(struct socket *so) 3700{ 3701 SOCK_UNLOCK(so); 3702} 3703