uipc_socket.c revision 142055
1/*- 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004-2005 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 142055 2005-02-18 18:43:33Z rwatson $"); 36 37#include "opt_inet.h" 38#include "opt_mac.h" 39#include "opt_zero.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/fcntl.h> 44#include <sys/limits.h> 45#include <sys/lock.h> 46#include <sys/mac.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/mutex.h> 50#include <sys/domain.h> 51#include <sys/file.h> /* for struct knote */ 52#include <sys/kernel.h> 53#include <sys/event.h> 54#include <sys/poll.h> 55#include <sys/proc.h> 56#include <sys/protosw.h> 57#include <sys/socket.h> 58#include <sys/socketvar.h> 59#include <sys/resourcevar.h> 60#include <sys/signalvar.h> 61#include <sys/sysctl.h> 62#include <sys/uio.h> 63#include <sys/jail.h> 64 65#include <vm/uma.h> 66 67 68static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71#ifdef INET 72static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73#endif 74 75static void filt_sordetach(struct knote *kn); 76static int filt_soread(struct knote *kn, long hint); 77static void filt_sowdetach(struct knote *kn); 78static int filt_sowrite(struct knote *kn, long hint); 79static int filt_solisten(struct knote *kn, long hint); 80 81static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88uma_zone_t socket_zone; 89so_gen_t so_gencnt; /* generation count for sockets */ 90 91MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94SYSCTL_DECL(_kern_ipc); 95 96static int somaxconn = SOMAXCONN; 97static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS); 98/* XXX: we dont have SYSCTL_USHORT */ 99SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 100 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection " 101 "queue size"); 102static int numopensockets; 103SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 104 &numopensockets, 0, "Number of open sockets"); 105#ifdef ZERO_COPY_SOCKETS 106/* These aren't static because they're used in other files. */ 107int so_zero_copy_send = 1; 108int so_zero_copy_receive = 1; 109SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 110 "Zero copy controls"); 111SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 112 &so_zero_copy_receive, 0, "Enable zero copy receive"); 113SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 114 &so_zero_copy_send, 0, "Enable zero copy send"); 115#endif /* ZERO_COPY_SOCKETS */ 116 117/* 118 * accept_mtx locks down per-socket fields relating to accept queues. See 119 * socketvar.h for an annotation of the protected fields of struct socket. 120 */ 121struct mtx accept_mtx; 122MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 123 124/* 125 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 126 * so_gencnt field. 127 */ 128static struct mtx so_global_mtx; 129MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131/* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139/* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147struct socket * 148soalloc(int mflags) 149{ 150 struct socket *so; 151 152 so = uma_zalloc(socket_zone, mflags | M_ZERO); 153 if (so != NULL) { 154#ifdef MAC 155 if (mac_init_socket(so, mflags) != 0) { 156 uma_zfree(socket_zone, so); 157 return (NULL); 158 } 159#endif 160 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 161 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 162 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 163 TAILQ_INIT(&so->so_aiojobq); 164 mtx_lock(&so_global_mtx); 165 so->so_gencnt = ++so_gencnt; 166 ++numopensockets; 167 mtx_unlock(&so_global_mtx); 168 } 169 return (so); 170} 171 172/* 173 * socreate returns a socket with a ref count of 1. The socket should be 174 * closed with soclose(). 175 */ 176int 177socreate(dom, aso, type, proto, cred, td) 178 int dom; 179 struct socket **aso; 180 int type; 181 int proto; 182 struct ucred *cred; 183 struct thread *td; 184{ 185 struct protosw *prp; 186 struct socket *so; 187 int error; 188 189 if (proto) 190 prp = pffindproto(dom, proto, type); 191 else 192 prp = pffindtype(dom, type); 193 194 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 195 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 196 return (EPROTONOSUPPORT); 197 198 if (jailed(cred) && jail_socket_unixiproute_only && 199 prp->pr_domain->dom_family != PF_LOCAL && 200 prp->pr_domain->dom_family != PF_INET && 201 prp->pr_domain->dom_family != PF_ROUTE) { 202 return (EPROTONOSUPPORT); 203 } 204 205 if (prp->pr_type != type) 206 return (EPROTOTYPE); 207 so = soalloc(M_WAITOK); 208 if (so == NULL) 209 return (ENOBUFS); 210 211 TAILQ_INIT(&so->so_incomp); 212 TAILQ_INIT(&so->so_comp); 213 so->so_type = type; 214 so->so_cred = crhold(cred); 215 so->so_proto = prp; 216#ifdef MAC 217 mac_create_socket(cred, so); 218#endif 219 SOCK_LOCK(so); 220 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 221 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 222 soref(so); 223 SOCK_UNLOCK(so); 224 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 225 if (error) { 226 ACCEPT_LOCK(); 227 SOCK_LOCK(so); 228 so->so_state |= SS_NOFDREF; 229 sorele(so); 230 return (error); 231 } 232 *aso = so; 233 return (0); 234} 235 236int 237sobind(so, nam, td) 238 struct socket *so; 239 struct sockaddr *nam; 240 struct thread *td; 241{ 242 243 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 244} 245 246void 247sodealloc(struct socket *so) 248{ 249 250 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 251 mtx_lock(&so_global_mtx); 252 so->so_gencnt = ++so_gencnt; 253 mtx_unlock(&so_global_mtx); 254 if (so->so_rcv.sb_hiwat) 255 (void)chgsbsize(so->so_cred->cr_uidinfo, 256 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 257 if (so->so_snd.sb_hiwat) 258 (void)chgsbsize(so->so_cred->cr_uidinfo, 259 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 260#ifdef INET 261 /* remove acccept filter if one is present. */ 262 if (so->so_accf != NULL) 263 do_setopt_accept_filter(so, NULL); 264#endif 265#ifdef MAC 266 mac_destroy_socket(so); 267#endif 268 crfree(so->so_cred); 269 SOCKBUF_LOCK_DESTROY(&so->so_snd); 270 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 271 /* sx_destroy(&so->so_sxlock); */ 272 uma_zfree(socket_zone, so); 273 mtx_lock(&so_global_mtx); 274 --numopensockets; 275 mtx_unlock(&so_global_mtx); 276} 277 278int 279solisten(so, backlog, td) 280 struct socket *so; 281 int backlog; 282 struct thread *td; 283{ 284 int error; 285 286 /* 287 * XXXRW: Ordering issue here -- perhaps we need to set 288 * SO_ACCEPTCONN before the call to pru_listen()? 289 * XXXRW: General atomic test-and-set concerns here also. 290 */ 291 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 292 SS_ISDISCONNECTING)) 293 return (EINVAL); 294 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 295 if (error) 296 return (error); 297 ACCEPT_LOCK(); 298 SOCK_LOCK(so); 299 so->so_options |= SO_ACCEPTCONN; 300 SOCK_UNLOCK(so); 301 if (backlog < 0 || backlog > somaxconn) 302 backlog = somaxconn; 303 so->so_qlimit = backlog; 304 ACCEPT_UNLOCK(); 305 return (0); 306} 307 308/* 309 * Attempt to free a socket. This should really be sotryfree(). 310 * 311 * We free the socket if the protocol is no longer interested in the socket, 312 * there's no file descriptor reference, and the refcount is 0. While the 313 * calling macro sotryfree() tests the refcount, sofree() has to test it 314 * again as it's possible to race with an accept()ing thread if the socket is 315 * in an listen queue of a listen socket, as being in the listen queue 316 * doesn't elevate the reference count. sofree() acquires the accept mutex 317 * early for this test in order to avoid that race. 318 */ 319void 320sofree(so) 321 struct socket *so; 322{ 323 struct socket *head; 324 325 ACCEPT_LOCK_ASSERT(); 326 SOCK_LOCK_ASSERT(so); 327 328 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 || 329 so->so_count != 0) { 330 SOCK_UNLOCK(so); 331 ACCEPT_UNLOCK(); 332 return; 333 } 334 335 head = so->so_head; 336 if (head != NULL) { 337 KASSERT((so->so_qstate & SQ_COMP) != 0 || 338 (so->so_qstate & SQ_INCOMP) != 0, 339 ("sofree: so_head != NULL, but neither SQ_COMP nor " 340 "SQ_INCOMP")); 341 KASSERT((so->so_qstate & SQ_COMP) == 0 || 342 (so->so_qstate & SQ_INCOMP) == 0, 343 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 344 /* 345 * accept(2) is responsible draining the completed 346 * connection queue and freeing those sockets, so 347 * we just return here if this socket is currently 348 * on the completed connection queue. Otherwise, 349 * accept(2) may hang after select(2) has indicating 350 * that a listening socket was ready. If it's an 351 * incomplete connection, we remove it from the queue 352 * and free it; otherwise, it won't be released until 353 * the listening socket is closed. 354 */ 355 if ((so->so_qstate & SQ_COMP) != 0) { 356 SOCK_UNLOCK(so); 357 ACCEPT_UNLOCK(); 358 return; 359 } 360 TAILQ_REMOVE(&head->so_incomp, so, so_list); 361 head->so_incqlen--; 362 so->so_qstate &= ~SQ_INCOMP; 363 so->so_head = NULL; 364 } 365 KASSERT((so->so_qstate & SQ_COMP) == 0 && 366 (so->so_qstate & SQ_INCOMP) == 0, 367 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 368 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 369 SOCK_UNLOCK(so); 370 ACCEPT_UNLOCK(); 371 SOCKBUF_LOCK(&so->so_snd); 372 so->so_snd.sb_flags |= SB_NOINTR; 373 (void)sblock(&so->so_snd, M_WAITOK); 374 /* 375 * socantsendmore_locked() drops the socket buffer mutex so that it 376 * can safely perform wakeups. Re-acquire the mutex before 377 * continuing. 378 */ 379 socantsendmore_locked(so); 380 SOCKBUF_LOCK(&so->so_snd); 381 sbunlock(&so->so_snd); 382 sbrelease_locked(&so->so_snd, so); 383 SOCKBUF_UNLOCK(&so->so_snd); 384 sorflush(so); 385 knlist_destroy(&so->so_rcv.sb_sel.si_note); 386 knlist_destroy(&so->so_snd.sb_sel.si_note); 387 sodealloc(so); 388} 389 390/* 391 * Close a socket on last file table reference removal. 392 * Initiate disconnect if connected. 393 * Free socket when disconnect complete. 394 * 395 * This function will sorele() the socket. Note that soclose() may be 396 * called prior to the ref count reaching zero. The actual socket 397 * structure will not be freed until the ref count reaches zero. 398 */ 399int 400soclose(so) 401 struct socket *so; 402{ 403 int error = 0; 404 405 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 406 407 funsetown(&so->so_sigio); 408 if (so->so_options & SO_ACCEPTCONN) { 409 struct socket *sp; 410 ACCEPT_LOCK(); 411 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 412 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 413 so->so_incqlen--; 414 sp->so_qstate &= ~SQ_INCOMP; 415 sp->so_head = NULL; 416 ACCEPT_UNLOCK(); 417 (void) soabort(sp); 418 ACCEPT_LOCK(); 419 } 420 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 421 TAILQ_REMOVE(&so->so_comp, sp, so_list); 422 so->so_qlen--; 423 sp->so_qstate &= ~SQ_COMP; 424 sp->so_head = NULL; 425 ACCEPT_UNLOCK(); 426 (void) soabort(sp); 427 ACCEPT_LOCK(); 428 } 429 ACCEPT_UNLOCK(); 430 } 431 if (so->so_pcb == NULL) 432 goto discard; 433 if (so->so_state & SS_ISCONNECTED) { 434 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 435 error = sodisconnect(so); 436 if (error) 437 goto drop; 438 } 439 if (so->so_options & SO_LINGER) { 440 if ((so->so_state & SS_ISDISCONNECTING) && 441 (so->so_state & SS_NBIO)) 442 goto drop; 443 while (so->so_state & SS_ISCONNECTED) { 444 error = tsleep(&so->so_timeo, 445 PSOCK | PCATCH, "soclos", so->so_linger * hz); 446 if (error) 447 break; 448 } 449 } 450 } 451drop: 452 if (so->so_pcb != NULL) { 453 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 454 if (error == 0) 455 error = error2; 456 } 457discard: 458 ACCEPT_LOCK(); 459 SOCK_LOCK(so); 460 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 461 so->so_state |= SS_NOFDREF; 462 sorele(so); 463 return (error); 464} 465 466/* 467 * soabort() must not be called with any socket locks held, as it calls 468 * into the protocol, which will call back into the socket code causing 469 * it to acquire additional socket locks that may cause recursion or lock 470 * order reversals. 471 */ 472int 473soabort(so) 474 struct socket *so; 475{ 476 int error; 477 478 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 479 if (error) { 480 ACCEPT_LOCK(); 481 SOCK_LOCK(so); 482 sotryfree(so); /* note: does not decrement the ref count */ 483 return error; 484 } 485 return (0); 486} 487 488int 489soaccept(so, nam) 490 struct socket *so; 491 struct sockaddr **nam; 492{ 493 int error; 494 495 SOCK_LOCK(so); 496 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 497 so->so_state &= ~SS_NOFDREF; 498 SOCK_UNLOCK(so); 499 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 500 return (error); 501} 502 503int 504soconnect(so, nam, td) 505 struct socket *so; 506 struct sockaddr *nam; 507 struct thread *td; 508{ 509 int error; 510 511 if (so->so_options & SO_ACCEPTCONN) 512 return (EOPNOTSUPP); 513 /* 514 * If protocol is connection-based, can only connect once. 515 * Otherwise, if connected, try to disconnect first. 516 * This allows user to disconnect by connecting to, e.g., 517 * a null address. 518 */ 519 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 520 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 521 (error = sodisconnect(so)))) { 522 error = EISCONN; 523 } else { 524 SOCK_LOCK(so); 525 /* 526 * Prevent accumulated error from previous connection 527 * from biting us. 528 */ 529 so->so_error = 0; 530 SOCK_UNLOCK(so); 531 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 532 } 533 534 return (error); 535} 536 537int 538soconnect2(so1, so2) 539 struct socket *so1; 540 struct socket *so2; 541{ 542 543 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 544} 545 546int 547sodisconnect(so) 548 struct socket *so; 549{ 550 int error; 551 552 if ((so->so_state & SS_ISCONNECTED) == 0) 553 return (ENOTCONN); 554 if (so->so_state & SS_ISDISCONNECTING) 555 return (EALREADY); 556 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 557 return (error); 558} 559 560#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 561/* 562 * Send on a socket. 563 * If send must go all at once and message is larger than 564 * send buffering, then hard error. 565 * Lock against other senders. 566 * If must go all at once and not enough room now, then 567 * inform user that this would block and do nothing. 568 * Otherwise, if nonblocking, send as much as possible. 569 * The data to be sent is described by "uio" if nonzero, 570 * otherwise by the mbuf chain "top" (which must be null 571 * if uio is not). Data provided in mbuf chain must be small 572 * enough to send all at once. 573 * 574 * Returns nonzero on error, timeout or signal; callers 575 * must check for short counts if EINTR/ERESTART are returned. 576 * Data and control buffers are freed on return. 577 */ 578 579#ifdef ZERO_COPY_SOCKETS 580struct so_zerocopy_stats{ 581 int size_ok; 582 int align_ok; 583 int found_ifp; 584}; 585struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 586#include <netinet/in.h> 587#include <net/route.h> 588#include <netinet/in_pcb.h> 589#include <vm/vm.h> 590#include <vm/vm_page.h> 591#include <vm/vm_object.h> 592#endif /*ZERO_COPY_SOCKETS*/ 593 594int 595sosend(so, addr, uio, top, control, flags, td) 596 struct socket *so; 597 struct sockaddr *addr; 598 struct uio *uio; 599 struct mbuf *top; 600 struct mbuf *control; 601 int flags; 602 struct thread *td; 603{ 604 struct mbuf **mp; 605 struct mbuf *m; 606 long space, len = 0, resid; 607 int clen = 0, error, dontroute; 608 int atomic = sosendallatonce(so) || top; 609#ifdef ZERO_COPY_SOCKETS 610 int cow_send; 611#endif /* ZERO_COPY_SOCKETS */ 612 613 if (uio != NULL) 614 resid = uio->uio_resid; 615 else 616 resid = top->m_pkthdr.len; 617 /* 618 * In theory resid should be unsigned. 619 * However, space must be signed, as it might be less than 0 620 * if we over-committed, and we must use a signed comparison 621 * of space and resid. On the other hand, a negative resid 622 * causes us to loop sending 0-length segments to the protocol. 623 * 624 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 625 * type sockets since that's an error. 626 */ 627 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 628 error = EINVAL; 629 goto out; 630 } 631 632 dontroute = 633 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 634 (so->so_proto->pr_flags & PR_ATOMIC); 635 if (td != NULL) 636 td->td_proc->p_stats->p_ru.ru_msgsnd++; 637 if (control != NULL) 638 clen = control->m_len; 639#define snderr(errno) { error = (errno); goto release; } 640 641 SOCKBUF_LOCK(&so->so_snd); 642restart: 643 SOCKBUF_LOCK_ASSERT(&so->so_snd); 644 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 645 if (error) 646 goto out_locked; 647 do { 648 SOCKBUF_LOCK_ASSERT(&so->so_snd); 649 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 650 snderr(EPIPE); 651 if (so->so_error) { 652 error = so->so_error; 653 so->so_error = 0; 654 goto release; 655 } 656 if ((so->so_state & SS_ISCONNECTED) == 0) { 657 /* 658 * `sendto' and `sendmsg' is allowed on a connection- 659 * based socket if it supports implied connect. 660 * Return ENOTCONN if not connected and no address is 661 * supplied. 662 */ 663 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 664 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 665 if ((so->so_state & SS_ISCONFIRMING) == 0 && 666 !(resid == 0 && clen != 0)) 667 snderr(ENOTCONN); 668 } else if (addr == NULL) 669 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 670 ENOTCONN : EDESTADDRREQ); 671 } 672 space = sbspace(&so->so_snd); 673 if (flags & MSG_OOB) 674 space += 1024; 675 if ((atomic && resid > so->so_snd.sb_hiwat) || 676 clen > so->so_snd.sb_hiwat) 677 snderr(EMSGSIZE); 678 if (space < resid + clen && 679 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 680 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 681 snderr(EWOULDBLOCK); 682 sbunlock(&so->so_snd); 683 error = sbwait(&so->so_snd); 684 if (error) 685 goto out_locked; 686 goto restart; 687 } 688 SOCKBUF_UNLOCK(&so->so_snd); 689 mp = ⊤ 690 space -= clen; 691 do { 692 if (uio == NULL) { 693 /* 694 * Data is prepackaged in "top". 695 */ 696 resid = 0; 697 if (flags & MSG_EOR) 698 top->m_flags |= M_EOR; 699 } else do { 700#ifdef ZERO_COPY_SOCKETS 701 cow_send = 0; 702#endif /* ZERO_COPY_SOCKETS */ 703 if (resid >= MINCLSIZE) { 704#ifdef ZERO_COPY_SOCKETS 705 if (top == NULL) { 706 MGETHDR(m, M_TRYWAIT, MT_DATA); 707 if (m == NULL) { 708 error = ENOBUFS; 709 SOCKBUF_LOCK(&so->so_snd); 710 goto release; 711 } 712 m->m_pkthdr.len = 0; 713 m->m_pkthdr.rcvif = (struct ifnet *)0; 714 } else { 715 MGET(m, M_TRYWAIT, MT_DATA); 716 if (m == NULL) { 717 error = ENOBUFS; 718 SOCKBUF_LOCK(&so->so_snd); 719 goto release; 720 } 721 } 722 if (so_zero_copy_send && 723 resid>=PAGE_SIZE && 724 space>=PAGE_SIZE && 725 uio->uio_iov->iov_len>=PAGE_SIZE) { 726 so_zerocp_stats.size_ok++; 727 if (!((vm_offset_t) 728 uio->uio_iov->iov_base & PAGE_MASK)){ 729 so_zerocp_stats.align_ok++; 730 cow_send = socow_setup(m, uio); 731 } 732 } 733 if (!cow_send) { 734 MCLGET(m, M_TRYWAIT); 735 if ((m->m_flags & M_EXT) == 0) { 736 m_free(m); 737 m = NULL; 738 } else { 739 len = min(min(MCLBYTES, resid), space); 740 } 741 } else 742 len = PAGE_SIZE; 743#else /* ZERO_COPY_SOCKETS */ 744 if (top == NULL) { 745 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 746 m->m_pkthdr.len = 0; 747 m->m_pkthdr.rcvif = (struct ifnet *)0; 748 } else 749 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 750 len = min(min(MCLBYTES, resid), space); 751#endif /* ZERO_COPY_SOCKETS */ 752 } else { 753 if (top == NULL) { 754 m = m_gethdr(M_TRYWAIT, MT_DATA); 755 m->m_pkthdr.len = 0; 756 m->m_pkthdr.rcvif = (struct ifnet *)0; 757 758 len = min(min(MHLEN, resid), space); 759 /* 760 * For datagram protocols, leave room 761 * for protocol headers in first mbuf. 762 */ 763 if (atomic && m && len < MHLEN) 764 MH_ALIGN(m, len); 765 } else { 766 m = m_get(M_TRYWAIT, MT_DATA); 767 len = min(min(MLEN, resid), space); 768 } 769 } 770 if (m == NULL) { 771 error = ENOBUFS; 772 SOCKBUF_LOCK(&so->so_snd); 773 goto release; 774 } 775 776 space -= len; 777#ifdef ZERO_COPY_SOCKETS 778 if (cow_send) 779 error = 0; 780 else 781#endif /* ZERO_COPY_SOCKETS */ 782 error = uiomove(mtod(m, void *), (int)len, uio); 783 resid = uio->uio_resid; 784 m->m_len = len; 785 *mp = m; 786 top->m_pkthdr.len += len; 787 if (error) { 788 SOCKBUF_LOCK(&so->so_snd); 789 goto release; 790 } 791 mp = &m->m_next; 792 if (resid <= 0) { 793 if (flags & MSG_EOR) 794 top->m_flags |= M_EOR; 795 break; 796 } 797 } while (space > 0 && atomic); 798 if (dontroute) { 799 SOCK_LOCK(so); 800 so->so_options |= SO_DONTROUTE; 801 SOCK_UNLOCK(so); 802 } 803 /* 804 * XXX all the SBS_CANTSENDMORE checks previously 805 * done could be out of date. We could have recieved 806 * a reset packet in an interrupt or maybe we slept 807 * while doing page faults in uiomove() etc. We could 808 * probably recheck again inside the locking protection 809 * here, but there are probably other places that this 810 * also happens. We must rethink this. 811 */ 812 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 813 (flags & MSG_OOB) ? PRUS_OOB : 814 /* 815 * If the user set MSG_EOF, the protocol 816 * understands this flag and nothing left to 817 * send then use PRU_SEND_EOF instead of PRU_SEND. 818 */ 819 ((flags & MSG_EOF) && 820 (so->so_proto->pr_flags & PR_IMPLOPCL) && 821 (resid <= 0)) ? 822 PRUS_EOF : 823 /* If there is more to send set PRUS_MORETOCOME */ 824 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 825 top, addr, control, td); 826 if (dontroute) { 827 SOCK_LOCK(so); 828 so->so_options &= ~SO_DONTROUTE; 829 SOCK_UNLOCK(so); 830 } 831 clen = 0; 832 control = NULL; 833 top = NULL; 834 mp = ⊤ 835 if (error) { 836 SOCKBUF_LOCK(&so->so_snd); 837 goto release; 838 } 839 } while (resid && space > 0); 840 SOCKBUF_LOCK(&so->so_snd); 841 } while (resid); 842 843release: 844 SOCKBUF_LOCK_ASSERT(&so->so_snd); 845 sbunlock(&so->so_snd); 846out_locked: 847 SOCKBUF_LOCK_ASSERT(&so->so_snd); 848 SOCKBUF_UNLOCK(&so->so_snd); 849out: 850 if (top != NULL) 851 m_freem(top); 852 if (control != NULL) 853 m_freem(control); 854 return (error); 855} 856 857/* 858 * The part of soreceive() that implements reading non-inline out-of-band 859 * data from a socket. For more complete comments, see soreceive(), from 860 * which this code originated. 861 * 862 * Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), is 863 * unable to return an mbuf chain to the caller. 864 */ 865static int 866soreceive_rcvoob(so, uio, flags) 867 struct socket *so; 868 struct uio *uio; 869 int flags; 870{ 871 struct protosw *pr = so->so_proto; 872 struct mbuf *m; 873 int error; 874 875 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 876 877 m = m_get(M_TRYWAIT, MT_DATA); 878 if (m == NULL) 879 return (ENOBUFS); 880 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 881 if (error) 882 goto bad; 883 do { 884#ifdef ZERO_COPY_SOCKETS 885 if (so_zero_copy_receive) { 886 int disposable; 887 888 if ((m->m_flags & M_EXT) 889 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 890 disposable = 1; 891 else 892 disposable = 0; 893 894 error = uiomoveco(mtod(m, void *), 895 min(uio->uio_resid, m->m_len), 896 uio, disposable); 897 } else 898#endif /* ZERO_COPY_SOCKETS */ 899 error = uiomove(mtod(m, void *), 900 (int) min(uio->uio_resid, m->m_len), uio); 901 m = m_free(m); 902 } while (uio->uio_resid && error == 0 && m); 903bad: 904 if (m != NULL) 905 m_freem(m); 906 return (error); 907} 908 909/* 910 * Following replacement or removal of the first mbuf on the first mbuf chain 911 * of a socket buffer, push necessary state changes back into the socket 912 * buffer so that other consumers see the values consistently. 'nextrecord' 913 * is the callers locally stored value of the original value of 914 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 915 * NOTE: 'nextrecord' may be NULL. 916 */ 917static __inline void 918sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 919{ 920 921 SOCKBUF_LOCK_ASSERT(sb); 922 /* 923 * First, update for the new value of nextrecord. If necessary, make 924 * it the first record. 925 */ 926 if (sb->sb_mb != NULL) 927 sb->sb_mb->m_nextpkt = nextrecord; 928 else 929 sb->sb_mb = nextrecord; 930 931 /* 932 * Now update any dependent socket buffer fields to reflect the new 933 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 934 * addition of a second clause that takes care of the case where 935 * sb_mb has been updated, but remains the last record. 936 */ 937 if (sb->sb_mb == NULL) { 938 sb->sb_mbtail = NULL; 939 sb->sb_lastrecord = NULL; 940 } else if (sb->sb_mb->m_nextpkt == NULL) 941 sb->sb_lastrecord = sb->sb_mb; 942} 943 944 945/* 946 * Implement receive operations on a socket. 947 * We depend on the way that records are added to the sockbuf 948 * by sbappend*. In particular, each record (mbufs linked through m_next) 949 * must begin with an address if the protocol so specifies, 950 * followed by an optional mbuf or mbufs containing ancillary data, 951 * and then zero or more mbufs of data. 952 * In order to avoid blocking network interrupts for the entire time here, 953 * we splx() while doing the actual copy to user space. 954 * Although the sockbuf is locked, new data may still be appended, 955 * and thus we must maintain consistency of the sockbuf during that time. 956 * 957 * The caller may receive the data as a single mbuf chain by supplying 958 * an mbuf **mp0 for use in returning the chain. The uio is then used 959 * only for the count in uio_resid. 960 */ 961int 962soreceive(so, psa, uio, mp0, controlp, flagsp) 963 struct socket *so; 964 struct sockaddr **psa; 965 struct uio *uio; 966 struct mbuf **mp0; 967 struct mbuf **controlp; 968 int *flagsp; 969{ 970 struct mbuf *m, **mp; 971 int flags, len, error, offset; 972 struct protosw *pr = so->so_proto; 973 struct mbuf *nextrecord; 974 int moff, type = 0; 975 int orig_resid = uio->uio_resid; 976 977 mp = mp0; 978 if (psa != NULL) 979 *psa = NULL; 980 if (controlp != NULL) 981 *controlp = NULL; 982 if (flagsp != NULL) 983 flags = *flagsp &~ MSG_EOR; 984 else 985 flags = 0; 986 if (flags & MSG_OOB) 987 return (soreceive_rcvoob(so, uio, flags)); 988 if (mp != NULL) 989 *mp = NULL; 990 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 991 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 992 993 SOCKBUF_LOCK(&so->so_rcv); 994restart: 995 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 996 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 997 if (error) 998 goto out; 999 1000 m = so->so_rcv.sb_mb; 1001 /* 1002 * If we have less data than requested, block awaiting more 1003 * (subject to any timeout) if: 1004 * 1. the current count is less than the low water mark, or 1005 * 2. MSG_WAITALL is set, and it is possible to do the entire 1006 * receive operation at once if we block (resid <= hiwat). 1007 * 3. MSG_DONTWAIT is not set 1008 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1009 * we have to do the receive in sections, and thus risk returning 1010 * a short count if a timeout or signal occurs after we start. 1011 */ 1012 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1013 so->so_rcv.sb_cc < uio->uio_resid) && 1014 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1015 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1016 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1017 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1018 ("receive: m == %p so->so_rcv.sb_cc == %u", 1019 m, so->so_rcv.sb_cc)); 1020 if (so->so_error) { 1021 if (m != NULL) 1022 goto dontblock; 1023 error = so->so_error; 1024 if ((flags & MSG_PEEK) == 0) 1025 so->so_error = 0; 1026 goto release; 1027 } 1028 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1029 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1030 if (m) 1031 goto dontblock; 1032 else 1033 goto release; 1034 } 1035 for (; m != NULL; m = m->m_next) 1036 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1037 m = so->so_rcv.sb_mb; 1038 goto dontblock; 1039 } 1040 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1041 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1042 error = ENOTCONN; 1043 goto release; 1044 } 1045 if (uio->uio_resid == 0) 1046 goto release; 1047 if ((so->so_state & SS_NBIO) || 1048 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1049 error = EWOULDBLOCK; 1050 goto release; 1051 } 1052 SBLASTRECORDCHK(&so->so_rcv); 1053 SBLASTMBUFCHK(&so->so_rcv); 1054 sbunlock(&so->so_rcv); 1055 error = sbwait(&so->so_rcv); 1056 if (error) 1057 goto out; 1058 goto restart; 1059 } 1060dontblock: 1061 /* 1062 * From this point onward, we maintain 'nextrecord' as a cache of the 1063 * pointer to the next record in the socket buffer. We must keep the 1064 * various socket buffer pointers and local stack versions of the 1065 * pointers in sync, pushing out modifications before dropping the 1066 * socket buffer mutex, and re-reading them when picking it up. 1067 * 1068 * Otherwise, we will race with the network stack appending new data 1069 * or records onto the socket buffer by using inconsistent/stale 1070 * versions of the field, possibly resulting in socket buffer 1071 * corruption. 1072 * 1073 * By holding the high-level sblock(), we prevent simultaneous 1074 * readers from pulling off the front of the socket buffer. 1075 */ 1076 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1077 if (uio->uio_td) 1078 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1079 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1080 SBLASTRECORDCHK(&so->so_rcv); 1081 SBLASTMBUFCHK(&so->so_rcv); 1082 nextrecord = m->m_nextpkt; 1083 if (pr->pr_flags & PR_ADDR) { 1084 KASSERT(m->m_type == MT_SONAME, 1085 ("m->m_type == %d", m->m_type)); 1086 orig_resid = 0; 1087 if (psa != NULL) 1088 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1089 M_NOWAIT); 1090 if (flags & MSG_PEEK) { 1091 m = m->m_next; 1092 } else { 1093 sbfree(&so->so_rcv, m); 1094 so->so_rcv.sb_mb = m_free(m); 1095 m = so->so_rcv.sb_mb; 1096 sockbuf_pushsync(&so->so_rcv, nextrecord); 1097 } 1098 } 1099 1100 /* 1101 * Process one or more MT_CONTROL mbufs present before any data mbufs 1102 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1103 * just copy the data; if !MSG_PEEK, we call into the protocol to 1104 * perform externalization (or freeing if controlp == NULL). 1105 */ 1106 if (m != NULL && m->m_type == MT_CONTROL) { 1107 struct mbuf *cm = NULL, *cmn; 1108 struct mbuf **cme = &cm; 1109 1110 do { 1111 if (flags & MSG_PEEK) { 1112 if (controlp != NULL) { 1113 *controlp = m_copy(m, 0, m->m_len); 1114 controlp = &(*controlp)->m_next; 1115 } 1116 m = m->m_next; 1117 } else { 1118 sbfree(&so->so_rcv, m); 1119 so->so_rcv.sb_mb = m->m_next; 1120 m->m_next = NULL; 1121 *cme = m; 1122 cme = &(*cme)->m_next; 1123 m = so->so_rcv.sb_mb; 1124 } 1125 } while (m != NULL && m->m_type == MT_CONTROL); 1126 if ((flags & MSG_PEEK) == 0) 1127 sockbuf_pushsync(&so->so_rcv, nextrecord); 1128 while (cm != NULL) { 1129 cmn = cm->m_next; 1130 cm->m_next = NULL; 1131 if (pr->pr_domain->dom_externalize != NULL) { 1132 SOCKBUF_UNLOCK(&so->so_rcv); 1133 error = (*pr->pr_domain->dom_externalize) 1134 (cm, controlp); 1135 SOCKBUF_LOCK(&so->so_rcv); 1136 } else if (controlp != NULL) 1137 *controlp = cm; 1138 else 1139 m_freem(cm); 1140 if (controlp != NULL) { 1141 orig_resid = 0; 1142 while (*controlp != NULL) 1143 controlp = &(*controlp)->m_next; 1144 } 1145 cm = cmn; 1146 } 1147 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1148 orig_resid = 0; 1149 } 1150 if (m != NULL) { 1151 if ((flags & MSG_PEEK) == 0) { 1152 KASSERT(m->m_nextpkt == nextrecord, 1153 ("soreceive: post-control, nextrecord !sync")); 1154 if (nextrecord == NULL) { 1155 KASSERT(so->so_rcv.sb_mb == m, 1156 ("soreceive: post-control, sb_mb!=m")); 1157 KASSERT(so->so_rcv.sb_lastrecord == m, 1158 ("soreceive: post-control, lastrecord!=m")); 1159 } 1160 } 1161 type = m->m_type; 1162 if (type == MT_OOBDATA) 1163 flags |= MSG_OOB; 1164 } else { 1165 if ((flags & MSG_PEEK) == 0) { 1166 KASSERT(so->so_rcv.sb_mb == nextrecord, 1167 ("soreceive: sb_mb != nextrecord")); 1168 if (so->so_rcv.sb_mb == NULL) { 1169 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1170 ("soreceive: sb_lastercord != NULL")); 1171 } 1172 } 1173 } 1174 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1175 SBLASTRECORDCHK(&so->so_rcv); 1176 SBLASTMBUFCHK(&so->so_rcv); 1177 1178 /* 1179 * Now continue to read any data mbufs off of the head of the socket 1180 * buffer until the read request is satisfied. Note that 'type' is 1181 * used to store the type of any mbuf reads that have happened so far 1182 * such that soreceive() can stop reading if the type changes, which 1183 * causes soreceive() to return only one of regular data and inline 1184 * out-of-band data in a single socket receive operation. 1185 */ 1186 moff = 0; 1187 offset = 0; 1188 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1189 /* 1190 * If the type of mbuf has changed since the last mbuf 1191 * examined ('type'), end the receive operation. 1192 */ 1193 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1194 if (m->m_type == MT_OOBDATA) { 1195 if (type != MT_OOBDATA) 1196 break; 1197 } else if (type == MT_OOBDATA) 1198 break; 1199 else 1200 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1201 ("m->m_type == %d", m->m_type)); 1202 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1203 len = uio->uio_resid; 1204 if (so->so_oobmark && len > so->so_oobmark - offset) 1205 len = so->so_oobmark - offset; 1206 if (len > m->m_len - moff) 1207 len = m->m_len - moff; 1208 /* 1209 * If mp is set, just pass back the mbufs. 1210 * Otherwise copy them out via the uio, then free. 1211 * Sockbuf must be consistent here (points to current mbuf, 1212 * it points to next record) when we drop priority; 1213 * we must note any additions to the sockbuf when we 1214 * block interrupts again. 1215 */ 1216 if (mp == NULL) { 1217 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1218 SBLASTRECORDCHK(&so->so_rcv); 1219 SBLASTMBUFCHK(&so->so_rcv); 1220 SOCKBUF_UNLOCK(&so->so_rcv); 1221#ifdef ZERO_COPY_SOCKETS 1222 if (so_zero_copy_receive) { 1223 int disposable; 1224 1225 if ((m->m_flags & M_EXT) 1226 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1227 disposable = 1; 1228 else 1229 disposable = 0; 1230 1231 error = uiomoveco(mtod(m, char *) + moff, 1232 (int)len, uio, 1233 disposable); 1234 } else 1235#endif /* ZERO_COPY_SOCKETS */ 1236 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1237 SOCKBUF_LOCK(&so->so_rcv); 1238 if (error) 1239 goto release; 1240 } else 1241 uio->uio_resid -= len; 1242 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1243 if (len == m->m_len - moff) { 1244 if (m->m_flags & M_EOR) 1245 flags |= MSG_EOR; 1246 if (flags & MSG_PEEK) { 1247 m = m->m_next; 1248 moff = 0; 1249 } else { 1250 nextrecord = m->m_nextpkt; 1251 sbfree(&so->so_rcv, m); 1252 if (mp != NULL) { 1253 *mp = m; 1254 mp = &m->m_next; 1255 so->so_rcv.sb_mb = m = m->m_next; 1256 *mp = NULL; 1257 } else { 1258 so->so_rcv.sb_mb = m_free(m); 1259 m = so->so_rcv.sb_mb; 1260 } 1261 if (m != NULL) { 1262 m->m_nextpkt = nextrecord; 1263 if (nextrecord == NULL) 1264 so->so_rcv.sb_lastrecord = m; 1265 } else { 1266 so->so_rcv.sb_mb = nextrecord; 1267 SB_EMPTY_FIXUP(&so->so_rcv); 1268 } 1269 SBLASTRECORDCHK(&so->so_rcv); 1270 SBLASTMBUFCHK(&so->so_rcv); 1271 } 1272 } else { 1273 if (flags & MSG_PEEK) 1274 moff += len; 1275 else { 1276 if (mp != NULL) { 1277 int copy_flag; 1278 1279 if (flags & MSG_DONTWAIT) 1280 copy_flag = M_DONTWAIT; 1281 else 1282 copy_flag = M_TRYWAIT; 1283 if (copy_flag == M_TRYWAIT) 1284 SOCKBUF_UNLOCK(&so->so_rcv); 1285 *mp = m_copym(m, 0, len, copy_flag); 1286 if (copy_flag == M_TRYWAIT) 1287 SOCKBUF_LOCK(&so->so_rcv); 1288 if (*mp == NULL) { 1289 /* 1290 * m_copym() couldn't allocate an mbuf. 1291 * Adjust uio_resid back (it was adjusted 1292 * down by len bytes, which we didn't end 1293 * up "copying" over). 1294 */ 1295 uio->uio_resid += len; 1296 break; 1297 } 1298 } 1299 m->m_data += len; 1300 m->m_len -= len; 1301 so->so_rcv.sb_cc -= len; 1302 } 1303 } 1304 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1305 if (so->so_oobmark) { 1306 if ((flags & MSG_PEEK) == 0) { 1307 so->so_oobmark -= len; 1308 if (so->so_oobmark == 0) { 1309 so->so_rcv.sb_state |= SBS_RCVATMARK; 1310 break; 1311 } 1312 } else { 1313 offset += len; 1314 if (offset == so->so_oobmark) 1315 break; 1316 } 1317 } 1318 if (flags & MSG_EOR) 1319 break; 1320 /* 1321 * If the MSG_WAITALL flag is set (for non-atomic socket), 1322 * we must not quit until "uio->uio_resid == 0" or an error 1323 * termination. If a signal/timeout occurs, return 1324 * with a short count but without error. 1325 * Keep sockbuf locked against other readers. 1326 */ 1327 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1328 !sosendallatonce(so) && nextrecord == NULL) { 1329 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1330 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1331 break; 1332 /* 1333 * Notify the protocol that some data has been 1334 * drained before blocking. 1335 */ 1336 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1337 SOCKBUF_UNLOCK(&so->so_rcv); 1338 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1339 SOCKBUF_LOCK(&so->so_rcv); 1340 } 1341 SBLASTRECORDCHK(&so->so_rcv); 1342 SBLASTMBUFCHK(&so->so_rcv); 1343 error = sbwait(&so->so_rcv); 1344 if (error) 1345 goto release; 1346 m = so->so_rcv.sb_mb; 1347 if (m != NULL) 1348 nextrecord = m->m_nextpkt; 1349 } 1350 } 1351 1352 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1353 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1354 flags |= MSG_TRUNC; 1355 if ((flags & MSG_PEEK) == 0) 1356 (void) sbdroprecord_locked(&so->so_rcv); 1357 } 1358 if ((flags & MSG_PEEK) == 0) { 1359 if (m == NULL) { 1360 /* 1361 * First part is an inline SB_EMPTY_FIXUP(). Second 1362 * part makes sure sb_lastrecord is up-to-date if 1363 * there is still data in the socket buffer. 1364 */ 1365 so->so_rcv.sb_mb = nextrecord; 1366 if (so->so_rcv.sb_mb == NULL) { 1367 so->so_rcv.sb_mbtail = NULL; 1368 so->so_rcv.sb_lastrecord = NULL; 1369 } else if (nextrecord->m_nextpkt == NULL) 1370 so->so_rcv.sb_lastrecord = nextrecord; 1371 } 1372 SBLASTRECORDCHK(&so->so_rcv); 1373 SBLASTMBUFCHK(&so->so_rcv); 1374 /* 1375 * If soreceive() is being done from the socket callback, then 1376 * don't need to generate ACK to peer to update window, since 1377 * ACK will be generated on return to TCP. 1378 */ 1379 if (!(flags & MSG_SOCALLBCK) && 1380 (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) { 1381 SOCKBUF_UNLOCK(&so->so_rcv); 1382 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1383 SOCKBUF_LOCK(&so->so_rcv); 1384 } 1385 } 1386 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1387 if (orig_resid == uio->uio_resid && orig_resid && 1388 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1389 sbunlock(&so->so_rcv); 1390 goto restart; 1391 } 1392 1393 if (flagsp != NULL) 1394 *flagsp |= flags; 1395release: 1396 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1397 sbunlock(&so->so_rcv); 1398out: 1399 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1400 SOCKBUF_UNLOCK(&so->so_rcv); 1401 return (error); 1402} 1403 1404int 1405soshutdown(so, how) 1406 struct socket *so; 1407 int how; 1408{ 1409 struct protosw *pr = so->so_proto; 1410 1411 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1412 return (EINVAL); 1413 1414 if (how != SHUT_WR) 1415 sorflush(so); 1416 if (how != SHUT_RD) 1417 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1418 return (0); 1419} 1420 1421void 1422sorflush(so) 1423 struct socket *so; 1424{ 1425 struct sockbuf *sb = &so->so_rcv; 1426 struct protosw *pr = so->so_proto; 1427 struct sockbuf asb; 1428 1429 /* 1430 * XXXRW: This is quite ugly. Previously, this code made a copy of 1431 * the socket buffer, then zero'd the original to clear the buffer 1432 * fields. However, with mutexes in the socket buffer, this causes 1433 * problems. We only clear the zeroable bits of the original; 1434 * however, we have to initialize and destroy the mutex in the copy 1435 * so that dom_dispose() and sbrelease() can lock t as needed. 1436 */ 1437 SOCKBUF_LOCK(sb); 1438 sb->sb_flags |= SB_NOINTR; 1439 (void) sblock(sb, M_WAITOK); 1440 /* 1441 * socantrcvmore_locked() drops the socket buffer mutex so that it 1442 * can safely perform wakeups. Re-acquire the mutex before 1443 * continuing. 1444 */ 1445 socantrcvmore_locked(so); 1446 SOCKBUF_LOCK(sb); 1447 sbunlock(sb); 1448 /* 1449 * Invalidate/clear most of the sockbuf structure, but leave 1450 * selinfo and mutex data unchanged. 1451 */ 1452 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1453 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1454 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1455 bzero(&sb->sb_startzero, 1456 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1457 SOCKBUF_UNLOCK(sb); 1458 1459 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1460 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1461 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1462 sbrelease(&asb, so); 1463 SOCKBUF_LOCK_DESTROY(&asb); 1464} 1465 1466#ifdef INET 1467static int 1468do_setopt_accept_filter(so, sopt) 1469 struct socket *so; 1470 struct sockopt *sopt; 1471{ 1472 struct accept_filter_arg *afap; 1473 struct accept_filter *afp; 1474 struct so_accf *newaf; 1475 int error = 0; 1476 1477 newaf = NULL; 1478 afap = NULL; 1479 1480 /* 1481 * XXXRW: Configuring accept filters should be an atomic test-and-set 1482 * operation to prevent races during setup and attach. There may be 1483 * more general issues of racing and ordering here that are not yet 1484 * addressed by locking. 1485 */ 1486 /* do not set/remove accept filters on non listen sockets */ 1487 SOCK_LOCK(so); 1488 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1489 SOCK_UNLOCK(so); 1490 return (EINVAL); 1491 } 1492 1493 /* removing the filter */ 1494 if (sopt == NULL) { 1495 if (so->so_accf != NULL) { 1496 struct so_accf *af = so->so_accf; 1497 if (af->so_accept_filter != NULL && 1498 af->so_accept_filter->accf_destroy != NULL) { 1499 af->so_accept_filter->accf_destroy(so); 1500 } 1501 if (af->so_accept_filter_str != NULL) { 1502 FREE(af->so_accept_filter_str, M_ACCF); 1503 } 1504 FREE(af, M_ACCF); 1505 so->so_accf = NULL; 1506 } 1507 so->so_options &= ~SO_ACCEPTFILTER; 1508 SOCK_UNLOCK(so); 1509 return (0); 1510 } 1511 SOCK_UNLOCK(so); 1512 1513 /*- 1514 * Adding a filter. 1515 * 1516 * Do memory allocation, copyin, and filter lookup now while we're 1517 * not holding any locks. Avoids sleeping with a mutex, as well as 1518 * introducing a lock order between accept filter locks and socket 1519 * locks here. 1520 */ 1521 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, 1522 M_WAITOK); 1523 /* don't put large objects on the kernel stack */ 1524 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1525 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1526 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1527 if (error) { 1528 FREE(afap, M_TEMP); 1529 return (error); 1530 } 1531 afp = accept_filt_get(afap->af_name); 1532 if (afp == NULL) { 1533 FREE(afap, M_TEMP); 1534 return (ENOENT); 1535 } 1536 1537 /* 1538 * Allocate the new accept filter instance storage. We may have to 1539 * free it again later if we fail to attach it. If attached 1540 * properly, 'newaf' is NULLed to avoid a free() while in use. 1541 */ 1542 MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | 1543 M_ZERO); 1544 if (afp->accf_create != NULL && afap->af_name[0] != '\0') { 1545 int len = strlen(afap->af_name) + 1; 1546 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, 1547 M_WAITOK); 1548 strcpy(newaf->so_accept_filter_str, afap->af_name); 1549 } 1550 1551 SOCK_LOCK(so); 1552 /* must remove previous filter first */ 1553 if (so->so_accf != NULL) { 1554 error = EINVAL; 1555 goto out; 1556 } 1557 /* 1558 * Invoke the accf_create() method of the filter if required. 1559 * XXXRW: the socket mutex is held over this call, so the create 1560 * method cannot block. This may be something we have to change, but 1561 * it would require addressing possible races. 1562 */ 1563 if (afp->accf_create != NULL) { 1564 newaf->so_accept_filter_arg = 1565 afp->accf_create(so, afap->af_arg); 1566 if (newaf->so_accept_filter_arg == NULL) { 1567 error = EINVAL; 1568 goto out; 1569 } 1570 } 1571 newaf->so_accept_filter = afp; 1572 so->so_accf = newaf; 1573 so->so_options |= SO_ACCEPTFILTER; 1574 newaf = NULL; 1575out: 1576 SOCK_UNLOCK(so); 1577 if (newaf != NULL) { 1578 if (newaf->so_accept_filter_str != NULL) 1579 FREE(newaf->so_accept_filter_str, M_ACCF); 1580 FREE(newaf, M_ACCF); 1581 } 1582 if (afap != NULL) 1583 FREE(afap, M_TEMP); 1584 return (error); 1585} 1586#endif /* INET */ 1587 1588/* 1589 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1590 * an additional variant to handle the case where the option value needs 1591 * to be some kind of integer, but not a specific size. 1592 * In addition to their use here, these functions are also called by the 1593 * protocol-level pr_ctloutput() routines. 1594 */ 1595int 1596sooptcopyin(sopt, buf, len, minlen) 1597 struct sockopt *sopt; 1598 void *buf; 1599 size_t len; 1600 size_t minlen; 1601{ 1602 size_t valsize; 1603 1604 /* 1605 * If the user gives us more than we wanted, we ignore it, 1606 * but if we don't get the minimum length the caller 1607 * wants, we return EINVAL. On success, sopt->sopt_valsize 1608 * is set to however much we actually retrieved. 1609 */ 1610 if ((valsize = sopt->sopt_valsize) < minlen) 1611 return EINVAL; 1612 if (valsize > len) 1613 sopt->sopt_valsize = valsize = len; 1614 1615 if (sopt->sopt_td != NULL) 1616 return (copyin(sopt->sopt_val, buf, valsize)); 1617 1618 bcopy(sopt->sopt_val, buf, valsize); 1619 return 0; 1620} 1621 1622/* 1623 * Kernel version of setsockopt(2)/ 1624 * XXX: optlen is size_t, not socklen_t 1625 */ 1626int 1627so_setsockopt(struct socket *so, int level, int optname, void *optval, 1628 size_t optlen) 1629{ 1630 struct sockopt sopt; 1631 1632 sopt.sopt_level = level; 1633 sopt.sopt_name = optname; 1634 sopt.sopt_dir = SOPT_SET; 1635 sopt.sopt_val = optval; 1636 sopt.sopt_valsize = optlen; 1637 sopt.sopt_td = NULL; 1638 return (sosetopt(so, &sopt)); 1639} 1640 1641int 1642sosetopt(so, sopt) 1643 struct socket *so; 1644 struct sockopt *sopt; 1645{ 1646 int error, optval; 1647 struct linger l; 1648 struct timeval tv; 1649 u_long val; 1650#ifdef MAC 1651 struct mac extmac; 1652#endif 1653 1654 error = 0; 1655 if (sopt->sopt_level != SOL_SOCKET) { 1656 if (so->so_proto && so->so_proto->pr_ctloutput) 1657 return ((*so->so_proto->pr_ctloutput) 1658 (so, sopt)); 1659 error = ENOPROTOOPT; 1660 } else { 1661 switch (sopt->sopt_name) { 1662#ifdef INET 1663 case SO_ACCEPTFILTER: 1664 error = do_setopt_accept_filter(so, sopt); 1665 if (error) 1666 goto bad; 1667 break; 1668#endif 1669 case SO_LINGER: 1670 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1671 if (error) 1672 goto bad; 1673 1674 SOCK_LOCK(so); 1675 so->so_linger = l.l_linger; 1676 if (l.l_onoff) 1677 so->so_options |= SO_LINGER; 1678 else 1679 so->so_options &= ~SO_LINGER; 1680 SOCK_UNLOCK(so); 1681 break; 1682 1683 case SO_DEBUG: 1684 case SO_KEEPALIVE: 1685 case SO_DONTROUTE: 1686 case SO_USELOOPBACK: 1687 case SO_BROADCAST: 1688 case SO_REUSEADDR: 1689 case SO_REUSEPORT: 1690 case SO_OOBINLINE: 1691 case SO_TIMESTAMP: 1692 case SO_BINTIME: 1693 case SO_NOSIGPIPE: 1694 error = sooptcopyin(sopt, &optval, sizeof optval, 1695 sizeof optval); 1696 if (error) 1697 goto bad; 1698 SOCK_LOCK(so); 1699 if (optval) 1700 so->so_options |= sopt->sopt_name; 1701 else 1702 so->so_options &= ~sopt->sopt_name; 1703 SOCK_UNLOCK(so); 1704 break; 1705 1706 case SO_SNDBUF: 1707 case SO_RCVBUF: 1708 case SO_SNDLOWAT: 1709 case SO_RCVLOWAT: 1710 error = sooptcopyin(sopt, &optval, sizeof optval, 1711 sizeof optval); 1712 if (error) 1713 goto bad; 1714 1715 /* 1716 * Values < 1 make no sense for any of these 1717 * options, so disallow them. 1718 */ 1719 if (optval < 1) { 1720 error = EINVAL; 1721 goto bad; 1722 } 1723 1724 switch (sopt->sopt_name) { 1725 case SO_SNDBUF: 1726 case SO_RCVBUF: 1727 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1728 &so->so_snd : &so->so_rcv, (u_long)optval, 1729 so, curthread) == 0) { 1730 error = ENOBUFS; 1731 goto bad; 1732 } 1733 break; 1734 1735 /* 1736 * Make sure the low-water is never greater than 1737 * the high-water. 1738 */ 1739 case SO_SNDLOWAT: 1740 SOCKBUF_LOCK(&so->so_snd); 1741 so->so_snd.sb_lowat = 1742 (optval > so->so_snd.sb_hiwat) ? 1743 so->so_snd.sb_hiwat : optval; 1744 SOCKBUF_UNLOCK(&so->so_snd); 1745 break; 1746 case SO_RCVLOWAT: 1747 SOCKBUF_LOCK(&so->so_rcv); 1748 so->so_rcv.sb_lowat = 1749 (optval > so->so_rcv.sb_hiwat) ? 1750 so->so_rcv.sb_hiwat : optval; 1751 SOCKBUF_UNLOCK(&so->so_rcv); 1752 break; 1753 } 1754 break; 1755 1756 case SO_SNDTIMEO: 1757 case SO_RCVTIMEO: 1758 error = sooptcopyin(sopt, &tv, sizeof tv, 1759 sizeof tv); 1760 if (error) 1761 goto bad; 1762 1763 /* assert(hz > 0); */ 1764 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 1765 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1766 error = EDOM; 1767 goto bad; 1768 } 1769 /* assert(tick > 0); */ 1770 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 1771 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1772 if (val > INT_MAX) { 1773 error = EDOM; 1774 goto bad; 1775 } 1776 if (val == 0 && tv.tv_usec != 0) 1777 val = 1; 1778 1779 switch (sopt->sopt_name) { 1780 case SO_SNDTIMEO: 1781 so->so_snd.sb_timeo = val; 1782 break; 1783 case SO_RCVTIMEO: 1784 so->so_rcv.sb_timeo = val; 1785 break; 1786 } 1787 break; 1788 case SO_LABEL: 1789#ifdef MAC 1790 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1791 sizeof extmac); 1792 if (error) 1793 goto bad; 1794 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1795 so, &extmac); 1796#else 1797 error = EOPNOTSUPP; 1798#endif 1799 break; 1800 default: 1801 error = ENOPROTOOPT; 1802 break; 1803 } 1804 if (error == 0 && so->so_proto != NULL && 1805 so->so_proto->pr_ctloutput != NULL) { 1806 (void) ((*so->so_proto->pr_ctloutput) 1807 (so, sopt)); 1808 } 1809 } 1810bad: 1811 return (error); 1812} 1813 1814/* Helper routine for getsockopt */ 1815int 1816sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1817{ 1818 int error; 1819 size_t valsize; 1820 1821 error = 0; 1822 1823 /* 1824 * Documented get behavior is that we always return a value, 1825 * possibly truncated to fit in the user's buffer. 1826 * Traditional behavior is that we always tell the user 1827 * precisely how much we copied, rather than something useful 1828 * like the total amount we had available for her. 1829 * Note that this interface is not idempotent; the entire answer must 1830 * generated ahead of time. 1831 */ 1832 valsize = min(len, sopt->sopt_valsize); 1833 sopt->sopt_valsize = valsize; 1834 if (sopt->sopt_val != NULL) { 1835 if (sopt->sopt_td != NULL) 1836 error = copyout(buf, sopt->sopt_val, valsize); 1837 else 1838 bcopy(buf, sopt->sopt_val, valsize); 1839 } 1840 return error; 1841} 1842 1843int 1844sogetopt(so, sopt) 1845 struct socket *so; 1846 struct sockopt *sopt; 1847{ 1848 int error, optval; 1849 struct linger l; 1850 struct timeval tv; 1851#ifdef INET 1852 struct accept_filter_arg *afap; 1853#endif 1854#ifdef MAC 1855 struct mac extmac; 1856#endif 1857 1858 error = 0; 1859 if (sopt->sopt_level != SOL_SOCKET) { 1860 if (so->so_proto && so->so_proto->pr_ctloutput) { 1861 return ((*so->so_proto->pr_ctloutput) 1862 (so, sopt)); 1863 } else 1864 return (ENOPROTOOPT); 1865 } else { 1866 switch (sopt->sopt_name) { 1867#ifdef INET 1868 case SO_ACCEPTFILTER: 1869 /* Unlocked read. */ 1870 if ((so->so_options & SO_ACCEPTCONN) == 0) 1871 return (EINVAL); 1872 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1873 M_TEMP, M_WAITOK | M_ZERO); 1874 SOCK_LOCK(so); 1875 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1876 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1877 if (so->so_accf->so_accept_filter_str != NULL) 1878 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1879 } 1880 SOCK_UNLOCK(so); 1881 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1882 FREE(afap, M_TEMP); 1883 break; 1884#endif 1885 1886 case SO_LINGER: 1887 SOCK_LOCK(so); 1888 l.l_onoff = so->so_options & SO_LINGER; 1889 l.l_linger = so->so_linger; 1890 SOCK_UNLOCK(so); 1891 error = sooptcopyout(sopt, &l, sizeof l); 1892 break; 1893 1894 case SO_USELOOPBACK: 1895 case SO_DONTROUTE: 1896 case SO_DEBUG: 1897 case SO_KEEPALIVE: 1898 case SO_REUSEADDR: 1899 case SO_REUSEPORT: 1900 case SO_BROADCAST: 1901 case SO_OOBINLINE: 1902 case SO_TIMESTAMP: 1903 case SO_BINTIME: 1904 case SO_NOSIGPIPE: 1905 optval = so->so_options & sopt->sopt_name; 1906integer: 1907 error = sooptcopyout(sopt, &optval, sizeof optval); 1908 break; 1909 1910 case SO_TYPE: 1911 optval = so->so_type; 1912 goto integer; 1913 1914 case SO_ERROR: 1915 optval = so->so_error; 1916 so->so_error = 0; 1917 goto integer; 1918 1919 case SO_SNDBUF: 1920 optval = so->so_snd.sb_hiwat; 1921 goto integer; 1922 1923 case SO_RCVBUF: 1924 optval = so->so_rcv.sb_hiwat; 1925 goto integer; 1926 1927 case SO_SNDLOWAT: 1928 optval = so->so_snd.sb_lowat; 1929 goto integer; 1930 1931 case SO_RCVLOWAT: 1932 optval = so->so_rcv.sb_lowat; 1933 goto integer; 1934 1935 case SO_SNDTIMEO: 1936 case SO_RCVTIMEO: 1937 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1938 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1939 1940 tv.tv_sec = optval / hz; 1941 tv.tv_usec = (optval % hz) * tick; 1942 error = sooptcopyout(sopt, &tv, sizeof tv); 1943 break; 1944 case SO_LABEL: 1945#ifdef MAC 1946 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1947 sizeof(extmac)); 1948 if (error) 1949 return (error); 1950 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1951 so, &extmac); 1952 if (error) 1953 return (error); 1954 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1955#else 1956 error = EOPNOTSUPP; 1957#endif 1958 break; 1959 case SO_PEERLABEL: 1960#ifdef MAC 1961 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1962 sizeof(extmac)); 1963 if (error) 1964 return (error); 1965 error = mac_getsockopt_peerlabel( 1966 sopt->sopt_td->td_ucred, so, &extmac); 1967 if (error) 1968 return (error); 1969 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1970#else 1971 error = EOPNOTSUPP; 1972#endif 1973 break; 1974 default: 1975 error = ENOPROTOOPT; 1976 break; 1977 } 1978 return (error); 1979 } 1980} 1981 1982/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1983int 1984soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1985{ 1986 struct mbuf *m, *m_prev; 1987 int sopt_size = sopt->sopt_valsize; 1988 1989 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1990 if (m == NULL) 1991 return ENOBUFS; 1992 if (sopt_size > MLEN) { 1993 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1994 if ((m->m_flags & M_EXT) == 0) { 1995 m_free(m); 1996 return ENOBUFS; 1997 } 1998 m->m_len = min(MCLBYTES, sopt_size); 1999 } else { 2000 m->m_len = min(MLEN, sopt_size); 2001 } 2002 sopt_size -= m->m_len; 2003 *mp = m; 2004 m_prev = m; 2005 2006 while (sopt_size) { 2007 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2008 if (m == NULL) { 2009 m_freem(*mp); 2010 return ENOBUFS; 2011 } 2012 if (sopt_size > MLEN) { 2013 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2014 M_DONTWAIT); 2015 if ((m->m_flags & M_EXT) == 0) { 2016 m_freem(m); 2017 m_freem(*mp); 2018 return ENOBUFS; 2019 } 2020 m->m_len = min(MCLBYTES, sopt_size); 2021 } else { 2022 m->m_len = min(MLEN, sopt_size); 2023 } 2024 sopt_size -= m->m_len; 2025 m_prev->m_next = m; 2026 m_prev = m; 2027 } 2028 return 0; 2029} 2030 2031/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2032int 2033soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2034{ 2035 struct mbuf *m0 = m; 2036 2037 if (sopt->sopt_val == NULL) 2038 return 0; 2039 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2040 if (sopt->sopt_td != NULL) { 2041 int error; 2042 2043 error = copyin(sopt->sopt_val, mtod(m, char *), 2044 m->m_len); 2045 if (error != 0) { 2046 m_freem(m0); 2047 return(error); 2048 } 2049 } else 2050 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2051 sopt->sopt_valsize -= m->m_len; 2052 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2053 m = m->m_next; 2054 } 2055 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2056 panic("ip6_sooptmcopyin"); 2057 return 0; 2058} 2059 2060/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2061int 2062soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2063{ 2064 struct mbuf *m0 = m; 2065 size_t valsize = 0; 2066 2067 if (sopt->sopt_val == NULL) 2068 return 0; 2069 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2070 if (sopt->sopt_td != NULL) { 2071 int error; 2072 2073 error = copyout(mtod(m, char *), sopt->sopt_val, 2074 m->m_len); 2075 if (error != 0) { 2076 m_freem(m0); 2077 return(error); 2078 } 2079 } else 2080 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2081 sopt->sopt_valsize -= m->m_len; 2082 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2083 valsize += m->m_len; 2084 m = m->m_next; 2085 } 2086 if (m != NULL) { 2087 /* enough soopt buffer should be given from user-land */ 2088 m_freem(m0); 2089 return(EINVAL); 2090 } 2091 sopt->sopt_valsize = valsize; 2092 return 0; 2093} 2094 2095void 2096sohasoutofband(so) 2097 struct socket *so; 2098{ 2099 if (so->so_sigio != NULL) 2100 pgsigio(&so->so_sigio, SIGURG, 0); 2101 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2102} 2103 2104int 2105sopoll(struct socket *so, int events, struct ucred *active_cred, 2106 struct thread *td) 2107{ 2108 int revents = 0; 2109 2110 SOCKBUF_LOCK(&so->so_snd); 2111 SOCKBUF_LOCK(&so->so_rcv); 2112 if (events & (POLLIN | POLLRDNORM)) 2113 if (soreadable(so)) 2114 revents |= events & (POLLIN | POLLRDNORM); 2115 2116 if (events & POLLINIGNEOF) 2117 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2118 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2119 revents |= POLLINIGNEOF; 2120 2121 if (events & (POLLOUT | POLLWRNORM)) 2122 if (sowriteable(so)) 2123 revents |= events & (POLLOUT | POLLWRNORM); 2124 2125 if (events & (POLLPRI | POLLRDBAND)) 2126 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2127 revents |= events & (POLLPRI | POLLRDBAND); 2128 2129 if (revents == 0) { 2130 if (events & 2131 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2132 POLLRDBAND)) { 2133 selrecord(td, &so->so_rcv.sb_sel); 2134 so->so_rcv.sb_flags |= SB_SEL; 2135 } 2136 2137 if (events & (POLLOUT | POLLWRNORM)) { 2138 selrecord(td, &so->so_snd.sb_sel); 2139 so->so_snd.sb_flags |= SB_SEL; 2140 } 2141 } 2142 2143 SOCKBUF_UNLOCK(&so->so_rcv); 2144 SOCKBUF_UNLOCK(&so->so_snd); 2145 return (revents); 2146} 2147 2148int 2149soo_kqfilter(struct file *fp, struct knote *kn) 2150{ 2151 struct socket *so = kn->kn_fp->f_data; 2152 struct sockbuf *sb; 2153 2154 switch (kn->kn_filter) { 2155 case EVFILT_READ: 2156 if (so->so_options & SO_ACCEPTCONN) 2157 kn->kn_fop = &solisten_filtops; 2158 else 2159 kn->kn_fop = &soread_filtops; 2160 sb = &so->so_rcv; 2161 break; 2162 case EVFILT_WRITE: 2163 kn->kn_fop = &sowrite_filtops; 2164 sb = &so->so_snd; 2165 break; 2166 default: 2167 return (EINVAL); 2168 } 2169 2170 SOCKBUF_LOCK(sb); 2171 knlist_add(&sb->sb_sel.si_note, kn, 1); 2172 sb->sb_flags |= SB_KNOTE; 2173 SOCKBUF_UNLOCK(sb); 2174 return (0); 2175} 2176 2177static void 2178filt_sordetach(struct knote *kn) 2179{ 2180 struct socket *so = kn->kn_fp->f_data; 2181 2182 SOCKBUF_LOCK(&so->so_rcv); 2183 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2184 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2185 so->so_rcv.sb_flags &= ~SB_KNOTE; 2186 SOCKBUF_UNLOCK(&so->so_rcv); 2187} 2188 2189/*ARGSUSED*/ 2190static int 2191filt_soread(struct knote *kn, long hint) 2192{ 2193 struct socket *so; 2194 2195 so = kn->kn_fp->f_data; 2196 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2197 2198 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2199 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2200 kn->kn_flags |= EV_EOF; 2201 kn->kn_fflags = so->so_error; 2202 return (1); 2203 } else if (so->so_error) /* temporary udp error */ 2204 return (1); 2205 else if (kn->kn_sfflags & NOTE_LOWAT) 2206 return (kn->kn_data >= kn->kn_sdata); 2207 else 2208 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2209} 2210 2211static void 2212filt_sowdetach(struct knote *kn) 2213{ 2214 struct socket *so = kn->kn_fp->f_data; 2215 2216 SOCKBUF_LOCK(&so->so_snd); 2217 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2218 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2219 so->so_snd.sb_flags &= ~SB_KNOTE; 2220 SOCKBUF_UNLOCK(&so->so_snd); 2221} 2222 2223/*ARGSUSED*/ 2224static int 2225filt_sowrite(struct knote *kn, long hint) 2226{ 2227 struct socket *so; 2228 2229 so = kn->kn_fp->f_data; 2230 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2231 kn->kn_data = sbspace(&so->so_snd); 2232 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2233 kn->kn_flags |= EV_EOF; 2234 kn->kn_fflags = so->so_error; 2235 return (1); 2236 } else if (so->so_error) /* temporary udp error */ 2237 return (1); 2238 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2239 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2240 return (0); 2241 else if (kn->kn_sfflags & NOTE_LOWAT) 2242 return (kn->kn_data >= kn->kn_sdata); 2243 else 2244 return (kn->kn_data >= so->so_snd.sb_lowat); 2245} 2246 2247/*ARGSUSED*/ 2248static int 2249filt_solisten(struct knote *kn, long hint) 2250{ 2251 struct socket *so = kn->kn_fp->f_data; 2252 2253 kn->kn_data = so->so_qlen; 2254 return (! TAILQ_EMPTY(&so->so_comp)); 2255} 2256 2257int 2258socheckuid(struct socket *so, uid_t uid) 2259{ 2260 2261 if (so == NULL) 2262 return (EPERM); 2263 if (so->so_cred->cr_uid != uid) 2264 return (EPERM); 2265 return (0); 2266} 2267 2268static int 2269somaxconn_sysctl(SYSCTL_HANDLER_ARGS) 2270{ 2271 int error; 2272 int val; 2273 2274 val = somaxconn; 2275 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2276 if (error || !req->newptr ) 2277 return (error); 2278 2279 if (val < 1 || val > USHRT_MAX) 2280 return (EINVAL); 2281 2282 somaxconn = val; 2283 return (0); 2284} 2285