uipc_socket.c revision 136373
1/* 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 136373 2004-10-11 08:11:26Z rwatson $"); 36 37#include "opt_inet.h" 38#include "opt_mac.h" 39#include "opt_zero.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/fcntl.h> 44#include <sys/limits.h> 45#include <sys/lock.h> 46#include <sys/mac.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/mutex.h> 50#include <sys/domain.h> 51#include <sys/file.h> /* for struct knote */ 52#include <sys/kernel.h> 53#include <sys/event.h> 54#include <sys/poll.h> 55#include <sys/proc.h> 56#include <sys/protosw.h> 57#include <sys/socket.h> 58#include <sys/socketvar.h> 59#include <sys/resourcevar.h> 60#include <sys/signalvar.h> 61#include <sys/sysctl.h> 62#include <sys/uio.h> 63#include <sys/jail.h> 64 65#include <vm/uma.h> 66 67 68static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71#ifdef INET 72static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73#endif 74 75static void filt_sordetach(struct knote *kn); 76static int filt_soread(struct knote *kn, long hint); 77static void filt_sowdetach(struct knote *kn); 78static int filt_sowrite(struct knote *kn, long hint); 79static int filt_solisten(struct knote *kn, long hint); 80 81static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88uma_zone_t socket_zone; 89so_gen_t so_gencnt; /* generation count for sockets */ 90 91MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94SYSCTL_DECL(_kern_ipc); 95 96static int somaxconn = SOMAXCONN; 97SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 98 &somaxconn, 0, "Maximum pending socket connection queue size"); 99static int numopensockets; 100SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 101 &numopensockets, 0, "Number of open sockets"); 102#ifdef ZERO_COPY_SOCKETS 103/* These aren't static because they're used in other files. */ 104int so_zero_copy_send = 1; 105int so_zero_copy_receive = 1; 106SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 107 "Zero copy controls"); 108SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 109 &so_zero_copy_receive, 0, "Enable zero copy receive"); 110SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 111 &so_zero_copy_send, 0, "Enable zero copy send"); 112#endif /* ZERO_COPY_SOCKETS */ 113 114/* 115 * accept_mtx locks down per-socket fields relating to accept queues. See 116 * socketvar.h for an annotation of the protected fields of struct socket. 117 */ 118struct mtx accept_mtx; 119MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 120 121/* 122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 123 * so_gencnt field. 124 * 125 * XXXRW: These variables might be better manipulated using atomic operations 126 * for improved efficiency. 127 */ 128static struct mtx so_global_mtx; 129MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131/* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139/* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147struct socket * 148soalloc(int mflags) 149{ 150 struct socket *so; 151#ifdef MAC 152 int error; 153#endif 154 155 so = uma_zalloc(socket_zone, mflags | M_ZERO); 156 if (so != NULL) { 157#ifdef MAC 158 error = mac_init_socket(so, mflags); 159 if (error != 0) { 160 uma_zfree(socket_zone, so); 161 so = NULL; 162 return so; 163 } 164#endif 165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 167 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 168 TAILQ_INIT(&so->so_aiojobq); 169 mtx_lock(&so_global_mtx); 170 so->so_gencnt = ++so_gencnt; 171 ++numopensockets; 172 mtx_unlock(&so_global_mtx); 173 } 174 return so; 175} 176 177/* 178 * socreate returns a socket with a ref count of 1. The socket should be 179 * closed with soclose(). 180 */ 181int 182socreate(dom, aso, type, proto, cred, td) 183 int dom; 184 struct socket **aso; 185 int type; 186 int proto; 187 struct ucred *cred; 188 struct thread *td; 189{ 190 struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 200 return (EPROTONOSUPPORT); 201 202 if (jailed(cred) && jail_socket_unixiproute_only && 203 prp->pr_domain->dom_family != PF_LOCAL && 204 prp->pr_domain->dom_family != PF_INET && 205 prp->pr_domain->dom_family != PF_ROUTE) { 206 return (EPROTONOSUPPORT); 207 } 208 209 if (prp->pr_type != type) 210 return (EPROTOTYPE); 211 so = soalloc(M_WAITOK); 212 if (so == NULL) 213 return (ENOBUFS); 214 215 TAILQ_INIT(&so->so_incomp); 216 TAILQ_INIT(&so->so_comp); 217 so->so_type = type; 218 so->so_cred = crhold(cred); 219 so->so_proto = prp; 220#ifdef MAC 221 mac_create_socket(cred, so); 222#endif 223 SOCK_LOCK(so); 224 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 225 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 226 soref(so); 227 SOCK_UNLOCK(so); 228 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 229 if (error) { 230 SOCK_LOCK(so); 231 so->so_state |= SS_NOFDREF; 232 sorele(so); 233 return (error); 234 } 235 *aso = so; 236 return (0); 237} 238 239int 240sobind(so, nam, td) 241 struct socket *so; 242 struct sockaddr *nam; 243 struct thread *td; 244{ 245 246 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 247} 248 249void 250sodealloc(struct socket *so) 251{ 252 253 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 254 mtx_lock(&so_global_mtx); 255 so->so_gencnt = ++so_gencnt; 256 mtx_unlock(&so_global_mtx); 257 if (so->so_rcv.sb_hiwat) 258 (void)chgsbsize(so->so_cred->cr_uidinfo, 259 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 260 if (so->so_snd.sb_hiwat) 261 (void)chgsbsize(so->so_cred->cr_uidinfo, 262 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 263#ifdef INET 264 /* remove acccept filter if one is present. */ 265 if (so->so_accf != NULL) 266 do_setopt_accept_filter(so, NULL); 267#endif 268#ifdef MAC 269 mac_destroy_socket(so); 270#endif 271 crfree(so->so_cred); 272 SOCKBUF_LOCK_DESTROY(&so->so_snd); 273 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 274 /* sx_destroy(&so->so_sxlock); */ 275 uma_zfree(socket_zone, so); 276 /* 277 * XXXRW: Seems like a shame to grab the mutex again down here, but 278 * we don't want to decrement the socket count until after we free 279 * the socket, and we can't increment the gencnt on the socket after 280 * we free, it so... 281 */ 282 mtx_lock(&so_global_mtx); 283 --numopensockets; 284 mtx_unlock(&so_global_mtx); 285} 286 287int 288solisten(so, backlog, td) 289 struct socket *so; 290 int backlog; 291 struct thread *td; 292{ 293 int error; 294 295 /* 296 * XXXRW: Ordering issue here -- perhaps we need to set 297 * SO_ACCEPTCONN before the call to pru_listen()? 298 * XXXRW: General atomic test-and-set concerns here also. 299 */ 300 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 301 SS_ISDISCONNECTING)) 302 return (EINVAL); 303 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 304 if (error) 305 return (error); 306 ACCEPT_LOCK(); 307 if (TAILQ_EMPTY(&so->so_comp)) { 308 SOCK_LOCK(so); 309 so->so_options |= SO_ACCEPTCONN; 310 SOCK_UNLOCK(so); 311 } 312 if (backlog < 0 || backlog > somaxconn) 313 backlog = somaxconn; 314 so->so_qlimit = backlog; 315 ACCEPT_UNLOCK(); 316 return (0); 317} 318 319/* 320 * Attempt to free a socket. This should really be sotryfree(). 321 * 322 * We free the socket if the protocol is no longer interested in the socket, 323 * there's no file descriptor reference, and the refcount is 0. While the 324 * calling macro sotryfree() tests the refcount, sofree() has to test it 325 * again as it's possible to race with an accept()ing thread if the socket is 326 * in an listen queue of a listen socket, as being in the listen queue 327 * doesn't elevate the reference count. sofree() acquires the accept mutex 328 * early for this test in order to avoid that race. 329 */ 330void 331sofree(so) 332 struct socket *so; 333{ 334 struct socket *head; 335 336 SOCK_UNLOCK(so); 337 ACCEPT_LOCK(); 338 SOCK_LOCK(so); 339 340 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 || 341 so->so_count != 0) { 342 SOCK_UNLOCK(so); 343 ACCEPT_UNLOCK(); 344 return; 345 } 346 347 head = so->so_head; 348 if (head != NULL) { 349 KASSERT((so->so_qstate & SQ_COMP) != 0 || 350 (so->so_qstate & SQ_INCOMP) != 0, 351 ("sofree: so_head != NULL, but neither SQ_COMP nor " 352 "SQ_INCOMP")); 353 KASSERT((so->so_qstate & SQ_COMP) == 0 || 354 (so->so_qstate & SQ_INCOMP) == 0, 355 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 356 /* 357 * accept(2) is responsible draining the completed 358 * connection queue and freeing those sockets, so 359 * we just return here if this socket is currently 360 * on the completed connection queue. Otherwise, 361 * accept(2) may hang after select(2) has indicating 362 * that a listening socket was ready. If it's an 363 * incomplete connection, we remove it from the queue 364 * and free it; otherwise, it won't be released until 365 * the listening socket is closed. 366 */ 367 if ((so->so_qstate & SQ_COMP) != 0) { 368 SOCK_UNLOCK(so); 369 ACCEPT_UNLOCK(); 370 return; 371 } 372 TAILQ_REMOVE(&head->so_incomp, so, so_list); 373 head->so_incqlen--; 374 so->so_qstate &= ~SQ_INCOMP; 375 so->so_head = NULL; 376 } 377 KASSERT((so->so_qstate & SQ_COMP) == 0 && 378 (so->so_qstate & SQ_INCOMP) == 0, 379 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 380 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 381 SOCK_UNLOCK(so); 382 ACCEPT_UNLOCK(); 383 SOCKBUF_LOCK(&so->so_snd); 384 so->so_snd.sb_flags |= SB_NOINTR; 385 (void)sblock(&so->so_snd, M_WAITOK); 386 /* 387 * socantsendmore_locked() drops the socket buffer mutex so that it 388 * can safely perform wakeups. Re-acquire the mutex before 389 * continuing. 390 */ 391 socantsendmore_locked(so); 392 SOCKBUF_LOCK(&so->so_snd); 393 sbunlock(&so->so_snd); 394 sbrelease_locked(&so->so_snd, so); 395 SOCKBUF_UNLOCK(&so->so_snd); 396 sorflush(so); 397 knlist_destroy(&so->so_rcv.sb_sel.si_note); 398 knlist_destroy(&so->so_snd.sb_sel.si_note); 399 sodealloc(so); 400} 401 402/* 403 * Close a socket on last file table reference removal. 404 * Initiate disconnect if connected. 405 * Free socket when disconnect complete. 406 * 407 * This function will sorele() the socket. Note that soclose() may be 408 * called prior to the ref count reaching zero. The actual socket 409 * structure will not be freed until the ref count reaches zero. 410 */ 411int 412soclose(so) 413 struct socket *so; 414{ 415 int error = 0; 416 417 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 418 419 funsetown(&so->so_sigio); 420 if (so->so_options & SO_ACCEPTCONN) { 421 struct socket *sp; 422 ACCEPT_LOCK(); 423 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 424 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 425 so->so_incqlen--; 426 sp->so_qstate &= ~SQ_INCOMP; 427 sp->so_head = NULL; 428 ACCEPT_UNLOCK(); 429 (void) soabort(sp); 430 ACCEPT_LOCK(); 431 } 432 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 433 TAILQ_REMOVE(&so->so_comp, sp, so_list); 434 so->so_qlen--; 435 sp->so_qstate &= ~SQ_COMP; 436 sp->so_head = NULL; 437 ACCEPT_UNLOCK(); 438 (void) soabort(sp); 439 ACCEPT_LOCK(); 440 } 441 ACCEPT_UNLOCK(); 442 } 443 if (so->so_pcb == NULL) 444 goto discard; 445 if (so->so_state & SS_ISCONNECTED) { 446 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 447 error = sodisconnect(so); 448 if (error) 449 goto drop; 450 } 451 if (so->so_options & SO_LINGER) { 452 if ((so->so_state & SS_ISDISCONNECTING) && 453 (so->so_state & SS_NBIO)) 454 goto drop; 455 while (so->so_state & SS_ISCONNECTED) { 456 error = tsleep(&so->so_timeo, 457 PSOCK | PCATCH, "soclos", so->so_linger * hz); 458 if (error) 459 break; 460 } 461 } 462 } 463drop: 464 if (so->so_pcb != NULL) { 465 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 466 if (error == 0) 467 error = error2; 468 } 469discard: 470 SOCK_LOCK(so); 471 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 472 so->so_state |= SS_NOFDREF; 473 sorele(so); 474 return (error); 475} 476 477/* 478 * soabort() must not be called with any socket locks held, as it calls 479 * into the protocol, which will call back into the socket code causing 480 * it to acquire additional socket locks that may cause recursion or lock 481 * order reversals. 482 */ 483int 484soabort(so) 485 struct socket *so; 486{ 487 int error; 488 489 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 490 if (error) { 491 SOCK_LOCK(so); 492 sotryfree(so); /* note: does not decrement the ref count */ 493 return error; 494 } 495 return (0); 496} 497 498int 499soaccept(so, nam) 500 struct socket *so; 501 struct sockaddr **nam; 502{ 503 int error; 504 505 SOCK_LOCK(so); 506 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 507 so->so_state &= ~SS_NOFDREF; 508 SOCK_UNLOCK(so); 509 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 510 return (error); 511} 512 513int 514soconnect(so, nam, td) 515 struct socket *so; 516 struct sockaddr *nam; 517 struct thread *td; 518{ 519 int error; 520 521 if (so->so_options & SO_ACCEPTCONN) 522 return (EOPNOTSUPP); 523 /* 524 * If protocol is connection-based, can only connect once. 525 * Otherwise, if connected, try to disconnect first. 526 * This allows user to disconnect by connecting to, e.g., 527 * a null address. 528 */ 529 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 530 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 531 (error = sodisconnect(so)))) 532 error = EISCONN; 533 else 534 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 535 return (error); 536} 537 538int 539soconnect2(so1, so2) 540 struct socket *so1; 541 struct socket *so2; 542{ 543 544 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 545} 546 547int 548sodisconnect(so) 549 struct socket *so; 550{ 551 int error; 552 553 if ((so->so_state & SS_ISCONNECTED) == 0) 554 return (ENOTCONN); 555 if (so->so_state & SS_ISDISCONNECTING) 556 return (EALREADY); 557 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 558 return (error); 559} 560 561#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 562/* 563 * Send on a socket. 564 * If send must go all at once and message is larger than 565 * send buffering, then hard error. 566 * Lock against other senders. 567 * If must go all at once and not enough room now, then 568 * inform user that this would block and do nothing. 569 * Otherwise, if nonblocking, send as much as possible. 570 * The data to be sent is described by "uio" if nonzero, 571 * otherwise by the mbuf chain "top" (which must be null 572 * if uio is not). Data provided in mbuf chain must be small 573 * enough to send all at once. 574 * 575 * Returns nonzero on error, timeout or signal; callers 576 * must check for short counts if EINTR/ERESTART are returned. 577 * Data and control buffers are freed on return. 578 */ 579 580#ifdef ZERO_COPY_SOCKETS 581struct so_zerocopy_stats{ 582 int size_ok; 583 int align_ok; 584 int found_ifp; 585}; 586struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 587#include <netinet/in.h> 588#include <net/route.h> 589#include <netinet/in_pcb.h> 590#include <vm/vm.h> 591#include <vm/vm_page.h> 592#include <vm/vm_object.h> 593#endif /*ZERO_COPY_SOCKETS*/ 594 595int 596sosend(so, addr, uio, top, control, flags, td) 597 struct socket *so; 598 struct sockaddr *addr; 599 struct uio *uio; 600 struct mbuf *top; 601 struct mbuf *control; 602 int flags; 603 struct thread *td; 604{ 605 struct mbuf **mp; 606 struct mbuf *m; 607 long space, len = 0, resid; 608 int clen = 0, error, dontroute; 609 int atomic = sosendallatonce(so) || top; 610#ifdef ZERO_COPY_SOCKETS 611 int cow_send; 612#endif /* ZERO_COPY_SOCKETS */ 613 614 if (uio != NULL) 615 resid = uio->uio_resid; 616 else 617 resid = top->m_pkthdr.len; 618 /* 619 * In theory resid should be unsigned. 620 * However, space must be signed, as it might be less than 0 621 * if we over-committed, and we must use a signed comparison 622 * of space and resid. On the other hand, a negative resid 623 * causes us to loop sending 0-length segments to the protocol. 624 * 625 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 626 * type sockets since that's an error. 627 */ 628 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 629 error = EINVAL; 630 goto out; 631 } 632 633 dontroute = 634 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 635 (so->so_proto->pr_flags & PR_ATOMIC); 636 if (td != NULL) 637 td->td_proc->p_stats->p_ru.ru_msgsnd++; 638 if (control != NULL) 639 clen = control->m_len; 640#define snderr(errno) { error = (errno); goto release; } 641 642 SOCKBUF_LOCK(&so->so_snd); 643restart: 644 SOCKBUF_LOCK_ASSERT(&so->so_snd); 645 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 646 if (error) 647 goto out_locked; 648 do { 649 SOCKBUF_LOCK_ASSERT(&so->so_snd); 650 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 651 snderr(EPIPE); 652 if (so->so_error) { 653 error = so->so_error; 654 so->so_error = 0; 655 goto release; 656 } 657 if ((so->so_state & SS_ISCONNECTED) == 0) { 658 /* 659 * `sendto' and `sendmsg' is allowed on a connection- 660 * based socket if it supports implied connect. 661 * Return ENOTCONN if not connected and no address is 662 * supplied. 663 */ 664 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 665 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 666 if ((so->so_state & SS_ISCONFIRMING) == 0 && 667 !(resid == 0 && clen != 0)) 668 snderr(ENOTCONN); 669 } else if (addr == NULL) 670 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 671 ENOTCONN : EDESTADDRREQ); 672 } 673 space = sbspace(&so->so_snd); 674 if (flags & MSG_OOB) 675 space += 1024; 676 if ((atomic && resid > so->so_snd.sb_hiwat) || 677 clen > so->so_snd.sb_hiwat) 678 snderr(EMSGSIZE); 679 if (space < resid + clen && 680 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 681 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 682 snderr(EWOULDBLOCK); 683 sbunlock(&so->so_snd); 684 error = sbwait(&so->so_snd); 685 if (error) 686 goto out_locked; 687 goto restart; 688 } 689 SOCKBUF_UNLOCK(&so->so_snd); 690 mp = ⊤ 691 space -= clen; 692 do { 693 if (uio == NULL) { 694 /* 695 * Data is prepackaged in "top". 696 */ 697 resid = 0; 698 if (flags & MSG_EOR) 699 top->m_flags |= M_EOR; 700 } else do { 701#ifdef ZERO_COPY_SOCKETS 702 cow_send = 0; 703#endif /* ZERO_COPY_SOCKETS */ 704 if (resid >= MINCLSIZE) { 705#ifdef ZERO_COPY_SOCKETS 706 if (top == NULL) { 707 MGETHDR(m, M_TRYWAIT, MT_DATA); 708 if (m == NULL) { 709 error = ENOBUFS; 710 SOCKBUF_LOCK(&so->so_snd); 711 goto release; 712 } 713 m->m_pkthdr.len = 0; 714 m->m_pkthdr.rcvif = (struct ifnet *)0; 715 } else { 716 MGET(m, M_TRYWAIT, MT_DATA); 717 if (m == NULL) { 718 error = ENOBUFS; 719 SOCKBUF_LOCK(&so->so_snd); 720 goto release; 721 } 722 } 723 if (so_zero_copy_send && 724 resid>=PAGE_SIZE && 725 space>=PAGE_SIZE && 726 uio->uio_iov->iov_len>=PAGE_SIZE) { 727 so_zerocp_stats.size_ok++; 728 if (!((vm_offset_t) 729 uio->uio_iov->iov_base & PAGE_MASK)){ 730 so_zerocp_stats.align_ok++; 731 cow_send = socow_setup(m, uio); 732 } 733 } 734 if (!cow_send) { 735 MCLGET(m, M_TRYWAIT); 736 if ((m->m_flags & M_EXT) == 0) { 737 m_free(m); 738 m = NULL; 739 } else { 740 len = min(min(MCLBYTES, resid), space); 741 } 742 } else 743 len = PAGE_SIZE; 744#else /* ZERO_COPY_SOCKETS */ 745 if (top == NULL) { 746 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 747 m->m_pkthdr.len = 0; 748 m->m_pkthdr.rcvif = (struct ifnet *)0; 749 } else 750 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 751 len = min(min(MCLBYTES, resid), space); 752#endif /* ZERO_COPY_SOCKETS */ 753 } else { 754 if (top == NULL) { 755 m = m_gethdr(M_TRYWAIT, MT_DATA); 756 m->m_pkthdr.len = 0; 757 m->m_pkthdr.rcvif = (struct ifnet *)0; 758 759 len = min(min(MHLEN, resid), space); 760 /* 761 * For datagram protocols, leave room 762 * for protocol headers in first mbuf. 763 */ 764 if (atomic && m && len < MHLEN) 765 MH_ALIGN(m, len); 766 } else { 767 m = m_get(M_TRYWAIT, MT_DATA); 768 len = min(min(MLEN, resid), space); 769 } 770 } 771 if (m == NULL) { 772 error = ENOBUFS; 773 SOCKBUF_LOCK(&so->so_snd); 774 goto release; 775 } 776 777 space -= len; 778#ifdef ZERO_COPY_SOCKETS 779 if (cow_send) 780 error = 0; 781 else 782#endif /* ZERO_COPY_SOCKETS */ 783 error = uiomove(mtod(m, void *), (int)len, uio); 784 resid = uio->uio_resid; 785 m->m_len = len; 786 *mp = m; 787 top->m_pkthdr.len += len; 788 if (error) { 789 SOCKBUF_LOCK(&so->so_snd); 790 goto release; 791 } 792 mp = &m->m_next; 793 if (resid <= 0) { 794 if (flags & MSG_EOR) 795 top->m_flags |= M_EOR; 796 break; 797 } 798 } while (space > 0 && atomic); 799 if (dontroute) { 800 SOCK_LOCK(so); 801 so->so_options |= SO_DONTROUTE; 802 SOCK_UNLOCK(so); 803 } 804 /* 805 * XXX all the SBS_CANTSENDMORE checks previously 806 * done could be out of date. We could have recieved 807 * a reset packet in an interrupt or maybe we slept 808 * while doing page faults in uiomove() etc. We could 809 * probably recheck again inside the locking protection 810 * here, but there are probably other places that this 811 * also happens. We must rethink this. 812 */ 813 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 814 (flags & MSG_OOB) ? PRUS_OOB : 815 /* 816 * If the user set MSG_EOF, the protocol 817 * understands this flag and nothing left to 818 * send then use PRU_SEND_EOF instead of PRU_SEND. 819 */ 820 ((flags & MSG_EOF) && 821 (so->so_proto->pr_flags & PR_IMPLOPCL) && 822 (resid <= 0)) ? 823 PRUS_EOF : 824 /* If there is more to send set PRUS_MORETOCOME */ 825 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 826 top, addr, control, td); 827 if (dontroute) { 828 SOCK_LOCK(so); 829 so->so_options &= ~SO_DONTROUTE; 830 SOCK_UNLOCK(so); 831 } 832 clen = 0; 833 control = NULL; 834 top = NULL; 835 mp = ⊤ 836 if (error) { 837 SOCKBUF_LOCK(&so->so_snd); 838 goto release; 839 } 840 } while (resid && space > 0); 841 SOCKBUF_LOCK(&so->so_snd); 842 } while (resid); 843 844release: 845 SOCKBUF_LOCK_ASSERT(&so->so_snd); 846 sbunlock(&so->so_snd); 847out_locked: 848 SOCKBUF_LOCK_ASSERT(&so->so_snd); 849 SOCKBUF_UNLOCK(&so->so_snd); 850out: 851 if (top != NULL) 852 m_freem(top); 853 if (control != NULL) 854 m_freem(control); 855 return (error); 856} 857 858/* 859 * The part of soreceive() that implements reading non-inline out-of-band 860 * data from a socket. For more complete comments, see soreceive(), from 861 * which this code originated. 862 * 863 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 864 * is unable to return an mbuf chain to the caller. 865 */ 866static int 867soreceive_rcvoob(so, uio, flags) 868 struct socket *so; 869 struct uio *uio; 870 int flags; 871{ 872 struct protosw *pr = so->so_proto; 873 struct mbuf *m; 874 int error; 875 876 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 877 878 m = m_get(M_TRYWAIT, MT_DATA); 879 if (m == NULL) 880 return (ENOBUFS); 881 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 882 if (error) 883 goto bad; 884 do { 885#ifdef ZERO_COPY_SOCKETS 886 if (so_zero_copy_receive) { 887 vm_page_t pg; 888 int disposable; 889 890 if ((m->m_flags & M_EXT) 891 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 892 disposable = 1; 893 else 894 disposable = 0; 895 896 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 897 if (uio->uio_offset == -1) 898 uio->uio_offset =IDX_TO_OFF(pg->pindex); 899 900 error = uiomoveco(mtod(m, void *), 901 min(uio->uio_resid, m->m_len), 902 uio, pg->object, 903 disposable); 904 } else 905#endif /* ZERO_COPY_SOCKETS */ 906 error = uiomove(mtod(m, void *), 907 (int) min(uio->uio_resid, m->m_len), uio); 908 m = m_free(m); 909 } while (uio->uio_resid && error == 0 && m); 910bad: 911 if (m != NULL) 912 m_freem(m); 913 return (error); 914} 915 916/* 917 * Following replacement or removal of the first mbuf on the first mbuf chain 918 * of a socket buffer, push necessary state changes back into the socket 919 * buffer so that other consumers see the values consistently. 'nextrecord' 920 * is the callers locally stored value of the original value of 921 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 922 * NOTE: 'nextrecord' may be NULL. 923 */ 924static __inline void 925sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 926{ 927 928 SOCKBUF_LOCK_ASSERT(sb); 929 /* 930 * First, update for the new value of nextrecord. If necessary, make 931 * it the first record. 932 */ 933 if (sb->sb_mb != NULL) 934 sb->sb_mb->m_nextpkt = nextrecord; 935 else 936 sb->sb_mb = nextrecord; 937 938 /* 939 * Now update any dependent socket buffer fields to reflect the new 940 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 941 * addition of a second clause that takes care of the case where 942 * sb_mb has been updated, but remains the last record. 943 */ 944 if (sb->sb_mb == NULL) { 945 sb->sb_mbtail = NULL; 946 sb->sb_lastrecord = NULL; 947 } else if (sb->sb_mb->m_nextpkt == NULL) 948 sb->sb_lastrecord = sb->sb_mb; 949} 950 951 952/* 953 * Implement receive operations on a socket. 954 * We depend on the way that records are added to the sockbuf 955 * by sbappend*. In particular, each record (mbufs linked through m_next) 956 * must begin with an address if the protocol so specifies, 957 * followed by an optional mbuf or mbufs containing ancillary data, 958 * and then zero or more mbufs of data. 959 * In order to avoid blocking network interrupts for the entire time here, 960 * we splx() while doing the actual copy to user space. 961 * Although the sockbuf is locked, new data may still be appended, 962 * and thus we must maintain consistency of the sockbuf during that time. 963 * 964 * The caller may receive the data as a single mbuf chain by supplying 965 * an mbuf **mp0 for use in returning the chain. The uio is then used 966 * only for the count in uio_resid. 967 */ 968int 969soreceive(so, psa, uio, mp0, controlp, flagsp) 970 struct socket *so; 971 struct sockaddr **psa; 972 struct uio *uio; 973 struct mbuf **mp0; 974 struct mbuf **controlp; 975 int *flagsp; 976{ 977 struct mbuf *m, **mp; 978 int flags, len, error, offset; 979 struct protosw *pr = so->so_proto; 980 struct mbuf *nextrecord; 981 int moff, type = 0; 982 int orig_resid = uio->uio_resid; 983 984 mp = mp0; 985 if (psa != NULL) 986 *psa = NULL; 987 if (controlp != NULL) 988 *controlp = NULL; 989 if (flagsp != NULL) 990 flags = *flagsp &~ MSG_EOR; 991 else 992 flags = 0; 993 if (flags & MSG_OOB) 994 return (soreceive_rcvoob(so, uio, flags)); 995 if (mp != NULL) 996 *mp = NULL; 997 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 998 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 999 1000 SOCKBUF_LOCK(&so->so_rcv); 1001restart: 1002 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1003 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1004 if (error) 1005 goto out; 1006 1007 m = so->so_rcv.sb_mb; 1008 /* 1009 * If we have less data than requested, block awaiting more 1010 * (subject to any timeout) if: 1011 * 1. the current count is less than the low water mark, or 1012 * 2. MSG_WAITALL is set, and it is possible to do the entire 1013 * receive operation at once if we block (resid <= hiwat). 1014 * 3. MSG_DONTWAIT is not set 1015 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1016 * we have to do the receive in sections, and thus risk returning 1017 * a short count if a timeout or signal occurs after we start. 1018 */ 1019 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1020 so->so_rcv.sb_cc < uio->uio_resid) && 1021 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1022 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1023 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1024 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1025 ("receive: m == %p so->so_rcv.sb_cc == %u", 1026 m, so->so_rcv.sb_cc)); 1027 if (so->so_error) { 1028 if (m != NULL) 1029 goto dontblock; 1030 error = so->so_error; 1031 if ((flags & MSG_PEEK) == 0) 1032 so->so_error = 0; 1033 goto release; 1034 } 1035 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1036 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1037 if (m) 1038 goto dontblock; 1039 else 1040 goto release; 1041 } 1042 for (; m != NULL; m = m->m_next) 1043 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1044 m = so->so_rcv.sb_mb; 1045 goto dontblock; 1046 } 1047 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1048 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1049 error = ENOTCONN; 1050 goto release; 1051 } 1052 if (uio->uio_resid == 0) 1053 goto release; 1054 if ((so->so_state & SS_NBIO) || 1055 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1056 error = EWOULDBLOCK; 1057 goto release; 1058 } 1059 SBLASTRECORDCHK(&so->so_rcv); 1060 SBLASTMBUFCHK(&so->so_rcv); 1061 sbunlock(&so->so_rcv); 1062 error = sbwait(&so->so_rcv); 1063 if (error) 1064 goto out; 1065 goto restart; 1066 } 1067dontblock: 1068 /* 1069 * From this point onward, we maintain 'nextrecord' as a cache of the 1070 * pointer to the next record in the socket buffer. We must keep the 1071 * various socket buffer pointers and local stack versions of the 1072 * pointers in sync, pushing out modifications before dropping the 1073 * socket buffer mutex, and re-reading them when picking it up. 1074 * 1075 * Otherwise, we will race with the network stack appending new data 1076 * or records onto the socket buffer by using inconsistent/stale 1077 * versions of the field, possibly resulting in socket buffer 1078 * corruption. 1079 * 1080 * By holding the high-level sblock(), we prevent simultaneous 1081 * readers from pulling off the front of the socket buffer. 1082 */ 1083 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1084 if (uio->uio_td) 1085 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1086 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1087 SBLASTRECORDCHK(&so->so_rcv); 1088 SBLASTMBUFCHK(&so->so_rcv); 1089 nextrecord = m->m_nextpkt; 1090 if (pr->pr_flags & PR_ADDR) { 1091 KASSERT(m->m_type == MT_SONAME, 1092 ("m->m_type == %d", m->m_type)); 1093 orig_resid = 0; 1094 if (psa != NULL) 1095 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1096 M_NOWAIT); 1097 if (flags & MSG_PEEK) { 1098 m = m->m_next; 1099 } else { 1100 sbfree(&so->so_rcv, m); 1101 so->so_rcv.sb_mb = m_free(m); 1102 m = so->so_rcv.sb_mb; 1103 sockbuf_pushsync(&so->so_rcv, nextrecord); 1104 } 1105 } 1106 1107 /* 1108 * Process one or more MT_CONTROL mbufs present before any data mbufs 1109 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1110 * just copy the data; if !MSG_PEEK, we call into the protocol to 1111 * perform externalization (or freeing if controlp == NULL). 1112 */ 1113 if (m != NULL && m->m_type == MT_CONTROL) { 1114 struct mbuf *cm = NULL, *cmn; 1115 struct mbuf **cme = &cm; 1116 1117 do { 1118 if (flags & MSG_PEEK) { 1119 if (controlp != NULL) { 1120 *controlp = m_copy(m, 0, m->m_len); 1121 controlp = &(*controlp)->m_next; 1122 } 1123 m = m->m_next; 1124 } else { 1125 sbfree(&so->so_rcv, m); 1126 so->so_rcv.sb_mb = m->m_next; 1127 m->m_next = NULL; 1128 *cme = m; 1129 cme = &(*cme)->m_next; 1130 m = so->so_rcv.sb_mb; 1131 } 1132 } while (m != NULL && m->m_type == MT_CONTROL); 1133 if ((flags & MSG_PEEK) == 0) 1134 sockbuf_pushsync(&so->so_rcv, nextrecord); 1135 while (cm != NULL) { 1136 cmn = cm->m_next; 1137 cm->m_next = NULL; 1138 if (pr->pr_domain->dom_externalize != NULL) { 1139 SOCKBUF_UNLOCK(&so->so_rcv); 1140 error = (*pr->pr_domain->dom_externalize) 1141 (cm, controlp); 1142 SOCKBUF_LOCK(&so->so_rcv); 1143 } else if (controlp != NULL) 1144 *controlp = cm; 1145 else 1146 m_freem(cm); 1147 if (controlp != NULL) { 1148 orig_resid = 0; 1149 while (*controlp != NULL) 1150 controlp = &(*controlp)->m_next; 1151 } 1152 cm = cmn; 1153 } 1154 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1155 orig_resid = 0; 1156 } 1157 if (m != NULL) { 1158 if ((flags & MSG_PEEK) == 0) { 1159 KASSERT(m->m_nextpkt == nextrecord, 1160 ("soreceive: post-control, nextrecord !sync")); 1161 if (nextrecord == NULL) { 1162 KASSERT(so->so_rcv.sb_mb == m, 1163 ("soreceive: post-control, sb_mb!=m")); 1164 KASSERT(so->so_rcv.sb_lastrecord == m, 1165 ("soreceive: post-control, lastrecord!=m")); 1166 } 1167 } 1168 type = m->m_type; 1169 if (type == MT_OOBDATA) 1170 flags |= MSG_OOB; 1171 } else { 1172 if ((flags & MSG_PEEK) == 0) { 1173 KASSERT(so->so_rcv.sb_mb == nextrecord, 1174 ("soreceive: sb_mb != nextrecord")); 1175 if (so->so_rcv.sb_mb == NULL) { 1176 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1177 ("soreceive: sb_lastercord != NULL")); 1178 } 1179 } 1180 } 1181 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1182 SBLASTRECORDCHK(&so->so_rcv); 1183 SBLASTMBUFCHK(&so->so_rcv); 1184 1185 /* 1186 * Now continue to read any data mbufs off of the head of the socket 1187 * buffer until the read request is satisfied. Note that 'type' is 1188 * used to store the type of any mbuf reads that have happened so far 1189 * such that soreceive() can stop reading if the type changes, which 1190 * causes soreceive() to return only one of regular data and inline 1191 * out-of-band data in a single socket receive operation. 1192 */ 1193 moff = 0; 1194 offset = 0; 1195 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1196 /* 1197 * If the type of mbuf has changed since the last mbuf 1198 * examined ('type'), end the receive operation. 1199 */ 1200 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1201 if (m->m_type == MT_OOBDATA) { 1202 if (type != MT_OOBDATA) 1203 break; 1204 } else if (type == MT_OOBDATA) 1205 break; 1206 else 1207 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1208 ("m->m_type == %d", m->m_type)); 1209 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1210 len = uio->uio_resid; 1211 if (so->so_oobmark && len > so->so_oobmark - offset) 1212 len = so->so_oobmark - offset; 1213 if (len > m->m_len - moff) 1214 len = m->m_len - moff; 1215 /* 1216 * If mp is set, just pass back the mbufs. 1217 * Otherwise copy them out via the uio, then free. 1218 * Sockbuf must be consistent here (points to current mbuf, 1219 * it points to next record) when we drop priority; 1220 * we must note any additions to the sockbuf when we 1221 * block interrupts again. 1222 */ 1223 if (mp == NULL) { 1224 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1225 SBLASTRECORDCHK(&so->so_rcv); 1226 SBLASTMBUFCHK(&so->so_rcv); 1227 SOCKBUF_UNLOCK(&so->so_rcv); 1228#ifdef ZERO_COPY_SOCKETS 1229 if (so_zero_copy_receive) { 1230 vm_page_t pg; 1231 int disposable; 1232 1233 if ((m->m_flags & M_EXT) 1234 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1235 disposable = 1; 1236 else 1237 disposable = 0; 1238 1239 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1240 moff)); 1241 1242 if (uio->uio_offset == -1) 1243 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1244 1245 error = uiomoveco(mtod(m, char *) + moff, 1246 (int)len, uio,pg->object, 1247 disposable); 1248 } else 1249#endif /* ZERO_COPY_SOCKETS */ 1250 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1251 SOCKBUF_LOCK(&so->so_rcv); 1252 if (error) 1253 goto release; 1254 } else 1255 uio->uio_resid -= len; 1256 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1257 if (len == m->m_len - moff) { 1258 if (m->m_flags & M_EOR) 1259 flags |= MSG_EOR; 1260 if (flags & MSG_PEEK) { 1261 m = m->m_next; 1262 moff = 0; 1263 } else { 1264 nextrecord = m->m_nextpkt; 1265 sbfree(&so->so_rcv, m); 1266 if (mp != NULL) { 1267 *mp = m; 1268 mp = &m->m_next; 1269 so->so_rcv.sb_mb = m = m->m_next; 1270 *mp = NULL; 1271 } else { 1272 so->so_rcv.sb_mb = m_free(m); 1273 m = so->so_rcv.sb_mb; 1274 } 1275 if (m != NULL) { 1276 m->m_nextpkt = nextrecord; 1277 if (nextrecord == NULL) 1278 so->so_rcv.sb_lastrecord = m; 1279 } else { 1280 so->so_rcv.sb_mb = nextrecord; 1281 SB_EMPTY_FIXUP(&so->so_rcv); 1282 } 1283 SBLASTRECORDCHK(&so->so_rcv); 1284 SBLASTMBUFCHK(&so->so_rcv); 1285 } 1286 } else { 1287 if (flags & MSG_PEEK) 1288 moff += len; 1289 else { 1290 if (mp != NULL) { 1291 SOCKBUF_UNLOCK(&so->so_rcv); 1292 *mp = m_copym(m, 0, len, M_TRYWAIT); 1293 SOCKBUF_LOCK(&so->so_rcv); 1294 } 1295 m->m_data += len; 1296 m->m_len -= len; 1297 so->so_rcv.sb_cc -= len; 1298 } 1299 } 1300 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1301 if (so->so_oobmark) { 1302 if ((flags & MSG_PEEK) == 0) { 1303 so->so_oobmark -= len; 1304 if (so->so_oobmark == 0) { 1305 so->so_rcv.sb_state |= SBS_RCVATMARK; 1306 break; 1307 } 1308 } else { 1309 offset += len; 1310 if (offset == so->so_oobmark) 1311 break; 1312 } 1313 } 1314 if (flags & MSG_EOR) 1315 break; 1316 /* 1317 * If the MSG_WAITALL flag is set (for non-atomic socket), 1318 * we must not quit until "uio->uio_resid == 0" or an error 1319 * termination. If a signal/timeout occurs, return 1320 * with a short count but without error. 1321 * Keep sockbuf locked against other readers. 1322 */ 1323 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1324 !sosendallatonce(so) && nextrecord == NULL) { 1325 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1326 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1327 break; 1328 /* 1329 * Notify the protocol that some data has been 1330 * drained before blocking. 1331 */ 1332 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1333 SOCKBUF_UNLOCK(&so->so_rcv); 1334 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1335 SOCKBUF_LOCK(&so->so_rcv); 1336 } 1337 SBLASTRECORDCHK(&so->so_rcv); 1338 SBLASTMBUFCHK(&so->so_rcv); 1339 error = sbwait(&so->so_rcv); 1340 if (error) 1341 goto release; 1342 m = so->so_rcv.sb_mb; 1343 if (m != NULL) 1344 nextrecord = m->m_nextpkt; 1345 } 1346 } 1347 1348 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1349 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1350 flags |= MSG_TRUNC; 1351 if ((flags & MSG_PEEK) == 0) 1352 (void) sbdroprecord_locked(&so->so_rcv); 1353 } 1354 if ((flags & MSG_PEEK) == 0) { 1355 if (m == NULL) { 1356 /* 1357 * First part is an inline SB_EMPTY_FIXUP(). Second 1358 * part makes sure sb_lastrecord is up-to-date if 1359 * there is still data in the socket buffer. 1360 */ 1361 so->so_rcv.sb_mb = nextrecord; 1362 if (so->so_rcv.sb_mb == NULL) { 1363 so->so_rcv.sb_mbtail = NULL; 1364 so->so_rcv.sb_lastrecord = NULL; 1365 } else if (nextrecord->m_nextpkt == NULL) 1366 so->so_rcv.sb_lastrecord = nextrecord; 1367 } 1368 SBLASTRECORDCHK(&so->so_rcv); 1369 SBLASTMBUFCHK(&so->so_rcv); 1370 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1371 SOCKBUF_UNLOCK(&so->so_rcv); 1372 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1373 SOCKBUF_LOCK(&so->so_rcv); 1374 } 1375 } 1376 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1377 if (orig_resid == uio->uio_resid && orig_resid && 1378 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1379 sbunlock(&so->so_rcv); 1380 goto restart; 1381 } 1382 1383 if (flagsp != NULL) 1384 *flagsp |= flags; 1385release: 1386 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1387 sbunlock(&so->so_rcv); 1388out: 1389 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1390 SOCKBUF_UNLOCK(&so->so_rcv); 1391 return (error); 1392} 1393 1394int 1395soshutdown(so, how) 1396 struct socket *so; 1397 int how; 1398{ 1399 struct protosw *pr = so->so_proto; 1400 1401 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1402 return (EINVAL); 1403 1404 if (how != SHUT_WR) 1405 sorflush(so); 1406 if (how != SHUT_RD) 1407 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1408 return (0); 1409} 1410 1411void 1412sorflush(so) 1413 struct socket *so; 1414{ 1415 struct sockbuf *sb = &so->so_rcv; 1416 struct protosw *pr = so->so_proto; 1417 struct sockbuf asb; 1418 1419 /* 1420 * XXXRW: This is quite ugly. The existing code made a copy of the 1421 * socket buffer, then zero'd the original to clear the buffer 1422 * fields. However, with mutexes in the socket buffer, this causes 1423 * problems. We only clear the zeroable bits of the original; 1424 * however, we have to initialize and destroy the mutex in the copy 1425 * so that dom_dispose() and sbrelease() can lock t as needed. 1426 */ 1427 SOCKBUF_LOCK(sb); 1428 sb->sb_flags |= SB_NOINTR; 1429 (void) sblock(sb, M_WAITOK); 1430 /* 1431 * socantrcvmore_locked() drops the socket buffer mutex so that it 1432 * can safely perform wakeups. Re-acquire the mutex before 1433 * continuing. 1434 */ 1435 socantrcvmore_locked(so); 1436 SOCKBUF_LOCK(sb); 1437 sbunlock(sb); 1438 /* 1439 * Invalidate/clear most of the sockbuf structure, but leave 1440 * selinfo and mutex data unchanged. 1441 */ 1442 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1443 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1444 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1445 bzero(&sb->sb_startzero, 1446 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1447 SOCKBUF_UNLOCK(sb); 1448 1449 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1450 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1451 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1452 sbrelease(&asb, so); 1453 SOCKBUF_LOCK_DESTROY(&asb); 1454} 1455 1456#ifdef INET 1457static int 1458do_setopt_accept_filter(so, sopt) 1459 struct socket *so; 1460 struct sockopt *sopt; 1461{ 1462 struct accept_filter_arg *afap; 1463 struct accept_filter *afp; 1464 struct so_accf *newaf; 1465 int error = 0; 1466 1467 newaf = NULL; 1468 afap = NULL; 1469 1470 /* 1471 * XXXRW: Configuring accept filters should be an atomic test-and-set 1472 * operation to prevent races during setup and attach. There may be 1473 * more general issues of racing and ordering here that are not yet 1474 * addressed by locking. 1475 */ 1476 /* do not set/remove accept filters on non listen sockets */ 1477 SOCK_LOCK(so); 1478 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1479 SOCK_UNLOCK(so); 1480 return (EINVAL); 1481 } 1482 1483 /* removing the filter */ 1484 if (sopt == NULL) { 1485 if (so->so_accf != NULL) { 1486 struct so_accf *af = so->so_accf; 1487 if (af->so_accept_filter != NULL && 1488 af->so_accept_filter->accf_destroy != NULL) { 1489 af->so_accept_filter->accf_destroy(so); 1490 } 1491 if (af->so_accept_filter_str != NULL) { 1492 FREE(af->so_accept_filter_str, M_ACCF); 1493 } 1494 FREE(af, M_ACCF); 1495 so->so_accf = NULL; 1496 } 1497 so->so_options &= ~SO_ACCEPTFILTER; 1498 SOCK_UNLOCK(so); 1499 return (0); 1500 } 1501 SOCK_UNLOCK(so); 1502 1503 /*- 1504 * Adding a filter. 1505 * 1506 * Do memory allocation, copyin, and filter lookup now while we're 1507 * not holding any locks. Avoids sleeping with a mutex, as well as 1508 * introducing a lock order between accept filter locks and socket 1509 * locks here. 1510 */ 1511 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, 1512 M_WAITOK); 1513 /* don't put large objects on the kernel stack */ 1514 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1515 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1516 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1517 if (error) { 1518 FREE(afap, M_TEMP); 1519 return (error); 1520 } 1521 afp = accept_filt_get(afap->af_name); 1522 if (afp == NULL) { 1523 FREE(afap, M_TEMP); 1524 return (ENOENT); 1525 } 1526 1527 /* 1528 * Allocate the new accept filter instance storage. We may have to 1529 * free it again later if we fail to attach it. If attached 1530 * properly, 'newaf' is NULLed to avoid a free() while in use. 1531 */ 1532 MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | 1533 M_ZERO); 1534 if (afp->accf_create != NULL && afap->af_name[0] != '\0') { 1535 int len = strlen(afap->af_name) + 1; 1536 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, 1537 M_WAITOK); 1538 strcpy(newaf->so_accept_filter_str, afap->af_name); 1539 } 1540 1541 SOCK_LOCK(so); 1542 /* must remove previous filter first */ 1543 if (so->so_accf != NULL) { 1544 error = EINVAL; 1545 goto out; 1546 } 1547 /* 1548 * Invoke the accf_create() method of the filter if required. 1549 * XXXRW: the socket mutex is held over this call, so the create 1550 * method cannot block. This may be something we have to change, but 1551 * it would require addressing possible races. 1552 */ 1553 if (afp->accf_create != NULL) { 1554 newaf->so_accept_filter_arg = 1555 afp->accf_create(so, afap->af_arg); 1556 if (newaf->so_accept_filter_arg == NULL) { 1557 error = EINVAL; 1558 goto out; 1559 } 1560 } 1561 newaf->so_accept_filter = afp; 1562 so->so_accf = newaf; 1563 so->so_options |= SO_ACCEPTFILTER; 1564 newaf = NULL; 1565out: 1566 SOCK_UNLOCK(so); 1567 if (newaf != NULL) { 1568 if (newaf->so_accept_filter_str != NULL) 1569 FREE(newaf->so_accept_filter_str, M_ACCF); 1570 FREE(newaf, M_ACCF); 1571 } 1572 if (afap != NULL) 1573 FREE(afap, M_TEMP); 1574 return (error); 1575} 1576#endif /* INET */ 1577 1578/* 1579 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1580 * an additional variant to handle the case where the option value needs 1581 * to be some kind of integer, but not a specific size. 1582 * In addition to their use here, these functions are also called by the 1583 * protocol-level pr_ctloutput() routines. 1584 */ 1585int 1586sooptcopyin(sopt, buf, len, minlen) 1587 struct sockopt *sopt; 1588 void *buf; 1589 size_t len; 1590 size_t minlen; 1591{ 1592 size_t valsize; 1593 1594 /* 1595 * If the user gives us more than we wanted, we ignore it, 1596 * but if we don't get the minimum length the caller 1597 * wants, we return EINVAL. On success, sopt->sopt_valsize 1598 * is set to however much we actually retrieved. 1599 */ 1600 if ((valsize = sopt->sopt_valsize) < minlen) 1601 return EINVAL; 1602 if (valsize > len) 1603 sopt->sopt_valsize = valsize = len; 1604 1605 if (sopt->sopt_td != NULL) 1606 return (copyin(sopt->sopt_val, buf, valsize)); 1607 1608 bcopy(sopt->sopt_val, buf, valsize); 1609 return 0; 1610} 1611 1612/* 1613 * Kernel version of setsockopt(2)/ 1614 * XXX: optlen is size_t, not socklen_t 1615 */ 1616int 1617so_setsockopt(struct socket *so, int level, int optname, void *optval, 1618 size_t optlen) 1619{ 1620 struct sockopt sopt; 1621 1622 sopt.sopt_level = level; 1623 sopt.sopt_name = optname; 1624 sopt.sopt_dir = SOPT_SET; 1625 sopt.sopt_val = optval; 1626 sopt.sopt_valsize = optlen; 1627 sopt.sopt_td = NULL; 1628 return (sosetopt(so, &sopt)); 1629} 1630 1631int 1632sosetopt(so, sopt) 1633 struct socket *so; 1634 struct sockopt *sopt; 1635{ 1636 int error, optval; 1637 struct linger l; 1638 struct timeval tv; 1639 u_long val; 1640#ifdef MAC 1641 struct mac extmac; 1642#endif 1643 1644 error = 0; 1645 if (sopt->sopt_level != SOL_SOCKET) { 1646 if (so->so_proto && so->so_proto->pr_ctloutput) 1647 return ((*so->so_proto->pr_ctloutput) 1648 (so, sopt)); 1649 error = ENOPROTOOPT; 1650 } else { 1651 switch (sopt->sopt_name) { 1652#ifdef INET 1653 case SO_ACCEPTFILTER: 1654 error = do_setopt_accept_filter(so, sopt); 1655 if (error) 1656 goto bad; 1657 break; 1658#endif 1659 case SO_LINGER: 1660 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1661 if (error) 1662 goto bad; 1663 1664 SOCK_LOCK(so); 1665 so->so_linger = l.l_linger; 1666 if (l.l_onoff) 1667 so->so_options |= SO_LINGER; 1668 else 1669 so->so_options &= ~SO_LINGER; 1670 SOCK_UNLOCK(so); 1671 break; 1672 1673 case SO_DEBUG: 1674 case SO_KEEPALIVE: 1675 case SO_DONTROUTE: 1676 case SO_USELOOPBACK: 1677 case SO_BROADCAST: 1678 case SO_REUSEADDR: 1679 case SO_REUSEPORT: 1680 case SO_OOBINLINE: 1681 case SO_TIMESTAMP: 1682 case SO_BINTIME: 1683 case SO_NOSIGPIPE: 1684 error = sooptcopyin(sopt, &optval, sizeof optval, 1685 sizeof optval); 1686 if (error) 1687 goto bad; 1688 SOCK_LOCK(so); 1689 if (optval) 1690 so->so_options |= sopt->sopt_name; 1691 else 1692 so->so_options &= ~sopt->sopt_name; 1693 SOCK_UNLOCK(so); 1694 break; 1695 1696 case SO_SNDBUF: 1697 case SO_RCVBUF: 1698 case SO_SNDLOWAT: 1699 case SO_RCVLOWAT: 1700 error = sooptcopyin(sopt, &optval, sizeof optval, 1701 sizeof optval); 1702 if (error) 1703 goto bad; 1704 1705 /* 1706 * Values < 1 make no sense for any of these 1707 * options, so disallow them. 1708 */ 1709 if (optval < 1) { 1710 error = EINVAL; 1711 goto bad; 1712 } 1713 1714 switch (sopt->sopt_name) { 1715 case SO_SNDBUF: 1716 case SO_RCVBUF: 1717 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1718 &so->so_snd : &so->so_rcv, (u_long)optval, 1719 so, curthread) == 0) { 1720 error = ENOBUFS; 1721 goto bad; 1722 } 1723 break; 1724 1725 /* 1726 * Make sure the low-water is never greater than 1727 * the high-water. 1728 */ 1729 case SO_SNDLOWAT: 1730 SOCKBUF_LOCK(&so->so_snd); 1731 so->so_snd.sb_lowat = 1732 (optval > so->so_snd.sb_hiwat) ? 1733 so->so_snd.sb_hiwat : optval; 1734 SOCKBUF_UNLOCK(&so->so_snd); 1735 break; 1736 case SO_RCVLOWAT: 1737 SOCKBUF_LOCK(&so->so_rcv); 1738 so->so_rcv.sb_lowat = 1739 (optval > so->so_rcv.sb_hiwat) ? 1740 so->so_rcv.sb_hiwat : optval; 1741 SOCKBUF_UNLOCK(&so->so_rcv); 1742 break; 1743 } 1744 break; 1745 1746 case SO_SNDTIMEO: 1747 case SO_RCVTIMEO: 1748 error = sooptcopyin(sopt, &tv, sizeof tv, 1749 sizeof tv); 1750 if (error) 1751 goto bad; 1752 1753 /* assert(hz > 0); */ 1754 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1755 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1756 error = EDOM; 1757 goto bad; 1758 } 1759 /* assert(tick > 0); */ 1760 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1761 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1762 if (val > SHRT_MAX) { 1763 error = EDOM; 1764 goto bad; 1765 } 1766 if (val == 0 && tv.tv_usec != 0) 1767 val = 1; 1768 1769 switch (sopt->sopt_name) { 1770 case SO_SNDTIMEO: 1771 so->so_snd.sb_timeo = val; 1772 break; 1773 case SO_RCVTIMEO: 1774 so->so_rcv.sb_timeo = val; 1775 break; 1776 } 1777 break; 1778 case SO_LABEL: 1779#ifdef MAC 1780 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1781 sizeof extmac); 1782 if (error) 1783 goto bad; 1784 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1785 so, &extmac); 1786#else 1787 error = EOPNOTSUPP; 1788#endif 1789 break; 1790 default: 1791 error = ENOPROTOOPT; 1792 break; 1793 } 1794 if (error == 0 && so->so_proto != NULL && 1795 so->so_proto->pr_ctloutput != NULL) { 1796 (void) ((*so->so_proto->pr_ctloutput) 1797 (so, sopt)); 1798 } 1799 } 1800bad: 1801 return (error); 1802} 1803 1804/* Helper routine for getsockopt */ 1805int 1806sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1807{ 1808 int error; 1809 size_t valsize; 1810 1811 error = 0; 1812 1813 /* 1814 * Documented get behavior is that we always return a value, 1815 * possibly truncated to fit in the user's buffer. 1816 * Traditional behavior is that we always tell the user 1817 * precisely how much we copied, rather than something useful 1818 * like the total amount we had available for her. 1819 * Note that this interface is not idempotent; the entire answer must 1820 * generated ahead of time. 1821 */ 1822 valsize = min(len, sopt->sopt_valsize); 1823 sopt->sopt_valsize = valsize; 1824 if (sopt->sopt_val != NULL) { 1825 if (sopt->sopt_td != NULL) 1826 error = copyout(buf, sopt->sopt_val, valsize); 1827 else 1828 bcopy(buf, sopt->sopt_val, valsize); 1829 } 1830 return error; 1831} 1832 1833int 1834sogetopt(so, sopt) 1835 struct socket *so; 1836 struct sockopt *sopt; 1837{ 1838 int error, optval; 1839 struct linger l; 1840 struct timeval tv; 1841#ifdef INET 1842 struct accept_filter_arg *afap; 1843#endif 1844#ifdef MAC 1845 struct mac extmac; 1846#endif 1847 1848 error = 0; 1849 if (sopt->sopt_level != SOL_SOCKET) { 1850 if (so->so_proto && so->so_proto->pr_ctloutput) { 1851 return ((*so->so_proto->pr_ctloutput) 1852 (so, sopt)); 1853 } else 1854 return (ENOPROTOOPT); 1855 } else { 1856 switch (sopt->sopt_name) { 1857#ifdef INET 1858 case SO_ACCEPTFILTER: 1859 /* Unlocked read. */ 1860 if ((so->so_options & SO_ACCEPTCONN) == 0) 1861 return (EINVAL); 1862 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1863 M_TEMP, M_WAITOK | M_ZERO); 1864 SOCK_LOCK(so); 1865 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1866 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1867 if (so->so_accf->so_accept_filter_str != NULL) 1868 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1869 } 1870 SOCK_UNLOCK(so); 1871 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1872 FREE(afap, M_TEMP); 1873 break; 1874#endif 1875 1876 case SO_LINGER: 1877 /* 1878 * XXXRW: We grab the lock here to get a consistent 1879 * snapshot of both fields. This may not really 1880 * be necessary. 1881 */ 1882 SOCK_LOCK(so); 1883 l.l_onoff = so->so_options & SO_LINGER; 1884 l.l_linger = so->so_linger; 1885 SOCK_UNLOCK(so); 1886 error = sooptcopyout(sopt, &l, sizeof l); 1887 break; 1888 1889 case SO_USELOOPBACK: 1890 case SO_DONTROUTE: 1891 case SO_DEBUG: 1892 case SO_KEEPALIVE: 1893 case SO_REUSEADDR: 1894 case SO_REUSEPORT: 1895 case SO_BROADCAST: 1896 case SO_OOBINLINE: 1897 case SO_TIMESTAMP: 1898 case SO_BINTIME: 1899 case SO_NOSIGPIPE: 1900 optval = so->so_options & sopt->sopt_name; 1901integer: 1902 error = sooptcopyout(sopt, &optval, sizeof optval); 1903 break; 1904 1905 case SO_TYPE: 1906 optval = so->so_type; 1907 goto integer; 1908 1909 case SO_ERROR: 1910 optval = so->so_error; 1911 so->so_error = 0; 1912 goto integer; 1913 1914 case SO_SNDBUF: 1915 optval = so->so_snd.sb_hiwat; 1916 goto integer; 1917 1918 case SO_RCVBUF: 1919 optval = so->so_rcv.sb_hiwat; 1920 goto integer; 1921 1922 case SO_SNDLOWAT: 1923 optval = so->so_snd.sb_lowat; 1924 goto integer; 1925 1926 case SO_RCVLOWAT: 1927 optval = so->so_rcv.sb_lowat; 1928 goto integer; 1929 1930 case SO_SNDTIMEO: 1931 case SO_RCVTIMEO: 1932 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1933 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1934 1935 tv.tv_sec = optval / hz; 1936 tv.tv_usec = (optval % hz) * tick; 1937 error = sooptcopyout(sopt, &tv, sizeof tv); 1938 break; 1939 case SO_LABEL: 1940#ifdef MAC 1941 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1942 sizeof(extmac)); 1943 if (error) 1944 return (error); 1945 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1946 so, &extmac); 1947 if (error) 1948 return (error); 1949 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1950#else 1951 error = EOPNOTSUPP; 1952#endif 1953 break; 1954 case SO_PEERLABEL: 1955#ifdef MAC 1956 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1957 sizeof(extmac)); 1958 if (error) 1959 return (error); 1960 error = mac_getsockopt_peerlabel( 1961 sopt->sopt_td->td_ucred, so, &extmac); 1962 if (error) 1963 return (error); 1964 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1965#else 1966 error = EOPNOTSUPP; 1967#endif 1968 break; 1969 default: 1970 error = ENOPROTOOPT; 1971 break; 1972 } 1973 return (error); 1974 } 1975} 1976 1977/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1978int 1979soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1980{ 1981 struct mbuf *m, *m_prev; 1982 int sopt_size = sopt->sopt_valsize; 1983 1984 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1985 if (m == NULL) 1986 return ENOBUFS; 1987 if (sopt_size > MLEN) { 1988 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1989 if ((m->m_flags & M_EXT) == 0) { 1990 m_free(m); 1991 return ENOBUFS; 1992 } 1993 m->m_len = min(MCLBYTES, sopt_size); 1994 } else { 1995 m->m_len = min(MLEN, sopt_size); 1996 } 1997 sopt_size -= m->m_len; 1998 *mp = m; 1999 m_prev = m; 2000 2001 while (sopt_size) { 2002 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2003 if (m == NULL) { 2004 m_freem(*mp); 2005 return ENOBUFS; 2006 } 2007 if (sopt_size > MLEN) { 2008 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2009 M_DONTWAIT); 2010 if ((m->m_flags & M_EXT) == 0) { 2011 m_freem(m); 2012 m_freem(*mp); 2013 return ENOBUFS; 2014 } 2015 m->m_len = min(MCLBYTES, sopt_size); 2016 } else { 2017 m->m_len = min(MLEN, sopt_size); 2018 } 2019 sopt_size -= m->m_len; 2020 m_prev->m_next = m; 2021 m_prev = m; 2022 } 2023 return 0; 2024} 2025 2026/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2027int 2028soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2029{ 2030 struct mbuf *m0 = m; 2031 2032 if (sopt->sopt_val == NULL) 2033 return 0; 2034 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2035 if (sopt->sopt_td != NULL) { 2036 int error; 2037 2038 error = copyin(sopt->sopt_val, mtod(m, char *), 2039 m->m_len); 2040 if (error != 0) { 2041 m_freem(m0); 2042 return(error); 2043 } 2044 } else 2045 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2046 sopt->sopt_valsize -= m->m_len; 2047 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2048 m = m->m_next; 2049 } 2050 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2051 panic("ip6_sooptmcopyin"); 2052 return 0; 2053} 2054 2055/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2056int 2057soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2058{ 2059 struct mbuf *m0 = m; 2060 size_t valsize = 0; 2061 2062 if (sopt->sopt_val == NULL) 2063 return 0; 2064 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2065 if (sopt->sopt_td != NULL) { 2066 int error; 2067 2068 error = copyout(mtod(m, char *), sopt->sopt_val, 2069 m->m_len); 2070 if (error != 0) { 2071 m_freem(m0); 2072 return(error); 2073 } 2074 } else 2075 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2076 sopt->sopt_valsize -= m->m_len; 2077 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2078 valsize += m->m_len; 2079 m = m->m_next; 2080 } 2081 if (m != NULL) { 2082 /* enough soopt buffer should be given from user-land */ 2083 m_freem(m0); 2084 return(EINVAL); 2085 } 2086 sopt->sopt_valsize = valsize; 2087 return 0; 2088} 2089 2090void 2091sohasoutofband(so) 2092 struct socket *so; 2093{ 2094 if (so->so_sigio != NULL) 2095 pgsigio(&so->so_sigio, SIGURG, 0); 2096 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2097} 2098 2099int 2100sopoll(struct socket *so, int events, struct ucred *active_cred, 2101 struct thread *td) 2102{ 2103 int revents = 0; 2104 2105 SOCKBUF_LOCK(&so->so_snd); 2106 SOCKBUF_LOCK(&so->so_rcv); 2107 if (events & (POLLIN | POLLRDNORM)) 2108 if (soreadable(so)) 2109 revents |= events & (POLLIN | POLLRDNORM); 2110 2111 if (events & POLLINIGNEOF) 2112 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2113 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2114 revents |= POLLINIGNEOF; 2115 2116 if (events & (POLLOUT | POLLWRNORM)) 2117 if (sowriteable(so)) 2118 revents |= events & (POLLOUT | POLLWRNORM); 2119 2120 if (events & (POLLPRI | POLLRDBAND)) 2121 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2122 revents |= events & (POLLPRI | POLLRDBAND); 2123 2124 if (revents == 0) { 2125 if (events & 2126 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2127 POLLRDBAND)) { 2128 selrecord(td, &so->so_rcv.sb_sel); 2129 so->so_rcv.sb_flags |= SB_SEL; 2130 } 2131 2132 if (events & (POLLOUT | POLLWRNORM)) { 2133 selrecord(td, &so->so_snd.sb_sel); 2134 so->so_snd.sb_flags |= SB_SEL; 2135 } 2136 } 2137 2138 SOCKBUF_UNLOCK(&so->so_rcv); 2139 SOCKBUF_UNLOCK(&so->so_snd); 2140 return (revents); 2141} 2142 2143int 2144soo_kqfilter(struct file *fp, struct knote *kn) 2145{ 2146 struct socket *so = kn->kn_fp->f_data; 2147 struct sockbuf *sb; 2148 2149 switch (kn->kn_filter) { 2150 case EVFILT_READ: 2151 if (so->so_options & SO_ACCEPTCONN) 2152 kn->kn_fop = &solisten_filtops; 2153 else 2154 kn->kn_fop = &soread_filtops; 2155 sb = &so->so_rcv; 2156 break; 2157 case EVFILT_WRITE: 2158 kn->kn_fop = &sowrite_filtops; 2159 sb = &so->so_snd; 2160 break; 2161 default: 2162 return (EINVAL); 2163 } 2164 2165 SOCKBUF_LOCK(sb); 2166 knlist_add(&sb->sb_sel.si_note, kn, 1); 2167 sb->sb_flags |= SB_KNOTE; 2168 SOCKBUF_UNLOCK(sb); 2169 return (0); 2170} 2171 2172static void 2173filt_sordetach(struct knote *kn) 2174{ 2175 struct socket *so = kn->kn_fp->f_data; 2176 2177 SOCKBUF_LOCK(&so->so_rcv); 2178 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2179 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2180 so->so_rcv.sb_flags &= ~SB_KNOTE; 2181 SOCKBUF_UNLOCK(&so->so_rcv); 2182} 2183 2184/*ARGSUSED*/ 2185static int 2186filt_soread(struct knote *kn, long hint) 2187{ 2188 struct socket *so; 2189 2190 so = kn->kn_fp->f_data; 2191 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2192 2193 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2194 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2195 kn->kn_flags |= EV_EOF; 2196 kn->kn_fflags = so->so_error; 2197 return (1); 2198 } else if (so->so_error) /* temporary udp error */ 2199 return (1); 2200 else if (kn->kn_sfflags & NOTE_LOWAT) 2201 return (kn->kn_data >= kn->kn_sdata); 2202 else 2203 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2204} 2205 2206static void 2207filt_sowdetach(struct knote *kn) 2208{ 2209 struct socket *so = kn->kn_fp->f_data; 2210 2211 SOCKBUF_LOCK(&so->so_snd); 2212 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2213 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2214 so->so_snd.sb_flags &= ~SB_KNOTE; 2215 SOCKBUF_UNLOCK(&so->so_snd); 2216} 2217 2218/*ARGSUSED*/ 2219static int 2220filt_sowrite(struct knote *kn, long hint) 2221{ 2222 struct socket *so; 2223 2224 so = kn->kn_fp->f_data; 2225 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2226 kn->kn_data = sbspace(&so->so_snd); 2227 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2228 kn->kn_flags |= EV_EOF; 2229 kn->kn_fflags = so->so_error; 2230 return (1); 2231 } else if (so->so_error) /* temporary udp error */ 2232 return (1); 2233 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2234 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2235 return (0); 2236 else if (kn->kn_sfflags & NOTE_LOWAT) 2237 return (kn->kn_data >= kn->kn_sdata); 2238 else 2239 return (kn->kn_data >= so->so_snd.sb_lowat); 2240} 2241 2242/*ARGSUSED*/ 2243static int 2244filt_solisten(struct knote *kn, long hint) 2245{ 2246 struct socket *so = kn->kn_fp->f_data; 2247 2248 kn->kn_data = so->so_qlen; 2249 return (! TAILQ_EMPTY(&so->so_comp)); 2250} 2251 2252int 2253socheckuid(struct socket *so, uid_t uid) 2254{ 2255 2256 if (so == NULL) 2257 return (EPERM); 2258 if (so->so_cred->cr_uid == uid) 2259 return (0); 2260 return (EPERM); 2261} 2262