uipc_socket.c revision 134240
133965Sjdp/* 233965Sjdp * Copyright (c) 2004 The FreeBSD Foundation 333965Sjdp * Copyright (c) 2004 Robert Watson 433965Sjdp * Copyright (c) 1982, 1986, 1988, 1990, 1993 533965Sjdp * The Regents of the University of California. All rights reserved. 633965Sjdp * 733965Sjdp * Redistribution and use in source and binary forms, with or without 833965Sjdp * modification, are permitted provided that the following conditions 933965Sjdp * are met: 1033965Sjdp * 1. Redistributions of source code must retain the above copyright 1133965Sjdp * notice, this list of conditions and the following disclaimer. 1233965Sjdp * 2. Redistributions in binary form must reproduce the above copyright 1333965Sjdp * notice, this list of conditions and the following disclaimer in the 1433965Sjdp * documentation and/or other materials provided with the distribution. 1533965Sjdp * 4. Neither the name of the University nor the names of its contributors 1633965Sjdp * may be used to endorse or promote products derived from this software 1733965Sjdp * without specific prior written permission. 1833965Sjdp * 1933965Sjdp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2033965Sjdp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2133965Sjdp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2233965Sjdp * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2333965Sjdp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2433965Sjdp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2533965Sjdp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2633965Sjdp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2733965Sjdp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2833965Sjdp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2933965Sjdp * SUCH DAMAGE. 3033965Sjdp * 3133965Sjdp * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 3233965Sjdp */ 3333965Sjdp 3433965Sjdp#include <sys/cdefs.h> 3533965Sjdp__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 134240 2004-08-24 05:28:18Z rwatson $"); 3633965Sjdp 3733965Sjdp#include "opt_inet.h" 3833965Sjdp#include "opt_mac.h" 3933965Sjdp#include "opt_zero.h" 4033965Sjdp 4133965Sjdp#include <sys/param.h> 4233965Sjdp#include <sys/systm.h> 4333965Sjdp#include <sys/fcntl.h> 4433965Sjdp#include <sys/limits.h> 4533965Sjdp#include <sys/lock.h> 4633965Sjdp#include <sys/mac.h> 4733965Sjdp#include <sys/malloc.h> 4833965Sjdp#include <sys/mbuf.h> 4933965Sjdp#include <sys/mutex.h> 5033965Sjdp#include <sys/domain.h> 5133965Sjdp#include <sys/file.h> /* for struct knote */ 5233965Sjdp#include <sys/kernel.h> 5333965Sjdp#include <sys/event.h> 5433965Sjdp#include <sys/poll.h> 5533965Sjdp#include <sys/proc.h> 5633965Sjdp#include <sys/protosw.h> 5733965Sjdp#include <sys/socket.h> 5833965Sjdp#include <sys/socketvar.h> 5933965Sjdp#include <sys/resourcevar.h> 6033965Sjdp#include <sys/signalvar.h> 6133965Sjdp#include <sys/sysctl.h> 6233965Sjdp#include <sys/uio.h> 6333965Sjdp#include <sys/jail.h> 6433965Sjdp 6533965Sjdp#include <vm/uma.h> 6633965Sjdp 6733965Sjdp 6833965Sjdpstatic int soreceive_rcvoob(struct socket *so, struct uio *uio, 6933965Sjdp int flags); 7033965Sjdp 7133965Sjdp#ifdef INET 7233965Sjdpstatic int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 7333965Sjdp#endif 7433965Sjdp 7533965Sjdpstatic void filt_sordetach(struct knote *kn); 7633965Sjdpstatic int filt_soread(struct knote *kn, long hint); 7733965Sjdpstatic void filt_sowdetach(struct knote *kn); 7833965Sjdpstatic int filt_sowrite(struct knote *kn, long hint); 7933965Sjdpstatic int filt_solisten(struct knote *kn, long hint); 8033965Sjdp 8133965Sjdpstatic struct filterops solisten_filtops = 8233965Sjdp { 1, NULL, filt_sordetach, filt_solisten }; 8333965Sjdpstatic struct filterops soread_filtops = 8433965Sjdp { 1, NULL, filt_sordetach, filt_soread }; 8533965Sjdpstatic struct filterops sowrite_filtops = 8633965Sjdp { 1, NULL, filt_sowdetach, filt_sowrite }; 8733965Sjdp 8833965Sjdpuma_zone_t socket_zone; 8933965Sjdpso_gen_t so_gencnt; /* generation count for sockets */ 9033965Sjdp 9133965SjdpMALLOC_DEFINE(M_SONAME, "soname", "socket name"); 9233965SjdpMALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 9333965Sjdp 9433965SjdpSYSCTL_DECL(_kern_ipc); 9533965Sjdp 9633965Sjdpstatic int somaxconn = SOMAXCONN; 9733965SjdpSYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 9833965Sjdp &somaxconn, 0, "Maximum pending socket connection queue size"); 9933965Sjdpstatic int numopensockets; 10033965SjdpSYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 10133965Sjdp &numopensockets, 0, "Number of open sockets"); 10233965Sjdp#ifdef ZERO_COPY_SOCKETS 10333965Sjdp/* These aren't static because they're used in other files. */ 10433965Sjdpint so_zero_copy_send = 1; 10533965Sjdpint so_zero_copy_receive = 1; 10633965SjdpSYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 10733965Sjdp "Zero copy controls"); 10833965SjdpSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 10933965Sjdp &so_zero_copy_receive, 0, "Enable zero copy receive"); 11033965SjdpSYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 11133965Sjdp &so_zero_copy_send, 0, "Enable zero copy send"); 11233965Sjdp#endif /* ZERO_COPY_SOCKETS */ 11333965Sjdp 11433965Sjdp/* 11533965Sjdp * accept_mtx locks down per-socket fields relating to accept queues. See 11633965Sjdp * socketvar.h for an annotation of the protected fields of struct socket. 11733965Sjdp */ 11833965Sjdpstruct mtx accept_mtx; 11933965SjdpMTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 12033965Sjdp 12133965Sjdp/* 12233965Sjdp * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 12333965Sjdp * so_gencnt field. 12433965Sjdp * 12533965Sjdp * XXXRW: These variables might be better manipulated using atomic operations 12633965Sjdp * for improved efficiency. 12733965Sjdp */ 12833965Sjdpstatic struct mtx so_global_mtx; 12933965SjdpMTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 13033965Sjdp 13133965Sjdp/* 13233965Sjdp * Socket operation routines. 13333965Sjdp * These routines are called by the routines in 13433965Sjdp * sys_socket.c or from a system process, and 13533965Sjdp * implement the semantics of socket operations by 13633965Sjdp * switching out to the protocol specific routines. 13733965Sjdp */ 13833965Sjdp 13933965Sjdp/* 14033965Sjdp * Get a socket structure from our zone, and initialize it. 14133965Sjdp * Note that it would probably be better to allocate socket 14233965Sjdp * and PCB at the same time, but I'm not convinced that all 14333965Sjdp * the protocols can be easily modified to do this. 14433965Sjdp * 14533965Sjdp * soalloc() returns a socket with a ref count of 0. 14633965Sjdp */ 14733965Sjdpstruct socket * 14833965Sjdpsoalloc(int mflags) 14933965Sjdp{ 15033965Sjdp struct socket *so; 15133965Sjdp#ifdef MAC 15233965Sjdp int error; 15333965Sjdp#endif 15433965Sjdp 15533965Sjdp so = uma_zalloc(socket_zone, mflags | M_ZERO); 15633965Sjdp if (so != NULL) { 15733965Sjdp#ifdef MAC 15833965Sjdp error = mac_init_socket(so, mflags); 15933965Sjdp if (error != 0) { 16033965Sjdp uma_zfree(socket_zone, so); 16133965Sjdp so = NULL; 16233965Sjdp return so; 16333965Sjdp } 16433965Sjdp#endif 16533965Sjdp SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 16633965Sjdp SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 16733965Sjdp /* sx_init(&so->so_sxlock, "socket sxlock"); */ 16833965Sjdp TAILQ_INIT(&so->so_aiojobq); 16933965Sjdp mtx_lock(&so_global_mtx); 17033965Sjdp so->so_gencnt = ++so_gencnt; 17133965Sjdp ++numopensockets; 17233965Sjdp mtx_unlock(&so_global_mtx); 17333965Sjdp } 17433965Sjdp return so; 17533965Sjdp} 17633965Sjdp 17733965Sjdp/* 17833965Sjdp * socreate returns a socket with a ref count of 1. The socket should be 17933965Sjdp * closed with soclose(). 18033965Sjdp */ 18133965Sjdpint 18233965Sjdpsocreate(dom, aso, type, proto, cred, td) 18333965Sjdp int dom; 18433965Sjdp struct socket **aso; 18533965Sjdp int type; 18633965Sjdp int proto; 18733965Sjdp struct ucred *cred; 18833965Sjdp struct thread *td; 18933965Sjdp{ 19033965Sjdp struct protosw *prp; 19133965Sjdp struct socket *so; 19233965Sjdp int error; 19333965Sjdp 19433965Sjdp if (proto) 19533965Sjdp prp = pffindproto(dom, proto, type); 19633965Sjdp else 19733965Sjdp prp = pffindtype(dom, type); 19833965Sjdp 19933965Sjdp if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 20033965Sjdp return (EPROTONOSUPPORT); 20133965Sjdp 20233965Sjdp if (jailed(cred) && jail_socket_unixiproute_only && 20333965Sjdp prp->pr_domain->dom_family != PF_LOCAL && 20433965Sjdp prp->pr_domain->dom_family != PF_INET && 20533965Sjdp prp->pr_domain->dom_family != PF_ROUTE) { 20633965Sjdp return (EPROTONOSUPPORT); 20733965Sjdp } 20833965Sjdp 20933965Sjdp if (prp->pr_type != type) 21033965Sjdp return (EPROTOTYPE); 21133965Sjdp so = soalloc(M_WAITOK); 21233965Sjdp if (so == NULL) 21333965Sjdp return (ENOBUFS); 21433965Sjdp 21533965Sjdp TAILQ_INIT(&so->so_incomp); 21633965Sjdp TAILQ_INIT(&so->so_comp); 21733965Sjdp so->so_type = type; 21833965Sjdp so->so_cred = crhold(cred); 21933965Sjdp so->so_proto = prp; 22033965Sjdp#ifdef MAC 22133965Sjdp mac_create_socket(cred, so); 22233965Sjdp#endif 22333965Sjdp SOCK_LOCK(so); 22433965Sjdp knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 22533965Sjdp knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 22633965Sjdp soref(so); 22733965Sjdp SOCK_UNLOCK(so); 22833965Sjdp error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 22933965Sjdp if (error) { 23033965Sjdp SOCK_LOCK(so); 23133965Sjdp so->so_state |= SS_NOFDREF; 23233965Sjdp sorele(so); 23333965Sjdp return (error); 23433965Sjdp } 23533965Sjdp *aso = so; 23633965Sjdp return (0); 23733965Sjdp} 23833965Sjdp 23933965Sjdpint 24033965Sjdpsobind(so, nam, td) 24133965Sjdp struct socket *so; 24233965Sjdp struct sockaddr *nam; 24333965Sjdp struct thread *td; 24433965Sjdp{ 24533965Sjdp 24633965Sjdp return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 24733965Sjdp} 24833965Sjdp 24933965Sjdpvoid 25033965Sjdpsodealloc(struct socket *so) 25133965Sjdp{ 25233965Sjdp 25333965Sjdp KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 25433965Sjdp mtx_lock(&so_global_mtx); 25533965Sjdp so->so_gencnt = ++so_gencnt; 25633965Sjdp mtx_unlock(&so_global_mtx); 25733965Sjdp if (so->so_rcv.sb_hiwat) 25833965Sjdp (void)chgsbsize(so->so_cred->cr_uidinfo, 25933965Sjdp &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 26033965Sjdp if (so->so_snd.sb_hiwat) 26133965Sjdp (void)chgsbsize(so->so_cred->cr_uidinfo, 26233965Sjdp &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 26333965Sjdp#ifdef INET 26433965Sjdp /* remove acccept filter if one is present. */ 26533965Sjdp if (so->so_accf != NULL) 26633965Sjdp do_setopt_accept_filter(so, NULL); 26733965Sjdp#endif 26833965Sjdp#ifdef MAC 26933965Sjdp mac_destroy_socket(so); 27033965Sjdp#endif 27133965Sjdp crfree(so->so_cred); 27233965Sjdp SOCKBUF_LOCK_DESTROY(&so->so_snd); 27333965Sjdp SOCKBUF_LOCK_DESTROY(&so->so_rcv); 27433965Sjdp /* sx_destroy(&so->so_sxlock); */ 27533965Sjdp uma_zfree(socket_zone, so); 27633965Sjdp /* 27733965Sjdp * XXXRW: Seems like a shame to grab the mutex again down here, but 27833965Sjdp * we don't want to decrement the socket count until after we free 27933965Sjdp * the socket, and we can't increment the gencnt on the socket after 28033965Sjdp * we free, it so... 28133965Sjdp */ 28233965Sjdp mtx_lock(&so_global_mtx); 28333965Sjdp --numopensockets; 28433965Sjdp mtx_unlock(&so_global_mtx); 28533965Sjdp} 28633965Sjdp 28733965Sjdpint 28833965Sjdpsolisten(so, backlog, td) 28933965Sjdp struct socket *so; 29033965Sjdp int backlog; 29133965Sjdp struct thread *td; 29233965Sjdp{ 29333965Sjdp int error; 29433965Sjdp 29533965Sjdp /* 29633965Sjdp * XXXRW: Ordering issue here -- perhaps we need to set 29733965Sjdp * SO_ACCEPTCONN before the call to pru_listen()? 29833965Sjdp * XXXRW: General atomic test-and-set concerns here also. 29933965Sjdp */ 30033965Sjdp if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 30133965Sjdp SS_ISDISCONNECTING)) 30233965Sjdp return (EINVAL); 30333965Sjdp error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 30433965Sjdp if (error) 30533965Sjdp return (error); 30633965Sjdp ACCEPT_LOCK(); 30733965Sjdp if (TAILQ_EMPTY(&so->so_comp)) { 30833965Sjdp SOCK_LOCK(so); 30933965Sjdp so->so_options |= SO_ACCEPTCONN; 31033965Sjdp SOCK_UNLOCK(so); 31133965Sjdp } 31233965Sjdp if (backlog < 0 || backlog > somaxconn) 31333965Sjdp backlog = somaxconn; 31433965Sjdp so->so_qlimit = backlog; 31533965Sjdp ACCEPT_UNLOCK(); 31633965Sjdp return (0); 31733965Sjdp} 31833965Sjdp 31933965Sjdpvoid 32033965Sjdpsofree(so) 32133965Sjdp struct socket *so; 32233965Sjdp{ 32333965Sjdp struct socket *head; 32433965Sjdp 32533965Sjdp KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 32633965Sjdp SOCK_LOCK_ASSERT(so); 32733965Sjdp 32833965Sjdp if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 32933965Sjdp SOCK_UNLOCK(so); 33033965Sjdp return; 33133965Sjdp } 33233965Sjdp 33333965Sjdp SOCK_UNLOCK(so); 33433965Sjdp ACCEPT_LOCK(); 33533965Sjdp head = so->so_head; 33633965Sjdp if (head != NULL) { 33733965Sjdp KASSERT((so->so_qstate & SQ_COMP) != 0 || 33833965Sjdp (so->so_qstate & SQ_INCOMP) != 0, 33933965Sjdp ("sofree: so_head != NULL, but neither SQ_COMP nor " 34033965Sjdp "SQ_INCOMP")); 34133965Sjdp KASSERT((so->so_qstate & SQ_COMP) == 0 || 34233965Sjdp (so->so_qstate & SQ_INCOMP) == 0, 34333965Sjdp ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 34433965Sjdp /* 34533965Sjdp * accept(2) is responsible draining the completed 34633965Sjdp * connection queue and freeing those sockets, so 34733965Sjdp * we just return here if this socket is currently 34833965Sjdp * on the completed connection queue. Otherwise, 34933965Sjdp * accept(2) may hang after select(2) has indicating 35033965Sjdp * that a listening socket was ready. If it's an 35133965Sjdp * incomplete connection, we remove it from the queue 35233965Sjdp * and free it; otherwise, it won't be released until 35333965Sjdp * the listening socket is closed. 35433965Sjdp */ 35533965Sjdp if ((so->so_qstate & SQ_COMP) != 0) { 35633965Sjdp ACCEPT_UNLOCK(); 35733965Sjdp return; 35833965Sjdp } 35933965Sjdp TAILQ_REMOVE(&head->so_incomp, so, so_list); 36033965Sjdp head->so_incqlen--; 36133965Sjdp so->so_qstate &= ~SQ_INCOMP; 36233965Sjdp so->so_head = NULL; 36333965Sjdp } 36433965Sjdp KASSERT((so->so_qstate & SQ_COMP) == 0 && 36533965Sjdp (so->so_qstate & SQ_INCOMP) == 0, 36633965Sjdp ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 36733965Sjdp so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 36833965Sjdp ACCEPT_UNLOCK(); 36933965Sjdp SOCKBUF_LOCK(&so->so_snd); 37033965Sjdp so->so_snd.sb_flags |= SB_NOINTR; 37133965Sjdp (void)sblock(&so->so_snd, M_WAITOK); 37233965Sjdp /* 37333965Sjdp * socantsendmore_locked() drops the socket buffer mutex so that it 37433965Sjdp * can safely perform wakeups. Re-acquire the mutex before 37533965Sjdp * continuing. 37633965Sjdp */ 37733965Sjdp socantsendmore_locked(so); 37833965Sjdp SOCKBUF_LOCK(&so->so_snd); 37933965Sjdp sbunlock(&so->so_snd); 38033965Sjdp sbrelease_locked(&so->so_snd, so); 38133965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 38233965Sjdp sorflush(so); 38333965Sjdp knlist_destroy(&so->so_rcv.sb_sel.si_note); 38433965Sjdp knlist_destroy(&so->so_snd.sb_sel.si_note); 38533965Sjdp sodealloc(so); 38633965Sjdp} 38733965Sjdp 38833965Sjdp/* 38933965Sjdp * Close a socket on last file table reference removal. 39033965Sjdp * Initiate disconnect if connected. 39133965Sjdp * Free socket when disconnect complete. 39233965Sjdp * 39333965Sjdp * This function will sorele() the socket. Note that soclose() may be 39433965Sjdp * called prior to the ref count reaching zero. The actual socket 39533965Sjdp * structure will not be freed until the ref count reaches zero. 39633965Sjdp */ 39733965Sjdpint 39833965Sjdpsoclose(so) 39933965Sjdp struct socket *so; 40033965Sjdp{ 40133965Sjdp int error = 0; 40233965Sjdp 40333965Sjdp KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 40433965Sjdp 40533965Sjdp funsetown(&so->so_sigio); 40633965Sjdp if (so->so_options & SO_ACCEPTCONN) { 40733965Sjdp struct socket *sp; 40833965Sjdp ACCEPT_LOCK(); 40933965Sjdp while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 41033965Sjdp TAILQ_REMOVE(&so->so_incomp, sp, so_list); 41133965Sjdp so->so_incqlen--; 41233965Sjdp sp->so_qstate &= ~SQ_INCOMP; 41333965Sjdp sp->so_head = NULL; 41433965Sjdp ACCEPT_UNLOCK(); 41533965Sjdp (void) soabort(sp); 41633965Sjdp ACCEPT_LOCK(); 41733965Sjdp } 41833965Sjdp while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 41933965Sjdp TAILQ_REMOVE(&so->so_comp, sp, so_list); 42033965Sjdp so->so_qlen--; 42133965Sjdp sp->so_qstate &= ~SQ_COMP; 42233965Sjdp sp->so_head = NULL; 42333965Sjdp ACCEPT_UNLOCK(); 42433965Sjdp (void) soabort(sp); 42533965Sjdp ACCEPT_LOCK(); 42633965Sjdp } 42733965Sjdp ACCEPT_UNLOCK(); 42833965Sjdp } 42933965Sjdp if (so->so_pcb == NULL) 43033965Sjdp goto discard; 43133965Sjdp if (so->so_state & SS_ISCONNECTED) { 43233965Sjdp if ((so->so_state & SS_ISDISCONNECTING) == 0) { 43333965Sjdp error = sodisconnect(so); 43433965Sjdp if (error) 43533965Sjdp goto drop; 43633965Sjdp } 43733965Sjdp if (so->so_options & SO_LINGER) { 43833965Sjdp if ((so->so_state & SS_ISDISCONNECTING) && 43933965Sjdp (so->so_state & SS_NBIO)) 44033965Sjdp goto drop; 44133965Sjdp while (so->so_state & SS_ISCONNECTED) { 44233965Sjdp error = tsleep(&so->so_timeo, 44333965Sjdp PSOCK | PCATCH, "soclos", so->so_linger * hz); 44433965Sjdp if (error) 44533965Sjdp break; 44633965Sjdp } 44733965Sjdp } 44833965Sjdp } 44933965Sjdpdrop: 45033965Sjdp if (so->so_pcb != NULL) { 45133965Sjdp int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 45233965Sjdp if (error == 0) 45333965Sjdp error = error2; 45433965Sjdp } 45533965Sjdpdiscard: 45633965Sjdp SOCK_LOCK(so); 45733965Sjdp KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 45833965Sjdp so->so_state |= SS_NOFDREF; 45933965Sjdp sorele(so); 46033965Sjdp return (error); 46133965Sjdp} 46233965Sjdp 46333965Sjdp/* 46433965Sjdp * soabort() must not be called with any socket locks held, as it calls 46533965Sjdp * into the protocol, which will call back into the socket code causing 46633965Sjdp * it to acquire additional socket locks that may cause recursion or lock 46733965Sjdp * order reversals. 46833965Sjdp */ 46933965Sjdpint 47033965Sjdpsoabort(so) 47133965Sjdp struct socket *so; 47233965Sjdp{ 47333965Sjdp int error; 47433965Sjdp 47533965Sjdp error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 47633965Sjdp if (error) { 47733965Sjdp SOCK_LOCK(so); 47833965Sjdp sotryfree(so); /* note: does not decrement the ref count */ 47933965Sjdp return error; 48033965Sjdp } 48133965Sjdp return (0); 48233965Sjdp} 48333965Sjdp 48433965Sjdpint 48533965Sjdpsoaccept(so, nam) 48633965Sjdp struct socket *so; 48733965Sjdp struct sockaddr **nam; 48833965Sjdp{ 48933965Sjdp int error; 49033965Sjdp 49133965Sjdp SOCK_LOCK(so); 49233965Sjdp KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 49333965Sjdp so->so_state &= ~SS_NOFDREF; 49433965Sjdp SOCK_UNLOCK(so); 49533965Sjdp error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 49633965Sjdp return (error); 49733965Sjdp} 49833965Sjdp 49933965Sjdpint 50033965Sjdpsoconnect(so, nam, td) 50133965Sjdp struct socket *so; 50233965Sjdp struct sockaddr *nam; 50333965Sjdp struct thread *td; 50433965Sjdp{ 50533965Sjdp int error; 50633965Sjdp 50733965Sjdp if (so->so_options & SO_ACCEPTCONN) 50833965Sjdp return (EOPNOTSUPP); 50933965Sjdp /* 51033965Sjdp * If protocol is connection-based, can only connect once. 51133965Sjdp * Otherwise, if connected, try to disconnect first. 51233965Sjdp * This allows user to disconnect by connecting to, e.g., 51333965Sjdp * a null address. 51433965Sjdp */ 51533965Sjdp if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 51633965Sjdp ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 51733965Sjdp (error = sodisconnect(so)))) 51833965Sjdp error = EISCONN; 51933965Sjdp else 52033965Sjdp error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 52133965Sjdp return (error); 52233965Sjdp} 52333965Sjdp 52433965Sjdpint 52533965Sjdpsoconnect2(so1, so2) 52633965Sjdp struct socket *so1; 52733965Sjdp struct socket *so2; 52833965Sjdp{ 52933965Sjdp 53033965Sjdp return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 53133965Sjdp} 53233965Sjdp 53333965Sjdpint 53433965Sjdpsodisconnect(so) 53533965Sjdp struct socket *so; 53633965Sjdp{ 53733965Sjdp int error; 53833965Sjdp 53933965Sjdp if ((so->so_state & SS_ISCONNECTED) == 0) 54033965Sjdp return (ENOTCONN); 54133965Sjdp if (so->so_state & SS_ISDISCONNECTING) 54233965Sjdp return (EALREADY); 54333965Sjdp error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 54433965Sjdp return (error); 54533965Sjdp} 54633965Sjdp 54733965Sjdp#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 54833965Sjdp/* 54933965Sjdp * Send on a socket. 55033965Sjdp * If send must go all at once and message is larger than 55133965Sjdp * send buffering, then hard error. 55233965Sjdp * Lock against other senders. 55333965Sjdp * If must go all at once and not enough room now, then 55433965Sjdp * inform user that this would block and do nothing. 55533965Sjdp * Otherwise, if nonblocking, send as much as possible. 55633965Sjdp * The data to be sent is described by "uio" if nonzero, 55733965Sjdp * otherwise by the mbuf chain "top" (which must be null 55833965Sjdp * if uio is not). Data provided in mbuf chain must be small 55933965Sjdp * enough to send all at once. 56033965Sjdp * 56133965Sjdp * Returns nonzero on error, timeout or signal; callers 56233965Sjdp * must check for short counts if EINTR/ERESTART are returned. 56333965Sjdp * Data and control buffers are freed on return. 56433965Sjdp */ 56533965Sjdp 56633965Sjdp#ifdef ZERO_COPY_SOCKETS 56733965Sjdpstruct so_zerocopy_stats{ 56833965Sjdp int size_ok; 56933965Sjdp int align_ok; 57033965Sjdp int found_ifp; 57133965Sjdp}; 57233965Sjdpstruct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 57333965Sjdp#include <netinet/in.h> 57433965Sjdp#include <net/route.h> 57533965Sjdp#include <netinet/in_pcb.h> 57633965Sjdp#include <vm/vm.h> 57733965Sjdp#include <vm/vm_page.h> 57833965Sjdp#include <vm/vm_object.h> 57933965Sjdp#endif /*ZERO_COPY_SOCKETS*/ 58033965Sjdp 58133965Sjdpint 58233965Sjdpsosend(so, addr, uio, top, control, flags, td) 58333965Sjdp struct socket *so; 58433965Sjdp struct sockaddr *addr; 58533965Sjdp struct uio *uio; 58633965Sjdp struct mbuf *top; 58733965Sjdp struct mbuf *control; 58833965Sjdp int flags; 58933965Sjdp struct thread *td; 59033965Sjdp{ 59133965Sjdp struct mbuf **mp; 59233965Sjdp struct mbuf *m; 59333965Sjdp long space, len = 0, resid; 59433965Sjdp int clen = 0, error, dontroute; 59533965Sjdp int atomic = sosendallatonce(so) || top; 59633965Sjdp#ifdef ZERO_COPY_SOCKETS 59733965Sjdp int cow_send; 59833965Sjdp#endif /* ZERO_COPY_SOCKETS */ 59933965Sjdp 60033965Sjdp if (uio != NULL) 60133965Sjdp resid = uio->uio_resid; 60233965Sjdp else 60333965Sjdp resid = top->m_pkthdr.len; 60433965Sjdp /* 60533965Sjdp * In theory resid should be unsigned. 60633965Sjdp * However, space must be signed, as it might be less than 0 60733965Sjdp * if we over-committed, and we must use a signed comparison 60833965Sjdp * of space and resid. On the other hand, a negative resid 60933965Sjdp * causes us to loop sending 0-length segments to the protocol. 61033965Sjdp * 61133965Sjdp * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 61233965Sjdp * type sockets since that's an error. 61333965Sjdp */ 61433965Sjdp if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 61533965Sjdp error = EINVAL; 61633965Sjdp goto out; 61733965Sjdp } 61833965Sjdp 61933965Sjdp dontroute = 62033965Sjdp (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 62133965Sjdp (so->so_proto->pr_flags & PR_ATOMIC); 62233965Sjdp if (td != NULL) 62333965Sjdp td->td_proc->p_stats->p_ru.ru_msgsnd++; 62433965Sjdp if (control != NULL) 62533965Sjdp clen = control->m_len; 62633965Sjdp#define snderr(errno) { error = (errno); goto release; } 62733965Sjdp 62833965Sjdp SOCKBUF_LOCK(&so->so_snd); 62933965Sjdprestart: 63033965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_snd); 63133965Sjdp error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 63233965Sjdp if (error) 63333965Sjdp goto out_locked; 63433965Sjdp do { 63533965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_snd); 63633965Sjdp if (so->so_snd.sb_state & SBS_CANTSENDMORE) 63733965Sjdp snderr(EPIPE); 63833965Sjdp if (so->so_error) { 63933965Sjdp error = so->so_error; 64033965Sjdp so->so_error = 0; 64133965Sjdp goto release; 64233965Sjdp } 64333965Sjdp if ((so->so_state & SS_ISCONNECTED) == 0) { 64433965Sjdp /* 64533965Sjdp * `sendto' and `sendmsg' is allowed on a connection- 64633965Sjdp * based socket if it supports implied connect. 64733965Sjdp * Return ENOTCONN if not connected and no address is 64833965Sjdp * supplied. 64933965Sjdp */ 65033965Sjdp if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 65133965Sjdp (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 65233965Sjdp if ((so->so_state & SS_ISCONFIRMING) == 0 && 65333965Sjdp !(resid == 0 && clen != 0)) 65433965Sjdp snderr(ENOTCONN); 65533965Sjdp } else if (addr == NULL) 65633965Sjdp snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 65733965Sjdp ENOTCONN : EDESTADDRREQ); 65833965Sjdp } 65933965Sjdp space = sbspace(&so->so_snd); 66033965Sjdp if (flags & MSG_OOB) 66133965Sjdp space += 1024; 66233965Sjdp if ((atomic && resid > so->so_snd.sb_hiwat) || 66333965Sjdp clen > so->so_snd.sb_hiwat) 66433965Sjdp snderr(EMSGSIZE); 66533965Sjdp if (space < resid + clen && 66633965Sjdp (atomic || space < so->so_snd.sb_lowat || space < clen)) { 66733965Sjdp if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 66833965Sjdp snderr(EWOULDBLOCK); 66933965Sjdp sbunlock(&so->so_snd); 67033965Sjdp error = sbwait(&so->so_snd); 67133965Sjdp if (error) 67233965Sjdp goto out_locked; 67333965Sjdp goto restart; 67433965Sjdp } 67533965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 67633965Sjdp mp = ⊤ 67733965Sjdp space -= clen; 67833965Sjdp do { 67933965Sjdp if (uio == NULL) { 68033965Sjdp /* 68133965Sjdp * Data is prepackaged in "top". 68233965Sjdp */ 68333965Sjdp resid = 0; 68433965Sjdp if (flags & MSG_EOR) 68533965Sjdp top->m_flags |= M_EOR; 68633965Sjdp } else do { 68733965Sjdp#ifdef ZERO_COPY_SOCKETS 68833965Sjdp cow_send = 0; 68933965Sjdp#endif /* ZERO_COPY_SOCKETS */ 69033965Sjdp if (resid >= MINCLSIZE) { 69133965Sjdp#ifdef ZERO_COPY_SOCKETS 69233965Sjdp if (top == NULL) { 69333965Sjdp MGETHDR(m, M_TRYWAIT, MT_DATA); 69433965Sjdp if (m == NULL) { 69533965Sjdp error = ENOBUFS; 69633965Sjdp SOCKBUF_LOCK(&so->so_snd); 69733965Sjdp goto release; 69833965Sjdp } 69933965Sjdp m->m_pkthdr.len = 0; 70033965Sjdp m->m_pkthdr.rcvif = (struct ifnet *)0; 70133965Sjdp } else { 70233965Sjdp MGET(m, M_TRYWAIT, MT_DATA); 70333965Sjdp if (m == NULL) { 70433965Sjdp error = ENOBUFS; 70533965Sjdp SOCKBUF_LOCK(&so->so_snd); 70633965Sjdp goto release; 70733965Sjdp } 70833965Sjdp } 70933965Sjdp if (so_zero_copy_send && 71033965Sjdp resid>=PAGE_SIZE && 71133965Sjdp space>=PAGE_SIZE && 71233965Sjdp uio->uio_iov->iov_len>=PAGE_SIZE) { 71333965Sjdp so_zerocp_stats.size_ok++; 71433965Sjdp if (!((vm_offset_t) 71533965Sjdp uio->uio_iov->iov_base & PAGE_MASK)){ 71633965Sjdp so_zerocp_stats.align_ok++; 71733965Sjdp cow_send = socow_setup(m, uio); 71833965Sjdp } 71933965Sjdp } 72033965Sjdp if (!cow_send) { 72133965Sjdp MCLGET(m, M_TRYWAIT); 72233965Sjdp if ((m->m_flags & M_EXT) == 0) { 72333965Sjdp m_free(m); 72433965Sjdp m = NULL; 72533965Sjdp } else { 72633965Sjdp len = min(min(MCLBYTES, resid), space); 72733965Sjdp } 72833965Sjdp } else 72933965Sjdp len = PAGE_SIZE; 73033965Sjdp#else /* ZERO_COPY_SOCKETS */ 73133965Sjdp if (top == NULL) { 73233965Sjdp m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 73333965Sjdp m->m_pkthdr.len = 0; 73433965Sjdp m->m_pkthdr.rcvif = (struct ifnet *)0; 73533965Sjdp } else 73633965Sjdp m = m_getcl(M_TRYWAIT, MT_DATA, 0); 73733965Sjdp len = min(min(MCLBYTES, resid), space); 73833965Sjdp#endif /* ZERO_COPY_SOCKETS */ 73933965Sjdp } else { 74033965Sjdp if (top == NULL) { 74133965Sjdp m = m_gethdr(M_TRYWAIT, MT_DATA); 74233965Sjdp m->m_pkthdr.len = 0; 74333965Sjdp m->m_pkthdr.rcvif = (struct ifnet *)0; 74433965Sjdp 74533965Sjdp len = min(min(MHLEN, resid), space); 74633965Sjdp /* 74733965Sjdp * For datagram protocols, leave room 74833965Sjdp * for protocol headers in first mbuf. 74933965Sjdp */ 75033965Sjdp if (atomic && m && len < MHLEN) 75133965Sjdp MH_ALIGN(m, len); 75233965Sjdp } else { 75333965Sjdp m = m_get(M_TRYWAIT, MT_DATA); 75433965Sjdp len = min(min(MLEN, resid), space); 75533965Sjdp } 75633965Sjdp } 75733965Sjdp if (m == NULL) { 75833965Sjdp error = ENOBUFS; 75933965Sjdp SOCKBUF_LOCK(&so->so_snd); 76033965Sjdp goto release; 76133965Sjdp } 76233965Sjdp 76333965Sjdp space -= len; 76433965Sjdp#ifdef ZERO_COPY_SOCKETS 76533965Sjdp if (cow_send) 76633965Sjdp error = 0; 76733965Sjdp else 76833965Sjdp#endif /* ZERO_COPY_SOCKETS */ 76933965Sjdp error = uiomove(mtod(m, void *), (int)len, uio); 77033965Sjdp resid = uio->uio_resid; 77133965Sjdp m->m_len = len; 77233965Sjdp *mp = m; 77333965Sjdp top->m_pkthdr.len += len; 77433965Sjdp if (error) { 77533965Sjdp SOCKBUF_LOCK(&so->so_snd); 77633965Sjdp goto release; 77733965Sjdp } 77833965Sjdp mp = &m->m_next; 77933965Sjdp if (resid <= 0) { 78033965Sjdp if (flags & MSG_EOR) 78133965Sjdp top->m_flags |= M_EOR; 78233965Sjdp break; 78333965Sjdp } 78433965Sjdp } while (space > 0 && atomic); 78533965Sjdp if (dontroute) { 78633965Sjdp SOCK_LOCK(so); 78733965Sjdp so->so_options |= SO_DONTROUTE; 78833965Sjdp SOCK_UNLOCK(so); 78933965Sjdp } 79033965Sjdp /* 79133965Sjdp * XXX all the SBS_CANTSENDMORE checks previously 79233965Sjdp * done could be out of date. We could have recieved 79333965Sjdp * a reset packet in an interrupt or maybe we slept 79433965Sjdp * while doing page faults in uiomove() etc. We could 79533965Sjdp * probably recheck again inside the locking protection 79633965Sjdp * here, but there are probably other places that this 79733965Sjdp * also happens. We must rethink this. 79833965Sjdp */ 79933965Sjdp error = (*so->so_proto->pr_usrreqs->pru_send)(so, 80033965Sjdp (flags & MSG_OOB) ? PRUS_OOB : 80133965Sjdp /* 80233965Sjdp * If the user set MSG_EOF, the protocol 80333965Sjdp * understands this flag and nothing left to 80433965Sjdp * send then use PRU_SEND_EOF instead of PRU_SEND. 80533965Sjdp */ 80633965Sjdp ((flags & MSG_EOF) && 80733965Sjdp (so->so_proto->pr_flags & PR_IMPLOPCL) && 80833965Sjdp (resid <= 0)) ? 80933965Sjdp PRUS_EOF : 81033965Sjdp /* If there is more to send set PRUS_MORETOCOME */ 81133965Sjdp (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 81233965Sjdp top, addr, control, td); 81333965Sjdp if (dontroute) { 81433965Sjdp SOCK_LOCK(so); 81533965Sjdp so->so_options &= ~SO_DONTROUTE; 81633965Sjdp SOCK_UNLOCK(so); 81733965Sjdp } 81833965Sjdp clen = 0; 81933965Sjdp control = NULL; 82033965Sjdp top = NULL; 82133965Sjdp mp = ⊤ 82233965Sjdp if (error) { 82333965Sjdp SOCKBUF_LOCK(&so->so_snd); 82433965Sjdp goto release; 82533965Sjdp } 82633965Sjdp } while (resid && space > 0); 82733965Sjdp SOCKBUF_LOCK(&so->so_snd); 82833965Sjdp } while (resid); 82933965Sjdp 83033965Sjdprelease: 83133965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_snd); 83233965Sjdp sbunlock(&so->so_snd); 83333965Sjdpout_locked: 83433965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_snd); 83533965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 83633965Sjdpout: 83733965Sjdp if (top != NULL) 83833965Sjdp m_freem(top); 83933965Sjdp if (control != NULL) 84033965Sjdp m_freem(control); 84133965Sjdp return (error); 84233965Sjdp} 84333965Sjdp 84433965Sjdp/* 84533965Sjdp * The part of soreceive() that implements reading non-inline out-of-band 84633965Sjdp * data from a socket. For more complete comments, see soreceive(), from 84733965Sjdp * which this code originated. 84833965Sjdp * 84933965Sjdp * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 85033965Sjdp * is unable to return an mbuf chain to the caller. 85133965Sjdp */ 85233965Sjdpstatic int 85333965Sjdpsoreceive_rcvoob(so, uio, flags) 85433965Sjdp struct socket *so; 85533965Sjdp struct uio *uio; 85633965Sjdp int flags; 85733965Sjdp{ 85833965Sjdp struct protosw *pr = so->so_proto; 85933965Sjdp struct mbuf *m; 86033965Sjdp int error; 86133965Sjdp 86233965Sjdp KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 86333965Sjdp 86433965Sjdp m = m_get(M_TRYWAIT, MT_DATA); 86533965Sjdp if (m == NULL) 86633965Sjdp return (ENOBUFS); 86733965Sjdp error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 86833965Sjdp if (error) 86933965Sjdp goto bad; 87033965Sjdp do { 87133965Sjdp#ifdef ZERO_COPY_SOCKETS 87233965Sjdp if (so_zero_copy_receive) { 87333965Sjdp vm_page_t pg; 87433965Sjdp int disposable; 87533965Sjdp 87633965Sjdp if ((m->m_flags & M_EXT) 87733965Sjdp && (m->m_ext.ext_type == EXT_DISPOSABLE)) 87833965Sjdp disposable = 1; 87933965Sjdp else 88033965Sjdp disposable = 0; 88133965Sjdp 88233965Sjdp pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 88333965Sjdp if (uio->uio_offset == -1) 88433965Sjdp uio->uio_offset =IDX_TO_OFF(pg->pindex); 88533965Sjdp 88633965Sjdp error = uiomoveco(mtod(m, void *), 88733965Sjdp min(uio->uio_resid, m->m_len), 88833965Sjdp uio, pg->object, 88933965Sjdp disposable); 89033965Sjdp } else 89133965Sjdp#endif /* ZERO_COPY_SOCKETS */ 89233965Sjdp error = uiomove(mtod(m, void *), 89333965Sjdp (int) min(uio->uio_resid, m->m_len), uio); 89433965Sjdp m = m_free(m); 89533965Sjdp } while (uio->uio_resid && error == 0 && m); 89633965Sjdpbad: 89733965Sjdp if (m != NULL) 89833965Sjdp m_freem(m); 89933965Sjdp return (error); 90033965Sjdp} 90133965Sjdp 90233965Sjdp/* 90333965Sjdp * Following replacement or removal of the first mbuf on the first mbuf chain 90433965Sjdp * of a socket buffer, push necessary state changes back into the socket 90533965Sjdp * buffer so that other consumers see the values consistently. 'nextrecord' 90633965Sjdp * is the callers locally stored value of the original value of 90733965Sjdp * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 90833965Sjdp * NOTE: 'nextrecord' may be NULL. 90933965Sjdp */ 91033965Sjdpstatic __inline void 91133965Sjdpsockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 91233965Sjdp{ 91333965Sjdp 91433965Sjdp SOCKBUF_LOCK_ASSERT(sb); 91533965Sjdp /* 91633965Sjdp * First, update for the new value of nextrecord. If necessary, make 91733965Sjdp * it the first record. 91833965Sjdp */ 91933965Sjdp if (sb->sb_mb != NULL) 92033965Sjdp sb->sb_mb->m_nextpkt = nextrecord; 92133965Sjdp else 92233965Sjdp sb->sb_mb = nextrecord; 92333965Sjdp 92433965Sjdp /* 92533965Sjdp * Now update any dependent socket buffer fields to reflect the new 92633965Sjdp * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 92733965Sjdp * addition of a second clause that takes care of the case where 92833965Sjdp * sb_mb has been updated, but remains the last record. 92933965Sjdp */ 93033965Sjdp if (sb->sb_mb == NULL) { 93133965Sjdp sb->sb_mbtail = NULL; 93233965Sjdp sb->sb_lastrecord = NULL; 93333965Sjdp } else if (sb->sb_mb->m_nextpkt == NULL) 93433965Sjdp sb->sb_lastrecord = sb->sb_mb; 93533965Sjdp} 93633965Sjdp 93733965Sjdp 93833965Sjdp/* 93933965Sjdp * Implement receive operations on a socket. 94033965Sjdp * We depend on the way that records are added to the sockbuf 94133965Sjdp * by sbappend*. In particular, each record (mbufs linked through m_next) 94233965Sjdp * must begin with an address if the protocol so specifies, 94333965Sjdp * followed by an optional mbuf or mbufs containing ancillary data, 94433965Sjdp * and then zero or more mbufs of data. 94533965Sjdp * In order to avoid blocking network interrupts for the entire time here, 94633965Sjdp * we splx() while doing the actual copy to user space. 94733965Sjdp * Although the sockbuf is locked, new data may still be appended, 94833965Sjdp * and thus we must maintain consistency of the sockbuf during that time. 94933965Sjdp * 95033965Sjdp * The caller may receive the data as a single mbuf chain by supplying 95133965Sjdp * an mbuf **mp0 for use in returning the chain. The uio is then used 95233965Sjdp * only for the count in uio_resid. 95333965Sjdp */ 95433965Sjdpint 95533965Sjdpsoreceive(so, psa, uio, mp0, controlp, flagsp) 95633965Sjdp struct socket *so; 95733965Sjdp struct sockaddr **psa; 95833965Sjdp struct uio *uio; 95933965Sjdp struct mbuf **mp0; 96033965Sjdp struct mbuf **controlp; 96133965Sjdp int *flagsp; 96233965Sjdp{ 96333965Sjdp struct mbuf *m, **mp; 96433965Sjdp int flags, len, error, offset; 96533965Sjdp struct protosw *pr = so->so_proto; 96633965Sjdp struct mbuf *nextrecord; 96733965Sjdp int moff, type = 0; 96833965Sjdp int orig_resid = uio->uio_resid; 96933965Sjdp 97033965Sjdp mp = mp0; 97133965Sjdp if (psa != NULL) 97233965Sjdp *psa = NULL; 97333965Sjdp if (controlp != NULL) 97433965Sjdp *controlp = NULL; 97533965Sjdp if (flagsp != NULL) 97633965Sjdp flags = *flagsp &~ MSG_EOR; 97733965Sjdp else 97833965Sjdp flags = 0; 97933965Sjdp if (flags & MSG_OOB) 98033965Sjdp return (soreceive_rcvoob(so, uio, flags)); 98133965Sjdp if (mp != NULL) 98233965Sjdp *mp = NULL; 98333965Sjdp if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 98433965Sjdp (*pr->pr_usrreqs->pru_rcvd)(so, 0); 98533965Sjdp 98633965Sjdp SOCKBUF_LOCK(&so->so_rcv); 98733965Sjdprestart: 98833965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 98933965Sjdp error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 99033965Sjdp if (error) 99133965Sjdp goto out; 99233965Sjdp 99333965Sjdp m = so->so_rcv.sb_mb; 99433965Sjdp /* 99533965Sjdp * If we have less data than requested, block awaiting more 99633965Sjdp * (subject to any timeout) if: 99733965Sjdp * 1. the current count is less than the low water mark, or 99833965Sjdp * 2. MSG_WAITALL is set, and it is possible to do the entire 99933965Sjdp * receive operation at once if we block (resid <= hiwat). 100033965Sjdp * 3. MSG_DONTWAIT is not set 100133965Sjdp * If MSG_WAITALL is set but resid is larger than the receive buffer, 100233965Sjdp * we have to do the receive in sections, and thus risk returning 100333965Sjdp * a short count if a timeout or signal occurs after we start. 100433965Sjdp */ 100533965Sjdp if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 100633965Sjdp so->so_rcv.sb_cc < uio->uio_resid) && 100733965Sjdp (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 100833965Sjdp ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 100933965Sjdp m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 101033965Sjdp KASSERT(m != NULL || !so->so_rcv.sb_cc, 101133965Sjdp ("receive: m == %p so->so_rcv.sb_cc == %u", 101233965Sjdp m, so->so_rcv.sb_cc)); 101333965Sjdp if (so->so_error) { 101433965Sjdp if (m != NULL) 101533965Sjdp goto dontblock; 101633965Sjdp error = so->so_error; 101733965Sjdp if ((flags & MSG_PEEK) == 0) 101833965Sjdp so->so_error = 0; 101933965Sjdp goto release; 102033965Sjdp } 102133965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 102233965Sjdp if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 102333965Sjdp if (m) 102433965Sjdp goto dontblock; 102533965Sjdp else 102633965Sjdp goto release; 102733965Sjdp } 102833965Sjdp for (; m != NULL; m = m->m_next) 102933965Sjdp if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 103033965Sjdp m = so->so_rcv.sb_mb; 103133965Sjdp goto dontblock; 103233965Sjdp } 103333965Sjdp if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 103433965Sjdp (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 103533965Sjdp error = ENOTCONN; 103633965Sjdp goto release; 103733965Sjdp } 103833965Sjdp if (uio->uio_resid == 0) 103933965Sjdp goto release; 104033965Sjdp if ((so->so_state & SS_NBIO) || 104133965Sjdp (flags & (MSG_DONTWAIT|MSG_NBIO))) { 104233965Sjdp error = EWOULDBLOCK; 104333965Sjdp goto release; 104433965Sjdp } 104533965Sjdp SBLASTRECORDCHK(&so->so_rcv); 104633965Sjdp SBLASTMBUFCHK(&so->so_rcv); 104733965Sjdp sbunlock(&so->so_rcv); 104833965Sjdp error = sbwait(&so->so_rcv); 104933965Sjdp if (error) 105033965Sjdp goto out; 105133965Sjdp goto restart; 105233965Sjdp } 105333965Sjdpdontblock: 105433965Sjdp /* 105533965Sjdp * From this point onward, we maintain 'nextrecord' as a cache of the 105633965Sjdp * pointer to the next record in the socket buffer. We must keep the 105733965Sjdp * various socket buffer pointers and local stack versions of the 105833965Sjdp * pointers in sync, pushing out modifications before dropping the 105933965Sjdp * socket buffer mutex, and re-reading them when picking it up. 106033965Sjdp * 106133965Sjdp * Otherwise, we will race with the network stack appending new data 106233965Sjdp * or records onto the socket buffer by using inconsistent/stale 106333965Sjdp * versions of the field, possibly resulting in socket buffer 106433965Sjdp * corruption. 106533965Sjdp * 106633965Sjdp * By holding the high-level sblock(), we prevent simultaneous 106733965Sjdp * readers from pulling off the front of the socket buffer. 106833965Sjdp */ 106933965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 107033965Sjdp if (uio->uio_td) 107133965Sjdp uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 107233965Sjdp KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 107333965Sjdp SBLASTRECORDCHK(&so->so_rcv); 107433965Sjdp SBLASTMBUFCHK(&so->so_rcv); 107533965Sjdp nextrecord = m->m_nextpkt; 107633965Sjdp if (pr->pr_flags & PR_ADDR) { 107733965Sjdp KASSERT(m->m_type == MT_SONAME, 107833965Sjdp ("m->m_type == %d", m->m_type)); 107933965Sjdp orig_resid = 0; 108033965Sjdp if (psa != NULL) 108133965Sjdp *psa = sodupsockaddr(mtod(m, struct sockaddr *), 108233965Sjdp M_NOWAIT); 108333965Sjdp if (flags & MSG_PEEK) { 108433965Sjdp m = m->m_next; 108533965Sjdp } else { 108633965Sjdp sbfree(&so->so_rcv, m); 108733965Sjdp so->so_rcv.sb_mb = m_free(m); 108833965Sjdp m = so->so_rcv.sb_mb; 108933965Sjdp sockbuf_pushsync(&so->so_rcv, nextrecord); 109033965Sjdp } 109133965Sjdp } 109233965Sjdp 109333965Sjdp /* 109433965Sjdp * Process one or more MT_CONTROL mbufs present before any data mbufs 109533965Sjdp * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 109633965Sjdp * just copy the data; if !MSG_PEEK, we call into the protocol to 109733965Sjdp * perform externalization (or freeing if controlp == NULL). 109833965Sjdp */ 109933965Sjdp if (m != NULL && m->m_type == MT_CONTROL) { 110033965Sjdp struct mbuf *cm = NULL, *cmn; 110133965Sjdp struct mbuf **cme = &cm; 110233965Sjdp 110333965Sjdp do { 110433965Sjdp if (flags & MSG_PEEK) { 110533965Sjdp if (controlp != NULL) { 110633965Sjdp *controlp = m_copy(m, 0, m->m_len); 110733965Sjdp controlp = &(*controlp)->m_next; 110833965Sjdp } 110933965Sjdp m = m->m_next; 111033965Sjdp } else { 111133965Sjdp sbfree(&so->so_rcv, m); 111233965Sjdp so->so_rcv.sb_mb = m->m_next; 111333965Sjdp m->m_next = NULL; 111433965Sjdp *cme = m; 111533965Sjdp cme = &(*cme)->m_next; 111633965Sjdp m = so->so_rcv.sb_mb; 111733965Sjdp } 111833965Sjdp } while (m != NULL && m->m_type == MT_CONTROL); 111933965Sjdp if ((flags & MSG_PEEK) == 0) 112033965Sjdp sockbuf_pushsync(&so->so_rcv, nextrecord); 112133965Sjdp while (cm != NULL) { 112233965Sjdp cmn = cm->m_next; 112333965Sjdp cm->m_next = NULL; 112433965Sjdp if (pr->pr_domain->dom_externalize != NULL) { 112533965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 112633965Sjdp error = (*pr->pr_domain->dom_externalize) 112733965Sjdp (cm, controlp); 112833965Sjdp SOCKBUF_LOCK(&so->so_rcv); 112933965Sjdp } else if (controlp != NULL) 113033965Sjdp *controlp = cm; 113133965Sjdp else 113233965Sjdp m_freem(cm); 113333965Sjdp if (controlp != NULL) { 113433965Sjdp orig_resid = 0; 113533965Sjdp while (*controlp != NULL) 113633965Sjdp controlp = &(*controlp)->m_next; 113733965Sjdp } 113833965Sjdp cm = cmn; 113933965Sjdp } 114033965Sjdp nextrecord = so->so_rcv.sb_mb->m_nextpkt; 114133965Sjdp orig_resid = 0; 114233965Sjdp } 114333965Sjdp if (m != NULL) { 114433965Sjdp if ((flags & MSG_PEEK) == 0) { 114533965Sjdp KASSERT(m->m_nextpkt == nextrecord, 114633965Sjdp ("soreceive: post-control, nextrecord !sync")); 114733965Sjdp if (nextrecord == NULL) { 114833965Sjdp KASSERT(so->so_rcv.sb_mb == m, 114933965Sjdp ("soreceive: post-control, sb_mb!=m")); 115033965Sjdp KASSERT(so->so_rcv.sb_lastrecord == m, 115133965Sjdp ("soreceive: post-control, lastrecord!=m")); 115233965Sjdp } 115333965Sjdp } 115433965Sjdp type = m->m_type; 115533965Sjdp if (type == MT_OOBDATA) 115633965Sjdp flags |= MSG_OOB; 115733965Sjdp } else { 115833965Sjdp if ((flags & MSG_PEEK) == 0) { 115933965Sjdp KASSERT(so->so_rcv.sb_mb == nextrecord, 116033965Sjdp ("soreceive: sb_mb != nextrecord")); 116133965Sjdp if (so->so_rcv.sb_mb == NULL) { 116233965Sjdp KASSERT(so->so_rcv.sb_lastrecord == NULL, 116333965Sjdp ("soreceive: sb_lastercord != NULL")); 116433965Sjdp } 116533965Sjdp } 116633965Sjdp } 116733965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 116833965Sjdp SBLASTRECORDCHK(&so->so_rcv); 116933965Sjdp SBLASTMBUFCHK(&so->so_rcv); 117033965Sjdp 117133965Sjdp /* 117233965Sjdp * Now continue to read any data mbufs off of the head of the socket 117333965Sjdp * buffer until the read request is satisfied. Note that 'type' is 117433965Sjdp * used to store the type of any mbuf reads that have happened so far 117533965Sjdp * such that soreceive() can stop reading if the type changes, which 117633965Sjdp * causes soreceive() to return only one of regular data and inline 117733965Sjdp * out-of-band data in a single socket receive operation. 117833965Sjdp */ 117933965Sjdp moff = 0; 118033965Sjdp offset = 0; 118133965Sjdp while (m != NULL && uio->uio_resid > 0 && error == 0) { 118233965Sjdp /* 118333965Sjdp * If the type of mbuf has changed since the last mbuf 118433965Sjdp * examined ('type'), end the receive operation. 118533965Sjdp */ 118633965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 118733965Sjdp if (m->m_type == MT_OOBDATA) { 118833965Sjdp if (type != MT_OOBDATA) 118933965Sjdp break; 119033965Sjdp } else if (type == MT_OOBDATA) 119133965Sjdp break; 119233965Sjdp else 119333965Sjdp KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 119433965Sjdp ("m->m_type == %d", m->m_type)); 119533965Sjdp so->so_rcv.sb_state &= ~SBS_RCVATMARK; 119633965Sjdp len = uio->uio_resid; 119733965Sjdp if (so->so_oobmark && len > so->so_oobmark - offset) 119833965Sjdp len = so->so_oobmark - offset; 119933965Sjdp if (len > m->m_len - moff) 120033965Sjdp len = m->m_len - moff; 120133965Sjdp /* 120233965Sjdp * If mp is set, just pass back the mbufs. 120333965Sjdp * Otherwise copy them out via the uio, then free. 120433965Sjdp * Sockbuf must be consistent here (points to current mbuf, 120533965Sjdp * it points to next record) when we drop priority; 120633965Sjdp * we must note any additions to the sockbuf when we 120733965Sjdp * block interrupts again. 120833965Sjdp */ 120933965Sjdp if (mp == NULL) { 121033965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 121133965Sjdp SBLASTRECORDCHK(&so->so_rcv); 121233965Sjdp SBLASTMBUFCHK(&so->so_rcv); 121333965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 121433965Sjdp#ifdef ZERO_COPY_SOCKETS 121533965Sjdp if (so_zero_copy_receive) { 121633965Sjdp vm_page_t pg; 121733965Sjdp int disposable; 121833965Sjdp 121933965Sjdp if ((m->m_flags & M_EXT) 122033965Sjdp && (m->m_ext.ext_type == EXT_DISPOSABLE)) 122133965Sjdp disposable = 1; 122233965Sjdp else 122333965Sjdp disposable = 0; 122433965Sjdp 122533965Sjdp pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 122633965Sjdp moff)); 122733965Sjdp 122833965Sjdp if (uio->uio_offset == -1) 122933965Sjdp uio->uio_offset =IDX_TO_OFF(pg->pindex); 123033965Sjdp 123133965Sjdp error = uiomoveco(mtod(m, char *) + moff, 123233965Sjdp (int)len, uio,pg->object, 123333965Sjdp disposable); 123433965Sjdp } else 123533965Sjdp#endif /* ZERO_COPY_SOCKETS */ 123633965Sjdp error = uiomove(mtod(m, char *) + moff, (int)len, uio); 123733965Sjdp SOCKBUF_LOCK(&so->so_rcv); 123833965Sjdp if (error) 123933965Sjdp goto release; 124033965Sjdp } else 124133965Sjdp uio->uio_resid -= len; 124233965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 124333965Sjdp if (len == m->m_len - moff) { 124433965Sjdp if (m->m_flags & M_EOR) 124533965Sjdp flags |= MSG_EOR; 124633965Sjdp if (flags & MSG_PEEK) { 124733965Sjdp m = m->m_next; 124833965Sjdp moff = 0; 124933965Sjdp } else { 125033965Sjdp nextrecord = m->m_nextpkt; 125133965Sjdp sbfree(&so->so_rcv, m); 125233965Sjdp if (mp != NULL) { 125333965Sjdp *mp = m; 125433965Sjdp mp = &m->m_next; 125533965Sjdp so->so_rcv.sb_mb = m = m->m_next; 125633965Sjdp *mp = NULL; 125733965Sjdp } else { 125833965Sjdp so->so_rcv.sb_mb = m_free(m); 125933965Sjdp m = so->so_rcv.sb_mb; 126033965Sjdp } 126133965Sjdp if (m != NULL) { 126233965Sjdp m->m_nextpkt = nextrecord; 126333965Sjdp if (nextrecord == NULL) 126433965Sjdp so->so_rcv.sb_lastrecord = m; 126533965Sjdp } else { 126633965Sjdp so->so_rcv.sb_mb = nextrecord; 126733965Sjdp SB_EMPTY_FIXUP(&so->so_rcv); 126833965Sjdp } 126933965Sjdp SBLASTRECORDCHK(&so->so_rcv); 127033965Sjdp SBLASTMBUFCHK(&so->so_rcv); 127133965Sjdp } 127233965Sjdp } else { 127333965Sjdp if (flags & MSG_PEEK) 127433965Sjdp moff += len; 127533965Sjdp else { 127633965Sjdp if (mp != NULL) { 127733965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 127833965Sjdp *mp = m_copym(m, 0, len, M_TRYWAIT); 127933965Sjdp SOCKBUF_LOCK(&so->so_rcv); 128033965Sjdp } 128133965Sjdp m->m_data += len; 128233965Sjdp m->m_len -= len; 128333965Sjdp so->so_rcv.sb_cc -= len; 128433965Sjdp } 128533965Sjdp } 128633965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 128733965Sjdp if (so->so_oobmark) { 128833965Sjdp if ((flags & MSG_PEEK) == 0) { 128933965Sjdp so->so_oobmark -= len; 129033965Sjdp if (so->so_oobmark == 0) { 129133965Sjdp so->so_rcv.sb_state |= SBS_RCVATMARK; 129233965Sjdp break; 129333965Sjdp } 129433965Sjdp } else { 129533965Sjdp offset += len; 129633965Sjdp if (offset == so->so_oobmark) 129733965Sjdp break; 129833965Sjdp } 129933965Sjdp } 130033965Sjdp if (flags & MSG_EOR) 130133965Sjdp break; 130233965Sjdp /* 130333965Sjdp * If the MSG_WAITALL flag is set (for non-atomic socket), 130433965Sjdp * we must not quit until "uio->uio_resid == 0" or an error 130533965Sjdp * termination. If a signal/timeout occurs, return 130633965Sjdp * with a short count but without error. 130733965Sjdp * Keep sockbuf locked against other readers. 130833965Sjdp */ 130933965Sjdp while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 131033965Sjdp !sosendallatonce(so) && nextrecord == NULL) { 131133965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 131233965Sjdp if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 131333965Sjdp break; 131433965Sjdp /* 131533965Sjdp * Notify the protocol that some data has been 131633965Sjdp * drained before blocking. 131733965Sjdp */ 131833965Sjdp if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 131933965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 132033965Sjdp (*pr->pr_usrreqs->pru_rcvd)(so, flags); 132133965Sjdp SOCKBUF_LOCK(&so->so_rcv); 132233965Sjdp } 132333965Sjdp SBLASTRECORDCHK(&so->so_rcv); 132433965Sjdp SBLASTMBUFCHK(&so->so_rcv); 132533965Sjdp error = sbwait(&so->so_rcv); 132633965Sjdp if (error) 132733965Sjdp goto release; 132833965Sjdp m = so->so_rcv.sb_mb; 132933965Sjdp if (m != NULL) 133033965Sjdp nextrecord = m->m_nextpkt; 133133965Sjdp } 133233965Sjdp } 133333965Sjdp 133433965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 133533965Sjdp if (m != NULL && pr->pr_flags & PR_ATOMIC) { 133633965Sjdp flags |= MSG_TRUNC; 133733965Sjdp if ((flags & MSG_PEEK) == 0) 133833965Sjdp (void) sbdroprecord_locked(&so->so_rcv); 133933965Sjdp } 134033965Sjdp if ((flags & MSG_PEEK) == 0) { 134133965Sjdp if (m == NULL) { 134233965Sjdp /* 134333965Sjdp * First part is an inline SB_EMPTY_FIXUP(). Second 134433965Sjdp * part makes sure sb_lastrecord is up-to-date if 134533965Sjdp * there is still data in the socket buffer. 134633965Sjdp */ 134733965Sjdp so->so_rcv.sb_mb = nextrecord; 134833965Sjdp if (so->so_rcv.sb_mb == NULL) { 134933965Sjdp so->so_rcv.sb_mbtail = NULL; 135033965Sjdp so->so_rcv.sb_lastrecord = NULL; 135133965Sjdp } else if (nextrecord->m_nextpkt == NULL) 135233965Sjdp so->so_rcv.sb_lastrecord = nextrecord; 135333965Sjdp } 135433965Sjdp SBLASTRECORDCHK(&so->so_rcv); 135533965Sjdp SBLASTMBUFCHK(&so->so_rcv); 135633965Sjdp if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 135733965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 135833965Sjdp (*pr->pr_usrreqs->pru_rcvd)(so, flags); 135933965Sjdp SOCKBUF_LOCK(&so->so_rcv); 136033965Sjdp } 136133965Sjdp } 136233965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 136333965Sjdp if (orig_resid == uio->uio_resid && orig_resid && 136433965Sjdp (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 136533965Sjdp sbunlock(&so->so_rcv); 136633965Sjdp goto restart; 136733965Sjdp } 136833965Sjdp 136933965Sjdp if (flagsp != NULL) 137033965Sjdp *flagsp |= flags; 137133965Sjdprelease: 137233965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 137333965Sjdp sbunlock(&so->so_rcv); 137433965Sjdpout: 137533965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 137633965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 137733965Sjdp return (error); 137833965Sjdp} 137933965Sjdp 138033965Sjdpint 138133965Sjdpsoshutdown(so, how) 138233965Sjdp struct socket *so; 138333965Sjdp int how; 138433965Sjdp{ 138533965Sjdp struct protosw *pr = so->so_proto; 138633965Sjdp 138733965Sjdp if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 138833965Sjdp return (EINVAL); 138933965Sjdp 139033965Sjdp if (how != SHUT_WR) 139133965Sjdp sorflush(so); 139233965Sjdp if (how != SHUT_RD) 139333965Sjdp return ((*pr->pr_usrreqs->pru_shutdown)(so)); 139433965Sjdp return (0); 139533965Sjdp} 139633965Sjdp 139733965Sjdpvoid 139833965Sjdpsorflush(so) 139933965Sjdp struct socket *so; 140033965Sjdp{ 140133965Sjdp struct sockbuf *sb = &so->so_rcv; 140233965Sjdp struct protosw *pr = so->so_proto; 140333965Sjdp struct sockbuf asb; 140433965Sjdp 140533965Sjdp /* 140633965Sjdp * XXXRW: This is quite ugly. The existing code made a copy of the 140733965Sjdp * socket buffer, then zero'd the original to clear the buffer 140833965Sjdp * fields. However, with mutexes in the socket buffer, this causes 140933965Sjdp * problems. We only clear the zeroable bits of the original; 141033965Sjdp * however, we have to initialize and destroy the mutex in the copy 141133965Sjdp * so that dom_dispose() and sbrelease() can lock t as needed. 141233965Sjdp */ 141333965Sjdp SOCKBUF_LOCK(sb); 141433965Sjdp sb->sb_flags |= SB_NOINTR; 141533965Sjdp (void) sblock(sb, M_WAITOK); 141633965Sjdp /* 141733965Sjdp * socantrcvmore_locked() drops the socket buffer mutex so that it 141833965Sjdp * can safely perform wakeups. Re-acquire the mutex before 141933965Sjdp * continuing. 142033965Sjdp */ 142133965Sjdp socantrcvmore_locked(so); 142233965Sjdp SOCKBUF_LOCK(sb); 142333965Sjdp sbunlock(sb); 142433965Sjdp /* 142533965Sjdp * Invalidate/clear most of the sockbuf structure, but leave 142633965Sjdp * selinfo and mutex data unchanged. 142733965Sjdp */ 142833965Sjdp bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 142933965Sjdp bcopy(&sb->sb_startzero, &asb.sb_startzero, 143033965Sjdp sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 143133965Sjdp bzero(&sb->sb_startzero, 143233965Sjdp sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 143333965Sjdp SOCKBUF_UNLOCK(sb); 143433965Sjdp 143533965Sjdp SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 143633965Sjdp if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 143733965Sjdp (*pr->pr_domain->dom_dispose)(asb.sb_mb); 143833965Sjdp sbrelease(&asb, so); 143933965Sjdp SOCKBUF_LOCK_DESTROY(&asb); 144033965Sjdp} 144133965Sjdp 144233965Sjdp#ifdef INET 144333965Sjdpstatic int 144433965Sjdpdo_setopt_accept_filter(so, sopt) 144533965Sjdp struct socket *so; 144633965Sjdp struct sockopt *sopt; 144733965Sjdp{ 144833965Sjdp struct accept_filter_arg *afap; 144933965Sjdp struct accept_filter *afp; 145033965Sjdp struct so_accf *newaf; 145133965Sjdp int error = 0; 145233965Sjdp 145333965Sjdp newaf = NULL; 145433965Sjdp afap = NULL; 145533965Sjdp 145633965Sjdp /* 145733965Sjdp * XXXRW: Configuring accept filters should be an atomic test-and-set 145833965Sjdp * operation to prevent races during setup and attach. There may be 145933965Sjdp * more general issues of racing and ordering here that are not yet 146033965Sjdp * addressed by locking. 146133965Sjdp */ 146233965Sjdp /* do not set/remove accept filters on non listen sockets */ 146333965Sjdp SOCK_LOCK(so); 146433965Sjdp if ((so->so_options & SO_ACCEPTCONN) == 0) { 146533965Sjdp SOCK_UNLOCK(so); 146633965Sjdp return (EINVAL); 146733965Sjdp } 146833965Sjdp 146933965Sjdp /* removing the filter */ 147033965Sjdp if (sopt == NULL) { 147133965Sjdp if (so->so_accf != NULL) { 147233965Sjdp struct so_accf *af = so->so_accf; 147333965Sjdp if (af->so_accept_filter != NULL && 147433965Sjdp af->so_accept_filter->accf_destroy != NULL) { 147533965Sjdp af->so_accept_filter->accf_destroy(so); 147633965Sjdp } 147733965Sjdp if (af->so_accept_filter_str != NULL) { 147833965Sjdp FREE(af->so_accept_filter_str, M_ACCF); 147933965Sjdp } 148033965Sjdp FREE(af, M_ACCF); 148133965Sjdp so->so_accf = NULL; 148233965Sjdp } 148333965Sjdp so->so_options &= ~SO_ACCEPTFILTER; 148433965Sjdp SOCK_UNLOCK(so); 148533965Sjdp return (0); 148633965Sjdp } 148733965Sjdp SOCK_UNLOCK(so); 148833965Sjdp 148933965Sjdp /*- 149033965Sjdp * Adding a filter. 149133965Sjdp * 149233965Sjdp * Do memory allocation, copyin, and filter lookup now while we're 149333965Sjdp * not holding any locks. Avoids sleeping with a mutex, as well as 149433965Sjdp * introducing a lock order between accept filter locks and socket 149533965Sjdp * locks here. 149633965Sjdp */ 149733965Sjdp MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, 149833965Sjdp M_WAITOK); 149933965Sjdp /* don't put large objects on the kernel stack */ 150033965Sjdp error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 150133965Sjdp afap->af_name[sizeof(afap->af_name)-1] = '\0'; 150233965Sjdp afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 150333965Sjdp if (error) { 150433965Sjdp FREE(afap, M_TEMP); 150533965Sjdp return (error); 150633965Sjdp } 150733965Sjdp afp = accept_filt_get(afap->af_name); 150833965Sjdp if (afp == NULL) { 150933965Sjdp FREE(afap, M_TEMP); 151033965Sjdp return (ENOENT); 151133965Sjdp } 151233965Sjdp 151333965Sjdp /* 151433965Sjdp * Allocate the new accept filter instance storage. We may have to 151533965Sjdp * free it again later if we fail to attach it. If attached 151633965Sjdp * properly, 'newaf' is NULLed to avoid a free() while in use. 151733965Sjdp */ 151833965Sjdp MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | 151933965Sjdp M_ZERO); 152033965Sjdp if (afp->accf_create != NULL && afap->af_name[0] != '\0') { 152133965Sjdp int len = strlen(afap->af_name) + 1; 152233965Sjdp MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, 152333965Sjdp M_WAITOK); 152433965Sjdp strcpy(newaf->so_accept_filter_str, afap->af_name); 152533965Sjdp } 152633965Sjdp 152733965Sjdp SOCK_LOCK(so); 152833965Sjdp /* must remove previous filter first */ 152933965Sjdp if (so->so_accf != NULL) { 153033965Sjdp error = EINVAL; 153133965Sjdp goto out; 153233965Sjdp } 153333965Sjdp /* 153433965Sjdp * Invoke the accf_create() method of the filter if required. 153533965Sjdp * XXXRW: the socket mutex is held over this call, so the create 153633965Sjdp * method cannot block. This may be something we have to change, but 153733965Sjdp * it would require addressing possible races. 153833965Sjdp */ 153933965Sjdp if (afp->accf_create != NULL) { 154033965Sjdp newaf->so_accept_filter_arg = 154133965Sjdp afp->accf_create(so, afap->af_arg); 154233965Sjdp if (newaf->so_accept_filter_arg == NULL) { 154333965Sjdp error = EINVAL; 154433965Sjdp goto out; 154533965Sjdp } 154633965Sjdp } 154733965Sjdp newaf->so_accept_filter = afp; 154833965Sjdp so->so_accf = newaf; 154933965Sjdp so->so_options |= SO_ACCEPTFILTER; 155033965Sjdp newaf = NULL; 155133965Sjdpout: 155233965Sjdp SOCK_UNLOCK(so); 155333965Sjdp if (newaf != NULL) { 155433965Sjdp if (newaf->so_accept_filter_str != NULL) 155533965Sjdp FREE(newaf->so_accept_filter_str, M_ACCF); 155633965Sjdp FREE(newaf, M_ACCF); 155733965Sjdp } 155833965Sjdp if (afap != NULL) 155933965Sjdp FREE(afap, M_TEMP); 156033965Sjdp return (error); 156133965Sjdp} 156233965Sjdp#endif /* INET */ 156333965Sjdp 156433965Sjdp/* 156533965Sjdp * Perhaps this routine, and sooptcopyout(), below, ought to come in 156633965Sjdp * an additional variant to handle the case where the option value needs 156733965Sjdp * to be some kind of integer, but not a specific size. 156833965Sjdp * In addition to their use here, these functions are also called by the 156933965Sjdp * protocol-level pr_ctloutput() routines. 157033965Sjdp */ 157133965Sjdpint 157233965Sjdpsooptcopyin(sopt, buf, len, minlen) 157333965Sjdp struct sockopt *sopt; 157433965Sjdp void *buf; 157533965Sjdp size_t len; 157633965Sjdp size_t minlen; 157733965Sjdp{ 157833965Sjdp size_t valsize; 157933965Sjdp 158033965Sjdp /* 158133965Sjdp * If the user gives us more than we wanted, we ignore it, 158233965Sjdp * but if we don't get the minimum length the caller 158333965Sjdp * wants, we return EINVAL. On success, sopt->sopt_valsize 158433965Sjdp * is set to however much we actually retrieved. 158533965Sjdp */ 158633965Sjdp if ((valsize = sopt->sopt_valsize) < minlen) 158733965Sjdp return EINVAL; 158833965Sjdp if (valsize > len) 158933965Sjdp sopt->sopt_valsize = valsize = len; 159033965Sjdp 159133965Sjdp if (sopt->sopt_td != NULL) 159233965Sjdp return (copyin(sopt->sopt_val, buf, valsize)); 159333965Sjdp 159433965Sjdp bcopy(sopt->sopt_val, buf, valsize); 159533965Sjdp return 0; 159633965Sjdp} 159733965Sjdp 159833965Sjdp/* 159933965Sjdp * Kernel version of setsockopt(2)/ 160033965Sjdp * XXX: optlen is size_t, not socklen_t 160133965Sjdp */ 160233965Sjdpint 160333965Sjdpso_setsockopt(struct socket *so, int level, int optname, void *optval, 160433965Sjdp size_t optlen) 160533965Sjdp{ 160633965Sjdp struct sockopt sopt; 160733965Sjdp 160833965Sjdp sopt.sopt_level = level; 160933965Sjdp sopt.sopt_name = optname; 161033965Sjdp sopt.sopt_dir = SOPT_SET; 161133965Sjdp sopt.sopt_val = optval; 161233965Sjdp sopt.sopt_valsize = optlen; 161333965Sjdp sopt.sopt_td = NULL; 161433965Sjdp return (sosetopt(so, &sopt)); 161533965Sjdp} 161633965Sjdp 161733965Sjdpint 161833965Sjdpsosetopt(so, sopt) 161933965Sjdp struct socket *so; 162033965Sjdp struct sockopt *sopt; 162133965Sjdp{ 162233965Sjdp int error, optval; 162333965Sjdp struct linger l; 162433965Sjdp struct timeval tv; 162533965Sjdp u_long val; 162633965Sjdp#ifdef MAC 162733965Sjdp struct mac extmac; 162833965Sjdp#endif 162933965Sjdp 163033965Sjdp error = 0; 163133965Sjdp if (sopt->sopt_level != SOL_SOCKET) { 163233965Sjdp if (so->so_proto && so->so_proto->pr_ctloutput) 163333965Sjdp return ((*so->so_proto->pr_ctloutput) 163433965Sjdp (so, sopt)); 163533965Sjdp error = ENOPROTOOPT; 163633965Sjdp } else { 163733965Sjdp switch (sopt->sopt_name) { 163833965Sjdp#ifdef INET 163933965Sjdp case SO_ACCEPTFILTER: 164033965Sjdp error = do_setopt_accept_filter(so, sopt); 164133965Sjdp if (error) 164233965Sjdp goto bad; 164333965Sjdp break; 164433965Sjdp#endif 164533965Sjdp case SO_LINGER: 164633965Sjdp error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 164733965Sjdp if (error) 164833965Sjdp goto bad; 164933965Sjdp 165033965Sjdp SOCK_LOCK(so); 165133965Sjdp so->so_linger = l.l_linger; 165233965Sjdp if (l.l_onoff) 165333965Sjdp so->so_options |= SO_LINGER; 165433965Sjdp else 165533965Sjdp so->so_options &= ~SO_LINGER; 165633965Sjdp SOCK_UNLOCK(so); 165733965Sjdp break; 165833965Sjdp 165933965Sjdp case SO_DEBUG: 166033965Sjdp case SO_KEEPALIVE: 166133965Sjdp case SO_DONTROUTE: 166233965Sjdp case SO_USELOOPBACK: 166333965Sjdp case SO_BROADCAST: 166433965Sjdp case SO_REUSEADDR: 166533965Sjdp case SO_REUSEPORT: 166633965Sjdp case SO_OOBINLINE: 166733965Sjdp case SO_TIMESTAMP: 166833965Sjdp case SO_BINTIME: 166933965Sjdp case SO_NOSIGPIPE: 167033965Sjdp error = sooptcopyin(sopt, &optval, sizeof optval, 167133965Sjdp sizeof optval); 167233965Sjdp if (error) 167333965Sjdp goto bad; 167433965Sjdp SOCK_LOCK(so); 167533965Sjdp if (optval) 167633965Sjdp so->so_options |= sopt->sopt_name; 167733965Sjdp else 167833965Sjdp so->so_options &= ~sopt->sopt_name; 167933965Sjdp SOCK_UNLOCK(so); 168033965Sjdp break; 168133965Sjdp 168233965Sjdp case SO_SNDBUF: 168333965Sjdp case SO_RCVBUF: 168433965Sjdp case SO_SNDLOWAT: 168533965Sjdp case SO_RCVLOWAT: 168633965Sjdp error = sooptcopyin(sopt, &optval, sizeof optval, 168733965Sjdp sizeof optval); 168833965Sjdp if (error) 168933965Sjdp goto bad; 169033965Sjdp 169133965Sjdp /* 169233965Sjdp * Values < 1 make no sense for any of these 169333965Sjdp * options, so disallow them. 169433965Sjdp */ 169533965Sjdp if (optval < 1) { 169633965Sjdp error = EINVAL; 169733965Sjdp goto bad; 169833965Sjdp } 169933965Sjdp 170033965Sjdp switch (sopt->sopt_name) { 170133965Sjdp case SO_SNDBUF: 170233965Sjdp case SO_RCVBUF: 170333965Sjdp if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 170433965Sjdp &so->so_snd : &so->so_rcv, (u_long)optval, 170533965Sjdp so, curthread) == 0) { 170633965Sjdp error = ENOBUFS; 170733965Sjdp goto bad; 170833965Sjdp } 170933965Sjdp break; 171033965Sjdp 171133965Sjdp /* 171233965Sjdp * Make sure the low-water is never greater than 171333965Sjdp * the high-water. 171433965Sjdp */ 171533965Sjdp case SO_SNDLOWAT: 171633965Sjdp SOCKBUF_LOCK(&so->so_snd); 171733965Sjdp so->so_snd.sb_lowat = 171833965Sjdp (optval > so->so_snd.sb_hiwat) ? 171933965Sjdp so->so_snd.sb_hiwat : optval; 172033965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 172133965Sjdp break; 172233965Sjdp case SO_RCVLOWAT: 172333965Sjdp SOCKBUF_LOCK(&so->so_rcv); 172433965Sjdp so->so_rcv.sb_lowat = 172533965Sjdp (optval > so->so_rcv.sb_hiwat) ? 172633965Sjdp so->so_rcv.sb_hiwat : optval; 172733965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 172833965Sjdp break; 172933965Sjdp } 173033965Sjdp break; 173133965Sjdp 173233965Sjdp case SO_SNDTIMEO: 173333965Sjdp case SO_RCVTIMEO: 173433965Sjdp error = sooptcopyin(sopt, &tv, sizeof tv, 173533965Sjdp sizeof tv); 173633965Sjdp if (error) 173733965Sjdp goto bad; 173833965Sjdp 173933965Sjdp /* assert(hz > 0); */ 174033965Sjdp if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 174133965Sjdp tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 174233965Sjdp error = EDOM; 174333965Sjdp goto bad; 174433965Sjdp } 174533965Sjdp /* assert(tick > 0); */ 174633965Sjdp /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 174733965Sjdp val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 174833965Sjdp if (val > SHRT_MAX) { 174933965Sjdp error = EDOM; 175033965Sjdp goto bad; 175133965Sjdp } 175233965Sjdp if (val == 0 && tv.tv_usec != 0) 175333965Sjdp val = 1; 175433965Sjdp 175533965Sjdp switch (sopt->sopt_name) { 175633965Sjdp case SO_SNDTIMEO: 175733965Sjdp so->so_snd.sb_timeo = val; 175833965Sjdp break; 175933965Sjdp case SO_RCVTIMEO: 176033965Sjdp so->so_rcv.sb_timeo = val; 176133965Sjdp break; 176233965Sjdp } 176333965Sjdp break; 176433965Sjdp case SO_LABEL: 176533965Sjdp#ifdef MAC 176633965Sjdp error = sooptcopyin(sopt, &extmac, sizeof extmac, 176733965Sjdp sizeof extmac); 176833965Sjdp if (error) 176933965Sjdp goto bad; 177033965Sjdp error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 177133965Sjdp so, &extmac); 177233965Sjdp#else 177333965Sjdp error = EOPNOTSUPP; 177433965Sjdp#endif 177533965Sjdp break; 177633965Sjdp default: 177733965Sjdp error = ENOPROTOOPT; 177833965Sjdp break; 177933965Sjdp } 178033965Sjdp if (error == 0 && so->so_proto != NULL && 178133965Sjdp so->so_proto->pr_ctloutput != NULL) { 178233965Sjdp (void) ((*so->so_proto->pr_ctloutput) 178333965Sjdp (so, sopt)); 178433965Sjdp } 178533965Sjdp } 178633965Sjdpbad: 178733965Sjdp return (error); 178833965Sjdp} 178933965Sjdp 179033965Sjdp/* Helper routine for getsockopt */ 179133965Sjdpint 179233965Sjdpsooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 179333965Sjdp{ 179433965Sjdp int error; 179533965Sjdp size_t valsize; 179633965Sjdp 179733965Sjdp error = 0; 179833965Sjdp 179933965Sjdp /* 180033965Sjdp * Documented get behavior is that we always return a value, 180133965Sjdp * possibly truncated to fit in the user's buffer. 180233965Sjdp * Traditional behavior is that we always tell the user 180333965Sjdp * precisely how much we copied, rather than something useful 180433965Sjdp * like the total amount we had available for her. 180533965Sjdp * Note that this interface is not idempotent; the entire answer must 180633965Sjdp * generated ahead of time. 180733965Sjdp */ 180833965Sjdp valsize = min(len, sopt->sopt_valsize); 180933965Sjdp sopt->sopt_valsize = valsize; 181033965Sjdp if (sopt->sopt_val != NULL) { 181133965Sjdp if (sopt->sopt_td != NULL) 181233965Sjdp error = copyout(buf, sopt->sopt_val, valsize); 181333965Sjdp else 181433965Sjdp bcopy(buf, sopt->sopt_val, valsize); 181533965Sjdp } 181633965Sjdp return error; 181733965Sjdp} 181833965Sjdp 181933965Sjdpint 182033965Sjdpsogetopt(so, sopt) 182133965Sjdp struct socket *so; 182233965Sjdp struct sockopt *sopt; 182333965Sjdp{ 182433965Sjdp int error, optval; 182533965Sjdp struct linger l; 182633965Sjdp struct timeval tv; 182733965Sjdp#ifdef INET 182833965Sjdp struct accept_filter_arg *afap; 182933965Sjdp#endif 183033965Sjdp#ifdef MAC 183133965Sjdp struct mac extmac; 183233965Sjdp#endif 183333965Sjdp 183433965Sjdp error = 0; 183533965Sjdp if (sopt->sopt_level != SOL_SOCKET) { 183633965Sjdp if (so->so_proto && so->so_proto->pr_ctloutput) { 183733965Sjdp return ((*so->so_proto->pr_ctloutput) 183833965Sjdp (so, sopt)); 183933965Sjdp } else 184033965Sjdp return (ENOPROTOOPT); 184133965Sjdp } else { 184233965Sjdp switch (sopt->sopt_name) { 184333965Sjdp#ifdef INET 184433965Sjdp case SO_ACCEPTFILTER: 184533965Sjdp /* Unlocked read. */ 184633965Sjdp if ((so->so_options & SO_ACCEPTCONN) == 0) 184733965Sjdp return (EINVAL); 184833965Sjdp MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 184933965Sjdp M_TEMP, M_WAITOK | M_ZERO); 185033965Sjdp SOCK_LOCK(so); 185133965Sjdp if ((so->so_options & SO_ACCEPTFILTER) != 0) { 185233965Sjdp strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 185333965Sjdp if (so->so_accf->so_accept_filter_str != NULL) 185433965Sjdp strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 185533965Sjdp } 185633965Sjdp SOCK_UNLOCK(so); 185733965Sjdp error = sooptcopyout(sopt, afap, sizeof(*afap)); 185833965Sjdp FREE(afap, M_TEMP); 185933965Sjdp break; 186033965Sjdp#endif 186133965Sjdp 186233965Sjdp case SO_LINGER: 186333965Sjdp /* 186433965Sjdp * XXXRW: We grab the lock here to get a consistent 186533965Sjdp * snapshot of both fields. This may not really 186633965Sjdp * be necessary. 186733965Sjdp */ 186833965Sjdp SOCK_LOCK(so); 186933965Sjdp l.l_onoff = so->so_options & SO_LINGER; 187033965Sjdp l.l_linger = so->so_linger; 187133965Sjdp SOCK_UNLOCK(so); 187233965Sjdp error = sooptcopyout(sopt, &l, sizeof l); 187333965Sjdp break; 187433965Sjdp 187533965Sjdp case SO_USELOOPBACK: 187633965Sjdp case SO_DONTROUTE: 187733965Sjdp case SO_DEBUG: 187833965Sjdp case SO_KEEPALIVE: 187933965Sjdp case SO_REUSEADDR: 188033965Sjdp case SO_REUSEPORT: 188133965Sjdp case SO_BROADCAST: 188233965Sjdp case SO_OOBINLINE: 188333965Sjdp case SO_TIMESTAMP: 188433965Sjdp case SO_BINTIME: 188533965Sjdp case SO_NOSIGPIPE: 188633965Sjdp optval = so->so_options & sopt->sopt_name; 188733965Sjdpinteger: 188833965Sjdp error = sooptcopyout(sopt, &optval, sizeof optval); 188933965Sjdp break; 189033965Sjdp 189133965Sjdp case SO_TYPE: 189233965Sjdp optval = so->so_type; 189333965Sjdp goto integer; 189433965Sjdp 189533965Sjdp case SO_ERROR: 189633965Sjdp optval = so->so_error; 189733965Sjdp so->so_error = 0; 189833965Sjdp goto integer; 189933965Sjdp 190033965Sjdp case SO_SNDBUF: 190133965Sjdp optval = so->so_snd.sb_hiwat; 190233965Sjdp goto integer; 190333965Sjdp 190433965Sjdp case SO_RCVBUF: 190533965Sjdp optval = so->so_rcv.sb_hiwat; 190633965Sjdp goto integer; 190733965Sjdp 190833965Sjdp case SO_SNDLOWAT: 190933965Sjdp optval = so->so_snd.sb_lowat; 191033965Sjdp goto integer; 191133965Sjdp 191233965Sjdp case SO_RCVLOWAT: 191333965Sjdp optval = so->so_rcv.sb_lowat; 191433965Sjdp goto integer; 191533965Sjdp 191633965Sjdp case SO_SNDTIMEO: 191733965Sjdp case SO_RCVTIMEO: 191833965Sjdp optval = (sopt->sopt_name == SO_SNDTIMEO ? 191933965Sjdp so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 192033965Sjdp 192133965Sjdp tv.tv_sec = optval / hz; 192233965Sjdp tv.tv_usec = (optval % hz) * tick; 192333965Sjdp error = sooptcopyout(sopt, &tv, sizeof tv); 192433965Sjdp break; 192533965Sjdp case SO_LABEL: 192633965Sjdp#ifdef MAC 192733965Sjdp error = sooptcopyin(sopt, &extmac, sizeof(extmac), 192833965Sjdp sizeof(extmac)); 192933965Sjdp if (error) 193033965Sjdp return (error); 193133965Sjdp error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 193233965Sjdp so, &extmac); 193333965Sjdp if (error) 193433965Sjdp return (error); 193533965Sjdp error = sooptcopyout(sopt, &extmac, sizeof extmac); 193633965Sjdp#else 193733965Sjdp error = EOPNOTSUPP; 193833965Sjdp#endif 193933965Sjdp break; 194033965Sjdp case SO_PEERLABEL: 194133965Sjdp#ifdef MAC 194233965Sjdp error = sooptcopyin(sopt, &extmac, sizeof(extmac), 194333965Sjdp sizeof(extmac)); 194433965Sjdp if (error) 194533965Sjdp return (error); 194633965Sjdp error = mac_getsockopt_peerlabel( 194733965Sjdp sopt->sopt_td->td_ucred, so, &extmac); 194833965Sjdp if (error) 194933965Sjdp return (error); 195033965Sjdp error = sooptcopyout(sopt, &extmac, sizeof extmac); 195133965Sjdp#else 195233965Sjdp error = EOPNOTSUPP; 195333965Sjdp#endif 195433965Sjdp break; 195533965Sjdp default: 195633965Sjdp error = ENOPROTOOPT; 195733965Sjdp break; 195833965Sjdp } 195933965Sjdp return (error); 196033965Sjdp } 196133965Sjdp} 196233965Sjdp 196333965Sjdp/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 196433965Sjdpint 196533965Sjdpsoopt_getm(struct sockopt *sopt, struct mbuf **mp) 196633965Sjdp{ 196733965Sjdp struct mbuf *m, *m_prev; 196833965Sjdp int sopt_size = sopt->sopt_valsize; 196933965Sjdp 197033965Sjdp MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 197133965Sjdp if (m == NULL) 197233965Sjdp return ENOBUFS; 197333965Sjdp if (sopt_size > MLEN) { 197433965Sjdp MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 197533965Sjdp if ((m->m_flags & M_EXT) == 0) { 197633965Sjdp m_free(m); 197733965Sjdp return ENOBUFS; 197833965Sjdp } 197933965Sjdp m->m_len = min(MCLBYTES, sopt_size); 198033965Sjdp } else { 198133965Sjdp m->m_len = min(MLEN, sopt_size); 198233965Sjdp } 198333965Sjdp sopt_size -= m->m_len; 198433965Sjdp *mp = m; 198533965Sjdp m_prev = m; 198633965Sjdp 198733965Sjdp while (sopt_size) { 198833965Sjdp MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 198933965Sjdp if (m == NULL) { 199033965Sjdp m_freem(*mp); 199133965Sjdp return ENOBUFS; 199233965Sjdp } 199333965Sjdp if (sopt_size > MLEN) { 199433965Sjdp MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 199533965Sjdp M_DONTWAIT); 199633965Sjdp if ((m->m_flags & M_EXT) == 0) { 199733965Sjdp m_freem(m); 199833965Sjdp m_freem(*mp); 199933965Sjdp return ENOBUFS; 200033965Sjdp } 200133965Sjdp m->m_len = min(MCLBYTES, sopt_size); 200233965Sjdp } else { 200333965Sjdp m->m_len = min(MLEN, sopt_size); 200433965Sjdp } 200533965Sjdp sopt_size -= m->m_len; 200633965Sjdp m_prev->m_next = m; 200733965Sjdp m_prev = m; 200833965Sjdp } 200933965Sjdp return 0; 201033965Sjdp} 201133965Sjdp 201233965Sjdp/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 201333965Sjdpint 201433965Sjdpsoopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 201533965Sjdp{ 201633965Sjdp struct mbuf *m0 = m; 201733965Sjdp 201833965Sjdp if (sopt->sopt_val == NULL) 201933965Sjdp return 0; 202033965Sjdp while (m != NULL && sopt->sopt_valsize >= m->m_len) { 202133965Sjdp if (sopt->sopt_td != NULL) { 202233965Sjdp int error; 202333965Sjdp 202433965Sjdp error = copyin(sopt->sopt_val, mtod(m, char *), 202533965Sjdp m->m_len); 202633965Sjdp if (error != 0) { 202733965Sjdp m_freem(m0); 202833965Sjdp return(error); 202933965Sjdp } 203033965Sjdp } else 203133965Sjdp bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 203233965Sjdp sopt->sopt_valsize -= m->m_len; 203333965Sjdp sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 203433965Sjdp m = m->m_next; 203533965Sjdp } 203633965Sjdp if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 203733965Sjdp panic("ip6_sooptmcopyin"); 203833965Sjdp return 0; 203933965Sjdp} 204033965Sjdp 204133965Sjdp/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 204233965Sjdpint 204333965Sjdpsoopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 204433965Sjdp{ 204533965Sjdp struct mbuf *m0 = m; 204633965Sjdp size_t valsize = 0; 204733965Sjdp 204833965Sjdp if (sopt->sopt_val == NULL) 204933965Sjdp return 0; 205033965Sjdp while (m != NULL && sopt->sopt_valsize >= m->m_len) { 205133965Sjdp if (sopt->sopt_td != NULL) { 205233965Sjdp int error; 205333965Sjdp 205433965Sjdp error = copyout(mtod(m, char *), sopt->sopt_val, 205533965Sjdp m->m_len); 205633965Sjdp if (error != 0) { 205733965Sjdp m_freem(m0); 205833965Sjdp return(error); 205933965Sjdp } 206033965Sjdp } else 206133965Sjdp bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 206233965Sjdp sopt->sopt_valsize -= m->m_len; 206333965Sjdp sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 206433965Sjdp valsize += m->m_len; 206533965Sjdp m = m->m_next; 206633965Sjdp } 206733965Sjdp if (m != NULL) { 206833965Sjdp /* enough soopt buffer should be given from user-land */ 206933965Sjdp m_freem(m0); 207033965Sjdp return(EINVAL); 207133965Sjdp } 207233965Sjdp sopt->sopt_valsize = valsize; 207333965Sjdp return 0; 207433965Sjdp} 207533965Sjdp 207633965Sjdpvoid 207733965Sjdpsohasoutofband(so) 207833965Sjdp struct socket *so; 207933965Sjdp{ 208033965Sjdp if (so->so_sigio != NULL) 208133965Sjdp pgsigio(&so->so_sigio, SIGURG, 0); 208233965Sjdp selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 208333965Sjdp} 208433965Sjdp 208533965Sjdpint 208633965Sjdpsopoll(struct socket *so, int events, struct ucred *active_cred, 208733965Sjdp struct thread *td) 208833965Sjdp{ 208933965Sjdp int revents = 0; 209033965Sjdp 209133965Sjdp if (events & (POLLIN | POLLRDNORM)) 209233965Sjdp if (soreadable(so)) 209333965Sjdp revents |= events & (POLLIN | POLLRDNORM); 209433965Sjdp 209533965Sjdp if (events & POLLINIGNEOF) 209633965Sjdp if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 209733965Sjdp !TAILQ_EMPTY(&so->so_comp) || so->so_error) 209833965Sjdp revents |= POLLINIGNEOF; 209933965Sjdp 210033965Sjdp if (events & (POLLOUT | POLLWRNORM)) 210133965Sjdp if (sowriteable(so)) 210233965Sjdp revents |= events & (POLLOUT | POLLWRNORM); 210333965Sjdp 210433965Sjdp if (events & (POLLPRI | POLLRDBAND)) 210533965Sjdp if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 210633965Sjdp revents |= events & (POLLPRI | POLLRDBAND); 210733965Sjdp 210833965Sjdp if (revents == 0) { 210933965Sjdp if (events & 211033965Sjdp (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 211133965Sjdp POLLRDBAND)) { 211233965Sjdp SOCKBUF_LOCK(&so->so_rcv); 211333965Sjdp selrecord(td, &so->so_rcv.sb_sel); 211433965Sjdp so->so_rcv.sb_flags |= SB_SEL; 211533965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 211633965Sjdp } 211733965Sjdp 211833965Sjdp if (events & (POLLOUT | POLLWRNORM)) { 211933965Sjdp SOCKBUF_LOCK(&so->so_snd); 212033965Sjdp selrecord(td, &so->so_snd.sb_sel); 212133965Sjdp so->so_snd.sb_flags |= SB_SEL; 212233965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 212333965Sjdp } 212433965Sjdp } 212533965Sjdp 212633965Sjdp return (revents); 212733965Sjdp} 212833965Sjdp 212933965Sjdpint 213033965Sjdpsoo_kqfilter(struct file *fp, struct knote *kn) 213133965Sjdp{ 213233965Sjdp struct socket *so = kn->kn_fp->f_data; 213333965Sjdp struct sockbuf *sb; 213433965Sjdp 213533965Sjdp switch (kn->kn_filter) { 213633965Sjdp case EVFILT_READ: 213733965Sjdp if (so->so_options & SO_ACCEPTCONN) 213833965Sjdp kn->kn_fop = &solisten_filtops; 213933965Sjdp else 214033965Sjdp kn->kn_fop = &soread_filtops; 214133965Sjdp sb = &so->so_rcv; 214233965Sjdp break; 214333965Sjdp case EVFILT_WRITE: 214433965Sjdp kn->kn_fop = &sowrite_filtops; 214533965Sjdp sb = &so->so_snd; 214633965Sjdp break; 214733965Sjdp default: 214833965Sjdp return (EINVAL); 214933965Sjdp } 215033965Sjdp 215133965Sjdp SOCKBUF_LOCK(sb); 215233965Sjdp knlist_add(&sb->sb_sel.si_note, kn, 1); 215333965Sjdp sb->sb_flags |= SB_KNOTE; 215433965Sjdp SOCKBUF_UNLOCK(sb); 215533965Sjdp return (0); 215633965Sjdp} 215733965Sjdp 215833965Sjdpstatic void 215933965Sjdpfilt_sordetach(struct knote *kn) 216033965Sjdp{ 216133965Sjdp struct socket *so = kn->kn_fp->f_data; 216233965Sjdp 216333965Sjdp SOCKBUF_LOCK(&so->so_rcv); 216433965Sjdp knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 216533965Sjdp if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 216633965Sjdp so->so_rcv.sb_flags &= ~SB_KNOTE; 216733965Sjdp SOCKBUF_UNLOCK(&so->so_rcv); 216833965Sjdp} 216933965Sjdp 217033965Sjdp/*ARGSUSED*/ 217133965Sjdpstatic int 217233965Sjdpfilt_soread(struct knote *kn, long hint) 217333965Sjdp{ 217433965Sjdp struct socket *so; 217533965Sjdp 217633965Sjdp so = kn->kn_fp->f_data; 217733965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 217833965Sjdp 217933965Sjdp kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 218033965Sjdp if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 218133965Sjdp kn->kn_flags |= EV_EOF; 218233965Sjdp kn->kn_fflags = so->so_error; 218333965Sjdp return (1); 218433965Sjdp } else if (so->so_error) /* temporary udp error */ 218533965Sjdp return (1); 218633965Sjdp else if (kn->kn_sfflags & NOTE_LOWAT) 218733965Sjdp return (kn->kn_data >= kn->kn_sdata); 218833965Sjdp else 218933965Sjdp return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 219033965Sjdp} 219133965Sjdp 219233965Sjdpstatic void 219333965Sjdpfilt_sowdetach(struct knote *kn) 219433965Sjdp{ 219533965Sjdp struct socket *so = kn->kn_fp->f_data; 219633965Sjdp 219733965Sjdp SOCKBUF_LOCK(&so->so_snd); 219833965Sjdp knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 219933965Sjdp if (knlist_empty(&so->so_snd.sb_sel.si_note)) 220033965Sjdp so->so_snd.sb_flags &= ~SB_KNOTE; 220133965Sjdp SOCKBUF_UNLOCK(&so->so_snd); 220233965Sjdp} 220333965Sjdp 220433965Sjdp/*ARGSUSED*/ 220533965Sjdpstatic int 220633965Sjdpfilt_sowrite(struct knote *kn, long hint) 220733965Sjdp{ 220833965Sjdp struct socket *so; 220933965Sjdp 221033965Sjdp so = kn->kn_fp->f_data; 221133965Sjdp SOCKBUF_LOCK_ASSERT(&so->so_snd); 221233965Sjdp kn->kn_data = sbspace(&so->so_snd); 221333965Sjdp if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 221433965Sjdp kn->kn_flags |= EV_EOF; 221533965Sjdp kn->kn_fflags = so->so_error; 221633965Sjdp return (1); 221733965Sjdp } else if (so->so_error) /* temporary udp error */ 221833965Sjdp return (1); 221933965Sjdp else if (((so->so_state & SS_ISCONNECTED) == 0) && 222033965Sjdp (so->so_proto->pr_flags & PR_CONNREQUIRED)) 222133965Sjdp return (0); 222233965Sjdp else if (kn->kn_sfflags & NOTE_LOWAT) 222333965Sjdp return (kn->kn_data >= kn->kn_sdata); 222433965Sjdp else 222533965Sjdp return (kn->kn_data >= so->so_snd.sb_lowat); 222633965Sjdp} 222733965Sjdp 222833965Sjdp/*ARGSUSED*/ 222933965Sjdpstatic int 223033965Sjdpfilt_solisten(struct knote *kn, long hint) 223133965Sjdp{ 223233965Sjdp struct socket *so = kn->kn_fp->f_data; 223333965Sjdp 223433965Sjdp kn->kn_data = so->so_qlen; 223533965Sjdp return (! TAILQ_EMPTY(&so->so_comp)); 223633965Sjdp} 223733965Sjdp 223833965Sjdpint 223933965Sjdpsocheckuid(struct socket *so, uid_t uid) 224033965Sjdp{ 224133965Sjdp 224233965Sjdp if (so == NULL) 224333965Sjdp return (EPERM); 224433965Sjdp if (so->so_cred->cr_uid == uid) 224533965Sjdp return (0); 224633965Sjdp return (EPERM); 224733965Sjdp} 224833965Sjdp