uipc_socket.c revision 53541
1109998Smarkm/*
2296341Sdelphij * Copyright (c) 1982, 1986, 1988, 1990, 1993
3296341Sdelphij *	The Regents of the University of California.  All rights reserved.
4296341Sdelphij *
5109998Smarkm * Redistribution and use in source and binary forms, with or without
6109998Smarkm * modification, are permitted provided that the following conditions
7109998Smarkm * are met:
8109998Smarkm * 1. Redistributions of source code must retain the above copyright
9109998Smarkm *    notice, this list of conditions and the following disclaimer.
10109998Smarkm * 2. Redistributions in binary form must reproduce the above copyright
11109998Smarkm *    notice, this list of conditions and the following disclaimer in the
12109998Smarkm *    documentation and/or other materials provided with the distribution.
13109998Smarkm * 3. All advertising materials mentioning features or use of this software
14296341Sdelphij *    must display the following acknowledgement:
15109998Smarkm *	This product includes software developed by the University of
16109998Smarkm *	California, Berkeley and its contributors.
17109998Smarkm * 4. Neither the name of the University nor the names of its contributors
18109998Smarkm *    may be used to endorse or promote products derived from this software
19109998Smarkm *    without specific prior written permission.
20109998Smarkm *
21109998Smarkm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22109998Smarkm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23109998Smarkm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24109998Smarkm * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25109998Smarkm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26109998Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27109998Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28109998Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29109998Smarkm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30109998Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31109998Smarkm * SUCH DAMAGE.
32109998Smarkm *
33109998Smarkm *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34109998Smarkm * $FreeBSD: head/sys/kern/uipc_socket.c 53541 1999-11-22 02:45:11Z shin $
35109998Smarkm */
36109998Smarkm
37109998Smarkm#include <sys/param.h>
38109998Smarkm#include <sys/systm.h>
39109998Smarkm#include <sys/fcntl.h>
40109998Smarkm#include <sys/malloc.h>
41109998Smarkm#include <sys/mbuf.h>
42109998Smarkm#include <sys/domain.h>
43109998Smarkm#include <sys/kernel.h>
44109998Smarkm#include <sys/malloc.h>
45109998Smarkm#include <sys/poll.h>
46109998Smarkm#include <sys/proc.h>
47109998Smarkm#include <sys/protosw.h>
48109998Smarkm#include <sys/socket.h>
49109998Smarkm#include <sys/socketvar.h>
50109998Smarkm#include <sys/resourcevar.h>
51109998Smarkm#include <sys/signalvar.h>
52109998Smarkm#include <sys/sysctl.h>
53109998Smarkm#include <sys/uio.h>
54109998Smarkm#include <vm/vm_zone.h>
55109998Smarkm
56109998Smarkm#include <machine/limits.h>
57109998Smarkm
58109998Smarkmstruct	vm_zone *socket_zone;
59111147Snectarso_gen_t	so_gencnt;	/* generation count for sockets */
60238405Sjkim
61296341SdelphijMALLOC_DEFINE(M_SONAME, "soname", "socket name");
62296341SdelphijMALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
63296341Sdelphij
64296341SdelphijSYSCTL_DECL(_kern_ipc);
65238405Sjkim
66296341Sdelphijstatic int somaxconn = SOMAXCONN;
67238405SjkimSYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
68296341Sdelphij    &somaxconn, 0, "Maximum pending socket connection queue size");
69296341Sdelphij
70296341Sdelphij/*
71296341Sdelphij * Socket operation routines.
72296341Sdelphij * These routines are called by the routines in
73296341Sdelphij * sys_socket.c or from a system process, and
74296341Sdelphij * implement the semantics of socket operations by
75296341Sdelphij * switching out to the protocol specific routines.
76296341Sdelphij */
77296341Sdelphij
78296341Sdelphij/*
79296341Sdelphij * Get a socket structure from our zone, and initialize it.
80296341Sdelphij * We don't implement `waitok' yet (see comments in uipc_domain.c).
81109998Smarkm * Note that it would probably be better to allocate socket
82296341Sdelphij * and PCB at the same time, but I'm not convinced that all
83238405Sjkim * the protocols can be easily modified to do this.
84296341Sdelphij */
85296341Sdelphijstruct socket *
86238405Sjkimsoalloc(waitok)
87296341Sdelphij	int waitok;
88238405Sjkim{
89296341Sdelphij	struct socket *so;
90238405Sjkim
91296341Sdelphij	so = zalloci(socket_zone);
92238405Sjkim	if (so) {
93296341Sdelphij		/* XXX race condition for reentrant kernel */
94238405Sjkim		bzero(so, sizeof *so);
95296341Sdelphij		so->so_gencnt = ++so_gencnt;
96296341Sdelphij		so->so_zone = socket_zone;
97109998Smarkm	}
98296341Sdelphij	return so;
99109998Smarkm}
100296341Sdelphij
101296341Sdelphijint
102296341Sdelphijsocreate(dom, aso, type, proto, p)
103296341Sdelphij	int dom;
104296341Sdelphij	struct socket **aso;
105296341Sdelphij	register int type;
106109998Smarkm	int proto;
107296341Sdelphij	struct proc *p;
108296341Sdelphij{
109296341Sdelphij	register struct protosw *prp;
110109998Smarkm	register struct socket *so;
111296341Sdelphij	register int error;
112296341Sdelphij
113296341Sdelphij	if (proto)
114296341Sdelphij		prp = pffindproto(dom, proto, type);
115109998Smarkm	else
116127128Snectar		prp = pffindtype(dom, type);
117109998Smarkm	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
118296341Sdelphij		return (EPROTONOSUPPORT);
119296341Sdelphij	if (prp->pr_type != type)
120109998Smarkm		return (EPROTOTYPE);
121194206Ssimon	so = soalloc(p != 0);
122296341Sdelphij	if (so == 0)
123296341Sdelphij		return (ENOBUFS);
124109998Smarkm
125296341Sdelphij	TAILQ_INIT(&so->so_incomp);
126296341Sdelphij	TAILQ_INIT(&so->so_comp);
127109998Smarkm	so->so_type = type;
128109998Smarkm	so->so_cred = p->p_ucred;
129109998Smarkm	crhold(so->so_cred);
130109998Smarkm	so->so_proto = prp;
131296341Sdelphij	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
132296341Sdelphij	if (error) {
133296341Sdelphij		so->so_state |= SS_NOFDREF;
134296341Sdelphij		sofree(so);
135296341Sdelphij		return (error);
136296341Sdelphij	}
137296341Sdelphij	*aso = so;
138296341Sdelphij	return (0);
139296341Sdelphij}
140296341Sdelphij
141296341Sdelphijint
142296341Sdelphijsobind(so, nam, p)
143296341Sdelphij	struct socket *so;
144296341Sdelphij	struct sockaddr *nam;
145296341Sdelphij	struct proc *p;
146296341Sdelphij{
147296341Sdelphij	int s = splnet();
148296341Sdelphij	int error;
149296341Sdelphij
150296341Sdelphij	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
151296341Sdelphij	splx(s);
152296341Sdelphij	return (error);
153296341Sdelphij}
154296341Sdelphij
155296341Sdelphijvoid
156296341Sdelphijsodealloc(so)
157296341Sdelphij	struct socket *so;
158296341Sdelphij{
159296341Sdelphij
160296341Sdelphij	so->so_gencnt = ++so_gencnt;
161296341Sdelphij	if (so->so_rcv.sb_hiwat)
162296341Sdelphij		(void)chgsbsize(so->so_cred->cr_uid,
163296341Sdelphij		    -(rlim_t)so->so_rcv.sb_hiwat);
164296341Sdelphij	if (so->so_snd.sb_hiwat)
165296341Sdelphij		(void)chgsbsize(so->so_cred->cr_uid,
166296341Sdelphij		    -(rlim_t)so->so_snd.sb_hiwat);
167109998Smarkm	crfree(so->so_cred);
168296341Sdelphij	zfreei(so->so_zone, so);
169296341Sdelphij}
170296341Sdelphij
171296341Sdelphijint
172296341Sdelphijsolisten(so, backlog, p)
173296341Sdelphij	register struct socket *so;
174109998Smarkm	int backlog;
175296341Sdelphij	struct proc *p;
176296341Sdelphij{
177109998Smarkm	int s, error;
178296341Sdelphij
179296341Sdelphij	s = splnet();
180296341Sdelphij	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
181296341Sdelphij	if (error) {
182296341Sdelphij		splx(s);
183296341Sdelphij		return (error);
184296341Sdelphij	}
185296341Sdelphij	if (TAILQ_EMPTY(&so->so_comp))
186296341Sdelphij		so->so_options |= SO_ACCEPTCONN;
187296341Sdelphij	if (backlog < 0 || backlog > somaxconn)
188296341Sdelphij		backlog = somaxconn;
189296341Sdelphij	so->so_qlimit = backlog;
190296341Sdelphij	splx(s);
191296341Sdelphij	return (0);
192296341Sdelphij}
193296341Sdelphij
194296341Sdelphijvoid
195296341Sdelphijsofree(so)
196296341Sdelphij	register struct socket *so;
197296341Sdelphij{
198296341Sdelphij	struct socket *head = so->so_head;
199296341Sdelphij
200296341Sdelphij	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
201296341Sdelphij		return;
202296341Sdelphij	if (head != NULL) {
203296341Sdelphij		if (so->so_state & SS_INCOMP) {
204296341Sdelphij			TAILQ_REMOVE(&head->so_incomp, so, so_list);
205296341Sdelphij			head->so_incqlen--;
206296341Sdelphij		} else if (so->so_state & SS_COMP) {
207296341Sdelphij			/*
208296341Sdelphij			 * We must not decommission a socket that's
209296341Sdelphij			 * on the accept(2) queue.  If we do, then
210296341Sdelphij			 * accept(2) may hang after select(2) indicated
211296341Sdelphij			 * that the listening socket was ready.
212296341Sdelphij			 */
213296341Sdelphij			return;
214296341Sdelphij		} else {
215296341Sdelphij			panic("sofree: not queued");
216296341Sdelphij		}
217296341Sdelphij		head->so_qlen--;
218296341Sdelphij		so->so_state &= ~SS_INCOMP;
219296341Sdelphij		so->so_head = NULL;
220296341Sdelphij	}
221296341Sdelphij	sbrelease(&so->so_snd, so);
222296341Sdelphij	sorflush(so);
223296341Sdelphij	sodealloc(so);
224296341Sdelphij}
225296341Sdelphij
226296341Sdelphij/*
227296341Sdelphij * Close a socket on last file table reference removal.
228296341Sdelphij * Initiate disconnect if connected.
229296341Sdelphij * Free socket when disconnect complete.
230296341Sdelphij */
231296341Sdelphijint
232296341Sdelphijsoclose(so)
233296341Sdelphij	register struct socket *so;
234296341Sdelphij{
235296341Sdelphij	int s = splnet();		/* conservative */
236296341Sdelphij	int error = 0;
237296341Sdelphij
238296341Sdelphij	funsetown(so->so_sigio);
239296341Sdelphij	if (so->so_options & SO_ACCEPTCONN) {
240296341Sdelphij		struct socket *sp, *sonext;
241296341Sdelphij
242296341Sdelphij		sp = TAILQ_FIRST(&so->so_incomp);
243296341Sdelphij		for (; sp != NULL; sp = sonext) {
244296341Sdelphij			sonext = TAILQ_NEXT(sp, so_list);
245296341Sdelphij			(void) soabort(sp);
246296341Sdelphij		}
247296341Sdelphij		for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
248296341Sdelphij			sonext = TAILQ_NEXT(sp, so_list);
249296341Sdelphij			/* Dequeue from so_comp since sofree() won't do it */
250296341Sdelphij			TAILQ_REMOVE(&so->so_comp, sp, so_list);
251296341Sdelphij			so->so_qlen--;
252296341Sdelphij			sp->so_state &= ~SS_COMP;
253296341Sdelphij			sp->so_head = NULL;
254296341Sdelphij			(void) soabort(sp);
255296341Sdelphij		}
256296341Sdelphij	}
257296341Sdelphij	if (so->so_pcb == 0)
258296341Sdelphij		goto discard;
259296341Sdelphij	if (so->so_state & SS_ISCONNECTED) {
260296341Sdelphij		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
261296341Sdelphij			error = sodisconnect(so);
262296341Sdelphij			if (error)
263296341Sdelphij				goto drop;
264296341Sdelphij		}
265296341Sdelphij		if (so->so_options & SO_LINGER) {
266296341Sdelphij			if ((so->so_state & SS_ISDISCONNECTING) &&
267296341Sdelphij			    (so->so_state & SS_NBIO))
268296341Sdelphij				goto drop;
269296341Sdelphij			while (so->so_state & SS_ISCONNECTED) {
270296341Sdelphij				error = tsleep((caddr_t)&so->so_timeo,
271296341Sdelphij				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
272296341Sdelphij				if (error)
273296341Sdelphij					break;
274296341Sdelphij			}
275296341Sdelphij		}
276296341Sdelphij	}
277296341Sdelphijdrop:
278296341Sdelphij	if (so->so_pcb) {
279296341Sdelphij		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
280296341Sdelphij		if (error == 0)
281296341Sdelphij			error = error2;
282296341Sdelphij	}
283296341Sdelphijdiscard:
284296341Sdelphij	if (so->so_state & SS_NOFDREF)
285296341Sdelphij		panic("soclose: NOFDREF");
286296341Sdelphij	so->so_state |= SS_NOFDREF;
287296341Sdelphij	sofree(so);
288296341Sdelphij	splx(s);
289296341Sdelphij	return (error);
290296341Sdelphij}
291296341Sdelphij
292296341Sdelphij/*
293296341Sdelphij * Must be called at splnet...
294296341Sdelphij */
295296341Sdelphijint
296296341Sdelphijsoabort(so)
297296341Sdelphij	struct socket *so;
298296341Sdelphij{
299296341Sdelphij
300296341Sdelphij	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
301296341Sdelphij}
302296341Sdelphij
303296341Sdelphijint
304296341Sdelphijsoaccept(so, nam)
305296341Sdelphij	register struct socket *so;
306296341Sdelphij	struct sockaddr **nam;
307296341Sdelphij{
308296341Sdelphij	int s = splnet();
309296341Sdelphij	int error;
310296341Sdelphij
311296341Sdelphij	if ((so->so_state & SS_NOFDREF) == 0)
312296341Sdelphij		panic("soaccept: !NOFDREF");
313296341Sdelphij	so->so_state &= ~SS_NOFDREF;
314296341Sdelphij 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
315296341Sdelphij		error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
316296341Sdelphij	else {
317296341Sdelphij		if (nam)
318296341Sdelphij			*nam = 0;
319296341Sdelphij		error = 0;
320296341Sdelphij	}
321296341Sdelphij	splx(s);
322296341Sdelphij	return (error);
323296341Sdelphij}
324296341Sdelphij
325296341Sdelphijint
326296341Sdelphijsoconnect(so, nam, p)
327296341Sdelphij	register struct socket *so;
328296341Sdelphij	struct sockaddr *nam;
329296341Sdelphij	struct proc *p;
330296341Sdelphij{
331296341Sdelphij	int s;
332296341Sdelphij	int error;
333296341Sdelphij
334296341Sdelphij	if (so->so_options & SO_ACCEPTCONN)
335296341Sdelphij		return (EOPNOTSUPP);
336296341Sdelphij	s = splnet();
337296341Sdelphij	/*
338296341Sdelphij	 * If protocol is connection-based, can only connect once.
339296341Sdelphij	 * Otherwise, if connected, try to disconnect first.
340296341Sdelphij	 * This allows user to disconnect by connecting to, e.g.,
341296341Sdelphij	 * a null address.
342296341Sdelphij	 */
343296341Sdelphij	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
344296341Sdelphij	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
345296341Sdelphij	    (error = sodisconnect(so))))
346296341Sdelphij		error = EISCONN;
347296341Sdelphij	else
348296341Sdelphij		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
349296341Sdelphij	splx(s);
350296341Sdelphij	return (error);
351296341Sdelphij}
352296341Sdelphij
353296341Sdelphijint
354296341Sdelphijsoconnect2(so1, so2)
355296341Sdelphij	register struct socket *so1;
356296341Sdelphij	struct socket *so2;
357296341Sdelphij{
358296341Sdelphij	int s = splnet();
359296341Sdelphij	int error;
360296341Sdelphij
361296341Sdelphij	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
362296341Sdelphij	splx(s);
363296341Sdelphij	return (error);
364296341Sdelphij}
365296341Sdelphij
366296341Sdelphijint
367296341Sdelphijsodisconnect(so)
368296341Sdelphij	register struct socket *so;
369296341Sdelphij{
370296341Sdelphij	int s = splnet();
371296341Sdelphij	int error;
372296341Sdelphij
373296341Sdelphij	if ((so->so_state & SS_ISCONNECTED) == 0) {
374296341Sdelphij		error = ENOTCONN;
375296341Sdelphij		goto bad;
376296341Sdelphij	}
377296341Sdelphij	if (so->so_state & SS_ISDISCONNECTING) {
378296341Sdelphij		error = EALREADY;
379296341Sdelphij		goto bad;
380296341Sdelphij	}
381296341Sdelphij	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
382296341Sdelphijbad:
383296341Sdelphij	splx(s);
384296341Sdelphij	return (error);
385296341Sdelphij}
386296341Sdelphij
387296341Sdelphij#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
388296341Sdelphij/*
389296341Sdelphij * Send on a socket.
390296341Sdelphij * If send must go all at once and message is larger than
391296341Sdelphij * send buffering, then hard error.
392296341Sdelphij * Lock against other senders.
393296341Sdelphij * If must go all at once and not enough room now, then
394296341Sdelphij * inform user that this would block and do nothing.
395296341Sdelphij * Otherwise, if nonblocking, send as much as possible.
396296341Sdelphij * The data to be sent is described by "uio" if nonzero,
397296341Sdelphij * otherwise by the mbuf chain "top" (which must be null
398296341Sdelphij * if uio is not).  Data provided in mbuf chain must be small
399296341Sdelphij * enough to send all at once.
400296341Sdelphij *
401296341Sdelphij * Returns nonzero on error, timeout or signal; callers
402296341Sdelphij * must check for short counts if EINTR/ERESTART are returned.
403296341Sdelphij * Data and control buffers are freed on return.
404296341Sdelphij */
405296341Sdelphijint
406296341Sdelphijsosend(so, addr, uio, top, control, flags, p)
407296341Sdelphij	register struct socket *so;
408296341Sdelphij	struct sockaddr *addr;
409296341Sdelphij	struct uio *uio;
410296341Sdelphij	struct mbuf *top;
411296341Sdelphij	struct mbuf *control;
412296341Sdelphij	int flags;
413296341Sdelphij	struct proc *p;
414296341Sdelphij{
415296341Sdelphij	struct mbuf **mp;
416296341Sdelphij	register struct mbuf *m;
417296341Sdelphij	register long space, len, resid;
418296341Sdelphij	int clen = 0, error, s, dontroute, mlen;
419296341Sdelphij	int atomic = sosendallatonce(so) || top;
420296341Sdelphij
421296341Sdelphij	if (uio)
422296341Sdelphij		resid = uio->uio_resid;
423296341Sdelphij	else
424296341Sdelphij		resid = top->m_pkthdr.len;
425296341Sdelphij	/*
426296341Sdelphij	 * In theory resid should be unsigned.
427296341Sdelphij	 * However, space must be signed, as it might be less than 0
428296341Sdelphij	 * if we over-committed, and we must use a signed comparison
429296341Sdelphij	 * of space and resid.  On the other hand, a negative resid
430296341Sdelphij	 * causes us to loop sending 0-length segments to the protocol.
431296341Sdelphij	 *
432296341Sdelphij	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
433296341Sdelphij	 * type sockets since that's an error.
434296341Sdelphij	 */
435296341Sdelphij	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
436296341Sdelphij		error = EINVAL;
437296341Sdelphij		goto out;
438296341Sdelphij	}
439296341Sdelphij
440296341Sdelphij	dontroute =
441296341Sdelphij	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
442296341Sdelphij	    (so->so_proto->pr_flags & PR_ATOMIC);
443296341Sdelphij	if (p)
444296341Sdelphij		p->p_stats->p_ru.ru_msgsnd++;
445296341Sdelphij	if (control)
446296341Sdelphij		clen = control->m_len;
447296341Sdelphij#define	snderr(errno)	{ error = errno; splx(s); goto release; }
448296341Sdelphij
449296341Sdelphijrestart:
450296341Sdelphij	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
451296341Sdelphij	if (error)
452296341Sdelphij		goto out;
453296341Sdelphij	do {
454296341Sdelphij		s = splnet();
455296341Sdelphij		if (so->so_state & SS_CANTSENDMORE)
456296341Sdelphij			snderr(EPIPE);
457296341Sdelphij		if (so->so_error) {
458296341Sdelphij			error = so->so_error;
459296341Sdelphij			so->so_error = 0;
460296341Sdelphij			splx(s);
461296341Sdelphij			goto release;
462296341Sdelphij		}
463296341Sdelphij		if ((so->so_state & SS_ISCONNECTED) == 0) {
464296341Sdelphij			/*
465296341Sdelphij			 * `sendto' and `sendmsg' is allowed on a connection-
466296341Sdelphij			 * based socket if it supports implied connect.
467296341Sdelphij			 * Return ENOTCONN if not connected and no address is
468296341Sdelphij			 * supplied.
469296341Sdelphij			 */
470296341Sdelphij			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
471296341Sdelphij			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
472296341Sdelphij				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
473109998Smarkm				    !(resid == 0 && clen != 0))
474296341Sdelphij					snderr(ENOTCONN);
475296341Sdelphij			} else if (addr == 0)
476296341Sdelphij			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
477109998Smarkm				   ENOTCONN : EDESTADDRREQ);
478296341Sdelphij		}
479296341Sdelphij		space = sbspace(&so->so_snd);
480296341Sdelphij		if (flags & MSG_OOB)
481296341Sdelphij			space += 1024;
482296341Sdelphij		if ((atomic && resid > so->so_snd.sb_hiwat) ||
483296341Sdelphij		    clen > so->so_snd.sb_hiwat)
484296341Sdelphij			snderr(EMSGSIZE);
485296341Sdelphij		if (space < resid + clen && uio &&
486296341Sdelphij		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
487296341Sdelphij			if (so->so_state & SS_NBIO)
488296341Sdelphij				snderr(EWOULDBLOCK);
489296341Sdelphij			sbunlock(&so->so_snd);
490296341Sdelphij			error = sbwait(&so->so_snd);
491296341Sdelphij			splx(s);
492296341Sdelphij			if (error)
493296341Sdelphij				goto out;
494296341Sdelphij			goto restart;
495296341Sdelphij		}
496296341Sdelphij		splx(s);
497296341Sdelphij		mp = &top;
498296341Sdelphij		space -= clen;
499296341Sdelphij		do {
500296341Sdelphij		    if (uio == NULL) {
501296341Sdelphij			/*
502296341Sdelphij			 * Data is prepackaged in "top".
503296341Sdelphij			 */
504296341Sdelphij			resid = 0;
505296341Sdelphij			if (flags & MSG_EOR)
506296341Sdelphij				top->m_flags |= M_EOR;
507296341Sdelphij		    } else do {
508296341Sdelphij			if (top == 0) {
509296341Sdelphij				MGETHDR(m, M_WAIT, MT_DATA);
510296341Sdelphij				mlen = MHLEN;
511296341Sdelphij				m->m_pkthdr.len = 0;
512296341Sdelphij				m->m_pkthdr.rcvif = (struct ifnet *)0;
513296341Sdelphij			} else {
514296341Sdelphij				MGET(m, M_WAIT, MT_DATA);
515296341Sdelphij				mlen = MLEN;
516296341Sdelphij			}
517296341Sdelphij			if (resid >= MINCLSIZE) {
518296341Sdelphij				MCLGET(m, M_WAIT);
519296341Sdelphij				if ((m->m_flags & M_EXT) == 0)
520296341Sdelphij					goto nopages;
521296341Sdelphij				mlen = MCLBYTES;
522296341Sdelphij				len = min(min(mlen, resid), space);
523296341Sdelphij			} else {
524296341Sdelphijnopages:
525296341Sdelphij				len = min(min(mlen, resid), space);
526296341Sdelphij				/*
527296341Sdelphij				 * For datagram protocols, leave room
528296341Sdelphij				 * for protocol headers in first mbuf.
529296341Sdelphij				 */
530296341Sdelphij				if (atomic && top == 0 && len < mlen)
531296341Sdelphij					MH_ALIGN(m, len);
532296341Sdelphij			}
533296341Sdelphij			space -= len;
534296341Sdelphij			error = uiomove(mtod(m, caddr_t), (int)len, uio);
535296341Sdelphij			resid = uio->uio_resid;
536296341Sdelphij			m->m_len = len;
537296341Sdelphij			*mp = m;
538296341Sdelphij			top->m_pkthdr.len += len;
539296341Sdelphij			if (error)
540296341Sdelphij				goto release;
541296341Sdelphij			mp = &m->m_next;
542296341Sdelphij			if (resid <= 0) {
543296341Sdelphij				if (flags & MSG_EOR)
544296341Sdelphij					top->m_flags |= M_EOR;
545296341Sdelphij				break;
546296341Sdelphij			}
547296341Sdelphij		    } while (space > 0 && atomic);
548296341Sdelphij		    if (dontroute)
549296341Sdelphij			    so->so_options |= SO_DONTROUTE;
550296341Sdelphij		    s = splnet();				/* XXX */
551296341Sdelphij		    /*
552296341Sdelphij		     * XXX all the SS_CANTSENDMORE checks previously
553296341Sdelphij		     * done could be out of date.  We could have recieved
554296341Sdelphij		     * a reset packet in an interrupt or maybe we slept
555296341Sdelphij		     * while doing page faults in uiomove() etc. We could
556296341Sdelphij		     * probably recheck again inside the splnet() protection
557296341Sdelphij		     * here, but there are probably other places that this
558296341Sdelphij		     * also happens.  We must rethink this.
559296341Sdelphij		     */
560296341Sdelphij		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
561296341Sdelphij			(flags & MSG_OOB) ? PRUS_OOB :
562296341Sdelphij			/*
563296341Sdelphij			 * If the user set MSG_EOF, the protocol
564296341Sdelphij			 * understands this flag and nothing left to
565296341Sdelphij			 * send then use PRU_SEND_EOF instead of PRU_SEND.
566296341Sdelphij			 */
567296341Sdelphij			((flags & MSG_EOF) &&
568296341Sdelphij			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
569296341Sdelphij			 (resid <= 0)) ?
570296341Sdelphij				PRUS_EOF :
571296341Sdelphij			/* If there is more to send set PRUS_MORETOCOME */
572109998Smarkm			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
573296341Sdelphij			top, addr, control, p);
574296341Sdelphij		    splx(s);
575296341Sdelphij		    if (dontroute)
576296341Sdelphij			    so->so_options &= ~SO_DONTROUTE;
577109998Smarkm		    clen = 0;
578296341Sdelphij		    control = 0;
579296341Sdelphij		    top = 0;
580296341Sdelphij		    mp = &top;
581296341Sdelphij		    if (error)
582109998Smarkm			goto release;
583296341Sdelphij		} while (resid && space > 0);
584296341Sdelphij	} while (resid);
585109998Smarkm
586296341Sdelphijrelease:
587296341Sdelphij	sbunlock(&so->so_snd);
588296341Sdelphijout:
589296341Sdelphij	if (top)
590296341Sdelphij		m_freem(top);
591296341Sdelphij	if (control)
592296341Sdelphij		m_freem(control);
593296341Sdelphij	return (error);
594296341Sdelphij}
595296341Sdelphij
596296341Sdelphij/*
597296341Sdelphij * Implement receive operations on a socket.
598296341Sdelphij * We depend on the way that records are added to the sockbuf
599109998Smarkm * by sbappend*.  In particular, each record (mbufs linked through m_next)
600296341Sdelphij * must begin with an address if the protocol so specifies,
601296341Sdelphij * followed by an optional mbuf or mbufs containing ancillary data,
602296341Sdelphij * and then zero or more mbufs of data.
603296341Sdelphij * In order to avoid blocking network interrupts for the entire time here,
604296341Sdelphij * we splx() while doing the actual copy to user space.
605109998Smarkm * Although the sockbuf is locked, new data may still be appended,
606296341Sdelphij * and thus we must maintain consistency of the sockbuf during that time.
607296341Sdelphij *
608296341Sdelphij * The caller may receive the data as a single mbuf chain by supplying
609296341Sdelphij * an mbuf **mp0 for use in returning the chain.  The uio is then used
610296341Sdelphij * only for the count in uio_resid.
611296341Sdelphij */
612296341Sdelphijint
613296341Sdelphijsoreceive(so, psa, uio, mp0, controlp, flagsp)
614296341Sdelphij	register struct socket *so;
615296341Sdelphij	struct sockaddr **psa;
616296341Sdelphij	struct uio *uio;
617296341Sdelphij	struct mbuf **mp0;
618296341Sdelphij	struct mbuf **controlp;
619296341Sdelphij	int *flagsp;
620296341Sdelphij{
621296341Sdelphij	register struct mbuf *m, **mp;
622296341Sdelphij	register int flags, len, error, s, offset;
623296341Sdelphij	struct protosw *pr = so->so_proto;
624296341Sdelphij	struct mbuf *nextrecord;
625296341Sdelphij	int moff, type = 0;
626296341Sdelphij	int orig_resid = uio->uio_resid;
627296341Sdelphij
628296341Sdelphij	mp = mp0;
629296341Sdelphij	if (psa)
630109998Smarkm		*psa = 0;
631296341Sdelphij	if (controlp)
632109998Smarkm		*controlp = 0;
633296341Sdelphij	if (flagsp)
634296341Sdelphij		flags = *flagsp &~ MSG_EOR;
635296341Sdelphij	else
636296341Sdelphij		flags = 0;
637296341Sdelphij	if (flags & MSG_OOB) {
638296341Sdelphij		m = m_get(M_WAIT, MT_DATA);
639296341Sdelphij		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
640296341Sdelphij		if (error)
641296341Sdelphij			goto bad;
642296341Sdelphij		do {
643296341Sdelphij			error = uiomove(mtod(m, caddr_t),
644109998Smarkm			    (int) min(uio->uio_resid, m->m_len), uio);
645296341Sdelphij			m = m_free(m);
646296341Sdelphij		} while (uio->uio_resid && error == 0 && m);
647296341Sdelphijbad:
648296341Sdelphij		if (m)
649109998Smarkm			m_freem(m);
650296341Sdelphij		return (error);
651296341Sdelphij	}
652109998Smarkm	if (mp)
653296341Sdelphij		*mp = (struct mbuf *)0;
654296341Sdelphij	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
655296341Sdelphij		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
656296341Sdelphij
657296341Sdelphijrestart:
658296341Sdelphij	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
659296341Sdelphij	if (error)
660296341Sdelphij		return (error);
661296341Sdelphij	s = splnet();
662296341Sdelphij
663296341Sdelphij	m = so->so_rcv.sb_mb;
664296341Sdelphij	/*
665296341Sdelphij	 * If we have less data than requested, block awaiting more
666296341Sdelphij	 * (subject to any timeout) if:
667296341Sdelphij	 *   1. the current count is less than the low water mark, or
668296341Sdelphij	 *   2. MSG_WAITALL is set, and it is possible to do the entire
669296341Sdelphij	 *	receive operation at once if we block (resid <= hiwat).
670296341Sdelphij	 *   3. MSG_DONTWAIT is not set
671296341Sdelphij	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
672238405Sjkim	 * we have to do the receive in sections, and thus risk returning
673296341Sdelphij	 * a short count if a timeout or signal occurs after we start.
674296341Sdelphij	 */
675296341Sdelphij	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
676296341Sdelphij	    so->so_rcv.sb_cc < uio->uio_resid) &&
677296341Sdelphij	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
678296341Sdelphij	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
679109998Smarkm	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
680296341Sdelphij		KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
681296341Sdelphij		if (so->so_error) {
682109998Smarkm			if (m)
683296341Sdelphij				goto dontblock;
684296341Sdelphij			error = so->so_error;
685296341Sdelphij			if ((flags & MSG_PEEK) == 0)
686296341Sdelphij				so->so_error = 0;
687296341Sdelphij			goto release;
688296341Sdelphij		}
689296341Sdelphij		if (so->so_state & SS_CANTRCVMORE) {
690296341Sdelphij			if (m)
691296341Sdelphij				goto dontblock;
692109998Smarkm			else
693296341Sdelphij				goto release;
694296341Sdelphij		}
695296341Sdelphij		for (; m; m = m->m_next)
696296341Sdelphij			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
697296341Sdelphij				m = so->so_rcv.sb_mb;
698109998Smarkm				goto dontblock;
699296341Sdelphij			}
700296341Sdelphij		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
701296341Sdelphij		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
702296341Sdelphij			error = ENOTCONN;
703296341Sdelphij			goto release;
704296341Sdelphij		}
705296341Sdelphij		if (uio->uio_resid == 0)
706109998Smarkm			goto release;
707296341Sdelphij		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
708296341Sdelphij			error = EWOULDBLOCK;
709296341Sdelphij			goto release;
710296341Sdelphij		}
711296341Sdelphij		sbunlock(&so->so_rcv);
712296341Sdelphij		error = sbwait(&so->so_rcv);
713296341Sdelphij		splx(s);
714296341Sdelphij		if (error)
715296341Sdelphij			return (error);
716296341Sdelphij		goto restart;
717296341Sdelphij	}
718296341Sdelphijdontblock:
719296341Sdelphij	if (uio->uio_procp)
720296341Sdelphij		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
721296341Sdelphij	nextrecord = m->m_nextpkt;
722296341Sdelphij	if (pr->pr_flags & PR_ADDR) {
723296341Sdelphij		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
724296341Sdelphij		orig_resid = 0;
725296341Sdelphij		if (psa)
726296341Sdelphij			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
727296341Sdelphij					    mp0 == 0);
728296341Sdelphij		if (flags & MSG_PEEK) {
729296341Sdelphij			m = m->m_next;
730296341Sdelphij		} else {
731296341Sdelphij			sbfree(&so->so_rcv, m);
732296341Sdelphij			MFREE(m, so->so_rcv.sb_mb);
733296341Sdelphij			m = so->so_rcv.sb_mb;
734296341Sdelphij		}
735109998Smarkm	}
736296341Sdelphij	while (m && m->m_type == MT_CONTROL && error == 0) {
737296341Sdelphij		if (flags & MSG_PEEK) {
738296341Sdelphij			if (controlp)
739296341Sdelphij				*controlp = m_copy(m, 0, m->m_len);
740109998Smarkm			m = m->m_next;
741296341Sdelphij		} else {
742109998Smarkm			sbfree(&so->so_rcv, m);
743296341Sdelphij			if (controlp) {
744296341Sdelphij				if (pr->pr_domain->dom_externalize &&
745296341Sdelphij				    mtod(m, struct cmsghdr *)->cmsg_type ==
746296341Sdelphij				    SCM_RIGHTS)
747296341Sdelphij				   error = (*pr->pr_domain->dom_externalize)(m);
748296341Sdelphij				*controlp = m;
749296341Sdelphij				so->so_rcv.sb_mb = m->m_next;
750296341Sdelphij				m->m_next = 0;
751296341Sdelphij				m = so->so_rcv.sb_mb;
752109998Smarkm			} else {
753296341Sdelphij				MFREE(m, so->so_rcv.sb_mb);
754109998Smarkm				m = so->so_rcv.sb_mb;
755296341Sdelphij			}
756296341Sdelphij		}
757296341Sdelphij		if (controlp) {
758296341Sdelphij			orig_resid = 0;
759296341Sdelphij			controlp = &(*controlp)->m_next;
760296341Sdelphij		}
761296341Sdelphij	}
762296341Sdelphij	if (m) {
763109998Smarkm		if ((flags & MSG_PEEK) == 0)
764296341Sdelphij			m->m_nextpkt = nextrecord;
765296341Sdelphij		type = m->m_type;
766109998Smarkm		if (type == MT_OOBDATA)
767296341Sdelphij			flags |= MSG_OOB;
768296341Sdelphij	}
769296341Sdelphij	moff = 0;
770296341Sdelphij	offset = 0;
771296341Sdelphij	while (m && uio->uio_resid > 0 && error == 0) {
772296341Sdelphij		if (m->m_type == MT_OOBDATA) {
773296341Sdelphij			if (type != MT_OOBDATA)
774296341Sdelphij				break;
775296341Sdelphij		} else if (type == MT_OOBDATA)
776296341Sdelphij			break;
777296341Sdelphij		else
778296341Sdelphij		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
779296341Sdelphij			("receive 3"));
780296341Sdelphij		so->so_state &= ~SS_RCVATMARK;
781296341Sdelphij		len = uio->uio_resid;
782296341Sdelphij		if (so->so_oobmark && len > so->so_oobmark - offset)
783109998Smarkm			len = so->so_oobmark - offset;
784296341Sdelphij		if (len > m->m_len - moff)
785296341Sdelphij			len = m->m_len - moff;
786296341Sdelphij		/*
787296341Sdelphij		 * If mp is set, just pass back the mbufs.
788296341Sdelphij		 * Otherwise copy them out via the uio, then free.
789296341Sdelphij		 * Sockbuf must be consistent here (points to current mbuf,
790296341Sdelphij		 * it points to next record) when we drop priority;
791296341Sdelphij		 * we must note any additions to the sockbuf when we
792296341Sdelphij		 * block interrupts again.
793296341Sdelphij		 */
794109998Smarkm		if (mp == 0) {
795296341Sdelphij			splx(s);
796109998Smarkm			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
797296341Sdelphij			s = splnet();
798296341Sdelphij			if (error)
799296341Sdelphij				goto release;
800296341Sdelphij		} else
801109998Smarkm			uio->uio_resid -= len;
802296341Sdelphij		if (len == m->m_len - moff) {
803296341Sdelphij			if (m->m_flags & M_EOR)
804296341Sdelphij				flags |= MSG_EOR;
805296341Sdelphij			if (flags & MSG_PEEK) {
806296341Sdelphij				m = m->m_next;
807296341Sdelphij				moff = 0;
808296341Sdelphij			} else {
809296341Sdelphij				nextrecord = m->m_nextpkt;
810296341Sdelphij				sbfree(&so->so_rcv, m);
811109998Smarkm				if (mp) {
812296341Sdelphij					*mp = m;
813296341Sdelphij					mp = &m->m_next;
814296341Sdelphij					so->so_rcv.sb_mb = m = m->m_next;
815109998Smarkm					*mp = (struct mbuf *)0;
816296341Sdelphij				} else {
817296341Sdelphij					MFREE(m, so->so_rcv.sb_mb);
818296341Sdelphij					m = so->so_rcv.sb_mb;
819296341Sdelphij				}
820296341Sdelphij				if (m)
821109998Smarkm					m->m_nextpkt = nextrecord;
822296341Sdelphij			}
823109998Smarkm		} else {
824296341Sdelphij			if (flags & MSG_PEEK)
825296341Sdelphij				moff += len;
826109998Smarkm			else {
827296341Sdelphij				if (mp)
828109998Smarkm					*mp = m_copym(m, 0, len, M_WAIT);
829296341Sdelphij				m->m_data += len;
830296341Sdelphij				m->m_len -= len;
831296341Sdelphij				so->so_rcv.sb_cc -= len;
832296341Sdelphij			}
833296341Sdelphij		}
834296341Sdelphij		if (so->so_oobmark) {
835296341Sdelphij			if ((flags & MSG_PEEK) == 0) {
836296341Sdelphij				so->so_oobmark -= len;
837296341Sdelphij				if (so->so_oobmark == 0) {
838296341Sdelphij					so->so_state |= SS_RCVATMARK;
839296341Sdelphij					break;
840296341Sdelphij				}
841296341Sdelphij			} else {
842296341Sdelphij				offset += len;
843296341Sdelphij				if (offset == so->so_oobmark)
844296341Sdelphij					break;
845296341Sdelphij			}
846296341Sdelphij		}
847296341Sdelphij		if (flags & MSG_EOR)
848296341Sdelphij			break;
849296341Sdelphij		/*
850296341Sdelphij		 * If the MSG_WAITALL flag is set (for non-atomic socket),
851109998Smarkm		 * we must not quit until "uio->uio_resid == 0" or an error
852296341Sdelphij		 * termination.  If a signal/timeout occurs, return
853296341Sdelphij		 * with a short count but without error.
854296341Sdelphij		 * Keep sockbuf locked against other readers.
855296341Sdelphij		 */
856296341Sdelphij		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
857296341Sdelphij		    !sosendallatonce(so) && !nextrecord) {
858296341Sdelphij			if (so->so_error || so->so_state & SS_CANTRCVMORE)
859296341Sdelphij				break;
860109998Smarkm			error = sbwait(&so->so_rcv);
861109998Smarkm			if (error) {
862296341Sdelphij				sbunlock(&so->so_rcv);
863296341Sdelphij				splx(s);
864296341Sdelphij				return (0);
865296341Sdelphij			}
866296341Sdelphij			m = so->so_rcv.sb_mb;
867296341Sdelphij			if (m)
868296341Sdelphij				nextrecord = m->m_nextpkt;
869296341Sdelphij		}
870296341Sdelphij	}
871296341Sdelphij
872296341Sdelphij	if (m && pr->pr_flags & PR_ATOMIC) {
873296341Sdelphij		flags |= MSG_TRUNC;
874296341Sdelphij		if ((flags & MSG_PEEK) == 0)
875296341Sdelphij			(void) sbdroprecord(&so->so_rcv);
876296341Sdelphij	}
877296341Sdelphij	if ((flags & MSG_PEEK) == 0) {
878296341Sdelphij		if (m == 0)
879296341Sdelphij			so->so_rcv.sb_mb = nextrecord;
880296341Sdelphij		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
881109998Smarkm			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
882296341Sdelphij	}
883296341Sdelphij	if (orig_resid == uio->uio_resid && orig_resid &&
884296341Sdelphij	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
885296341Sdelphij		sbunlock(&so->so_rcv);
886109998Smarkm		splx(s);
887296341Sdelphij		goto restart;
888296341Sdelphij	}
889296341Sdelphij
890296341Sdelphij	if (flagsp)
891296341Sdelphij		*flagsp |= flags;
892296341Sdelphijrelease:
893296341Sdelphij	sbunlock(&so->so_rcv);
894296341Sdelphij	splx(s);
895296341Sdelphij	return (error);
896296341Sdelphij}
897296341Sdelphij
898296341Sdelphijint
899296341Sdelphijsoshutdown(so, how)
900296341Sdelphij	register struct socket *so;
901296341Sdelphij	register int how;
902296341Sdelphij{
903296341Sdelphij	register struct protosw *pr = so->so_proto;
904296341Sdelphij
905296341Sdelphij	how++;
906296341Sdelphij	if (how & FREAD)
907296341Sdelphij		sorflush(so);
908296341Sdelphij	if (how & FWRITE)
909296341Sdelphij		return ((*pr->pr_usrreqs->pru_shutdown)(so));
910296341Sdelphij	return (0);
911296341Sdelphij}
912296341Sdelphij
913296341Sdelphijvoid
914296341Sdelphijsorflush(so)
915296341Sdelphij	register struct socket *so;
916296341Sdelphij{
917109998Smarkm	register struct sockbuf *sb = &so->so_rcv;
918296341Sdelphij	register struct protosw *pr = so->so_proto;
919296341Sdelphij	register int s;
920296341Sdelphij	struct sockbuf asb;
921296341Sdelphij
922109998Smarkm	sb->sb_flags |= SB_NOINTR;
923109998Smarkm	(void) sblock(sb, M_WAITOK);
924296341Sdelphij	s = splimp();
925296341Sdelphij	socantrcvmore(so);
926296341Sdelphij	sbunlock(sb);
927296341Sdelphij	asb = *sb;
928296341Sdelphij	bzero((caddr_t)sb, sizeof (*sb));
929296341Sdelphij	splx(s);
930296341Sdelphij	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
931109998Smarkm		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
932296341Sdelphij	sbrelease(&asb, so);
933109998Smarkm}
934296341Sdelphij
935109998Smarkm/*
936296341Sdelphij * Perhaps this routine, and sooptcopyout(), below, ought to come in
937296341Sdelphij * an additional variant to handle the case where the option value needs
938296341Sdelphij * to be some kind of integer, but not a specific size.
939109998Smarkm * In addition to their use here, these functions are also called by the
940296341Sdelphij * protocol-level pr_ctloutput() routines.
941296341Sdelphij */
942296341Sdelphijint
943296341Sdelphijsooptcopyin(sopt, buf, len, minlen)
944109998Smarkm	struct	sockopt *sopt;
945296341Sdelphij	void	*buf;
946296341Sdelphij	size_t	len;
947296341Sdelphij	size_t	minlen;
948296341Sdelphij{
949296341Sdelphij	size_t	valsize;
950109998Smarkm
951296341Sdelphij	/*
952296341Sdelphij	 * If the user gives us more than we wanted, we ignore it,
953296341Sdelphij	 * but if we don't get the minimum length the caller
954296341Sdelphij	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
955296341Sdelphij	 * is set to however much we actually retrieved.
956296341Sdelphij	 */
957296341Sdelphij	if ((valsize = sopt->sopt_valsize) < minlen)
958296341Sdelphij		return EINVAL;
959296341Sdelphij	if (valsize > len)
960109998Smarkm		sopt->sopt_valsize = valsize = len;
961296341Sdelphij
962296341Sdelphij	if (sopt->sopt_p != 0)
963296341Sdelphij		return (copyin(sopt->sopt_val, buf, valsize));
964109998Smarkm
965296341Sdelphij	bcopy(sopt->sopt_val, buf, valsize);
966296341Sdelphij	return 0;
967296341Sdelphij}
968296341Sdelphij
969296341Sdelphijint
970109998Smarkmsosetopt(so, sopt)
971296341Sdelphij	struct socket *so;
972296341Sdelphij	struct sockopt *sopt;
973109998Smarkm{
974296341Sdelphij	int	error, optval;
975296341Sdelphij	struct	linger l;
976109998Smarkm	struct	timeval tv;
977296341Sdelphij	u_long  val;
978296341Sdelphij
979296341Sdelphij	error = 0;
980296341Sdelphij	if (sopt->sopt_level != SOL_SOCKET) {
981109998Smarkm		if (so->so_proto && so->so_proto->pr_ctloutput)
982296341Sdelphij			return ((*so->so_proto->pr_ctloutput)
983296341Sdelphij				  (so, sopt));
984109998Smarkm		error = ENOPROTOOPT;
985296341Sdelphij	} else {
986296341Sdelphij		switch (sopt->sopt_name) {
987296341Sdelphij		case SO_LINGER:
988296341Sdelphij			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
989296341Sdelphij			if (error)
990296341Sdelphij				goto bad;
991296341Sdelphij
992296341Sdelphij			so->so_linger = l.l_linger;
993296341Sdelphij			if (l.l_onoff)
994109998Smarkm				so->so_options |= SO_LINGER;
995296341Sdelphij			else
996109998Smarkm				so->so_options &= ~SO_LINGER;
997296341Sdelphij			break;
998296341Sdelphij
999296341Sdelphij		case SO_DEBUG:
1000296341Sdelphij		case SO_KEEPALIVE:
1001296341Sdelphij		case SO_DONTROUTE:
1002109998Smarkm		case SO_USELOOPBACK:
1003296341Sdelphij		case SO_BROADCAST:
1004296341Sdelphij		case SO_REUSEADDR:
1005296341Sdelphij		case SO_REUSEPORT:
1006296341Sdelphij		case SO_OOBINLINE:
1007109998Smarkm		case SO_TIMESTAMP:
1008296341Sdelphij			error = sooptcopyin(sopt, &optval, sizeof optval,
1009296341Sdelphij					    sizeof optval);
1010296341Sdelphij			if (error)
1011296341Sdelphij				goto bad;
1012296341Sdelphij			if (optval)
1013296341Sdelphij				so->so_options |= sopt->sopt_name;
1014296341Sdelphij			else
1015296341Sdelphij				so->so_options &= ~sopt->sopt_name;
1016296341Sdelphij			break;
1017109998Smarkm
1018296341Sdelphij		case SO_SNDBUF:
1019109998Smarkm		case SO_RCVBUF:
1020296341Sdelphij		case SO_SNDLOWAT:
1021296341Sdelphij		case SO_RCVLOWAT:
1022296341Sdelphij			error = sooptcopyin(sopt, &optval, sizeof optval,
1023296341Sdelphij					    sizeof optval);
1024296341Sdelphij			if (error)
1025296341Sdelphij				goto bad;
1026296341Sdelphij
1027296341Sdelphij			/*
1028296341Sdelphij			 * Values < 1 make no sense for any of these
1029238405Sjkim			 * options, so disallow them.
1030296341Sdelphij			 */
1031296341Sdelphij			if (optval < 1) {
1032296341Sdelphij				error = EINVAL;
1033296341Sdelphij				goto bad;
1034296341Sdelphij			}
1035296341Sdelphij
1036296341Sdelphij			switch (sopt->sopt_name) {
1037296341Sdelphij			case SO_SNDBUF:
1038296341Sdelphij			case SO_RCVBUF:
1039296341Sdelphij				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1040296341Sdelphij				    &so->so_snd : &so->so_rcv, (u_long)optval,
1041296341Sdelphij				    so, curproc) == 0) {
1042296341Sdelphij					error = ENOBUFS;
1043296341Sdelphij					goto bad;
1044296341Sdelphij				}
1045296341Sdelphij				break;
1046296341Sdelphij
1047296341Sdelphij			/*
1048296341Sdelphij			 * Make sure the low-water is never greater than
1049296341Sdelphij			 * the high-water.
1050296341Sdelphij			 */
1051296341Sdelphij			case SO_SNDLOWAT:
1052296341Sdelphij				so->so_snd.sb_lowat =
1053296341Sdelphij				    (optval > so->so_snd.sb_hiwat) ?
1054296341Sdelphij				    so->so_snd.sb_hiwat : optval;
1055296341Sdelphij				break;
1056296341Sdelphij			case SO_RCVLOWAT:
1057296341Sdelphij				so->so_rcv.sb_lowat =
1058296341Sdelphij				    (optval > so->so_rcv.sb_hiwat) ?
1059296341Sdelphij				    so->so_rcv.sb_hiwat : optval;
1060296341Sdelphij				break;
1061296341Sdelphij			}
1062296341Sdelphij			break;
1063296341Sdelphij
1064296341Sdelphij		case SO_SNDTIMEO:
1065296341Sdelphij		case SO_RCVTIMEO:
1066296341Sdelphij			error = sooptcopyin(sopt, &tv, sizeof tv,
1067296341Sdelphij					    sizeof tv);
1068296341Sdelphij			if (error)
1069238405Sjkim				goto bad;
1070296341Sdelphij
1071238405Sjkim			/* assert(hz > 0); */
1072296341Sdelphij			if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1073109998Smarkm			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1074296341Sdelphij				error = EDOM;
1075109998Smarkm				goto bad;
1076296341Sdelphij			}
1077296341Sdelphij			/* assert(tick > 0); */
1078296341Sdelphij			/* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1079296341Sdelphij			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1080296341Sdelphij			if (val > SHRT_MAX) {
1081296341Sdelphij				error = EDOM;
1082109998Smarkm				goto bad;
1083296341Sdelphij			}
1084109998Smarkm
1085127128Snectar			switch (sopt->sopt_name) {
1086296341Sdelphij			case SO_SNDTIMEO:
1087296341Sdelphij				so->so_snd.sb_timeo = val;
1088296341Sdelphij				break;
1089296341Sdelphij			case SO_RCVTIMEO:
1090296341Sdelphij				so->so_rcv.sb_timeo = val;
1091296341Sdelphij				break;
1092296341Sdelphij			}
1093296341Sdelphij			break;
1094296341Sdelphij
1095296341Sdelphij		default:
1096296341Sdelphij			error = ENOPROTOOPT;
1097296341Sdelphij			break;
1098296341Sdelphij		}
1099296341Sdelphij		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1100296341Sdelphij			(void) ((*so->so_proto->pr_ctloutput)
1101296341Sdelphij				  (so, sopt));
1102296341Sdelphij		}
1103296341Sdelphij	}
1104296341Sdelphijbad:
1105109998Smarkm	return (error);
1106109998Smarkm}
1107109998Smarkm
1108109998Smarkm/* Helper routine for getsockopt */
1109296341Sdelphijint
1110296341Sdelphijsooptcopyout(sopt, buf, len)
1111296341Sdelphij	struct	sockopt *sopt;
1112296341Sdelphij	void	*buf;
1113296341Sdelphij	size_t	len;
1114296341Sdelphij{
1115296341Sdelphij	int	error;
1116296341Sdelphij	size_t	valsize;
1117296341Sdelphij
1118296341Sdelphij	error = 0;
1119296341Sdelphij
1120296341Sdelphij	/*
1121296341Sdelphij	 * Documented get behavior is that we always return a value,
1122296341Sdelphij	 * possibly truncated to fit in the user's buffer.
1123296341Sdelphij	 * Traditional behavior is that we always tell the user
1124109998Smarkm	 * precisely how much we copied, rather than something useful
1125296341Sdelphij	 * like the total amount we had available for her.
1126296341Sdelphij	 * Note that this interface is not idempotent; the entire answer must
1127296341Sdelphij	 * generated ahead of time.
1128296341Sdelphij	 */
1129296341Sdelphij	valsize = min(len, sopt->sopt_valsize);
1130109998Smarkm	sopt->sopt_valsize = valsize;
1131296341Sdelphij	if (sopt->sopt_val != 0) {
1132109998Smarkm		if (sopt->sopt_p != 0)
1133296341Sdelphij			error = copyout(buf, sopt->sopt_val, valsize);
1134296341Sdelphij		else
1135296341Sdelphij			bcopy(buf, sopt->sopt_val, valsize);
1136296341Sdelphij	}
1137296341Sdelphij	return error;
1138109998Smarkm}
1139296341Sdelphij
1140296341Sdelphijint
1141296341Sdelphijsogetopt(so, sopt)
1142296341Sdelphij	struct socket *so;
1143296341Sdelphij	struct sockopt *sopt;
1144296341Sdelphij{
1145296341Sdelphij	int	error, optval;
1146109998Smarkm	struct	linger l;
1147296341Sdelphij	struct	timeval tv;
1148296341Sdelphij
1149296341Sdelphij	error = 0;
1150296341Sdelphij	if (sopt->sopt_level != SOL_SOCKET) {
1151296341Sdelphij		if (so->so_proto && so->so_proto->pr_ctloutput) {
1152109998Smarkm			return ((*so->so_proto->pr_ctloutput)
1153296341Sdelphij				  (so, sopt));
1154296341Sdelphij		} else
1155109998Smarkm			return (ENOPROTOOPT);
1156296341Sdelphij	} else {
1157296341Sdelphij		switch (sopt->sopt_name) {
1158296341Sdelphij		case SO_LINGER:
1159296341Sdelphij			l.l_onoff = so->so_options & SO_LINGER;
1160296341Sdelphij			l.l_linger = so->so_linger;
1161296341Sdelphij			error = sooptcopyout(sopt, &l, sizeof l);
1162296341Sdelphij			break;
1163296341Sdelphij
1164296341Sdelphij		case SO_USELOOPBACK:
1165296341Sdelphij		case SO_DONTROUTE:
1166296341Sdelphij		case SO_DEBUG:
1167296341Sdelphij		case SO_KEEPALIVE:
1168296341Sdelphij		case SO_REUSEADDR:
1169296341Sdelphij		case SO_REUSEPORT:
1170296341Sdelphij		case SO_BROADCAST:
1171296341Sdelphij		case SO_OOBINLINE:
1172109998Smarkm		case SO_TIMESTAMP:
1173296341Sdelphij			optval = so->so_options & sopt->sopt_name;
1174109998Smarkminteger:
1175296341Sdelphij			error = sooptcopyout(sopt, &optval, sizeof optval);
1176109998Smarkm			break;
1177296341Sdelphij
1178296341Sdelphij		case SO_TYPE:
1179296341Sdelphij			optval = so->so_type;
1180296341Sdelphij			goto integer;
1181109998Smarkm
1182296341Sdelphij		case SO_ERROR:
1183109998Smarkm			optval = so->so_error;
1184296341Sdelphij			so->so_error = 0;
1185109998Smarkm			goto integer;
1186296341Sdelphij
1187109998Smarkm		case SO_SNDBUF:
1188109998Smarkm			optval = so->so_snd.sb_hiwat;
1189296341Sdelphij			goto integer;
1190296341Sdelphij
1191296341Sdelphij		case SO_RCVBUF:
1192296341Sdelphij			optval = so->so_rcv.sb_hiwat;
1193296341Sdelphij			goto integer;
1194296341Sdelphij
1195296341Sdelphij		case SO_SNDLOWAT:
1196296341Sdelphij			optval = so->so_snd.sb_lowat;
1197296341Sdelphij			goto integer;
1198296341Sdelphij
1199296341Sdelphij		case SO_RCVLOWAT:
1200109998Smarkm			optval = so->so_rcv.sb_lowat;
1201194206Ssimon			goto integer;
1202296341Sdelphij
1203296341Sdelphij		case SO_SNDTIMEO:
1204296341Sdelphij		case SO_RCVTIMEO:
1205296341Sdelphij			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1206296341Sdelphij				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1207296341Sdelphij
1208296341Sdelphij			tv.tv_sec = optval / hz;
1209296341Sdelphij			tv.tv_usec = (optval % hz) * tick;
1210296341Sdelphij			error = sooptcopyout(sopt, &tv, sizeof tv);
1211296341Sdelphij			break;
1212194206Ssimon
1213296341Sdelphij		default:
1214296341Sdelphij			error = ENOPROTOOPT;
1215194206Ssimon			break;
1216296341Sdelphij		}
1217194206Ssimon		return (error);
1218296341Sdelphij	}
1219296341Sdelphij}
1220296341Sdelphij
1221296341Sdelphij/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1222194206Ssimonint
1223296341Sdelphijsoopt_getm(struct sockopt *sopt, struct mbuf **mp)
1224296341Sdelphij{
1225296341Sdelphij	struct mbuf *m, *m_prev;
1226296341Sdelphij	int sopt_size = sopt->sopt_valsize;
1227194206Ssimon
1228296341Sdelphij	MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
1229296341Sdelphij	if (m == 0)
1230296341Sdelphij		return ENOBUFS;
1231296341Sdelphij	if (sopt_size > MLEN) {
1232296341Sdelphij		MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
1233296341Sdelphij		if ((m->m_flags & M_EXT) == 0) {
1234296341Sdelphij			m_free(m);
1235296341Sdelphij			return ENOBUFS;
1236296341Sdelphij		}
1237296341Sdelphij		m->m_len = min(MCLBYTES, sopt_size);
1238296341Sdelphij	} else {
1239194206Ssimon		m->m_len = min(MLEN, sopt_size);
1240296341Sdelphij	}
1241296341Sdelphij	sopt_size -= m->m_len;
1242296341Sdelphij	*mp = m;
1243194206Ssimon	m_prev = m;
1244296341Sdelphij
1245296341Sdelphij	while (sopt_size) {
1246296341Sdelphij		MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
1247296341Sdelphij		if (m == 0) {
1248296341Sdelphij			m_freem(*mp);
1249238405Sjkim			return ENOBUFS;
1250296341Sdelphij		}
1251296341Sdelphij		if (sopt_size > MLEN) {
1252238405Sjkim			MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
1253296341Sdelphij			if ((m->m_flags & M_EXT) == 0) {
1254296341Sdelphij				m_freem(*mp);
1255296341Sdelphij				return ENOBUFS;
1256296341Sdelphij			}
1257296341Sdelphij			m->m_len = min(MCLBYTES, sopt_size);
1258296341Sdelphij		} else {
1259296341Sdelphij			m->m_len = min(MLEN, sopt_size);
1260296341Sdelphij		}
1261296341Sdelphij		sopt_size -= m->m_len;
1262296341Sdelphij		m_prev->m_next = m;
1263296341Sdelphij		m_prev = m;
1264296341Sdelphij	}
1265296341Sdelphij	return 0;
1266296341Sdelphij}
1267296341Sdelphij
1268296341Sdelphij/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
1269296341Sdelphijint
1270296341Sdelphijsoopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1271296341Sdelphij{
1272296341Sdelphij	struct mbuf *m0 = m;
1273296341Sdelphij
1274296341Sdelphij	if (sopt->sopt_val == NULL)
1275296341Sdelphij		return 0;
1276296341Sdelphij	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1277296341Sdelphij		if (sopt->sopt_p != NULL) {
1278296341Sdelphij			int error;
1279238405Sjkim
1280296341Sdelphij			error = copyin(sopt->sopt_val, mtod(m, char *),
1281296341Sdelphij				       m->m_len);
1282296341Sdelphij			if (error != 0) {
1283296341Sdelphij				m_freem(m0);
1284194206Ssimon				return(error);
1285296341Sdelphij			}
1286296341Sdelphij		} else
1287194206Ssimon			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
1288194206Ssimon		sopt->sopt_valsize -= m->m_len;
1289296341Sdelphij		(caddr_t)sopt->sopt_val += m->m_len;
1290296341Sdelphij		m = m->m_next;
1291296341Sdelphij	}
1292296341Sdelphij	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
1293296341Sdelphij		panic("ip6_sooptmcopyin");
1294296341Sdelphij	return 0;
1295296341Sdelphij}
1296296341Sdelphij
1297296341Sdelphij/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
1298296341Sdelphijint
1299296341Sdelphijsoopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1300296341Sdelphij{
1301296341Sdelphij	struct mbuf *m0 = m;
1302296341Sdelphij	size_t valsize = 0;
1303296341Sdelphij
1304296341Sdelphij	if (sopt->sopt_val == NULL)
1305296341Sdelphij		return 0;
1306296341Sdelphij	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
1307296341Sdelphij		if (sopt->sopt_p != NULL) {
1308296341Sdelphij			int error;
1309296341Sdelphij
1310296341Sdelphij			error = copyout(mtod(m, char *), sopt->sopt_val,
1311296341Sdelphij				       m->m_len);
1312296341Sdelphij			if (error != 0) {
1313296341Sdelphij				m_freem(m0);
1314296341Sdelphij				return(error);
1315296341Sdelphij			}
1316296341Sdelphij		} else
1317296341Sdelphij			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
1318296341Sdelphij	       sopt->sopt_valsize -= m->m_len;
1319296341Sdelphij	       (caddr_t)sopt->sopt_val += m->m_len;
1320296341Sdelphij	       valsize += m->m_len;
1321296341Sdelphij	       m = m->m_next;
1322296341Sdelphij	}
1323296341Sdelphij	if (m != NULL) {
1324194206Ssimon		/* enough soopt buffer should be given from user-land */
1325194206Ssimon		m_freem(m0);
1326		return(EINVAL);
1327	}
1328	sopt->sopt_valsize = valsize;
1329	return 0;
1330}
1331
1332void
1333sohasoutofband(so)
1334	register struct socket *so;
1335{
1336	if (so->so_sigio != NULL)
1337		pgsigio(so->so_sigio, SIGURG, 0);
1338	selwakeup(&so->so_rcv.sb_sel);
1339}
1340
1341int
1342sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1343{
1344	int revents = 0;
1345	int s = splnet();
1346
1347	if (events & (POLLIN | POLLRDNORM))
1348		if (soreadable(so))
1349			revents |= events & (POLLIN | POLLRDNORM);
1350
1351	if (events & (POLLOUT | POLLWRNORM))
1352		if (sowriteable(so))
1353			revents |= events & (POLLOUT | POLLWRNORM);
1354
1355	if (events & (POLLPRI | POLLRDBAND))
1356		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1357			revents |= events & (POLLPRI | POLLRDBAND);
1358
1359	if (revents == 0) {
1360		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1361			selrecord(p, &so->so_rcv.sb_sel);
1362			so->so_rcv.sb_flags |= SB_SEL;
1363		}
1364
1365		if (events & (POLLOUT | POLLWRNORM)) {
1366			selrecord(p, &so->so_snd.sb_sel);
1367			so->so_snd.sb_flags |= SB_SEL;
1368		}
1369	}
1370
1371	splx(s);
1372	return (revents);
1373}
1374