uipc_socket.c revision 42902
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.49 1999/01/10 01:58:25 eivind Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/poll.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52#include <sys/uio.h>
53#include <vm/vm_zone.h>
54
55#include <machine/limits.h>
56
57struct	vm_zone *socket_zone;
58so_gen_t	so_gencnt;	/* generation count for sockets */
59
60MALLOC_DEFINE(M_SONAME, "soname", "socket name");
61MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
62
63static int somaxconn = SOMAXCONN;
64SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
65	   0, "");
66
67/*
68 * Socket operation routines.
69 * These routines are called by the routines in
70 * sys_socket.c or from a system process, and
71 * implement the semantics of socket operations by
72 * switching out to the protocol specific routines.
73 */
74
75/*
76 * Get a socket structure from our zone, and initialize it.
77 * We don't implement `waitok' yet (see comments in uipc_domain.c).
78 * Note that it would probably be better to allocate socket
79 * and PCB at the same time, but I'm not convinced that all
80 * the protocols can be easily modified to do this.
81 */
82struct socket *
83soalloc(waitok)
84	int waitok;
85{
86	struct socket *so;
87
88	so = zalloci(socket_zone);
89	if (so) {
90		/* XXX race condition for reentrant kernel */
91		bzero(so, sizeof *so);
92		so->so_gencnt = ++so_gencnt;
93		so->so_zone = socket_zone;
94	}
95	return so;
96}
97
98int
99socreate(dom, aso, type, proto, p)
100	int dom;
101	struct socket **aso;
102	register int type;
103	int proto;
104	struct proc *p;
105{
106	register struct protosw *prp;
107	register struct socket *so;
108	register int error;
109
110	if (proto)
111		prp = pffindproto(dom, proto, type);
112	else
113		prp = pffindtype(dom, type);
114	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
115		return (EPROTONOSUPPORT);
116	if (prp->pr_type != type)
117		return (EPROTOTYPE);
118	so = soalloc(p != 0);
119	if (so == 0)
120		return (ENOBUFS);
121
122	TAILQ_INIT(&so->so_incomp);
123	TAILQ_INIT(&so->so_comp);
124	so->so_type = type;
125	if (p != 0)
126		so->so_uid = p->p_ucred->cr_uid;
127	so->so_proto = prp;
128	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
129	if (error) {
130		so->so_state |= SS_NOFDREF;
131		sofree(so);
132		return (error);
133	}
134	*aso = so;
135	return (0);
136}
137
138int
139sobind(so, nam, p)
140	struct socket *so;
141	struct sockaddr *nam;
142	struct proc *p;
143{
144	int s = splnet();
145	int error;
146
147	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
148	splx(s);
149	return (error);
150}
151
152void
153sodealloc(so)
154	struct socket *so;
155{
156	so->so_gencnt = ++so_gencnt;
157	zfreei(so->so_zone, so);
158}
159
160int
161solisten(so, backlog, p)
162	register struct socket *so;
163	int backlog;
164	struct proc *p;
165{
166	int s, error;
167
168	s = splnet();
169	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
170	if (error) {
171		splx(s);
172		return (error);
173	}
174	if (so->so_comp.tqh_first == NULL)
175		so->so_options |= SO_ACCEPTCONN;
176	if (backlog < 0 || backlog > somaxconn)
177		backlog = somaxconn;
178	so->so_qlimit = backlog;
179	splx(s);
180	return (0);
181}
182
183void
184sofree(so)
185	register struct socket *so;
186{
187	struct socket *head = so->so_head;
188
189	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
190		return;
191	if (head != NULL) {
192		if (so->so_state & SS_INCOMP) {
193			TAILQ_REMOVE(&head->so_incomp, so, so_list);
194			head->so_incqlen--;
195		} else if (so->so_state & SS_COMP) {
196			TAILQ_REMOVE(&head->so_comp, so, so_list);
197		} else {
198			panic("sofree: not queued");
199		}
200		head->so_qlen--;
201		so->so_state &= ~(SS_INCOMP|SS_COMP);
202		so->so_head = NULL;
203	}
204	sbrelease(&so->so_snd);
205	sorflush(so);
206	sodealloc(so);
207}
208
209/*
210 * Close a socket on last file table reference removal.
211 * Initiate disconnect if connected.
212 * Free socket when disconnect complete.
213 */
214int
215soclose(so)
216	register struct socket *so;
217{
218	int s = splnet();		/* conservative */
219	int error = 0;
220
221	funsetown(so->so_sigio);
222	if (so->so_options & SO_ACCEPTCONN) {
223		struct socket *sp, *sonext;
224
225		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
226			sonext = sp->so_list.tqe_next;
227			(void) soabort(sp);
228		}
229		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
230			sonext = sp->so_list.tqe_next;
231			(void) soabort(sp);
232		}
233	}
234	if (so->so_pcb == 0)
235		goto discard;
236	if (so->so_state & SS_ISCONNECTED) {
237		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
238			error = sodisconnect(so);
239			if (error)
240				goto drop;
241		}
242		if (so->so_options & SO_LINGER) {
243			if ((so->so_state & SS_ISDISCONNECTING) &&
244			    (so->so_state & SS_NBIO))
245				goto drop;
246			while (so->so_state & SS_ISCONNECTED) {
247				error = tsleep((caddr_t)&so->so_timeo,
248				    PSOCK | PCATCH, "soclos", so->so_linger);
249				if (error)
250					break;
251			}
252		}
253	}
254drop:
255	if (so->so_pcb) {
256		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
257		if (error == 0)
258			error = error2;
259	}
260discard:
261	if (so->so_state & SS_NOFDREF)
262		panic("soclose: NOFDREF");
263	so->so_state |= SS_NOFDREF;
264	sofree(so);
265	splx(s);
266	return (error);
267}
268
269/*
270 * Must be called at splnet...
271 */
272int
273soabort(so)
274	struct socket *so;
275{
276
277	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
278}
279
280int
281soaccept(so, nam)
282	register struct socket *so;
283	struct sockaddr **nam;
284{
285	int s = splnet();
286	int error;
287
288	if ((so->so_state & SS_NOFDREF) == 0)
289		panic("soaccept: !NOFDREF");
290	so->so_state &= ~SS_NOFDREF;
291	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
292	splx(s);
293	return (error);
294}
295
296int
297soconnect(so, nam, p)
298	register struct socket *so;
299	struct sockaddr *nam;
300	struct proc *p;
301{
302	int s;
303	int error;
304
305	if (so->so_options & SO_ACCEPTCONN)
306		return (EOPNOTSUPP);
307	s = splnet();
308	/*
309	 * If protocol is connection-based, can only connect once.
310	 * Otherwise, if connected, try to disconnect first.
311	 * This allows user to disconnect by connecting to, e.g.,
312	 * a null address.
313	 */
314	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
315	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
316	    (error = sodisconnect(so))))
317		error = EISCONN;
318	else
319		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
320	splx(s);
321	return (error);
322}
323
324int
325soconnect2(so1, so2)
326	register struct socket *so1;
327	struct socket *so2;
328{
329	int s = splnet();
330	int error;
331
332	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
333	splx(s);
334	return (error);
335}
336
337int
338sodisconnect(so)
339	register struct socket *so;
340{
341	int s = splnet();
342	int error;
343
344	if ((so->so_state & SS_ISCONNECTED) == 0) {
345		error = ENOTCONN;
346		goto bad;
347	}
348	if (so->so_state & SS_ISDISCONNECTING) {
349		error = EALREADY;
350		goto bad;
351	}
352	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
353bad:
354	splx(s);
355	return (error);
356}
357
358#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
359/*
360 * Send on a socket.
361 * If send must go all at once and message is larger than
362 * send buffering, then hard error.
363 * Lock against other senders.
364 * If must go all at once and not enough room now, then
365 * inform user that this would block and do nothing.
366 * Otherwise, if nonblocking, send as much as possible.
367 * The data to be sent is described by "uio" if nonzero,
368 * otherwise by the mbuf chain "top" (which must be null
369 * if uio is not).  Data provided in mbuf chain must be small
370 * enough to send all at once.
371 *
372 * Returns nonzero on error, timeout or signal; callers
373 * must check for short counts if EINTR/ERESTART are returned.
374 * Data and control buffers are freed on return.
375 */
376int
377sosend(so, addr, uio, top, control, flags, p)
378	register struct socket *so;
379	struct sockaddr *addr;
380	struct uio *uio;
381	struct mbuf *top;
382	struct mbuf *control;
383	int flags;
384	struct proc *p;
385{
386	struct mbuf **mp;
387	register struct mbuf *m;
388	register long space, len, resid;
389	int clen = 0, error, s, dontroute, mlen;
390	int atomic = sosendallatonce(so) || top;
391
392	if (uio)
393		resid = uio->uio_resid;
394	else
395		resid = top->m_pkthdr.len;
396	/*
397	 * In theory resid should be unsigned.
398	 * However, space must be signed, as it might be less than 0
399	 * if we over-committed, and we must use a signed comparison
400	 * of space and resid.  On the other hand, a negative resid
401	 * causes us to loop sending 0-length segments to the protocol.
402	 *
403	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
404	 * type sockets since that's an error.
405	 */
406	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
407		error = EINVAL;
408		goto out;
409	}
410
411	dontroute =
412	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
413	    (so->so_proto->pr_flags & PR_ATOMIC);
414	if (p)
415		p->p_stats->p_ru.ru_msgsnd++;
416	if (control)
417		clen = control->m_len;
418#define	snderr(errno)	{ error = errno; splx(s); goto release; }
419
420restart:
421	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
422	if (error)
423		goto out;
424	do {
425		s = splnet();
426		if (so->so_state & SS_CANTSENDMORE)
427			snderr(EPIPE);
428		if (so->so_error) {
429			error = so->so_error;
430			so->so_error = 0;
431			splx(s);
432			goto release;
433		}
434		if ((so->so_state & SS_ISCONNECTED) == 0) {
435			/*
436			 * `sendto' and `sendmsg' is allowed on a connection-
437			 * based socket if it supports implied connect.
438			 * Return ENOTCONN if not connected and no address is
439			 * supplied.
440			 */
441			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
442			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
443				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
444				    !(resid == 0 && clen != 0))
445					snderr(ENOTCONN);
446			} else if (addr == 0)
447			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
448				   ENOTCONN : EDESTADDRREQ);
449		}
450		space = sbspace(&so->so_snd);
451		if (flags & MSG_OOB)
452			space += 1024;
453		if ((atomic && resid > so->so_snd.sb_hiwat) ||
454		    clen > so->so_snd.sb_hiwat)
455			snderr(EMSGSIZE);
456		if (space < resid + clen && uio &&
457		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
458			if (so->so_state & SS_NBIO)
459				snderr(EWOULDBLOCK);
460			sbunlock(&so->so_snd);
461			error = sbwait(&so->so_snd);
462			splx(s);
463			if (error)
464				goto out;
465			goto restart;
466		}
467		splx(s);
468		mp = &top;
469		space -= clen;
470		do {
471		    if (uio == NULL) {
472			/*
473			 * Data is prepackaged in "top".
474			 */
475			resid = 0;
476			if (flags & MSG_EOR)
477				top->m_flags |= M_EOR;
478		    } else do {
479			if (top == 0) {
480				MGETHDR(m, M_WAIT, MT_DATA);
481				mlen = MHLEN;
482				m->m_pkthdr.len = 0;
483				m->m_pkthdr.rcvif = (struct ifnet *)0;
484			} else {
485				MGET(m, M_WAIT, MT_DATA);
486				mlen = MLEN;
487			}
488			if (resid >= MINCLSIZE) {
489				MCLGET(m, M_WAIT);
490				if ((m->m_flags & M_EXT) == 0)
491					goto nopages;
492				mlen = MCLBYTES;
493				len = min(min(mlen, resid), space);
494			} else {
495nopages:
496				len = min(min(mlen, resid), space);
497				/*
498				 * For datagram protocols, leave room
499				 * for protocol headers in first mbuf.
500				 */
501				if (atomic && top == 0 && len < mlen)
502					MH_ALIGN(m, len);
503			}
504			space -= len;
505			error = uiomove(mtod(m, caddr_t), (int)len, uio);
506			resid = uio->uio_resid;
507			m->m_len = len;
508			*mp = m;
509			top->m_pkthdr.len += len;
510			if (error)
511				goto release;
512			mp = &m->m_next;
513			if (resid <= 0) {
514				if (flags & MSG_EOR)
515					top->m_flags |= M_EOR;
516				break;
517			}
518		    } while (space > 0 && atomic);
519		    if (dontroute)
520			    so->so_options |= SO_DONTROUTE;
521		    s = splnet();				/* XXX */
522		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
523			(flags & MSG_OOB) ? PRUS_OOB :
524			/*
525			 * If the user set MSG_EOF, the protocol
526			 * understands this flag and nothing left to
527			 * send then use PRU_SEND_EOF instead of PRU_SEND.
528			 */
529			((flags & MSG_EOF) &&
530			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
531			 (resid <= 0)) ?
532				PRUS_EOF :
533			/* If there is more to send set PRUS_MORETOCOME */
534			(resid > 0) ? PRUS_MORETOCOME : 0,
535			top, addr, control, p);
536		    splx(s);
537		    if (dontroute)
538			    so->so_options &= ~SO_DONTROUTE;
539		    clen = 0;
540		    control = 0;
541		    top = 0;
542		    mp = &top;
543		    if (error)
544			goto release;
545		} while (resid && space > 0);
546	} while (resid);
547
548release:
549	sbunlock(&so->so_snd);
550out:
551	if (top)
552		m_freem(top);
553	if (control)
554		m_freem(control);
555	return (error);
556}
557
558/*
559 * Implement receive operations on a socket.
560 * We depend on the way that records are added to the sockbuf
561 * by sbappend*.  In particular, each record (mbufs linked through m_next)
562 * must begin with an address if the protocol so specifies,
563 * followed by an optional mbuf or mbufs containing ancillary data,
564 * and then zero or more mbufs of data.
565 * In order to avoid blocking network interrupts for the entire time here,
566 * we splx() while doing the actual copy to user space.
567 * Although the sockbuf is locked, new data may still be appended,
568 * and thus we must maintain consistency of the sockbuf during that time.
569 *
570 * The caller may receive the data as a single mbuf chain by supplying
571 * an mbuf **mp0 for use in returning the chain.  The uio is then used
572 * only for the count in uio_resid.
573 */
574int
575soreceive(so, psa, uio, mp0, controlp, flagsp)
576	register struct socket *so;
577	struct sockaddr **psa;
578	struct uio *uio;
579	struct mbuf **mp0;
580	struct mbuf **controlp;
581	int *flagsp;
582{
583	register struct mbuf *m, **mp;
584	register int flags, len, error, s, offset;
585	struct protosw *pr = so->so_proto;
586	struct mbuf *nextrecord;
587	int moff, type = 0;
588	int orig_resid = uio->uio_resid;
589
590	mp = mp0;
591	if (psa)
592		*psa = 0;
593	if (controlp)
594		*controlp = 0;
595	if (flagsp)
596		flags = *flagsp &~ MSG_EOR;
597	else
598		flags = 0;
599	if (flags & MSG_OOB) {
600		m = m_get(M_WAIT, MT_DATA);
601		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
602		if (error)
603			goto bad;
604		do {
605			error = uiomove(mtod(m, caddr_t),
606			    (int) min(uio->uio_resid, m->m_len), uio);
607			m = m_free(m);
608		} while (uio->uio_resid && error == 0 && m);
609bad:
610		if (m)
611			m_freem(m);
612		return (error);
613	}
614	if (mp)
615		*mp = (struct mbuf *)0;
616	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
617		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
618
619restart:
620	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
621	if (error)
622		return (error);
623	s = splnet();
624
625	m = so->so_rcv.sb_mb;
626	/*
627	 * If we have less data than requested, block awaiting more
628	 * (subject to any timeout) if:
629	 *   1. the current count is less than the low water mark, or
630	 *   2. MSG_WAITALL is set, and it is possible to do the entire
631	 *	receive operation at once if we block (resid <= hiwat).
632	 *   3. MSG_DONTWAIT is not set
633	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
634	 * we have to do the receive in sections, and thus risk returning
635	 * a short count if a timeout or signal occurs after we start.
636	 */
637	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
638	    so->so_rcv.sb_cc < uio->uio_resid) &&
639	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
640	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
641	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
642		KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
643		if (so->so_error) {
644			if (m)
645				goto dontblock;
646			error = so->so_error;
647			if ((flags & MSG_PEEK) == 0)
648				so->so_error = 0;
649			goto release;
650		}
651		if (so->so_state & SS_CANTRCVMORE) {
652			if (m)
653				goto dontblock;
654			else
655				goto release;
656		}
657		for (; m; m = m->m_next)
658			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
659				m = so->so_rcv.sb_mb;
660				goto dontblock;
661			}
662		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
663		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
664			error = ENOTCONN;
665			goto release;
666		}
667		if (uio->uio_resid == 0)
668			goto release;
669		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
670			error = EWOULDBLOCK;
671			goto release;
672		}
673		sbunlock(&so->so_rcv);
674		error = sbwait(&so->so_rcv);
675		splx(s);
676		if (error)
677			return (error);
678		goto restart;
679	}
680dontblock:
681	if (uio->uio_procp)
682		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
683	nextrecord = m->m_nextpkt;
684	if (pr->pr_flags & PR_ADDR) {
685		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
686		orig_resid = 0;
687		if (psa)
688			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
689					    mp0 == 0);
690		if (flags & MSG_PEEK) {
691			m = m->m_next;
692		} else {
693			sbfree(&so->so_rcv, m);
694			MFREE(m, so->so_rcv.sb_mb);
695			m = so->so_rcv.sb_mb;
696		}
697	}
698	while (m && m->m_type == MT_CONTROL && error == 0) {
699		if (flags & MSG_PEEK) {
700			if (controlp)
701				*controlp = m_copy(m, 0, m->m_len);
702			m = m->m_next;
703		} else {
704			sbfree(&so->so_rcv, m);
705			if (controlp) {
706				if (pr->pr_domain->dom_externalize &&
707				    mtod(m, struct cmsghdr *)->cmsg_type ==
708				    SCM_RIGHTS)
709				   error = (*pr->pr_domain->dom_externalize)(m);
710				*controlp = m;
711				so->so_rcv.sb_mb = m->m_next;
712				m->m_next = 0;
713				m = so->so_rcv.sb_mb;
714			} else {
715				MFREE(m, so->so_rcv.sb_mb);
716				m = so->so_rcv.sb_mb;
717			}
718		}
719		if (controlp) {
720			orig_resid = 0;
721			controlp = &(*controlp)->m_next;
722		}
723	}
724	if (m) {
725		if ((flags & MSG_PEEK) == 0)
726			m->m_nextpkt = nextrecord;
727		type = m->m_type;
728		if (type == MT_OOBDATA)
729			flags |= MSG_OOB;
730	}
731	moff = 0;
732	offset = 0;
733	while (m && uio->uio_resid > 0 && error == 0) {
734		if (m->m_type == MT_OOBDATA) {
735			if (type != MT_OOBDATA)
736				break;
737		} else if (type == MT_OOBDATA)
738			break;
739		else
740		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
741			("receive 3"));
742		so->so_state &= ~SS_RCVATMARK;
743		len = uio->uio_resid;
744		if (so->so_oobmark && len > so->so_oobmark - offset)
745			len = so->so_oobmark - offset;
746		if (len > m->m_len - moff)
747			len = m->m_len - moff;
748		/*
749		 * If mp is set, just pass back the mbufs.
750		 * Otherwise copy them out via the uio, then free.
751		 * Sockbuf must be consistent here (points to current mbuf,
752		 * it points to next record) when we drop priority;
753		 * we must note any additions to the sockbuf when we
754		 * block interrupts again.
755		 */
756		if (mp == 0) {
757			splx(s);
758			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
759			s = splnet();
760			if (error)
761				goto release;
762		} else
763			uio->uio_resid -= len;
764		if (len == m->m_len - moff) {
765			if (m->m_flags & M_EOR)
766				flags |= MSG_EOR;
767			if (flags & MSG_PEEK) {
768				m = m->m_next;
769				moff = 0;
770			} else {
771				nextrecord = m->m_nextpkt;
772				sbfree(&so->so_rcv, m);
773				if (mp) {
774					*mp = m;
775					mp = &m->m_next;
776					so->so_rcv.sb_mb = m = m->m_next;
777					*mp = (struct mbuf *)0;
778				} else {
779					MFREE(m, so->so_rcv.sb_mb);
780					m = so->so_rcv.sb_mb;
781				}
782				if (m)
783					m->m_nextpkt = nextrecord;
784			}
785		} else {
786			if (flags & MSG_PEEK)
787				moff += len;
788			else {
789				if (mp)
790					*mp = m_copym(m, 0, len, M_WAIT);
791				m->m_data += len;
792				m->m_len -= len;
793				so->so_rcv.sb_cc -= len;
794			}
795		}
796		if (so->so_oobmark) {
797			if ((flags & MSG_PEEK) == 0) {
798				so->so_oobmark -= len;
799				if (so->so_oobmark == 0) {
800					so->so_state |= SS_RCVATMARK;
801					break;
802				}
803			} else {
804				offset += len;
805				if (offset == so->so_oobmark)
806					break;
807			}
808		}
809		if (flags & MSG_EOR)
810			break;
811		/*
812		 * If the MSG_WAITALL flag is set (for non-atomic socket),
813		 * we must not quit until "uio->uio_resid == 0" or an error
814		 * termination.  If a signal/timeout occurs, return
815		 * with a short count but without error.
816		 * Keep sockbuf locked against other readers.
817		 */
818		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
819		    !sosendallatonce(so) && !nextrecord) {
820			if (so->so_error || so->so_state & SS_CANTRCVMORE)
821				break;
822			error = sbwait(&so->so_rcv);
823			if (error) {
824				sbunlock(&so->so_rcv);
825				splx(s);
826				return (0);
827			}
828			m = so->so_rcv.sb_mb;
829			if (m)
830				nextrecord = m->m_nextpkt;
831		}
832	}
833
834	if (m && pr->pr_flags & PR_ATOMIC) {
835		flags |= MSG_TRUNC;
836		if ((flags & MSG_PEEK) == 0)
837			(void) sbdroprecord(&so->so_rcv);
838	}
839	if ((flags & MSG_PEEK) == 0) {
840		if (m == 0)
841			so->so_rcv.sb_mb = nextrecord;
842		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
843			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
844	}
845	if (orig_resid == uio->uio_resid && orig_resid &&
846	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
847		sbunlock(&so->so_rcv);
848		splx(s);
849		goto restart;
850	}
851
852	if (flagsp)
853		*flagsp |= flags;
854release:
855	sbunlock(&so->so_rcv);
856	splx(s);
857	return (error);
858}
859
860int
861soshutdown(so, how)
862	register struct socket *so;
863	register int how;
864{
865	register struct protosw *pr = so->so_proto;
866
867	how++;
868	if (how & FREAD)
869		sorflush(so);
870	if (how & FWRITE)
871		return ((*pr->pr_usrreqs->pru_shutdown)(so));
872	return (0);
873}
874
875void
876sorflush(so)
877	register struct socket *so;
878{
879	register struct sockbuf *sb = &so->so_rcv;
880	register struct protosw *pr = so->so_proto;
881	register int s;
882	struct sockbuf asb;
883
884	sb->sb_flags |= SB_NOINTR;
885	(void) sblock(sb, M_WAITOK);
886	s = splimp();
887	socantrcvmore(so);
888	sbunlock(sb);
889	asb = *sb;
890	bzero((caddr_t)sb, sizeof (*sb));
891	splx(s);
892	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
893		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
894	sbrelease(&asb);
895}
896
897/*
898 * Perhaps this routine, and sooptcopyout(), below, ought to come in
899 * an additional variant to handle the case where the option value needs
900 * to be some kind of integer, but not a specific size.
901 * In addition to their use here, these functions are also called by the
902 * protocol-level pr_ctloutput() routines.
903 */
904int
905sooptcopyin(sopt, buf, len, minlen)
906	struct	sockopt *sopt;
907	void	*buf;
908	size_t	len;
909	size_t	minlen;
910{
911	size_t	valsize;
912
913	/*
914	 * If the user gives us more than we wanted, we ignore it,
915	 * but if we don't get the minimum length the caller
916	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
917	 * is set to however much we actually retrieved.
918	 */
919	if ((valsize = sopt->sopt_valsize) < minlen)
920		return EINVAL;
921	if (valsize > len)
922		sopt->sopt_valsize = valsize = len;
923
924	if (sopt->sopt_p != 0)
925		return (copyin(sopt->sopt_val, buf, valsize));
926
927	bcopy(sopt->sopt_val, buf, valsize);
928	return 0;
929}
930
931int
932sosetopt(so, sopt)
933	struct socket *so;
934	struct sockopt *sopt;
935{
936	int	error, optval;
937	struct	linger l;
938	struct	timeval tv;
939	short	val;
940
941	error = 0;
942	if (sopt->sopt_level != SOL_SOCKET) {
943		if (so->so_proto && so->so_proto->pr_ctloutput)
944			return ((*so->so_proto->pr_ctloutput)
945				  (so, sopt));
946		error = ENOPROTOOPT;
947	} else {
948		switch (sopt->sopt_name) {
949		case SO_LINGER:
950			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
951			if (error)
952				goto bad;
953
954			so->so_linger = l.l_linger;
955			if (l.l_onoff)
956				so->so_options |= SO_LINGER;
957			else
958				so->so_options &= ~SO_LINGER;
959			break;
960
961		case SO_DEBUG:
962		case SO_KEEPALIVE:
963		case SO_DONTROUTE:
964		case SO_USELOOPBACK:
965		case SO_BROADCAST:
966		case SO_REUSEADDR:
967		case SO_REUSEPORT:
968		case SO_OOBINLINE:
969		case SO_TIMESTAMP:
970			error = sooptcopyin(sopt, &optval, sizeof optval,
971					    sizeof optval);
972			if (error)
973				goto bad;
974			if (optval)
975				so->so_options |= sopt->sopt_name;
976			else
977				so->so_options &= ~sopt->sopt_name;
978			break;
979
980		case SO_SNDBUF:
981		case SO_RCVBUF:
982		case SO_SNDLOWAT:
983		case SO_RCVLOWAT:
984			error = sooptcopyin(sopt, &optval, sizeof optval,
985					    sizeof optval);
986			if (error)
987				goto bad;
988
989			/*
990			 * Values < 1 make no sense for any of these
991			 * options, so disallow them.
992			 */
993			if (optval < 1) {
994				error = EINVAL;
995				goto bad;
996			}
997
998			switch (sopt->sopt_name) {
999			case SO_SNDBUF:
1000			case SO_RCVBUF:
1001				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1002					      &so->so_snd : &so->so_rcv,
1003					      (u_long) optval) == 0) {
1004					error = ENOBUFS;
1005					goto bad;
1006				}
1007				break;
1008
1009			/*
1010			 * Make sure the low-water is never greater than
1011			 * the high-water.
1012			 */
1013			case SO_SNDLOWAT:
1014				so->so_snd.sb_lowat =
1015				    (optval > so->so_snd.sb_hiwat) ?
1016				    so->so_snd.sb_hiwat : optval;
1017				break;
1018			case SO_RCVLOWAT:
1019				so->so_rcv.sb_lowat =
1020				    (optval > so->so_rcv.sb_hiwat) ?
1021				    so->so_rcv.sb_hiwat : optval;
1022				break;
1023			}
1024			break;
1025
1026		case SO_SNDTIMEO:
1027		case SO_RCVTIMEO:
1028			error = sooptcopyin(sopt, &tv, sizeof tv,
1029					    sizeof tv);
1030			if (error)
1031				goto bad;
1032
1033			if (tv.tv_sec > SHRT_MAX / hz - hz) {
1034				error = EDOM;
1035				goto bad;
1036			}
1037			val = tv.tv_sec * hz + tv.tv_usec / tick;
1038
1039			switch (sopt->sopt_name) {
1040			case SO_SNDTIMEO:
1041				so->so_snd.sb_timeo = val;
1042				break;
1043			case SO_RCVTIMEO:
1044				so->so_rcv.sb_timeo = val;
1045				break;
1046			}
1047			break;
1048
1049		default:
1050			error = ENOPROTOOPT;
1051			break;
1052		}
1053		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1054			(void) ((*so->so_proto->pr_ctloutput)
1055				  (so, sopt));
1056		}
1057	}
1058bad:
1059	return (error);
1060}
1061
1062/* Helper routine for getsockopt */
1063int
1064sooptcopyout(sopt, buf, len)
1065	struct	sockopt *sopt;
1066	void	*buf;
1067	size_t	len;
1068{
1069	int	error;
1070	size_t	valsize;
1071
1072	error = 0;
1073
1074	/*
1075	 * Documented get behavior is that we always return a value,
1076	 * possibly truncated to fit in the user's buffer.
1077	 * Traditional behavior is that we always tell the user
1078	 * precisely how much we copied, rather than something useful
1079	 * like the total amount we had available for her.
1080	 * Note that this interface is not idempotent; the entire answer must
1081	 * generated ahead of time.
1082	 */
1083	valsize = min(len, sopt->sopt_valsize);
1084	sopt->sopt_valsize = valsize;
1085	if (sopt->sopt_val != 0) {
1086		if (sopt->sopt_p != 0)
1087			error = copyout(buf, sopt->sopt_val, valsize);
1088		else
1089			bcopy(buf, sopt->sopt_val, valsize);
1090	}
1091	return error;
1092}
1093
1094int
1095sogetopt(so, sopt)
1096	struct socket *so;
1097	struct sockopt *sopt;
1098{
1099	int	error, optval;
1100	struct	linger l;
1101	struct	timeval tv;
1102
1103	error = 0;
1104	if (sopt->sopt_level != SOL_SOCKET) {
1105		if (so->so_proto && so->so_proto->pr_ctloutput) {
1106			return ((*so->so_proto->pr_ctloutput)
1107				  (so, sopt));
1108		} else
1109			return (ENOPROTOOPT);
1110	} else {
1111		switch (sopt->sopt_name) {
1112		case SO_LINGER:
1113			l.l_onoff = so->so_options & SO_LINGER;
1114			l.l_linger = so->so_linger;
1115			error = sooptcopyout(sopt, &l, sizeof l);
1116			break;
1117
1118		case SO_USELOOPBACK:
1119		case SO_DONTROUTE:
1120		case SO_DEBUG:
1121		case SO_KEEPALIVE:
1122		case SO_REUSEADDR:
1123		case SO_REUSEPORT:
1124		case SO_BROADCAST:
1125		case SO_OOBINLINE:
1126		case SO_TIMESTAMP:
1127			optval = so->so_options & sopt->sopt_name;
1128integer:
1129			error = sooptcopyout(sopt, &optval, sizeof optval);
1130			break;
1131
1132		case SO_TYPE:
1133			optval = so->so_type;
1134			goto integer;
1135
1136		case SO_ERROR:
1137			optval = so->so_error;
1138			so->so_error = 0;
1139			goto integer;
1140
1141		case SO_SNDBUF:
1142			optval = so->so_snd.sb_hiwat;
1143			goto integer;
1144
1145		case SO_RCVBUF:
1146			optval = so->so_rcv.sb_hiwat;
1147			goto integer;
1148
1149		case SO_SNDLOWAT:
1150			optval = so->so_snd.sb_lowat;
1151			goto integer;
1152
1153		case SO_RCVLOWAT:
1154			optval = so->so_rcv.sb_lowat;
1155			goto integer;
1156
1157		case SO_SNDTIMEO:
1158		case SO_RCVTIMEO:
1159			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1160				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1161
1162			tv.tv_sec = optval / hz;
1163			tv.tv_usec = (optval % hz) * tick;
1164			error = sooptcopyout(sopt, &tv, sizeof tv);
1165			break;
1166
1167		default:
1168			error = ENOPROTOOPT;
1169			break;
1170		}
1171		return (error);
1172	}
1173}
1174
1175void
1176sohasoutofband(so)
1177	register struct socket *so;
1178{
1179	if (so->so_sigio != NULL)
1180		pgsigio(so->so_sigio, SIGURG, 0);
1181	selwakeup(&so->so_rcv.sb_sel);
1182}
1183
1184int
1185sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1186{
1187	int revents = 0;
1188	int s = splnet();
1189
1190	if (events & (POLLIN | POLLRDNORM))
1191		if (soreadable(so))
1192			revents |= events & (POLLIN | POLLRDNORM);
1193
1194	if (events & (POLLOUT | POLLWRNORM))
1195		if (sowriteable(so))
1196			revents |= events & (POLLOUT | POLLWRNORM);
1197
1198	if (events & (POLLPRI | POLLRDBAND))
1199		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1200			revents |= events & (POLLPRI | POLLRDBAND);
1201
1202	if (revents == 0) {
1203		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1204			selrecord(p, &so->so_rcv.sb_sel);
1205			so->so_rcv.sb_flags |= SB_SEL;
1206		}
1207
1208		if (events & (POLLOUT | POLLWRNORM)) {
1209			selrecord(p, &so->so_snd.sb_sel);
1210			so->so_snd.sb_flags |= SB_SEL;
1211		}
1212	}
1213
1214	splx(s);
1215	return (revents);
1216}
1217