uipc_socket.c revision 38705
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.44 1998/08/31 15:34:55 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/poll.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52#include <sys/uio.h>
53#include <vm/vm_zone.h>
54
55#include <machine/limits.h>
56
57struct	vm_zone *socket_zone;
58so_gen_t	so_gencnt;	/* generation count for sockets */
59
60MALLOC_DEFINE(M_SONAME, "soname", "socket name");
61MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
62
63static int somaxconn = SOMAXCONN;
64SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
65	   0, "");
66
67/*
68 * Socket operation routines.
69 * These routines are called by the routines in
70 * sys_socket.c or from a system process, and
71 * implement the semantics of socket operations by
72 * switching out to the protocol specific routines.
73 */
74
75/*
76 * Get a socket structure from our zone, and initialize it.
77 * We don't implement `waitok' yet (see comments in uipc_domain.c).
78 * Note that it would probably be better to allocate socket
79 * and PCB at the same time, but I'm not convinced that all
80 * the protocols can be easily modified to do this.
81 */
82struct socket *
83soalloc(waitok)
84	int waitok;
85{
86	struct socket *so;
87
88	so = zalloci(socket_zone);
89	if (so) {
90		/* XXX race condition for reentrant kernel */
91		bzero(so, sizeof *so);
92		so->so_gencnt = ++so_gencnt;
93		so->so_zone = socket_zone;
94	}
95	return so;
96}
97
98int
99socreate(dom, aso, type, proto, p)
100	int dom;
101	struct socket **aso;
102	register int type;
103	int proto;
104	struct proc *p;
105{
106	register struct protosw *prp;
107	register struct socket *so;
108	register int error;
109
110	if (proto)
111		prp = pffindproto(dom, proto, type);
112	else
113		prp = pffindtype(dom, type);
114	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
115		return (EPROTONOSUPPORT);
116	if (prp->pr_type != type)
117		return (EPROTOTYPE);
118	so = soalloc(p != 0);
119	if (so == 0)
120		return (ENOBUFS);
121
122	TAILQ_INIT(&so->so_incomp);
123	TAILQ_INIT(&so->so_comp);
124	so->so_type = type;
125	if (p != 0)
126		so->so_uid = p->p_ucred->cr_uid;
127	so->so_proto = prp;
128	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
129	if (error) {
130		so->so_state |= SS_NOFDREF;
131		sofree(so);
132		return (error);
133	}
134	*aso = so;
135	return (0);
136}
137
138int
139sobind(so, nam, p)
140	struct socket *so;
141	struct sockaddr *nam;
142	struct proc *p;
143{
144	int s = splnet();
145	int error;
146
147	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
148	splx(s);
149	return (error);
150}
151
152void
153sodealloc(so)
154	struct socket *so;
155{
156	so->so_gencnt = ++so_gencnt;
157	zfreei(so->so_zone, so);
158}
159
160int
161solisten(so, backlog, p)
162	register struct socket *so;
163	int backlog;
164	struct proc *p;
165{
166	int s, error;
167
168	s = splnet();
169	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
170	if (error) {
171		splx(s);
172		return (error);
173	}
174	if (so->so_comp.tqh_first == NULL)
175		so->so_options |= SO_ACCEPTCONN;
176	if (backlog < 0 || backlog > somaxconn)
177		backlog = somaxconn;
178	so->so_qlimit = backlog;
179	splx(s);
180	return (0);
181}
182
183void
184sofree(so)
185	register struct socket *so;
186{
187	struct socket *head = so->so_head;
188
189	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
190		return;
191	if (head != NULL) {
192		if (so->so_state & SS_INCOMP) {
193			TAILQ_REMOVE(&head->so_incomp, so, so_list);
194			head->so_incqlen--;
195		} else if (so->so_state & SS_COMP) {
196			TAILQ_REMOVE(&head->so_comp, so, so_list);
197		} else {
198			panic("sofree: not queued");
199		}
200		head->so_qlen--;
201		so->so_state &= ~(SS_INCOMP|SS_COMP);
202		so->so_head = NULL;
203	}
204	sbrelease(&so->so_snd);
205	sorflush(so);
206	sodealloc(so);
207}
208
209/*
210 * Close a socket on last file table reference removal.
211 * Initiate disconnect if connected.
212 * Free socket when disconnect complete.
213 */
214int
215soclose(so)
216	register struct socket *so;
217{
218	int s = splnet();		/* conservative */
219	int error = 0;
220
221	if (so->so_options & SO_ACCEPTCONN) {
222		struct socket *sp, *sonext;
223
224		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
225			sonext = sp->so_list.tqe_next;
226			(void) soabort(sp);
227		}
228		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
229			sonext = sp->so_list.tqe_next;
230			(void) soabort(sp);
231		}
232	}
233	if (so->so_pcb == 0)
234		goto discard;
235	if (so->so_state & SS_ISCONNECTED) {
236		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
237			error = sodisconnect(so);
238			if (error)
239				goto drop;
240		}
241		if (so->so_options & SO_LINGER) {
242			if ((so->so_state & SS_ISDISCONNECTING) &&
243			    (so->so_state & SS_NBIO))
244				goto drop;
245			while (so->so_state & SS_ISCONNECTED) {
246				error = tsleep((caddr_t)&so->so_timeo,
247				    PSOCK | PCATCH, "soclos", so->so_linger);
248				if (error)
249					break;
250			}
251		}
252	}
253drop:
254	if (so->so_pcb) {
255		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
256		if (error == 0)
257			error = error2;
258	}
259discard:
260	if (so->so_state & SS_NOFDREF)
261		panic("soclose: NOFDREF");
262	so->so_state |= SS_NOFDREF;
263	sofree(so);
264	splx(s);
265	return (error);
266}
267
268/*
269 * Must be called at splnet...
270 */
271int
272soabort(so)
273	struct socket *so;
274{
275
276	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
277}
278
279int
280soaccept(so, nam)
281	register struct socket *so;
282	struct sockaddr **nam;
283{
284	int s = splnet();
285	int error;
286
287	if ((so->so_state & SS_NOFDREF) == 0)
288		panic("soaccept: !NOFDREF");
289	so->so_state &= ~SS_NOFDREF;
290	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
291	splx(s);
292	return (error);
293}
294
295int
296soconnect(so, nam, p)
297	register struct socket *so;
298	struct sockaddr *nam;
299	struct proc *p;
300{
301	int s;
302	int error;
303
304	if (so->so_options & SO_ACCEPTCONN)
305		return (EOPNOTSUPP);
306	s = splnet();
307	/*
308	 * If protocol is connection-based, can only connect once.
309	 * Otherwise, if connected, try to disconnect first.
310	 * This allows user to disconnect by connecting to, e.g.,
311	 * a null address.
312	 */
313	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
314	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
315	    (error = sodisconnect(so))))
316		error = EISCONN;
317	else
318		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
319	splx(s);
320	return (error);
321}
322
323int
324soconnect2(so1, so2)
325	register struct socket *so1;
326	struct socket *so2;
327{
328	int s = splnet();
329	int error;
330
331	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
332	splx(s);
333	return (error);
334}
335
336int
337sodisconnect(so)
338	register struct socket *so;
339{
340	int s = splnet();
341	int error;
342
343	if ((so->so_state & SS_ISCONNECTED) == 0) {
344		error = ENOTCONN;
345		goto bad;
346	}
347	if (so->so_state & SS_ISDISCONNECTING) {
348		error = EALREADY;
349		goto bad;
350	}
351	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
352bad:
353	splx(s);
354	return (error);
355}
356
357#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
358/*
359 * Send on a socket.
360 * If send must go all at once and message is larger than
361 * send buffering, then hard error.
362 * Lock against other senders.
363 * If must go all at once and not enough room now, then
364 * inform user that this would block and do nothing.
365 * Otherwise, if nonblocking, send as much as possible.
366 * The data to be sent is described by "uio" if nonzero,
367 * otherwise by the mbuf chain "top" (which must be null
368 * if uio is not).  Data provided in mbuf chain must be small
369 * enough to send all at once.
370 *
371 * Returns nonzero on error, timeout or signal; callers
372 * must check for short counts if EINTR/ERESTART are returned.
373 * Data and control buffers are freed on return.
374 */
375int
376sosend(so, addr, uio, top, control, flags, p)
377	register struct socket *so;
378	struct sockaddr *addr;
379	struct uio *uio;
380	struct mbuf *top;
381	struct mbuf *control;
382	int flags;
383	struct proc *p;
384{
385	struct mbuf **mp;
386	register struct mbuf *m;
387	register long space, len, resid;
388	int clen = 0, error, s, dontroute, mlen;
389	int atomic = sosendallatonce(so) || top;
390
391	if (uio)
392		resid = uio->uio_resid;
393	else
394		resid = top->m_pkthdr.len;
395	/*
396	 * In theory resid should be unsigned.
397	 * However, space must be signed, as it might be less than 0
398	 * if we over-committed, and we must use a signed comparison
399	 * of space and resid.  On the other hand, a negative resid
400	 * causes us to loop sending 0-length segments to the protocol.
401	 *
402	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
403	 * type sockets since that's an error.
404	 */
405	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
406		error = EINVAL;
407		goto out;
408	}
409
410	dontroute =
411	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
412	    (so->so_proto->pr_flags & PR_ATOMIC);
413	if (p)
414		p->p_stats->p_ru.ru_msgsnd++;
415	if (control)
416		clen = control->m_len;
417#define	snderr(errno)	{ error = errno; splx(s); goto release; }
418
419restart:
420	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
421	if (error)
422		goto out;
423	do {
424		s = splnet();
425		if (so->so_state & SS_CANTSENDMORE)
426			snderr(EPIPE);
427		if (so->so_error) {
428			error = so->so_error;
429			so->so_error = 0;
430			splx(s);
431			goto release;
432		}
433		if ((so->so_state & SS_ISCONNECTED) == 0) {
434			/*
435			 * `sendto' and `sendmsg' is allowed on a connection-
436			 * based socket if it supports implied connect.
437			 * Return ENOTCONN if not connected and no address is
438			 * supplied.
439			 */
440			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
441			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
442				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
443				    !(resid == 0 && clen != 0))
444					snderr(ENOTCONN);
445			} else if (addr == 0)
446			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
447				   ENOTCONN : EDESTADDRREQ);
448		}
449		space = sbspace(&so->so_snd);
450		if (flags & MSG_OOB)
451			space += 1024;
452		if ((atomic && resid > so->so_snd.sb_hiwat) ||
453		    clen > so->so_snd.sb_hiwat)
454			snderr(EMSGSIZE);
455		if (space < resid + clen && uio &&
456		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
457			if (so->so_state & SS_NBIO)
458				snderr(EWOULDBLOCK);
459			sbunlock(&so->so_snd);
460			error = sbwait(&so->so_snd);
461			splx(s);
462			if (error)
463				goto out;
464			goto restart;
465		}
466		splx(s);
467		mp = &top;
468		space -= clen;
469		do {
470		    if (uio == NULL) {
471			/*
472			 * Data is prepackaged in "top".
473			 */
474			resid = 0;
475			if (flags & MSG_EOR)
476				top->m_flags |= M_EOR;
477		    } else do {
478			if (top == 0) {
479				MGETHDR(m, M_WAIT, MT_DATA);
480				mlen = MHLEN;
481				m->m_pkthdr.len = 0;
482				m->m_pkthdr.rcvif = (struct ifnet *)0;
483			} else {
484				MGET(m, M_WAIT, MT_DATA);
485				mlen = MLEN;
486			}
487			if (resid >= MINCLSIZE) {
488				MCLGET(m, M_WAIT);
489				if ((m->m_flags & M_EXT) == 0)
490					goto nopages;
491				mlen = MCLBYTES;
492				len = min(min(mlen, resid), space);
493			} else {
494nopages:
495				len = min(min(mlen, resid), space);
496				/*
497				 * For datagram protocols, leave room
498				 * for protocol headers in first mbuf.
499				 */
500				if (atomic && top == 0 && len < mlen)
501					MH_ALIGN(m, len);
502			}
503			space -= len;
504			error = uiomove(mtod(m, caddr_t), (int)len, uio);
505			resid = uio->uio_resid;
506			m->m_len = len;
507			*mp = m;
508			top->m_pkthdr.len += len;
509			if (error)
510				goto release;
511			mp = &m->m_next;
512			if (resid <= 0) {
513				if (flags & MSG_EOR)
514					top->m_flags |= M_EOR;
515				break;
516			}
517		    } while (space > 0 && atomic);
518		    if (dontroute)
519			    so->so_options |= SO_DONTROUTE;
520		    s = splnet();				/* XXX */
521		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
522			(flags & MSG_OOB) ? PRUS_OOB :
523			/*
524			 * If the user set MSG_EOF, the protocol
525			 * understands this flag and nothing left to
526			 * send then use PRU_SEND_EOF instead of PRU_SEND.
527			 */
528			((flags & MSG_EOF) &&
529			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
530			 (resid <= 0)) ?
531				PRUS_EOF : 0,
532			top, addr, control, p);
533		    splx(s);
534		    if (dontroute)
535			    so->so_options &= ~SO_DONTROUTE;
536		    clen = 0;
537		    control = 0;
538		    top = 0;
539		    mp = &top;
540		    if (error)
541			goto release;
542		} while (resid && space > 0);
543	} while (resid);
544
545release:
546	sbunlock(&so->so_snd);
547out:
548	if (top)
549		m_freem(top);
550	if (control)
551		m_freem(control);
552	return (error);
553}
554
555/*
556 * Implement receive operations on a socket.
557 * We depend on the way that records are added to the sockbuf
558 * by sbappend*.  In particular, each record (mbufs linked through m_next)
559 * must begin with an address if the protocol so specifies,
560 * followed by an optional mbuf or mbufs containing ancillary data,
561 * and then zero or more mbufs of data.
562 * In order to avoid blocking network interrupts for the entire time here,
563 * we splx() while doing the actual copy to user space.
564 * Although the sockbuf is locked, new data may still be appended,
565 * and thus we must maintain consistency of the sockbuf during that time.
566 *
567 * The caller may receive the data as a single mbuf chain by supplying
568 * an mbuf **mp0 for use in returning the chain.  The uio is then used
569 * only for the count in uio_resid.
570 */
571int
572soreceive(so, psa, uio, mp0, controlp, flagsp)
573	register struct socket *so;
574	struct sockaddr **psa;
575	struct uio *uio;
576	struct mbuf **mp0;
577	struct mbuf **controlp;
578	int *flagsp;
579{
580	register struct mbuf *m, **mp;
581	register int flags, len, error, s, offset;
582	struct protosw *pr = so->so_proto;
583	struct mbuf *nextrecord;
584	int moff, type = 0;
585	int orig_resid = uio->uio_resid;
586
587	mp = mp0;
588	if (psa)
589		*psa = 0;
590	if (controlp)
591		*controlp = 0;
592	if (flagsp)
593		flags = *flagsp &~ MSG_EOR;
594	else
595		flags = 0;
596	if (flags & MSG_OOB) {
597		m = m_get(M_WAIT, MT_DATA);
598		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
599		if (error)
600			goto bad;
601		do {
602			error = uiomove(mtod(m, caddr_t),
603			    (int) min(uio->uio_resid, m->m_len), uio);
604			m = m_free(m);
605		} while (uio->uio_resid && error == 0 && m);
606bad:
607		if (m)
608			m_freem(m);
609		return (error);
610	}
611	if (mp)
612		*mp = (struct mbuf *)0;
613	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
614		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
615
616restart:
617	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
618	if (error)
619		return (error);
620	s = splnet();
621
622	m = so->so_rcv.sb_mb;
623	/*
624	 * If we have less data than requested, block awaiting more
625	 * (subject to any timeout) if:
626	 *   1. the current count is less than the low water mark, or
627	 *   2. MSG_WAITALL is set, and it is possible to do the entire
628	 *	receive operation at once if we block (resid <= hiwat).
629	 *   3. MSG_DONTWAIT is not set
630	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
631	 * we have to do the receive in sections, and thus risk returning
632	 * a short count if a timeout or signal occurs after we start.
633	 */
634	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
635	    so->so_rcv.sb_cc < uio->uio_resid) &&
636	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
637	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
638	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
639#ifdef DIAGNOSTIC
640		if (m == 0 && so->so_rcv.sb_cc)
641			panic("receive 1");
642#endif
643		if (so->so_error) {
644			if (m)
645				goto dontblock;
646			error = so->so_error;
647			if ((flags & MSG_PEEK) == 0)
648				so->so_error = 0;
649			goto release;
650		}
651		if (so->so_state & SS_CANTRCVMORE) {
652			if (m)
653				goto dontblock;
654			else
655				goto release;
656		}
657		for (; m; m = m->m_next)
658			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
659				m = so->so_rcv.sb_mb;
660				goto dontblock;
661			}
662		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
663		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
664			error = ENOTCONN;
665			goto release;
666		}
667		if (uio->uio_resid == 0)
668			goto release;
669		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
670			error = EWOULDBLOCK;
671			goto release;
672		}
673		sbunlock(&so->so_rcv);
674		error = sbwait(&so->so_rcv);
675		splx(s);
676		if (error)
677			return (error);
678		goto restart;
679	}
680dontblock:
681	if (uio->uio_procp)
682		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
683	nextrecord = m->m_nextpkt;
684	if (pr->pr_flags & PR_ADDR) {
685#ifdef DIAGNOSTIC
686		if (m->m_type != MT_SONAME)
687			panic("receive 1a");
688#endif
689		orig_resid = 0;
690		if (psa)
691			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
692					    mp0 == 0);
693		if (flags & MSG_PEEK) {
694			m = m->m_next;
695		} else {
696			sbfree(&so->so_rcv, m);
697			MFREE(m, so->so_rcv.sb_mb);
698			m = so->so_rcv.sb_mb;
699		}
700	}
701	while (m && m->m_type == MT_CONTROL && error == 0) {
702		if (flags & MSG_PEEK) {
703			if (controlp)
704				*controlp = m_copy(m, 0, m->m_len);
705			m = m->m_next;
706		} else {
707			sbfree(&so->so_rcv, m);
708			if (controlp) {
709				if (pr->pr_domain->dom_externalize &&
710				    mtod(m, struct cmsghdr *)->cmsg_type ==
711				    SCM_RIGHTS)
712				   error = (*pr->pr_domain->dom_externalize)(m);
713				*controlp = m;
714				so->so_rcv.sb_mb = m->m_next;
715				m->m_next = 0;
716				m = so->so_rcv.sb_mb;
717			} else {
718				MFREE(m, so->so_rcv.sb_mb);
719				m = so->so_rcv.sb_mb;
720			}
721		}
722		if (controlp) {
723			orig_resid = 0;
724			controlp = &(*controlp)->m_next;
725		}
726	}
727	if (m) {
728		if ((flags & MSG_PEEK) == 0)
729			m->m_nextpkt = nextrecord;
730		type = m->m_type;
731		if (type == MT_OOBDATA)
732			flags |= MSG_OOB;
733	}
734	moff = 0;
735	offset = 0;
736	while (m && uio->uio_resid > 0 && error == 0) {
737		if (m->m_type == MT_OOBDATA) {
738			if (type != MT_OOBDATA)
739				break;
740		} else if (type == MT_OOBDATA)
741			break;
742#ifdef DIAGNOSTIC
743		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
744			panic("receive 3");
745#endif
746		so->so_state &= ~SS_RCVATMARK;
747		len = uio->uio_resid;
748		if (so->so_oobmark && len > so->so_oobmark - offset)
749			len = so->so_oobmark - offset;
750		if (len > m->m_len - moff)
751			len = m->m_len - moff;
752		/*
753		 * If mp is set, just pass back the mbufs.
754		 * Otherwise copy them out via the uio, then free.
755		 * Sockbuf must be consistent here (points to current mbuf,
756		 * it points to next record) when we drop priority;
757		 * we must note any additions to the sockbuf when we
758		 * block interrupts again.
759		 */
760		if (mp == 0) {
761			splx(s);
762			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
763			s = splnet();
764			if (error)
765				goto release;
766		} else
767			uio->uio_resid -= len;
768		if (len == m->m_len - moff) {
769			if (m->m_flags & M_EOR)
770				flags |= MSG_EOR;
771			if (flags & MSG_PEEK) {
772				m = m->m_next;
773				moff = 0;
774			} else {
775				nextrecord = m->m_nextpkt;
776				sbfree(&so->so_rcv, m);
777				if (mp) {
778					*mp = m;
779					mp = &m->m_next;
780					so->so_rcv.sb_mb = m = m->m_next;
781					*mp = (struct mbuf *)0;
782				} else {
783					MFREE(m, so->so_rcv.sb_mb);
784					m = so->so_rcv.sb_mb;
785				}
786				if (m)
787					m->m_nextpkt = nextrecord;
788			}
789		} else {
790			if (flags & MSG_PEEK)
791				moff += len;
792			else {
793				if (mp)
794					*mp = m_copym(m, 0, len, M_WAIT);
795				m->m_data += len;
796				m->m_len -= len;
797				so->so_rcv.sb_cc -= len;
798			}
799		}
800		if (so->so_oobmark) {
801			if ((flags & MSG_PEEK) == 0) {
802				so->so_oobmark -= len;
803				if (so->so_oobmark == 0) {
804					so->so_state |= SS_RCVATMARK;
805					break;
806				}
807			} else {
808				offset += len;
809				if (offset == so->so_oobmark)
810					break;
811			}
812		}
813		if (flags & MSG_EOR)
814			break;
815		/*
816		 * If the MSG_WAITALL flag is set (for non-atomic socket),
817		 * we must not quit until "uio->uio_resid == 0" or an error
818		 * termination.  If a signal/timeout occurs, return
819		 * with a short count but without error.
820		 * Keep sockbuf locked against other readers.
821		 */
822		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
823		    !sosendallatonce(so) && !nextrecord) {
824			if (so->so_error || so->so_state & SS_CANTRCVMORE)
825				break;
826			error = sbwait(&so->so_rcv);
827			if (error) {
828				sbunlock(&so->so_rcv);
829				splx(s);
830				return (0);
831			}
832			m = so->so_rcv.sb_mb;
833			if (m)
834				nextrecord = m->m_nextpkt;
835		}
836	}
837
838	if (m && pr->pr_flags & PR_ATOMIC) {
839		flags |= MSG_TRUNC;
840		if ((flags & MSG_PEEK) == 0)
841			(void) sbdroprecord(&so->so_rcv);
842	}
843	if ((flags & MSG_PEEK) == 0) {
844		if (m == 0)
845			so->so_rcv.sb_mb = nextrecord;
846		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
847			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
848	}
849	if (orig_resid == uio->uio_resid && orig_resid &&
850	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
851		sbunlock(&so->so_rcv);
852		splx(s);
853		goto restart;
854	}
855
856	if (flagsp)
857		*flagsp |= flags;
858release:
859	sbunlock(&so->so_rcv);
860	splx(s);
861	return (error);
862}
863
864int
865soshutdown(so, how)
866	register struct socket *so;
867	register int how;
868{
869	register struct protosw *pr = so->so_proto;
870
871	how++;
872	if (how & FREAD)
873		sorflush(so);
874	if (how & FWRITE)
875		return ((*pr->pr_usrreqs->pru_shutdown)(so));
876	return (0);
877}
878
879void
880sorflush(so)
881	register struct socket *so;
882{
883	register struct sockbuf *sb = &so->so_rcv;
884	register struct protosw *pr = so->so_proto;
885	register int s;
886	struct sockbuf asb;
887
888	sb->sb_flags |= SB_NOINTR;
889	(void) sblock(sb, M_WAITOK);
890	s = splimp();
891	socantrcvmore(so);
892	sbunlock(sb);
893	asb = *sb;
894	bzero((caddr_t)sb, sizeof (*sb));
895	splx(s);
896	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
897		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
898	sbrelease(&asb);
899}
900
901/*
902 * Perhaps this routine, and sooptcopyout(), below, ought to come in
903 * an additional variant to handle the case where the option value needs
904 * to be some kind of integer, but not a specific size.
905 * In addition to their use here, these functions are also called by the
906 * protocol-level pr_ctloutput() routines.
907 */
908int
909sooptcopyin(sopt, buf, len, minlen)
910	struct	sockopt *sopt;
911	void	*buf;
912	size_t	len;
913	size_t	minlen;
914{
915	size_t	valsize;
916
917	/*
918	 * If the user gives us more than we wanted, we ignore it,
919	 * but if we don't get the minimum length the caller
920	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
921	 * is set to however much we actually retrieved.
922	 */
923	if ((valsize = sopt->sopt_valsize) < minlen)
924		return EINVAL;
925	if (valsize > len)
926		sopt->sopt_valsize = valsize = len;
927
928	if (sopt->sopt_p != 0)
929		return (copyin(sopt->sopt_val, buf, valsize));
930
931	bcopy(sopt->sopt_val, buf, valsize);
932	return 0;
933}
934
935int
936sosetopt(so, sopt)
937	struct socket *so;
938	struct sockopt *sopt;
939{
940	int	error, optval;
941	struct	linger l;
942	struct	timeval tv;
943	short	val;
944
945	error = 0;
946	if (sopt->sopt_level != SOL_SOCKET) {
947		if (so->so_proto && so->so_proto->pr_ctloutput)
948			return ((*so->so_proto->pr_ctloutput)
949				  (so, sopt));
950		error = ENOPROTOOPT;
951	} else {
952		switch (sopt->sopt_name) {
953		case SO_LINGER:
954			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
955			if (error)
956				goto bad;
957
958			so->so_linger = l.l_linger;
959			if (l.l_onoff)
960				so->so_options |= SO_LINGER;
961			else
962				so->so_options &= ~SO_LINGER;
963			break;
964
965		case SO_DEBUG:
966		case SO_KEEPALIVE:
967		case SO_DONTROUTE:
968		case SO_USELOOPBACK:
969		case SO_BROADCAST:
970		case SO_REUSEADDR:
971		case SO_REUSEPORT:
972		case SO_OOBINLINE:
973		case SO_TIMESTAMP:
974			error = sooptcopyin(sopt, &optval, sizeof optval,
975					    sizeof optval);
976			if (error)
977				goto bad;
978			if (optval)
979				so->so_options |= sopt->sopt_name;
980			else
981				so->so_options &= ~sopt->sopt_name;
982			break;
983
984		case SO_SNDBUF:
985		case SO_RCVBUF:
986		case SO_SNDLOWAT:
987		case SO_RCVLOWAT:
988			error = sooptcopyin(sopt, &optval, sizeof optval,
989					    sizeof optval);
990			if (error)
991				goto bad;
992
993			/*
994			 * Values < 1 make no sense for any of these
995			 * options, so disallow them.
996			 */
997			if (optval < 1) {
998				error = EINVAL;
999				goto bad;
1000			}
1001
1002			switch (sopt->sopt_name) {
1003			case SO_SNDBUF:
1004			case SO_RCVBUF:
1005				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1006					      &so->so_snd : &so->so_rcv,
1007					      (u_long) optval) == 0) {
1008					error = ENOBUFS;
1009					goto bad;
1010				}
1011				break;
1012
1013			/*
1014			 * Make sure the low-water is never greater than
1015			 * the high-water.
1016			 */
1017			case SO_SNDLOWAT:
1018				so->so_snd.sb_lowat =
1019				    (optval > so->so_snd.sb_hiwat) ?
1020				    so->so_snd.sb_hiwat : optval;
1021				break;
1022			case SO_RCVLOWAT:
1023				so->so_rcv.sb_lowat =
1024				    (optval > so->so_rcv.sb_hiwat) ?
1025				    so->so_rcv.sb_hiwat : optval;
1026				break;
1027			}
1028			break;
1029
1030		case SO_SNDTIMEO:
1031		case SO_RCVTIMEO:
1032			error = sooptcopyin(sopt, &tv, sizeof tv,
1033					    sizeof tv);
1034			if (error)
1035				goto bad;
1036
1037			if (tv.tv_sec > SHRT_MAX / hz - hz) {
1038				error = EDOM;
1039				goto bad;
1040			}
1041			val = tv.tv_sec * hz + tv.tv_usec / tick;
1042
1043			switch (sopt->sopt_name) {
1044			case SO_SNDTIMEO:
1045				so->so_snd.sb_timeo = val;
1046				break;
1047			case SO_RCVTIMEO:
1048				so->so_rcv.sb_timeo = val;
1049				break;
1050			}
1051			break;
1052
1053		default:
1054			error = ENOPROTOOPT;
1055			break;
1056		}
1057		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1058			(void) ((*so->so_proto->pr_ctloutput)
1059				  (so, sopt));
1060		}
1061	}
1062bad:
1063	return (error);
1064}
1065
1066/* Helper routine for getsockopt */
1067int
1068sooptcopyout(sopt, buf, len)
1069	struct	sockopt *sopt;
1070	void	*buf;
1071	size_t	len;
1072{
1073	int	error;
1074	size_t	valsize;
1075
1076	error = 0;
1077
1078	/*
1079	 * Documented get behavior is that we always return a value,
1080	 * possibly truncated to fit in the user's buffer.
1081	 * Traditional behavior is that we always tell the user
1082	 * precisely how much we copied, rather than something useful
1083	 * like the total amount we had available for her.
1084	 * Note that this interface is not idempotent; the entire answer must
1085	 * generated ahead of time.
1086	 */
1087	valsize = min(len, sopt->sopt_valsize);
1088	sopt->sopt_valsize = valsize;
1089	if (sopt->sopt_val != 0) {
1090		if (sopt->sopt_p != 0)
1091			error = copyout(buf, sopt->sopt_val, valsize);
1092		else
1093			bcopy(buf, sopt->sopt_val, valsize);
1094	}
1095	return error;
1096}
1097
1098int
1099sogetopt(so, sopt)
1100	struct socket *so;
1101	struct sockopt *sopt;
1102{
1103	int	error, optval;
1104	struct	linger l;
1105	struct	timeval tv;
1106
1107	error = 0;
1108	if (sopt->sopt_level != SOL_SOCKET) {
1109		if (so->so_proto && so->so_proto->pr_ctloutput) {
1110			return ((*so->so_proto->pr_ctloutput)
1111				  (so, sopt));
1112		} else
1113			return (ENOPROTOOPT);
1114	} else {
1115		switch (sopt->sopt_name) {
1116		case SO_LINGER:
1117			l.l_onoff = so->so_options & SO_LINGER;
1118			l.l_linger = so->so_linger;
1119			error = sooptcopyout(sopt, &l, sizeof l);
1120			break;
1121
1122		case SO_USELOOPBACK:
1123		case SO_DONTROUTE:
1124		case SO_DEBUG:
1125		case SO_KEEPALIVE:
1126		case SO_REUSEADDR:
1127		case SO_REUSEPORT:
1128		case SO_BROADCAST:
1129		case SO_OOBINLINE:
1130		case SO_TIMESTAMP:
1131			optval = so->so_options & sopt->sopt_name;
1132integer:
1133			error = sooptcopyout(sopt, &optval, sizeof optval);
1134			break;
1135
1136		case SO_TYPE:
1137			optval = so->so_type;
1138			goto integer;
1139
1140		case SO_ERROR:
1141			optval = so->so_error;
1142			so->so_error = 0;
1143			goto integer;
1144
1145		case SO_SNDBUF:
1146			optval = so->so_snd.sb_hiwat;
1147			goto integer;
1148
1149		case SO_RCVBUF:
1150			optval = so->so_rcv.sb_hiwat;
1151			goto integer;
1152
1153		case SO_SNDLOWAT:
1154			optval = so->so_snd.sb_lowat;
1155			goto integer;
1156
1157		case SO_RCVLOWAT:
1158			optval = so->so_rcv.sb_lowat;
1159			goto integer;
1160
1161		case SO_SNDTIMEO:
1162		case SO_RCVTIMEO:
1163			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1164				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1165
1166			tv.tv_sec = optval / hz;
1167			tv.tv_usec = (optval % hz) * tick;
1168			error = sooptcopyout(sopt, &tv, sizeof tv);
1169			break;
1170
1171		default:
1172			error = ENOPROTOOPT;
1173			break;
1174		}
1175		return (error);
1176	}
1177}
1178
1179void
1180sohasoutofband(so)
1181	register struct socket *so;
1182{
1183	struct proc *p;
1184
1185	if (so->so_pgid < 0)
1186		gsignal(-so->so_pgid, SIGURG);
1187	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1188		psignal(p, SIGURG);
1189	selwakeup(&so->so_rcv.sb_sel);
1190}
1191
1192int
1193sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1194{
1195	int revents = 0;
1196	int s = splnet();
1197
1198	if (events & (POLLIN | POLLRDNORM))
1199		if (soreadable(so))
1200			revents |= events & (POLLIN | POLLRDNORM);
1201
1202	if (events & (POLLOUT | POLLWRNORM))
1203		if (sowriteable(so))
1204			revents |= events & (POLLOUT | POLLWRNORM);
1205
1206	if (events & (POLLPRI | POLLRDBAND))
1207		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1208			revents |= events & (POLLPRI | POLLRDBAND);
1209
1210	if (revents == 0) {
1211		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1212			selrecord(p, &so->so_rcv.sb_sel);
1213			so->so_rcv.sb_flags |= SB_SEL;
1214		}
1215
1216		if (events & (POLLOUT | POLLWRNORM)) {
1217			selrecord(p, &so->so_snd.sb_sel);
1218			so->so_snd.sb_flags |= SB_SEL;
1219		}
1220	}
1221
1222	splx(s);
1223	return (revents);
1224}
1225