uipc_socket.c revision 24131
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.24 1997/02/24 20:30:56 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/queue.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/fcntl.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/domain.h>
45#include <sys/kernel.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52
53static int somaxconn = SOMAXCONN;
54SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
55	   0, "");
56
57/*
58 * Socket operation routines.
59 * These routines are called by the routines in
60 * sys_socket.c or from a system process, and
61 * implement the semantics of socket operations by
62 * switching out to the protocol specific routines.
63 */
64/*ARGSUSED*/
65int
66socreate(dom, aso, type, proto, p)
67	int dom;
68	struct socket **aso;
69	register int type;
70	int proto;
71	struct proc *p;
72{
73	register struct protosw *prp;
74	register struct socket *so;
75	register int error;
76
77	if (proto)
78		prp = pffindproto(dom, proto, type);
79	else
80		prp = pffindtype(dom, type);
81	if (prp == 0 || prp->pr_usrreqs == 0)
82		return (EPROTONOSUPPORT);
83	if (prp->pr_type != type)
84		return (EPROTOTYPE);
85	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
86	bzero((caddr_t)so, sizeof(*so));
87	TAILQ_INIT(&so->so_incomp);
88	TAILQ_INIT(&so->so_comp);
89	so->so_type = type;
90	if (p->p_ucred->cr_uid == 0)
91		so->so_state = SS_PRIV;
92	so->so_proto = prp;
93	error = (*prp->pr_usrreqs->pru_attach)(so, proto);
94	if (error) {
95		so->so_state |= SS_NOFDREF;
96		sofree(so);
97		return (error);
98	}
99	*aso = so;
100	return (0);
101}
102
103int
104sobind(so, nam)
105	struct socket *so;
106	struct mbuf *nam;
107{
108	int s = splnet();
109	int error;
110
111	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam);
112	splx(s);
113	return (error);
114}
115
116int
117solisten(so, backlog)
118	register struct socket *so;
119	int backlog;
120{
121	int s = splnet(), error;
122
123	error = (*so->so_proto->pr_usrreqs->pru_listen)(so);
124	if (error) {
125		splx(s);
126		return (error);
127	}
128	if (so->so_comp.tqh_first == NULL)
129		so->so_options |= SO_ACCEPTCONN;
130	if (backlog < 0 || backlog > somaxconn)
131		backlog = somaxconn;
132	so->so_qlimit = backlog;
133	splx(s);
134	return (0);
135}
136
137void
138sofree(so)
139	register struct socket *so;
140{
141	struct socket *head = so->so_head;
142
143	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
144		return;
145	if (head != NULL) {
146		if (so->so_state & SS_INCOMP) {
147			TAILQ_REMOVE(&head->so_incomp, so, so_list);
148			head->so_incqlen--;
149		} else if (so->so_state & SS_COMP) {
150			TAILQ_REMOVE(&head->so_comp, so, so_list);
151		} else {
152			panic("sofree: not queued");
153		}
154		head->so_qlen--;
155		so->so_state &= ~(SS_INCOMP|SS_COMP);
156		so->so_head = NULL;
157	}
158	sbrelease(&so->so_snd);
159	sorflush(so);
160	FREE(so, M_SOCKET);
161}
162
163/*
164 * Close a socket on last file table reference removal.
165 * Initiate disconnect if connected.
166 * Free socket when disconnect complete.
167 */
168int
169soclose(so)
170	register struct socket *so;
171{
172	int s = splnet();		/* conservative */
173	int error = 0;
174
175	if (so->so_options & SO_ACCEPTCONN) {
176		struct socket *sp, *sonext;
177
178		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
179			sonext = sp->so_list.tqe_next;
180			(void) soabort(sp);
181		}
182		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
183			sonext = sp->so_list.tqe_next;
184			(void) soabort(sp);
185		}
186	}
187	if (so->so_pcb == 0)
188		goto discard;
189	if (so->so_state & SS_ISCONNECTED) {
190		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
191			error = sodisconnect(so);
192			if (error)
193				goto drop;
194		}
195		if (so->so_options & SO_LINGER) {
196			if ((so->so_state & SS_ISDISCONNECTING) &&
197			    (so->so_state & SS_NBIO))
198				goto drop;
199			while (so->so_state & SS_ISCONNECTED) {
200				error = tsleep((caddr_t)&so->so_timeo,
201				    PSOCK | PCATCH, "soclos", so->so_linger);
202				if (error)
203					break;
204			}
205		}
206	}
207drop:
208	if (so->so_pcb) {
209		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
210		if (error == 0)
211			error = error2;
212	}
213discard:
214	if (so->so_state & SS_NOFDREF)
215		panic("soclose: NOFDREF");
216	so->so_state |= SS_NOFDREF;
217	sofree(so);
218	splx(s);
219	return (error);
220}
221
222/*
223 * Must be called at splnet...
224 */
225int
226soabort(so)
227	struct socket *so;
228{
229
230	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
231}
232
233int
234soaccept(so, nam)
235	register struct socket *so;
236	struct mbuf *nam;
237{
238	int s = splnet();
239	int error;
240
241	if ((so->so_state & SS_NOFDREF) == 0)
242		panic("soaccept: !NOFDREF");
243	so->so_state &= ~SS_NOFDREF;
244	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
245	splx(s);
246	return (error);
247}
248
249int
250soconnect(so, nam)
251	register struct socket *so;
252	struct mbuf *nam;
253{
254	int s;
255	int error;
256
257	if (so->so_options & SO_ACCEPTCONN)
258		return (EOPNOTSUPP);
259	s = splnet();
260	/*
261	 * If protocol is connection-based, can only connect once.
262	 * Otherwise, if connected, try to disconnect first.
263	 * This allows user to disconnect by connecting to, e.g.,
264	 * a null address.
265	 */
266	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
267	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
268	    (error = sodisconnect(so))))
269		error = EISCONN;
270	else
271		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
272	splx(s);
273	return (error);
274}
275
276int
277soconnect2(so1, so2)
278	register struct socket *so1;
279	struct socket *so2;
280{
281	int s = splnet();
282	int error;
283
284	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
285	splx(s);
286	return (error);
287}
288
289int
290sodisconnect(so)
291	register struct socket *so;
292{
293	int s = splnet();
294	int error;
295
296	if ((so->so_state & SS_ISCONNECTED) == 0) {
297		error = ENOTCONN;
298		goto bad;
299	}
300	if (so->so_state & SS_ISDISCONNECTING) {
301		error = EALREADY;
302		goto bad;
303	}
304	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
305bad:
306	splx(s);
307	return (error);
308}
309
310#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
311/*
312 * Send on a socket.
313 * If send must go all at once and message is larger than
314 * send buffering, then hard error.
315 * Lock against other senders.
316 * If must go all at once and not enough room now, then
317 * inform user that this would block and do nothing.
318 * Otherwise, if nonblocking, send as much as possible.
319 * The data to be sent is described by "uio" if nonzero,
320 * otherwise by the mbuf chain "top" (which must be null
321 * if uio is not).  Data provided in mbuf chain must be small
322 * enough to send all at once.
323 *
324 * Returns nonzero on error, timeout or signal; callers
325 * must check for short counts if EINTR/ERESTART are returned.
326 * Data and control buffers are freed on return.
327 */
328int
329sosend(so, addr, uio, top, control, flags)
330	register struct socket *so;
331	struct mbuf *addr;
332	struct uio *uio;
333	struct mbuf *top;
334	struct mbuf *control;
335	int flags;
336{
337	struct proc *p = curproc;		/* XXX */
338	struct mbuf **mp;
339	register struct mbuf *m;
340	register long space, len, resid;
341	int clen = 0, error, s, dontroute, mlen;
342	int atomic = sosendallatonce(so) || top;
343
344	if (uio)
345		resid = uio->uio_resid;
346	else
347		resid = top->m_pkthdr.len;
348	/*
349	 * In theory resid should be unsigned.
350	 * However, space must be signed, as it might be less than 0
351	 * if we over-committed, and we must use a signed comparison
352	 * of space and resid.  On the other hand, a negative resid
353	 * causes us to loop sending 0-length segments to the protocol.
354	 */
355	if (resid < 0)
356		return (EINVAL);
357	dontroute =
358	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
359	    (so->so_proto->pr_flags & PR_ATOMIC);
360	p->p_stats->p_ru.ru_msgsnd++;
361	if (control)
362		clen = control->m_len;
363#define	snderr(errno)	{ error = errno; splx(s); goto release; }
364
365restart:
366	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
367	if (error)
368		goto out;
369	do {
370		s = splnet();
371		if (so->so_state & SS_CANTSENDMORE)
372			snderr(EPIPE);
373		if (so->so_error)
374			snderr(so->so_error);
375		if ((so->so_state & SS_ISCONNECTED) == 0) {
376			/*
377			 * `sendto' and `sendmsg' is allowed on a connection-
378			 * based socket if it supports implied connect.
379			 * Return ENOTCONN if not connected and no address is
380			 * supplied.
381			 */
382			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
383			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
384				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
385				    !(resid == 0 && clen != 0))
386					snderr(ENOTCONN);
387			} else if (addr == 0)
388			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
389				   ENOTCONN : EDESTADDRREQ);
390		}
391		space = sbspace(&so->so_snd);
392		if (flags & MSG_OOB)
393			space += 1024;
394		if ((atomic && resid > so->so_snd.sb_hiwat) ||
395		    clen > so->so_snd.sb_hiwat)
396			snderr(EMSGSIZE);
397		if (space < resid + clen && uio &&
398		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
399			if (so->so_state & SS_NBIO)
400				snderr(EWOULDBLOCK);
401			sbunlock(&so->so_snd);
402			error = sbwait(&so->so_snd);
403			splx(s);
404			if (error)
405				goto out;
406			goto restart;
407		}
408		splx(s);
409		mp = &top;
410		space -= clen;
411		do {
412		    if (uio == NULL) {
413			/*
414			 * Data is prepackaged in "top".
415			 */
416			resid = 0;
417			if (flags & MSG_EOR)
418				top->m_flags |= M_EOR;
419		    } else do {
420			if (top == 0) {
421				MGETHDR(m, M_WAIT, MT_DATA);
422				mlen = MHLEN;
423				m->m_pkthdr.len = 0;
424				m->m_pkthdr.rcvif = (struct ifnet *)0;
425			} else {
426				MGET(m, M_WAIT, MT_DATA);
427				mlen = MLEN;
428			}
429			if (resid >= MINCLSIZE) {
430				MCLGET(m, M_WAIT);
431				if ((m->m_flags & M_EXT) == 0)
432					goto nopages;
433				mlen = MCLBYTES;
434				len = min(min(mlen, resid), space);
435			} else {
436nopages:
437				len = min(min(mlen, resid), space);
438				/*
439				 * For datagram protocols, leave room
440				 * for protocol headers in first mbuf.
441				 */
442				if (atomic && top == 0 && len < mlen)
443					MH_ALIGN(m, len);
444			}
445			space -= len;
446			error = uiomove(mtod(m, caddr_t), (int)len, uio);
447			resid = uio->uio_resid;
448			m->m_len = len;
449			*mp = m;
450			top->m_pkthdr.len += len;
451			if (error)
452				goto release;
453			mp = &m->m_next;
454			if (resid <= 0) {
455				if (flags & MSG_EOR)
456					top->m_flags |= M_EOR;
457				break;
458			}
459		    } while (space > 0 && atomic);
460		    if (dontroute)
461			    so->so_options |= SO_DONTROUTE;
462		    s = splnet();				/* XXX */
463		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
464			(flags & MSG_OOB) ? PRUS_OOB :
465			/*
466			 * If the user set MSG_EOF, the protocol
467			 * understands this flag and nothing left to
468			 * send then use PRU_SEND_EOF instead of PRU_SEND.
469			 */
470			((flags & MSG_EOF) &&
471			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
472			 (resid <= 0)) ?
473				PRUS_EOF : 0,
474			top, addr, control);
475		    splx(s);
476		    if (dontroute)
477			    so->so_options &= ~SO_DONTROUTE;
478		    clen = 0;
479		    control = 0;
480		    top = 0;
481		    mp = &top;
482		    if (error)
483			goto release;
484		} while (resid && space > 0);
485	} while (resid);
486
487release:
488	sbunlock(&so->so_snd);
489out:
490	if (top)
491		m_freem(top);
492	if (control)
493		m_freem(control);
494	return (error);
495}
496
497/*
498 * Implement receive operations on a socket.
499 * We depend on the way that records are added to the sockbuf
500 * by sbappend*.  In particular, each record (mbufs linked through m_next)
501 * must begin with an address if the protocol so specifies,
502 * followed by an optional mbuf or mbufs containing ancillary data,
503 * and then zero or more mbufs of data.
504 * In order to avoid blocking network interrupts for the entire time here,
505 * we splx() while doing the actual copy to user space.
506 * Although the sockbuf is locked, new data may still be appended,
507 * and thus we must maintain consistency of the sockbuf during that time.
508 *
509 * The caller may receive the data as a single mbuf chain by supplying
510 * an mbuf **mp0 for use in returning the chain.  The uio is then used
511 * only for the count in uio_resid.
512 */
513int
514soreceive(so, paddr, uio, mp0, controlp, flagsp)
515	register struct socket *so;
516	struct mbuf **paddr;
517	struct uio *uio;
518	struct mbuf **mp0;
519	struct mbuf **controlp;
520	int *flagsp;
521{
522	register struct mbuf *m, **mp;
523	register int flags, len, error, s, offset;
524	struct protosw *pr = so->so_proto;
525	struct mbuf *nextrecord;
526	int moff, type = 0;
527	int orig_resid = uio->uio_resid;
528
529	mp = mp0;
530	if (paddr)
531		*paddr = 0;
532	if (controlp)
533		*controlp = 0;
534	if (flagsp)
535		flags = *flagsp &~ MSG_EOR;
536	else
537		flags = 0;
538	if (flags & MSG_OOB) {
539		m = m_get(M_WAIT, MT_DATA);
540		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
541		if (error)
542			goto bad;
543		do {
544			error = uiomove(mtod(m, caddr_t),
545			    (int) min(uio->uio_resid, m->m_len), uio);
546			m = m_free(m);
547		} while (uio->uio_resid && error == 0 && m);
548bad:
549		if (m)
550			m_freem(m);
551		return (error);
552	}
553	if (mp)
554		*mp = (struct mbuf *)0;
555	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
556		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
557
558restart:
559	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
560	if (error)
561		return (error);
562	s = splnet();
563
564	m = so->so_rcv.sb_mb;
565	/*
566	 * If we have less data than requested, block awaiting more
567	 * (subject to any timeout) if:
568	 *   1. the current count is less than the low water mark, or
569	 *   2. MSG_WAITALL is set, and it is possible to do the entire
570	 *	receive operation at once if we block (resid <= hiwat).
571	 *   3. MSG_DONTWAIT is not set
572	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
573	 * we have to do the receive in sections, and thus risk returning
574	 * a short count if a timeout or signal occurs after we start.
575	 */
576	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
577	    so->so_rcv.sb_cc < uio->uio_resid) &&
578	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
579	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
580	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
581#ifdef DIAGNOSTIC
582		if (m == 0 && so->so_rcv.sb_cc)
583			panic("receive 1");
584#endif
585		if (so->so_error) {
586			if (m)
587				goto dontblock;
588			error = so->so_error;
589			if ((flags & MSG_PEEK) == 0)
590				so->so_error = 0;
591			goto release;
592		}
593		if (so->so_state & SS_CANTRCVMORE) {
594			if (m)
595				goto dontblock;
596			else
597				goto release;
598		}
599		for (; m; m = m->m_next)
600			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
601				m = so->so_rcv.sb_mb;
602				goto dontblock;
603			}
604		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
605		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
606			error = ENOTCONN;
607			goto release;
608		}
609		if (uio->uio_resid == 0)
610			goto release;
611		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
612			error = EWOULDBLOCK;
613			goto release;
614		}
615		sbunlock(&so->so_rcv);
616		error = sbwait(&so->so_rcv);
617		splx(s);
618		if (error)
619			return (error);
620		goto restart;
621	}
622dontblock:
623	if (uio->uio_procp)
624		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
625	nextrecord = m->m_nextpkt;
626	if (pr->pr_flags & PR_ADDR) {
627#ifdef DIAGNOSTIC
628		if (m->m_type != MT_SONAME)
629			panic("receive 1a");
630#endif
631		orig_resid = 0;
632		if (flags & MSG_PEEK) {
633			if (paddr)
634				*paddr = m_copy(m, 0, m->m_len);
635			m = m->m_next;
636		} else {
637			sbfree(&so->so_rcv, m);
638			if (paddr) {
639				*paddr = m;
640				so->so_rcv.sb_mb = m->m_next;
641				m->m_next = 0;
642				m = so->so_rcv.sb_mb;
643			} else {
644				MFREE(m, so->so_rcv.sb_mb);
645				m = so->so_rcv.sb_mb;
646			}
647		}
648	}
649	while (m && m->m_type == MT_CONTROL && error == 0) {
650		if (flags & MSG_PEEK) {
651			if (controlp)
652				*controlp = m_copy(m, 0, m->m_len);
653			m = m->m_next;
654		} else {
655			sbfree(&so->so_rcv, m);
656			if (controlp) {
657				if (pr->pr_domain->dom_externalize &&
658				    mtod(m, struct cmsghdr *)->cmsg_type ==
659				    SCM_RIGHTS)
660				   error = (*pr->pr_domain->dom_externalize)(m);
661				*controlp = m;
662				so->so_rcv.sb_mb = m->m_next;
663				m->m_next = 0;
664				m = so->so_rcv.sb_mb;
665			} else {
666				MFREE(m, so->so_rcv.sb_mb);
667				m = so->so_rcv.sb_mb;
668			}
669		}
670		if (controlp) {
671			orig_resid = 0;
672			controlp = &(*controlp)->m_next;
673		}
674	}
675	if (m) {
676		if ((flags & MSG_PEEK) == 0)
677			m->m_nextpkt = nextrecord;
678		type = m->m_type;
679		if (type == MT_OOBDATA)
680			flags |= MSG_OOB;
681	}
682	moff = 0;
683	offset = 0;
684	while (m && uio->uio_resid > 0 && error == 0) {
685		if (m->m_type == MT_OOBDATA) {
686			if (type != MT_OOBDATA)
687				break;
688		} else if (type == MT_OOBDATA)
689			break;
690#ifdef DIAGNOSTIC
691		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
692			panic("receive 3");
693#endif
694		so->so_state &= ~SS_RCVATMARK;
695		len = uio->uio_resid;
696		if (so->so_oobmark && len > so->so_oobmark - offset)
697			len = so->so_oobmark - offset;
698		if (len > m->m_len - moff)
699			len = m->m_len - moff;
700		/*
701		 * If mp is set, just pass back the mbufs.
702		 * Otherwise copy them out via the uio, then free.
703		 * Sockbuf must be consistent here (points to current mbuf,
704		 * it points to next record) when we drop priority;
705		 * we must note any additions to the sockbuf when we
706		 * block interrupts again.
707		 */
708		if (mp == 0) {
709			splx(s);
710			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
711			s = splnet();
712			if (error)
713				goto release;
714		} else
715			uio->uio_resid -= len;
716		if (len == m->m_len - moff) {
717			if (m->m_flags & M_EOR)
718				flags |= MSG_EOR;
719			if (flags & MSG_PEEK) {
720				m = m->m_next;
721				moff = 0;
722			} else {
723				nextrecord = m->m_nextpkt;
724				sbfree(&so->so_rcv, m);
725				if (mp) {
726					*mp = m;
727					mp = &m->m_next;
728					so->so_rcv.sb_mb = m = m->m_next;
729					*mp = (struct mbuf *)0;
730				} else {
731					MFREE(m, so->so_rcv.sb_mb);
732					m = so->so_rcv.sb_mb;
733				}
734				if (m)
735					m->m_nextpkt = nextrecord;
736			}
737		} else {
738			if (flags & MSG_PEEK)
739				moff += len;
740			else {
741				if (mp)
742					*mp = m_copym(m, 0, len, M_WAIT);
743				m->m_data += len;
744				m->m_len -= len;
745				so->so_rcv.sb_cc -= len;
746			}
747		}
748		if (so->so_oobmark) {
749			if ((flags & MSG_PEEK) == 0) {
750				so->so_oobmark -= len;
751				if (so->so_oobmark == 0) {
752					so->so_state |= SS_RCVATMARK;
753					break;
754				}
755			} else {
756				offset += len;
757				if (offset == so->so_oobmark)
758					break;
759			}
760		}
761		if (flags & MSG_EOR)
762			break;
763		/*
764		 * If the MSG_WAITALL flag is set (for non-atomic socket),
765		 * we must not quit until "uio->uio_resid == 0" or an error
766		 * termination.  If a signal/timeout occurs, return
767		 * with a short count but without error.
768		 * Keep sockbuf locked against other readers.
769		 */
770		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
771		    !sosendallatonce(so) && !nextrecord) {
772			if (so->so_error || so->so_state & SS_CANTRCVMORE)
773				break;
774			error = sbwait(&so->so_rcv);
775			if (error) {
776				sbunlock(&so->so_rcv);
777				splx(s);
778				return (0);
779			}
780			m = so->so_rcv.sb_mb;
781			if (m)
782				nextrecord = m->m_nextpkt;
783		}
784	}
785
786	if (m && pr->pr_flags & PR_ATOMIC) {
787		flags |= MSG_TRUNC;
788		if ((flags & MSG_PEEK) == 0)
789			(void) sbdroprecord(&so->so_rcv);
790	}
791	if ((flags & MSG_PEEK) == 0) {
792		if (m == 0)
793			so->so_rcv.sb_mb = nextrecord;
794		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
795			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
796	}
797	if (orig_resid == uio->uio_resid && orig_resid &&
798	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
799		sbunlock(&so->so_rcv);
800		splx(s);
801		goto restart;
802	}
803
804	if (flagsp)
805		*flagsp |= flags;
806release:
807	sbunlock(&so->so_rcv);
808	splx(s);
809	return (error);
810}
811
812int
813soshutdown(so, how)
814	register struct socket *so;
815	register int how;
816{
817	register struct protosw *pr = so->so_proto;
818
819	how++;
820	if (how & FREAD)
821		sorflush(so);
822	if (how & FWRITE)
823		return ((*pr->pr_usrreqs->pru_shutdown)(so));
824	return (0);
825}
826
827void
828sorflush(so)
829	register struct socket *so;
830{
831	register struct sockbuf *sb = &so->so_rcv;
832	register struct protosw *pr = so->so_proto;
833	register int s;
834	struct sockbuf asb;
835
836	sb->sb_flags |= SB_NOINTR;
837	(void) sblock(sb, M_WAITOK);
838	s = splimp();
839	socantrcvmore(so);
840	sbunlock(sb);
841	asb = *sb;
842	bzero((caddr_t)sb, sizeof (*sb));
843	splx(s);
844	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
845		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
846	sbrelease(&asb);
847}
848
849int
850sosetopt(so, level, optname, m0)
851	register struct socket *so;
852	int level, optname;
853	struct mbuf *m0;
854{
855	int error = 0;
856	register struct mbuf *m = m0;
857
858	if (level != SOL_SOCKET) {
859		if (so->so_proto && so->so_proto->pr_ctloutput)
860			return ((*so->so_proto->pr_ctloutput)
861				  (PRCO_SETOPT, so, level, optname, &m0));
862		error = ENOPROTOOPT;
863	} else {
864		switch (optname) {
865
866		case SO_LINGER:
867			if (m == NULL || m->m_len != sizeof (struct linger)) {
868				error = EINVAL;
869				goto bad;
870			}
871			so->so_linger = mtod(m, struct linger *)->l_linger;
872			/* fall thru... */
873
874		case SO_DEBUG:
875		case SO_KEEPALIVE:
876		case SO_DONTROUTE:
877		case SO_USELOOPBACK:
878		case SO_BROADCAST:
879		case SO_REUSEADDR:
880		case SO_REUSEPORT:
881		case SO_OOBINLINE:
882		case SO_TIMESTAMP:
883			if (m == NULL || m->m_len < sizeof (int)) {
884				error = EINVAL;
885				goto bad;
886			}
887			if (*mtod(m, int *))
888				so->so_options |= optname;
889			else
890				so->so_options &= ~optname;
891			break;
892
893		case SO_SNDBUF:
894		case SO_RCVBUF:
895		case SO_SNDLOWAT:
896		case SO_RCVLOWAT:
897			if (m == NULL || m->m_len < sizeof (int)) {
898				error = EINVAL;
899				goto bad;
900			}
901			switch (optname) {
902
903			case SO_SNDBUF:
904			case SO_RCVBUF:
905				if (sbreserve(optname == SO_SNDBUF ?
906				    &so->so_snd : &so->so_rcv,
907				    (u_long) *mtod(m, int *)) == 0) {
908					error = ENOBUFS;
909					goto bad;
910				}
911				break;
912
913			case SO_SNDLOWAT:
914				so->so_snd.sb_lowat = *mtod(m, int *);
915				break;
916			case SO_RCVLOWAT:
917				so->so_rcv.sb_lowat = *mtod(m, int *);
918				break;
919			}
920			break;
921
922		case SO_SNDTIMEO:
923		case SO_RCVTIMEO:
924		    {
925			struct timeval *tv;
926			short val;
927
928			if (m == NULL || m->m_len < sizeof (*tv)) {
929				error = EINVAL;
930				goto bad;
931			}
932			tv = mtod(m, struct timeval *);
933			if (tv->tv_sec > SHRT_MAX / hz - hz) {
934				error = EDOM;
935				goto bad;
936			}
937			val = tv->tv_sec * hz + tv->tv_usec / tick;
938
939			switch (optname) {
940
941			case SO_SNDTIMEO:
942				so->so_snd.sb_timeo = val;
943				break;
944			case SO_RCVTIMEO:
945				so->so_rcv.sb_timeo = val;
946				break;
947			}
948			break;
949		    }
950
951		case SO_PRIVSTATE:
952			/* we don't care what the parameter is... */
953			so->so_state &= ~SS_PRIV;
954			break;
955
956		default:
957			error = ENOPROTOOPT;
958			break;
959		}
960		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
961			(void) ((*so->so_proto->pr_ctloutput)
962				  (PRCO_SETOPT, so, level, optname, &m0));
963			m = NULL;	/* freed by protocol */
964		}
965	}
966bad:
967	if (m)
968		(void) m_free(m);
969	return (error);
970}
971
972int
973sogetopt(so, level, optname, mp)
974	register struct socket *so;
975	int level, optname;
976	struct mbuf **mp;
977{
978	register struct mbuf *m;
979
980	if (level != SOL_SOCKET) {
981		if (so->so_proto && so->so_proto->pr_ctloutput) {
982			return ((*so->so_proto->pr_ctloutput)
983				  (PRCO_GETOPT, so, level, optname, mp));
984		} else
985			return (ENOPROTOOPT);
986	} else {
987		m = m_get(M_WAIT, MT_SOOPTS);
988		m->m_len = sizeof (int);
989
990		switch (optname) {
991
992		case SO_LINGER:
993			m->m_len = sizeof (struct linger);
994			mtod(m, struct linger *)->l_onoff =
995				so->so_options & SO_LINGER;
996			mtod(m, struct linger *)->l_linger = so->so_linger;
997			break;
998
999		case SO_USELOOPBACK:
1000		case SO_DONTROUTE:
1001		case SO_DEBUG:
1002		case SO_KEEPALIVE:
1003		case SO_REUSEADDR:
1004		case SO_REUSEPORT:
1005		case SO_BROADCAST:
1006		case SO_OOBINLINE:
1007		case SO_TIMESTAMP:
1008			*mtod(m, int *) = so->so_options & optname;
1009			break;
1010
1011		case SO_PRIVSTATE:
1012			*mtod(m, int *) = so->so_state & SS_PRIV;
1013			break;
1014
1015		case SO_TYPE:
1016			*mtod(m, int *) = so->so_type;
1017			break;
1018
1019		case SO_ERROR:
1020			*mtod(m, int *) = so->so_error;
1021			so->so_error = 0;
1022			break;
1023
1024		case SO_SNDBUF:
1025			*mtod(m, int *) = so->so_snd.sb_hiwat;
1026			break;
1027
1028		case SO_RCVBUF:
1029			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1030			break;
1031
1032		case SO_SNDLOWAT:
1033			*mtod(m, int *) = so->so_snd.sb_lowat;
1034			break;
1035
1036		case SO_RCVLOWAT:
1037			*mtod(m, int *) = so->so_rcv.sb_lowat;
1038			break;
1039
1040		case SO_SNDTIMEO:
1041		case SO_RCVTIMEO:
1042		    {
1043			int val = (optname == SO_SNDTIMEO ?
1044			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1045
1046			m->m_len = sizeof(struct timeval);
1047			mtod(m, struct timeval *)->tv_sec = val / hz;
1048			mtod(m, struct timeval *)->tv_usec =
1049			    (val % hz) * tick;
1050			break;
1051		    }
1052
1053		default:
1054			(void)m_free(m);
1055			return (ENOPROTOOPT);
1056		}
1057		*mp = m;
1058		return (0);
1059	}
1060}
1061
1062void
1063sohasoutofband(so)
1064	register struct socket *so;
1065{
1066	struct proc *p;
1067
1068	if (so->so_pgid < 0)
1069		gsignal(-so->so_pgid, SIGURG);
1070	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1071		psignal(p, SIGURG);
1072	selwakeup(&so->so_rcv.sb_sel);
1073}
1074