uipc_socket.c revision 6476
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 * $Id: uipc_socket.c,v 1.8 1995/02/07 02:01:14 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/file.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/protosw.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/resourcevar.h>
49#include <sys/signalvar.h>
50
51/*
52 * Socket operation routines.
53 * These routines are called by the routines in
54 * sys_socket.c or from a system process, and
55 * implement the semantics of socket operations by
56 * switching out to the protocol specific routines.
57 */
58/*ARGSUSED*/
59int
60socreate(dom, aso, type, proto)
61	int dom;
62	struct socket **aso;
63	register int type;
64	int proto;
65{
66	struct proc *p = curproc;		/* XXX */
67	register struct protosw *prp;
68	register struct socket *so;
69	register int error;
70
71	if (proto)
72		prp = pffindproto(dom, proto, type);
73	else
74		prp = pffindtype(dom, type);
75	if (prp == 0 || prp->pr_usrreq == 0)
76		return (EPROTONOSUPPORT);
77	if (prp->pr_type != type)
78		return (EPROTOTYPE);
79	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
80	bzero((caddr_t)so, sizeof(*so));
81	so->so_type = type;
82	if (p->p_ucred->cr_uid == 0)
83		so->so_state = SS_PRIV;
84	so->so_proto = prp;
85	error =
86	    (*prp->pr_usrreq)(so, PRU_ATTACH,
87		(struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0);
88	if (error) {
89		so->so_state |= SS_NOFDREF;
90		sofree(so);
91		return (error);
92	}
93	*aso = so;
94	return (0);
95}
96
97int
98sobind(so, nam)
99	struct socket *so;
100	struct mbuf *nam;
101{
102	int s = splnet();
103	int error;
104
105	error =
106	    (*so->so_proto->pr_usrreq)(so, PRU_BIND,
107		(struct mbuf *)0, nam, (struct mbuf *)0);
108	splx(s);
109	return (error);
110}
111
112int
113solisten(so, backlog)
114	register struct socket *so;
115	int backlog;
116{
117	int s = splnet(), error;
118
119	error =
120	    (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
121		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
122	if (error) {
123		splx(s);
124		return (error);
125	}
126	if (so->so_q == 0)
127		so->so_options |= SO_ACCEPTCONN;
128	if (backlog < 0)
129		backlog = 0;
130	so->so_qlimit = min(backlog, SOMAXCONN);
131	splx(s);
132	return (0);
133}
134
135void
136sofree(so)
137	register struct socket *so;
138{
139
140	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
141		return;
142	if (so->so_head) {
143		if (!soqremque(so, 0) && !soqremque(so, 1))
144			panic("sofree dq");
145		so->so_head = 0;
146	}
147	sbrelease(&so->so_snd);
148	sorflush(so);
149	FREE(so, M_SOCKET);
150}
151
152/*
153 * Close a socket on last file table reference removal.
154 * Initiate disconnect if connected.
155 * Free socket when disconnect complete.
156 */
157int
158soclose(so)
159	register struct socket *so;
160{
161	int s = splnet();		/* conservative */
162	int error = 0;
163
164	if (so->so_options & SO_ACCEPTCONN) {
165		while (so->so_q0)
166			(void) soabort(so->so_q0);
167		while (so->so_q)
168			(void) soabort(so->so_q);
169	}
170	if (so->so_pcb == 0)
171		goto discard;
172	if (so->so_state & SS_ISCONNECTED) {
173		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
174			error = sodisconnect(so);
175			if (error)
176				goto drop;
177		}
178		if (so->so_options & SO_LINGER) {
179			if ((so->so_state & SS_ISDISCONNECTING) &&
180			    (so->so_state & SS_NBIO))
181				goto drop;
182			while (so->so_state & SS_ISCONNECTED) {
183				error = tsleep((caddr_t)&so->so_timeo,
184				    PSOCK | PCATCH, netcls, so->so_linger);
185				if (error)
186					break;
187			}
188		}
189	}
190drop:
191	if (so->so_pcb) {
192		int error2 =
193		    (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
194			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
195		if (error == 0)
196			error = error2;
197	}
198discard:
199	if (so->so_state & SS_NOFDREF)
200		panic("soclose: NOFDREF");
201	so->so_state |= SS_NOFDREF;
202	sofree(so);
203	splx(s);
204	return (error);
205}
206
207/*
208 * Must be called at splnet...
209 */
210int
211soabort(so)
212	struct socket *so;
213{
214
215	return (
216	    (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
217		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
218}
219
220int
221soaccept(so, nam)
222	register struct socket *so;
223	struct mbuf *nam;
224{
225	int s = splnet();
226	int error;
227
228	if ((so->so_state & SS_NOFDREF) == 0)
229		panic("soaccept: !NOFDREF");
230	so->so_state &= ~SS_NOFDREF;
231	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
232	    (struct mbuf *)0, nam, (struct mbuf *)0);
233	splx(s);
234	return (error);
235}
236
237int
238soconnect(so, nam)
239	register struct socket *so;
240	struct mbuf *nam;
241{
242	int s;
243	int error;
244
245	if (so->so_options & SO_ACCEPTCONN)
246		return (EOPNOTSUPP);
247	s = splnet();
248	/*
249	 * If protocol is connection-based, can only connect once.
250	 * Otherwise, if connected, try to disconnect first.
251	 * This allows user to disconnect by connecting to, e.g.,
252	 * a null address.
253	 */
254	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
255	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
256	    (error = sodisconnect(so))))
257		error = EISCONN;
258	else
259		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
260		    (struct mbuf *)0, nam, (struct mbuf *)0);
261	splx(s);
262	return (error);
263}
264
265int
266soconnect2(so1, so2)
267	register struct socket *so1;
268	struct socket *so2;
269{
270	int s = splnet();
271	int error;
272
273	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
274	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
275	splx(s);
276	return (error);
277}
278
279int
280sodisconnect(so)
281	register struct socket *so;
282{
283	int s = splnet();
284	int error;
285
286	if ((so->so_state & SS_ISCONNECTED) == 0) {
287		error = ENOTCONN;
288		goto bad;
289	}
290	if (so->so_state & SS_ISDISCONNECTING) {
291		error = EALREADY;
292		goto bad;
293	}
294	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
295	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
296bad:
297	splx(s);
298	return (error);
299}
300
301#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
302/*
303 * Send on a socket.
304 * If send must go all at once and message is larger than
305 * send buffering, then hard error.
306 * Lock against other senders.
307 * If must go all at once and not enough room now, then
308 * inform user that this would block and do nothing.
309 * Otherwise, if nonblocking, send as much as possible.
310 * The data to be sent is described by "uio" if nonzero,
311 * otherwise by the mbuf chain "top" (which must be null
312 * if uio is not).  Data provided in mbuf chain must be small
313 * enough to send all at once.
314 *
315 * Returns nonzero on error, timeout or signal; callers
316 * must check for short counts if EINTR/ERESTART are returned.
317 * Data and control buffers are freed on return.
318 */
319int
320sosend(so, addr, uio, top, control, flags)
321	register struct socket *so;
322	struct mbuf *addr;
323	struct uio *uio;
324	struct mbuf *top;
325	struct mbuf *control;
326	int flags;
327{
328	struct proc *p = curproc;		/* XXX */
329	struct mbuf **mp;
330	register struct mbuf *m;
331	register long space, len, resid;
332	int clen = 0, error, s, dontroute, mlen;
333	int atomic = sosendallatonce(so) || top;
334
335	if (uio)
336		resid = uio->uio_resid;
337	else
338		resid = top->m_pkthdr.len;
339	/*
340	 * In theory resid should be unsigned.
341	 * However, space must be signed, as it might be less than 0
342	 * if we over-committed, and we must use a signed comparison
343	 * of space and resid.  On the other hand, a negative resid
344	 * causes us to loop sending 0-length segments to the protocol.
345	 */
346	if (resid < 0)
347		return (EINVAL);
348	dontroute =
349	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
350	    (so->so_proto->pr_flags & PR_ATOMIC);
351	p->p_stats->p_ru.ru_msgsnd++;
352	if (control)
353		clen = control->m_len;
354#define	snderr(errno)	{ error = errno; splx(s); goto release; }
355
356restart:
357	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
358	if (error)
359		goto out;
360	do {
361		s = splnet();
362		if (so->so_state & SS_CANTSENDMORE)
363			snderr(EPIPE);
364		if (so->so_error)
365			snderr(so->so_error);
366		if ((so->so_state & SS_ISCONNECTED) == 0) {
367			/*
368			 * `sendto' and `sendmsg' is allowed on a connection-
369			 * based socket if it supports implied connect.
370			 * Return ENOTCONN if not connected and no address is
371			 * supplied.
372			 */
373			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
374			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
375				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
376				    !(resid == 0 && clen != 0))
377					snderr(ENOTCONN);
378			} else if (addr == 0)
379			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
380				   ENOTCONN : EDESTADDRREQ);
381		}
382		space = sbspace(&so->so_snd);
383		if (flags & MSG_OOB)
384			space += 1024;
385		if ((atomic && resid > so->so_snd.sb_hiwat) ||
386		    clen > so->so_snd.sb_hiwat)
387			snderr(EMSGSIZE);
388		if (space < resid + clen && uio &&
389		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
390			if (so->so_state & SS_NBIO)
391				snderr(EWOULDBLOCK);
392			sbunlock(&so->so_snd);
393			error = sbwait(&so->so_snd);
394			splx(s);
395			if (error)
396				goto out;
397			goto restart;
398		}
399		splx(s);
400		mp = &top;
401		space -= clen;
402		do {
403		    if (uio == NULL) {
404			/*
405			 * Data is prepackaged in "top".
406			 */
407			resid = 0;
408			if (flags & MSG_EOR)
409				top->m_flags |= M_EOR;
410		    } else do {
411			if (top == 0) {
412				MGETHDR(m, M_WAIT, MT_DATA);
413				mlen = MHLEN;
414				m->m_pkthdr.len = 0;
415				m->m_pkthdr.rcvif = (struct ifnet *)0;
416			} else {
417				MGET(m, M_WAIT, MT_DATA);
418				mlen = MLEN;
419			}
420			if (resid >= MINCLSIZE) {
421				MCLGET(m, M_WAIT);
422				if ((m->m_flags & M_EXT) == 0)
423					goto nopages;
424				mlen = MCLBYTES;
425				len = min(min(mlen, resid), space);
426			} else {
427nopages:
428				len = min(min(mlen, resid), space);
429				/*
430				 * For datagram protocols, leave room
431				 * for protocol headers in first mbuf.
432				 */
433				if (atomic && top == 0 && len < mlen)
434					MH_ALIGN(m, len);
435			}
436			space -= len;
437			error = uiomove(mtod(m, caddr_t), (int)len, uio);
438			resid = uio->uio_resid;
439			m->m_len = len;
440			*mp = m;
441			top->m_pkthdr.len += len;
442			if (error)
443				goto release;
444			mp = &m->m_next;
445			if (resid <= 0) {
446				if (flags & MSG_EOR)
447					top->m_flags |= M_EOR;
448				break;
449			}
450		    } while (space > 0 && atomic);
451		    if (dontroute)
452			    so->so_options |= SO_DONTROUTE;
453		    s = splnet();				/* XXX */
454		    error = (*so->so_proto->pr_usrreq)(so,
455			(flags & MSG_OOB) ? PRU_SENDOOB :
456			/*
457			 * If the user set MSG_EOF, the protocol
458			 * understands this flag and nothing left to
459			 * send then use PRU_SEND_EOF instead of PRU_SEND.
460			 */
461			((flags & MSG_EOF) &&
462			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
463			 (resid <= 0)) ?
464				PRU_SEND_EOF : PRU_SEND,
465			top, addr, control);
466		    splx(s);
467		    if (dontroute)
468			    so->so_options &= ~SO_DONTROUTE;
469		    clen = 0;
470		    control = 0;
471		    top = 0;
472		    mp = &top;
473		    if (error)
474			goto release;
475		} while (resid && space > 0);
476	} while (resid);
477
478release:
479	sbunlock(&so->so_snd);
480out:
481	if (top)
482		m_freem(top);
483	if (control)
484		m_freem(control);
485	return (error);
486}
487
488/*
489 * Implement receive operations on a socket.
490 * We depend on the way that records are added to the sockbuf
491 * by sbappend*.  In particular, each record (mbufs linked through m_next)
492 * must begin with an address if the protocol so specifies,
493 * followed by an optional mbuf or mbufs containing ancillary data,
494 * and then zero or more mbufs of data.
495 * In order to avoid blocking network interrupts for the entire time here,
496 * we splx() while doing the actual copy to user space.
497 * Although the sockbuf is locked, new data may still be appended,
498 * and thus we must maintain consistency of the sockbuf during that time.
499 *
500 * The caller may receive the data as a single mbuf chain by supplying
501 * an mbuf **mp0 for use in returning the chain.  The uio is then used
502 * only for the count in uio_resid.
503 */
504int
505soreceive(so, paddr, uio, mp0, controlp, flagsp)
506	register struct socket *so;
507	struct mbuf **paddr;
508	struct uio *uio;
509	struct mbuf **mp0;
510	struct mbuf **controlp;
511	int *flagsp;
512{
513	register struct mbuf *m, **mp;
514	register int flags, len, error, s, offset;
515	struct protosw *pr = so->so_proto;
516	struct mbuf *nextrecord;
517	int moff, type = 0;
518	int orig_resid = uio->uio_resid;
519
520	mp = mp0;
521	if (paddr)
522		*paddr = 0;
523	if (controlp)
524		*controlp = 0;
525	if (flagsp)
526		flags = *flagsp &~ MSG_EOR;
527	else
528		flags = 0;
529	if (flags & MSG_OOB) {
530		m = m_get(M_WAIT, MT_DATA);
531		error = (*pr->pr_usrreq)(so, PRU_RCVOOB,
532		    m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0);
533		if (error)
534			goto bad;
535		do {
536			error = uiomove(mtod(m, caddr_t),
537			    (int) min(uio->uio_resid, m->m_len), uio);
538			m = m_free(m);
539		} while (uio->uio_resid && error == 0 && m);
540bad:
541		if (m)
542			m_freem(m);
543		return (error);
544	}
545	if (mp)
546		*mp = (struct mbuf *)0;
547	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
548		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
549		    (struct mbuf *)0, (struct mbuf *)0);
550
551restart:
552	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
553	if (error)
554		return (error);
555	s = splnet();
556
557	m = so->so_rcv.sb_mb;
558	/*
559	 * If we have less data than requested, block awaiting more
560	 * (subject to any timeout) if:
561	 *   1. the current count is less than the low water mark, or
562	 *   2. MSG_WAITALL is set, and it is possible to do the entire
563	 *	receive operation at once if we block (resid <= hiwat).
564	 *   3. MSG_DONTWAIT is not set
565	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
566	 * we have to do the receive in sections, and thus risk returning
567	 * a short count if a timeout or signal occurs after we start.
568	 */
569	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
570	    so->so_rcv.sb_cc < uio->uio_resid) &&
571	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
572	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
573	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
574#ifdef DIAGNOSTIC
575		if (m == 0 && so->so_rcv.sb_cc)
576			panic("receive 1");
577#endif
578		if (so->so_error) {
579			if (m)
580				goto dontblock;
581			error = so->so_error;
582			if ((flags & MSG_PEEK) == 0)
583				so->so_error = 0;
584			goto release;
585		}
586		if (so->so_state & SS_CANTRCVMORE) {
587			if (m)
588				goto dontblock;
589			else
590				goto release;
591		}
592		for (; m; m = m->m_next)
593			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
594				m = so->so_rcv.sb_mb;
595				goto dontblock;
596			}
597		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
598		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
599			error = ENOTCONN;
600			goto release;
601		}
602		if (uio->uio_resid == 0)
603			goto release;
604		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
605			error = EWOULDBLOCK;
606			goto release;
607		}
608		sbunlock(&so->so_rcv);
609		error = sbwait(&so->so_rcv);
610		splx(s);
611		if (error)
612			return (error);
613		goto restart;
614	}
615dontblock:
616	if (uio->uio_procp)
617		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
618	nextrecord = m->m_nextpkt;
619	if (pr->pr_flags & PR_ADDR) {
620#ifdef DIAGNOSTIC
621		if (m->m_type != MT_SONAME)
622			panic("receive 1a");
623#endif
624		orig_resid = 0;
625		if (flags & MSG_PEEK) {
626			if (paddr)
627				*paddr = m_copy(m, 0, m->m_len);
628			m = m->m_next;
629		} else {
630			sbfree(&so->so_rcv, m);
631			if (paddr) {
632				*paddr = m;
633				so->so_rcv.sb_mb = m->m_next;
634				m->m_next = 0;
635				m = so->so_rcv.sb_mb;
636			} else {
637				MFREE(m, so->so_rcv.sb_mb);
638				m = so->so_rcv.sb_mb;
639			}
640		}
641	}
642	while (m && m->m_type == MT_CONTROL && error == 0) {
643		if (flags & MSG_PEEK) {
644			if (controlp)
645				*controlp = m_copy(m, 0, m->m_len);
646			m = m->m_next;
647		} else {
648			sbfree(&so->so_rcv, m);
649			if (controlp) {
650				if (pr->pr_domain->dom_externalize &&
651				    mtod(m, struct cmsghdr *)->cmsg_type ==
652				    SCM_RIGHTS)
653				   error = (*pr->pr_domain->dom_externalize)(m);
654				*controlp = m;
655				so->so_rcv.sb_mb = m->m_next;
656				m->m_next = 0;
657				m = so->so_rcv.sb_mb;
658			} else {
659				MFREE(m, so->so_rcv.sb_mb);
660				m = so->so_rcv.sb_mb;
661			}
662		}
663		if (controlp) {
664			orig_resid = 0;
665			controlp = &(*controlp)->m_next;
666		}
667	}
668	if (m) {
669		if ((flags & MSG_PEEK) == 0)
670			m->m_nextpkt = nextrecord;
671		type = m->m_type;
672		if (type == MT_OOBDATA)
673			flags |= MSG_OOB;
674	}
675	moff = 0;
676	offset = 0;
677	while (m && uio->uio_resid > 0 && error == 0) {
678		if (m->m_type == MT_OOBDATA) {
679			if (type != MT_OOBDATA)
680				break;
681		} else if (type == MT_OOBDATA)
682			break;
683#ifdef DIAGNOSTIC
684		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
685			panic("receive 3");
686#endif
687		so->so_state &= ~SS_RCVATMARK;
688		len = uio->uio_resid;
689		if (so->so_oobmark && len > so->so_oobmark - offset)
690			len = so->so_oobmark - offset;
691		if (len > m->m_len - moff)
692			len = m->m_len - moff;
693		/*
694		 * If mp is set, just pass back the mbufs.
695		 * Otherwise copy them out via the uio, then free.
696		 * Sockbuf must be consistent here (points to current mbuf,
697		 * it points to next record) when we drop priority;
698		 * we must note any additions to the sockbuf when we
699		 * block interrupts again.
700		 */
701		if (mp == 0) {
702			splx(s);
703			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
704			s = splnet();
705		} else
706			uio->uio_resid -= len;
707		if (len == m->m_len - moff) {
708			if (m->m_flags & M_EOR)
709				flags |= MSG_EOR;
710			if (flags & MSG_PEEK) {
711				m = m->m_next;
712				moff = 0;
713			} else {
714				nextrecord = m->m_nextpkt;
715				sbfree(&so->so_rcv, m);
716				if (mp) {
717					*mp = m;
718					mp = &m->m_next;
719					so->so_rcv.sb_mb = m = m->m_next;
720					*mp = (struct mbuf *)0;
721				} else {
722					MFREE(m, so->so_rcv.sb_mb);
723					m = so->so_rcv.sb_mb;
724				}
725				if (m)
726					m->m_nextpkt = nextrecord;
727			}
728		} else {
729			if (flags & MSG_PEEK)
730				moff += len;
731			else {
732				if (mp)
733					*mp = m_copym(m, 0, len, M_WAIT);
734				m->m_data += len;
735				m->m_len -= len;
736				so->so_rcv.sb_cc -= len;
737			}
738		}
739		if (so->so_oobmark) {
740			if ((flags & MSG_PEEK) == 0) {
741				so->so_oobmark -= len;
742				if (so->so_oobmark == 0) {
743					so->so_state |= SS_RCVATMARK;
744					break;
745				}
746			} else {
747				offset += len;
748				if (offset == so->so_oobmark)
749					break;
750			}
751		}
752		if (flags & MSG_EOR)
753			break;
754		/*
755		 * If the MSG_WAITALL flag is set (for non-atomic socket),
756		 * we must not quit until "uio->uio_resid == 0" or an error
757		 * termination.  If a signal/timeout occurs, return
758		 * with a short count but without error.
759		 * Keep sockbuf locked against other readers.
760		 */
761		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
762		    !sosendallatonce(so) && !nextrecord) {
763			if (so->so_error || so->so_state & SS_CANTRCVMORE)
764				break;
765			error = sbwait(&so->so_rcv);
766			if (error) {
767				sbunlock(&so->so_rcv);
768				splx(s);
769				return (0);
770			}
771			m = so->so_rcv.sb_mb;
772			if (m)
773				nextrecord = m->m_nextpkt;
774		}
775	}
776
777	if (m && pr->pr_flags & PR_ATOMIC) {
778		flags |= MSG_TRUNC;
779		if ((flags & MSG_PEEK) == 0)
780			(void) sbdroprecord(&so->so_rcv);
781	}
782	if ((flags & MSG_PEEK) == 0) {
783		if (m == 0)
784			so->so_rcv.sb_mb = nextrecord;
785		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
786			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
787			    (struct mbuf *)flags, (struct mbuf *)0,
788			    (struct mbuf *)0);
789	}
790	if (orig_resid == uio->uio_resid && orig_resid &&
791	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
792		sbunlock(&so->so_rcv);
793		splx(s);
794		goto restart;
795	}
796
797	if (flagsp)
798		*flagsp |= flags;
799release:
800	sbunlock(&so->so_rcv);
801	splx(s);
802	return (error);
803}
804
805int
806soshutdown(so, how)
807	register struct socket *so;
808	register int how;
809{
810	register struct protosw *pr = so->so_proto;
811
812	how++;
813	if (how & FREAD)
814		sorflush(so);
815	if (how & FWRITE)
816		return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
817		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
818	return (0);
819}
820
821void
822sorflush(so)
823	register struct socket *so;
824{
825	register struct sockbuf *sb = &so->so_rcv;
826	register struct protosw *pr = so->so_proto;
827	register int s;
828	struct sockbuf asb;
829
830	sb->sb_flags |= SB_NOINTR;
831	(void) sblock(sb, M_WAITOK);
832	s = splimp();
833	socantrcvmore(so);
834	sbunlock(sb);
835	asb = *sb;
836	bzero((caddr_t)sb, sizeof (*sb));
837	splx(s);
838	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
839		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
840	sbrelease(&asb);
841}
842
843int
844sosetopt(so, level, optname, m0)
845	register struct socket *so;
846	int level, optname;
847	struct mbuf *m0;
848{
849	int error = 0;
850	register struct mbuf *m = m0;
851
852	if (level != SOL_SOCKET) {
853		if (so->so_proto && so->so_proto->pr_ctloutput)
854			return ((*so->so_proto->pr_ctloutput)
855				  (PRCO_SETOPT, so, level, optname, &m0));
856		error = ENOPROTOOPT;
857	} else {
858		switch (optname) {
859
860		case SO_LINGER:
861			if (m == NULL || m->m_len != sizeof (struct linger)) {
862				error = EINVAL;
863				goto bad;
864			}
865			so->so_linger = mtod(m, struct linger *)->l_linger;
866			/* fall thru... */
867
868		case SO_DEBUG:
869		case SO_KEEPALIVE:
870		case SO_DONTROUTE:
871		case SO_USELOOPBACK:
872		case SO_BROADCAST:
873		case SO_REUSEADDR:
874		case SO_REUSEPORT:
875		case SO_OOBINLINE:
876			if (m == NULL || m->m_len < sizeof (int)) {
877				error = EINVAL;
878				goto bad;
879			}
880			if (*mtod(m, int *))
881				so->so_options |= optname;
882			else
883				so->so_options &= ~optname;
884			break;
885
886		case SO_SNDBUF:
887		case SO_RCVBUF:
888		case SO_SNDLOWAT:
889		case SO_RCVLOWAT:
890			if (m == NULL || m->m_len < sizeof (int)) {
891				error = EINVAL;
892				goto bad;
893			}
894			switch (optname) {
895
896			case SO_SNDBUF:
897			case SO_RCVBUF:
898				if (sbreserve(optname == SO_SNDBUF ?
899				    &so->so_snd : &so->so_rcv,
900				    (u_long) *mtod(m, int *)) == 0) {
901					error = ENOBUFS;
902					goto bad;
903				}
904				break;
905
906			case SO_SNDLOWAT:
907				so->so_snd.sb_lowat = *mtod(m, int *);
908				break;
909			case SO_RCVLOWAT:
910				so->so_rcv.sb_lowat = *mtod(m, int *);
911				break;
912			}
913			break;
914
915		case SO_SNDTIMEO:
916		case SO_RCVTIMEO:
917		    {
918			struct timeval *tv;
919			short val;
920
921			if (m == NULL || m->m_len < sizeof (*tv)) {
922				error = EINVAL;
923				goto bad;
924			}
925			tv = mtod(m, struct timeval *);
926			if (tv->tv_sec > SHRT_MAX / hz - hz) {
927				error = EDOM;
928				goto bad;
929			}
930			val = tv->tv_sec * hz + tv->tv_usec / tick;
931
932			switch (optname) {
933
934			case SO_SNDTIMEO:
935				so->so_snd.sb_timeo = val;
936				break;
937			case SO_RCVTIMEO:
938				so->so_rcv.sb_timeo = val;
939				break;
940			}
941			break;
942		    }
943
944		default:
945			error = ENOPROTOOPT;
946			break;
947		}
948		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
949			(void) ((*so->so_proto->pr_ctloutput)
950				  (PRCO_SETOPT, so, level, optname, &m0));
951			m = NULL;	/* freed by protocol */
952		}
953	}
954bad:
955	if (m)
956		(void) m_free(m);
957	return (error);
958}
959
960int
961sogetopt(so, level, optname, mp)
962	register struct socket *so;
963	int level, optname;
964	struct mbuf **mp;
965{
966	register struct mbuf *m;
967
968	if (level != SOL_SOCKET) {
969		if (so->so_proto && so->so_proto->pr_ctloutput) {
970			return ((*so->so_proto->pr_ctloutput)
971				  (PRCO_GETOPT, so, level, optname, mp));
972		} else
973			return (ENOPROTOOPT);
974	} else {
975		m = m_get(M_WAIT, MT_SOOPTS);
976		m->m_len = sizeof (int);
977
978		switch (optname) {
979
980		case SO_LINGER:
981			m->m_len = sizeof (struct linger);
982			mtod(m, struct linger *)->l_onoff =
983				so->so_options & SO_LINGER;
984			mtod(m, struct linger *)->l_linger = so->so_linger;
985			break;
986
987		case SO_USELOOPBACK:
988		case SO_DONTROUTE:
989		case SO_DEBUG:
990		case SO_KEEPALIVE:
991		case SO_REUSEADDR:
992		case SO_REUSEPORT:
993		case SO_BROADCAST:
994		case SO_OOBINLINE:
995			*mtod(m, int *) = so->so_options & optname;
996			break;
997
998		case SO_TYPE:
999			*mtod(m, int *) = so->so_type;
1000			break;
1001
1002		case SO_ERROR:
1003			*mtod(m, int *) = so->so_error;
1004			so->so_error = 0;
1005			break;
1006
1007		case SO_SNDBUF:
1008			*mtod(m, int *) = so->so_snd.sb_hiwat;
1009			break;
1010
1011		case SO_RCVBUF:
1012			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1013			break;
1014
1015		case SO_SNDLOWAT:
1016			*mtod(m, int *) = so->so_snd.sb_lowat;
1017			break;
1018
1019		case SO_RCVLOWAT:
1020			*mtod(m, int *) = so->so_rcv.sb_lowat;
1021			break;
1022
1023		case SO_SNDTIMEO:
1024		case SO_RCVTIMEO:
1025		    {
1026			int val = (optname == SO_SNDTIMEO ?
1027			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1028
1029			m->m_len = sizeof(struct timeval);
1030			mtod(m, struct timeval *)->tv_sec = val / hz;
1031			mtod(m, struct timeval *)->tv_usec =
1032			    (val % hz) * tick;
1033			break;
1034		    }
1035
1036		default:
1037			(void)m_free(m);
1038			return (ENOPROTOOPT);
1039		}
1040		*mp = m;
1041		return (0);
1042	}
1043}
1044
1045void
1046sohasoutofband(so)
1047	register struct socket *so;
1048{
1049	struct proc *p;
1050
1051	if (so->so_pgid < 0)
1052		gsignal(-so->so_pgid, SIGURG);
1053	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1054		psignal(p, SIGURG);
1055	selwakeup(&so->so_rcv.sb_sel);
1056}
1057