uipc_socket.c revision 33955
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.37 1998/02/19 19:38:20 fenner Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/poll.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52
53#include <machine/limits.h>
54
55MALLOC_DEFINE(M_SOCKET, "socket", "socket structure");
56MALLOC_DEFINE(M_SONAME, "soname", "socket name");
57MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
58
59static int somaxconn = SOMAXCONN;
60SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
61	   0, "");
62
63/*
64 * Socket operation routines.
65 * These routines are called by the routines in
66 * sys_socket.c or from a system process, and
67 * implement the semantics of socket operations by
68 * switching out to the protocol specific routines.
69 */
70/*ARGSUSED*/
71int
72socreate(dom, aso, type, proto, p)
73	int dom;
74	struct socket **aso;
75	register int type;
76	int proto;
77	struct proc *p;
78{
79	register struct protosw *prp;
80	register struct socket *so;
81	register int error;
82
83	if (proto)
84		prp = pffindproto(dom, proto, type);
85	else
86		prp = pffindtype(dom, type);
87	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
88		return (EPROTONOSUPPORT);
89	if (prp->pr_type != type)
90		return (EPROTOTYPE);
91	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
92	bzero((caddr_t)so, sizeof(*so));
93	TAILQ_INIT(&so->so_incomp);
94	TAILQ_INIT(&so->so_comp);
95	so->so_type = type;
96	so->so_uid = p->p_ucred->cr_uid;;
97	so->so_proto = prp;
98	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
99	if (error) {
100		so->so_state |= SS_NOFDREF;
101		sofree(so);
102		return (error);
103	}
104	*aso = so;
105	return (0);
106}
107
108int
109sobind(so, nam, p)
110	struct socket *so;
111	struct sockaddr *nam;
112	struct proc *p;
113{
114	int s = splnet();
115	int error;
116
117	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
118	splx(s);
119	return (error);
120}
121
122int
123solisten(so, backlog, p)
124	register struct socket *so;
125	int backlog;
126	struct proc *p;
127{
128	int s = splnet(), error;
129
130	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
131	if (error) {
132		splx(s);
133		return (error);
134	}
135	if (so->so_comp.tqh_first == NULL)
136		so->so_options |= SO_ACCEPTCONN;
137	if (backlog < 0 || backlog > somaxconn)
138		backlog = somaxconn;
139	so->so_qlimit = backlog;
140	splx(s);
141	return (0);
142}
143
144void
145sofree(so)
146	register struct socket *so;
147{
148	struct socket *head = so->so_head;
149
150	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
151		return;
152	if (head != NULL) {
153		if (so->so_state & SS_INCOMP) {
154			TAILQ_REMOVE(&head->so_incomp, so, so_list);
155			head->so_incqlen--;
156		} else if (so->so_state & SS_COMP) {
157			TAILQ_REMOVE(&head->so_comp, so, so_list);
158		} else {
159			panic("sofree: not queued");
160		}
161		head->so_qlen--;
162		so->so_state &= ~(SS_INCOMP|SS_COMP);
163		so->so_head = NULL;
164	}
165	sbrelease(&so->so_snd);
166	sorflush(so);
167	FREE(so, M_SOCKET);
168}
169
170/*
171 * Close a socket on last file table reference removal.
172 * Initiate disconnect if connected.
173 * Free socket when disconnect complete.
174 */
175int
176soclose(so)
177	register struct socket *so;
178{
179	int s = splnet();		/* conservative */
180	int error = 0;
181
182	if (so->so_options & SO_ACCEPTCONN) {
183		struct socket *sp, *sonext;
184
185		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
186			sonext = sp->so_list.tqe_next;
187			(void) soabort(sp);
188		}
189		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
190			sonext = sp->so_list.tqe_next;
191			(void) soabort(sp);
192		}
193	}
194	if (so->so_pcb == 0)
195		goto discard;
196	if (so->so_state & SS_ISCONNECTED) {
197		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
198			error = sodisconnect(so);
199			if (error)
200				goto drop;
201		}
202		if (so->so_options & SO_LINGER) {
203			if ((so->so_state & SS_ISDISCONNECTING) &&
204			    (so->so_state & SS_NBIO))
205				goto drop;
206			while (so->so_state & SS_ISCONNECTED) {
207				error = tsleep((caddr_t)&so->so_timeo,
208				    PSOCK | PCATCH, "soclos", so->so_linger);
209				if (error)
210					break;
211			}
212		}
213	}
214drop:
215	if (so->so_pcb) {
216		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
217		if (error == 0)
218			error = error2;
219	}
220discard:
221	if (so->so_state & SS_NOFDREF)
222		panic("soclose: NOFDREF");
223	so->so_state |= SS_NOFDREF;
224	sofree(so);
225	splx(s);
226	return (error);
227}
228
229/*
230 * Must be called at splnet...
231 */
232int
233soabort(so)
234	struct socket *so;
235{
236
237	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
238}
239
240int
241soaccept(so, nam)
242	register struct socket *so;
243	struct sockaddr **nam;
244{
245	int s = splnet();
246	int error;
247
248	if ((so->so_state & SS_NOFDREF) == 0)
249		panic("soaccept: !NOFDREF");
250	so->so_state &= ~SS_NOFDREF;
251	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
252	splx(s);
253	return (error);
254}
255
256int
257soconnect(so, nam, p)
258	register struct socket *so;
259	struct sockaddr *nam;
260	struct proc *p;
261{
262	int s;
263	int error;
264
265	if (so->so_options & SO_ACCEPTCONN)
266		return (EOPNOTSUPP);
267	s = splnet();
268	/*
269	 * If protocol is connection-based, can only connect once.
270	 * Otherwise, if connected, try to disconnect first.
271	 * This allows user to disconnect by connecting to, e.g.,
272	 * a null address.
273	 */
274	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
275	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
276	    (error = sodisconnect(so))))
277		error = EISCONN;
278	else
279		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
280	splx(s);
281	return (error);
282}
283
284int
285soconnect2(so1, so2)
286	register struct socket *so1;
287	struct socket *so2;
288{
289	int s = splnet();
290	int error;
291
292	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
293	splx(s);
294	return (error);
295}
296
297int
298sodisconnect(so)
299	register struct socket *so;
300{
301	int s = splnet();
302	int error;
303
304	if ((so->so_state & SS_ISCONNECTED) == 0) {
305		error = ENOTCONN;
306		goto bad;
307	}
308	if (so->so_state & SS_ISDISCONNECTING) {
309		error = EALREADY;
310		goto bad;
311	}
312	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
313bad:
314	splx(s);
315	return (error);
316}
317
318#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
319/*
320 * Send on a socket.
321 * If send must go all at once and message is larger than
322 * send buffering, then hard error.
323 * Lock against other senders.
324 * If must go all at once and not enough room now, then
325 * inform user that this would block and do nothing.
326 * Otherwise, if nonblocking, send as much as possible.
327 * The data to be sent is described by "uio" if nonzero,
328 * otherwise by the mbuf chain "top" (which must be null
329 * if uio is not).  Data provided in mbuf chain must be small
330 * enough to send all at once.
331 *
332 * Returns nonzero on error, timeout or signal; callers
333 * must check for short counts if EINTR/ERESTART are returned.
334 * Data and control buffers are freed on return.
335 */
336int
337sosend(so, addr, uio, top, control, flags, p)
338	register struct socket *so;
339	struct sockaddr *addr;
340	struct uio *uio;
341	struct mbuf *top;
342	struct mbuf *control;
343	int flags;
344	struct proc *p;
345{
346	struct mbuf **mp;
347	register struct mbuf *m;
348	register long space, len, resid;
349	int clen = 0, error, s, dontroute, mlen;
350	int atomic = sosendallatonce(so) || top;
351
352	if (uio)
353		resid = uio->uio_resid;
354	else
355		resid = top->m_pkthdr.len;
356	/*
357	 * In theory resid should be unsigned.
358	 * However, space must be signed, as it might be less than 0
359	 * if we over-committed, and we must use a signed comparison
360	 * of space and resid.  On the other hand, a negative resid
361	 * causes us to loop sending 0-length segments to the protocol.
362	 *
363	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
364	 * type sockets since that's an error.
365	 */
366	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
367		error = EINVAL;
368		goto out;
369	}
370
371	dontroute =
372	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
373	    (so->so_proto->pr_flags & PR_ATOMIC);
374	if (p)
375		p->p_stats->p_ru.ru_msgsnd++;
376	if (control)
377		clen = control->m_len;
378#define	snderr(errno)	{ error = errno; splx(s); goto release; }
379
380restart:
381	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
382	if (error)
383		goto out;
384	do {
385		s = splnet();
386		if (so->so_state & SS_CANTSENDMORE)
387			snderr(EPIPE);
388		if (so->so_error) {
389			error = so->so_error;
390			so->so_error = 0;
391			splx(s);
392			goto release;
393		}
394		if ((so->so_state & SS_ISCONNECTED) == 0) {
395			/*
396			 * `sendto' and `sendmsg' is allowed on a connection-
397			 * based socket if it supports implied connect.
398			 * Return ENOTCONN if not connected and no address is
399			 * supplied.
400			 */
401			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
402			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
403				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
404				    !(resid == 0 && clen != 0))
405					snderr(ENOTCONN);
406			} else if (addr == 0)
407			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
408				   ENOTCONN : EDESTADDRREQ);
409		}
410		space = sbspace(&so->so_snd);
411		if (flags & MSG_OOB)
412			space += 1024;
413		if ((atomic && resid > so->so_snd.sb_hiwat) ||
414		    clen > so->so_snd.sb_hiwat)
415			snderr(EMSGSIZE);
416		if (space < resid + clen && uio &&
417		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
418			if (so->so_state & SS_NBIO)
419				snderr(EWOULDBLOCK);
420			sbunlock(&so->so_snd);
421			error = sbwait(&so->so_snd);
422			splx(s);
423			if (error)
424				goto out;
425			goto restart;
426		}
427		splx(s);
428		mp = &top;
429		space -= clen;
430		do {
431		    if (uio == NULL) {
432			/*
433			 * Data is prepackaged in "top".
434			 */
435			resid = 0;
436			if (flags & MSG_EOR)
437				top->m_flags |= M_EOR;
438		    } else do {
439			if (top == 0) {
440				MGETHDR(m, M_WAIT, MT_DATA);
441				mlen = MHLEN;
442				m->m_pkthdr.len = 0;
443				m->m_pkthdr.rcvif = (struct ifnet *)0;
444			} else {
445				MGET(m, M_WAIT, MT_DATA);
446				mlen = MLEN;
447			}
448			if (resid >= MINCLSIZE) {
449				MCLGET(m, M_WAIT);
450				if ((m->m_flags & M_EXT) == 0)
451					goto nopages;
452				mlen = MCLBYTES;
453				len = min(min(mlen, resid), space);
454			} else {
455nopages:
456				len = min(min(mlen, resid), space);
457				/*
458				 * For datagram protocols, leave room
459				 * for protocol headers in first mbuf.
460				 */
461				if (atomic && top == 0 && len < mlen)
462					MH_ALIGN(m, len);
463			}
464			space -= len;
465			error = uiomove(mtod(m, caddr_t), (int)len, uio);
466			resid = uio->uio_resid;
467			m->m_len = len;
468			*mp = m;
469			top->m_pkthdr.len += len;
470			if (error)
471				goto release;
472			mp = &m->m_next;
473			if (resid <= 0) {
474				if (flags & MSG_EOR)
475					top->m_flags |= M_EOR;
476				break;
477			}
478		    } while (space > 0 && atomic);
479		    if (dontroute)
480			    so->so_options |= SO_DONTROUTE;
481		    s = splnet();				/* XXX */
482		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
483			(flags & MSG_OOB) ? PRUS_OOB :
484			/*
485			 * If the user set MSG_EOF, the protocol
486			 * understands this flag and nothing left to
487			 * send then use PRU_SEND_EOF instead of PRU_SEND.
488			 */
489			((flags & MSG_EOF) &&
490			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
491			 (resid <= 0)) ?
492				PRUS_EOF : 0,
493			top, addr, control, p);
494		    splx(s);
495		    if (dontroute)
496			    so->so_options &= ~SO_DONTROUTE;
497		    clen = 0;
498		    control = 0;
499		    top = 0;
500		    mp = &top;
501		    if (error)
502			goto release;
503		} while (resid && space > 0);
504	} while (resid);
505
506release:
507	sbunlock(&so->so_snd);
508out:
509	if (top)
510		m_freem(top);
511	if (control)
512		m_freem(control);
513	return (error);
514}
515
516/*
517 * Implement receive operations on a socket.
518 * We depend on the way that records are added to the sockbuf
519 * by sbappend*.  In particular, each record (mbufs linked through m_next)
520 * must begin with an address if the protocol so specifies,
521 * followed by an optional mbuf or mbufs containing ancillary data,
522 * and then zero or more mbufs of data.
523 * In order to avoid blocking network interrupts for the entire time here,
524 * we splx() while doing the actual copy to user space.
525 * Although the sockbuf is locked, new data may still be appended,
526 * and thus we must maintain consistency of the sockbuf during that time.
527 *
528 * The caller may receive the data as a single mbuf chain by supplying
529 * an mbuf **mp0 for use in returning the chain.  The uio is then used
530 * only for the count in uio_resid.
531 */
532int
533soreceive(so, psa, uio, mp0, controlp, flagsp)
534	register struct socket *so;
535	struct sockaddr **psa;
536	struct uio *uio;
537	struct mbuf **mp0;
538	struct mbuf **controlp;
539	int *flagsp;
540{
541	register struct mbuf *m, **mp;
542	register int flags, len, error, s, offset;
543	struct protosw *pr = so->so_proto;
544	struct mbuf *nextrecord;
545	int moff, type = 0;
546	int orig_resid = uio->uio_resid;
547
548	mp = mp0;
549	if (psa)
550		*psa = 0;
551	if (controlp)
552		*controlp = 0;
553	if (flagsp)
554		flags = *flagsp &~ MSG_EOR;
555	else
556		flags = 0;
557	if (flags & MSG_OOB) {
558		m = m_get(M_WAIT, MT_DATA);
559		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
560		if (error)
561			goto bad;
562		do {
563			error = uiomove(mtod(m, caddr_t),
564			    (int) min(uio->uio_resid, m->m_len), uio);
565			m = m_free(m);
566		} while (uio->uio_resid && error == 0 && m);
567bad:
568		if (m)
569			m_freem(m);
570		return (error);
571	}
572	if (mp)
573		*mp = (struct mbuf *)0;
574	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
575		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
576
577restart:
578	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
579	if (error)
580		return (error);
581	s = splnet();
582
583	m = so->so_rcv.sb_mb;
584	/*
585	 * If we have less data than requested, block awaiting more
586	 * (subject to any timeout) if:
587	 *   1. the current count is less than the low water mark, or
588	 *   2. MSG_WAITALL is set, and it is possible to do the entire
589	 *	receive operation at once if we block (resid <= hiwat).
590	 *   3. MSG_DONTWAIT is not set
591	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
592	 * we have to do the receive in sections, and thus risk returning
593	 * a short count if a timeout or signal occurs after we start.
594	 */
595	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
596	    so->so_rcv.sb_cc < uio->uio_resid) &&
597	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
598	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
599	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
600#ifdef DIAGNOSTIC
601		if (m == 0 && so->so_rcv.sb_cc)
602			panic("receive 1");
603#endif
604		if (so->so_error) {
605			if (m)
606				goto dontblock;
607			error = so->so_error;
608			if ((flags & MSG_PEEK) == 0)
609				so->so_error = 0;
610			goto release;
611		}
612		if (so->so_state & SS_CANTRCVMORE) {
613			if (m)
614				goto dontblock;
615			else
616				goto release;
617		}
618		for (; m; m = m->m_next)
619			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
620				m = so->so_rcv.sb_mb;
621				goto dontblock;
622			}
623		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
624		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
625			error = ENOTCONN;
626			goto release;
627		}
628		if (uio->uio_resid == 0)
629			goto release;
630		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
631			error = EWOULDBLOCK;
632			goto release;
633		}
634		sbunlock(&so->so_rcv);
635		error = sbwait(&so->so_rcv);
636		splx(s);
637		if (error)
638			return (error);
639		goto restart;
640	}
641dontblock:
642	if (uio->uio_procp)
643		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
644	nextrecord = m->m_nextpkt;
645	if (pr->pr_flags & PR_ADDR) {
646#ifdef DIAGNOSTIC
647		if (m->m_type != MT_SONAME)
648			panic("receive 1a");
649#endif
650		orig_resid = 0;
651		if (psa)
652			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
653					    mp0 == 0);
654		if (flags & MSG_PEEK) {
655			m = m->m_next;
656		} else {
657			sbfree(&so->so_rcv, m);
658			MFREE(m, so->so_rcv.sb_mb);
659			m = so->so_rcv.sb_mb;
660		}
661	}
662	while (m && m->m_type == MT_CONTROL && error == 0) {
663		if (flags & MSG_PEEK) {
664			if (controlp)
665				*controlp = m_copy(m, 0, m->m_len);
666			m = m->m_next;
667		} else {
668			sbfree(&so->so_rcv, m);
669			if (controlp) {
670				if (pr->pr_domain->dom_externalize &&
671				    mtod(m, struct cmsghdr *)->cmsg_type ==
672				    SCM_RIGHTS)
673				   error = (*pr->pr_domain->dom_externalize)(m);
674				*controlp = m;
675				so->so_rcv.sb_mb = m->m_next;
676				m->m_next = 0;
677				m = so->so_rcv.sb_mb;
678			} else {
679				MFREE(m, so->so_rcv.sb_mb);
680				m = so->so_rcv.sb_mb;
681			}
682		}
683		if (controlp) {
684			orig_resid = 0;
685			controlp = &(*controlp)->m_next;
686		}
687	}
688	if (m) {
689		if ((flags & MSG_PEEK) == 0)
690			m->m_nextpkt = nextrecord;
691		type = m->m_type;
692		if (type == MT_OOBDATA)
693			flags |= MSG_OOB;
694	}
695	moff = 0;
696	offset = 0;
697	while (m && uio->uio_resid > 0 && error == 0) {
698		if (m->m_type == MT_OOBDATA) {
699			if (type != MT_OOBDATA)
700				break;
701		} else if (type == MT_OOBDATA)
702			break;
703#ifdef DIAGNOSTIC
704		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
705			panic("receive 3");
706#endif
707		so->so_state &= ~SS_RCVATMARK;
708		len = uio->uio_resid;
709		if (so->so_oobmark && len > so->so_oobmark - offset)
710			len = so->so_oobmark - offset;
711		if (len > m->m_len - moff)
712			len = m->m_len - moff;
713		/*
714		 * If mp is set, just pass back the mbufs.
715		 * Otherwise copy them out via the uio, then free.
716		 * Sockbuf must be consistent here (points to current mbuf,
717		 * it points to next record) when we drop priority;
718		 * we must note any additions to the sockbuf when we
719		 * block interrupts again.
720		 */
721		if (mp == 0) {
722			splx(s);
723			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
724			s = splnet();
725			if (error)
726				goto release;
727		} else
728			uio->uio_resid -= len;
729		if (len == m->m_len - moff) {
730			if (m->m_flags & M_EOR)
731				flags |= MSG_EOR;
732			if (flags & MSG_PEEK) {
733				m = m->m_next;
734				moff = 0;
735			} else {
736				nextrecord = m->m_nextpkt;
737				sbfree(&so->so_rcv, m);
738				if (mp) {
739					*mp = m;
740					mp = &m->m_next;
741					so->so_rcv.sb_mb = m = m->m_next;
742					*mp = (struct mbuf *)0;
743				} else {
744					MFREE(m, so->so_rcv.sb_mb);
745					m = so->so_rcv.sb_mb;
746				}
747				if (m)
748					m->m_nextpkt = nextrecord;
749			}
750		} else {
751			if (flags & MSG_PEEK)
752				moff += len;
753			else {
754				if (mp)
755					*mp = m_copym(m, 0, len, M_WAIT);
756				m->m_data += len;
757				m->m_len -= len;
758				so->so_rcv.sb_cc -= len;
759			}
760		}
761		if (so->so_oobmark) {
762			if ((flags & MSG_PEEK) == 0) {
763				so->so_oobmark -= len;
764				if (so->so_oobmark == 0) {
765					so->so_state |= SS_RCVATMARK;
766					break;
767				}
768			} else {
769				offset += len;
770				if (offset == so->so_oobmark)
771					break;
772			}
773		}
774		if (flags & MSG_EOR)
775			break;
776		/*
777		 * If the MSG_WAITALL flag is set (for non-atomic socket),
778		 * we must not quit until "uio->uio_resid == 0" or an error
779		 * termination.  If a signal/timeout occurs, return
780		 * with a short count but without error.
781		 * Keep sockbuf locked against other readers.
782		 */
783		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
784		    !sosendallatonce(so) && !nextrecord) {
785			if (so->so_error || so->so_state & SS_CANTRCVMORE)
786				break;
787			error = sbwait(&so->so_rcv);
788			if (error) {
789				sbunlock(&so->so_rcv);
790				splx(s);
791				return (0);
792			}
793			m = so->so_rcv.sb_mb;
794			if (m)
795				nextrecord = m->m_nextpkt;
796		}
797	}
798
799	if (m && pr->pr_flags & PR_ATOMIC) {
800		flags |= MSG_TRUNC;
801		if ((flags & MSG_PEEK) == 0)
802			(void) sbdroprecord(&so->so_rcv);
803	}
804	if ((flags & MSG_PEEK) == 0) {
805		if (m == 0)
806			so->so_rcv.sb_mb = nextrecord;
807		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
808			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
809	}
810	if (orig_resid == uio->uio_resid && orig_resid &&
811	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
812		sbunlock(&so->so_rcv);
813		splx(s);
814		goto restart;
815	}
816
817	if (flagsp)
818		*flagsp |= flags;
819release:
820	sbunlock(&so->so_rcv);
821	splx(s);
822	return (error);
823}
824
825int
826soshutdown(so, how)
827	register struct socket *so;
828	register int how;
829{
830	register struct protosw *pr = so->so_proto;
831
832	how++;
833	if (how & FREAD)
834		sorflush(so);
835	if (how & FWRITE)
836		return ((*pr->pr_usrreqs->pru_shutdown)(so));
837	return (0);
838}
839
840void
841sorflush(so)
842	register struct socket *so;
843{
844	register struct sockbuf *sb = &so->so_rcv;
845	register struct protosw *pr = so->so_proto;
846	register int s;
847	struct sockbuf asb;
848
849	sb->sb_flags |= SB_NOINTR;
850	(void) sblock(sb, M_WAITOK);
851	s = splimp();
852	socantrcvmore(so);
853	sbunlock(sb);
854	asb = *sb;
855	bzero((caddr_t)sb, sizeof (*sb));
856	splx(s);
857	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
858		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
859	sbrelease(&asb);
860}
861
862int
863sosetopt(so, level, optname, m0, p)
864	register struct socket *so;
865	int level, optname;
866	struct mbuf *m0;
867	struct proc *p;
868{
869	int error = 0;
870	register struct mbuf *m = m0;
871
872	if (level != SOL_SOCKET) {
873		if (so->so_proto && so->so_proto->pr_ctloutput)
874			return ((*so->so_proto->pr_ctloutput)
875				  (PRCO_SETOPT, so, level, optname, &m0, p));
876		error = ENOPROTOOPT;
877	} else {
878		switch (optname) {
879
880		case SO_LINGER:
881			if (m == NULL || m->m_len != sizeof (struct linger)) {
882				error = EINVAL;
883				goto bad;
884			}
885			so->so_linger = mtod(m, struct linger *)->l_linger;
886			/* fall thru... */
887
888		case SO_DEBUG:
889		case SO_KEEPALIVE:
890		case SO_DONTROUTE:
891		case SO_USELOOPBACK:
892		case SO_BROADCAST:
893		case SO_REUSEADDR:
894		case SO_REUSEPORT:
895		case SO_OOBINLINE:
896		case SO_TIMESTAMP:
897			if (m == NULL || m->m_len < sizeof (int)) {
898				error = EINVAL;
899				goto bad;
900			}
901			if (*mtod(m, int *))
902				so->so_options |= optname;
903			else
904				so->so_options &= ~optname;
905			break;
906
907		case SO_SNDBUF:
908		case SO_RCVBUF:
909		case SO_SNDLOWAT:
910		case SO_RCVLOWAT:
911		    {
912			int optval;
913
914			if (m == NULL || m->m_len < sizeof (int)) {
915				error = EINVAL;
916				goto bad;
917			}
918
919			/*
920			 * Values < 1 make no sense for any of these
921			 * options, so disallow them.
922			 */
923			optval = *mtod(m, int *);
924			if (optval < 1) {
925				error = EINVAL;
926				goto bad;
927			}
928
929			switch (optname) {
930
931			case SO_SNDBUF:
932			case SO_RCVBUF:
933				if (sbreserve(optname == SO_SNDBUF ?
934				    &so->so_snd : &so->so_rcv,
935				    (u_long) optval) == 0) {
936					error = ENOBUFS;
937					goto bad;
938				}
939				break;
940
941			/*
942			 * Make sure the low-water is never greater than
943			 * the high-water.
944			 */
945			case SO_SNDLOWAT:
946				so->so_snd.sb_lowat =
947				    (optval > so->so_snd.sb_hiwat) ?
948				    so->so_snd.sb_hiwat : optval;
949				break;
950			case SO_RCVLOWAT:
951				so->so_rcv.sb_lowat =
952				    (optval > so->so_rcv.sb_hiwat) ?
953				    so->so_rcv.sb_hiwat : optval;
954				break;
955			}
956			break;
957		    }
958
959		case SO_SNDTIMEO:
960		case SO_RCVTIMEO:
961		    {
962			struct timeval *tv;
963			short val;
964
965			if (m == NULL || m->m_len < sizeof (*tv)) {
966				error = EINVAL;
967				goto bad;
968			}
969			tv = mtod(m, struct timeval *);
970			if (tv->tv_sec > SHRT_MAX / hz - hz) {
971				error = EDOM;
972				goto bad;
973			}
974			val = tv->tv_sec * hz + tv->tv_usec / tick;
975
976			switch (optname) {
977
978			case SO_SNDTIMEO:
979				so->so_snd.sb_timeo = val;
980				break;
981			case SO_RCVTIMEO:
982				so->so_rcv.sb_timeo = val;
983				break;
984			}
985			break;
986		    }
987
988		default:
989			error = ENOPROTOOPT;
990			break;
991		}
992		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
993			(void) ((*so->so_proto->pr_ctloutput)
994				  (PRCO_SETOPT, so, level, optname, &m0, p));
995			m = NULL;	/* freed by protocol */
996		}
997	}
998bad:
999	if (m)
1000		(void) m_free(m);
1001	return (error);
1002}
1003
1004int
1005sogetopt(so, level, optname, mp, p)
1006	register struct socket *so;
1007	int level, optname;
1008	struct mbuf **mp;
1009	struct proc *p;
1010{
1011	register struct mbuf *m;
1012
1013	if (level != SOL_SOCKET) {
1014		if (so->so_proto && so->so_proto->pr_ctloutput) {
1015			return ((*so->so_proto->pr_ctloutput)
1016				  (PRCO_GETOPT, so, level, optname, mp, p));
1017		} else
1018			return (ENOPROTOOPT);
1019	} else {
1020		m = m_get(M_WAIT, MT_SOOPTS);
1021		m->m_len = sizeof (int);
1022
1023		switch (optname) {
1024
1025		case SO_LINGER:
1026			m->m_len = sizeof (struct linger);
1027			mtod(m, struct linger *)->l_onoff =
1028				so->so_options & SO_LINGER;
1029			mtod(m, struct linger *)->l_linger = so->so_linger;
1030			break;
1031
1032		case SO_USELOOPBACK:
1033		case SO_DONTROUTE:
1034		case SO_DEBUG:
1035		case SO_KEEPALIVE:
1036		case SO_REUSEADDR:
1037		case SO_REUSEPORT:
1038		case SO_BROADCAST:
1039		case SO_OOBINLINE:
1040		case SO_TIMESTAMP:
1041			*mtod(m, int *) = so->so_options & optname;
1042			break;
1043
1044		case SO_TYPE:
1045			*mtod(m, int *) = so->so_type;
1046			break;
1047
1048		case SO_ERROR:
1049			*mtod(m, int *) = so->so_error;
1050			so->so_error = 0;
1051			break;
1052
1053		case SO_SNDBUF:
1054			*mtod(m, int *) = so->so_snd.sb_hiwat;
1055			break;
1056
1057		case SO_RCVBUF:
1058			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1059			break;
1060
1061		case SO_SNDLOWAT:
1062			*mtod(m, int *) = so->so_snd.sb_lowat;
1063			break;
1064
1065		case SO_RCVLOWAT:
1066			*mtod(m, int *) = so->so_rcv.sb_lowat;
1067			break;
1068
1069		case SO_SNDTIMEO:
1070		case SO_RCVTIMEO:
1071		    {
1072			int val = (optname == SO_SNDTIMEO ?
1073			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1074
1075			m->m_len = sizeof(struct timeval);
1076			mtod(m, struct timeval *)->tv_sec = val / hz;
1077			mtod(m, struct timeval *)->tv_usec =
1078			    (val % hz) * tick;
1079			break;
1080		    }
1081
1082		default:
1083			(void)m_free(m);
1084			return (ENOPROTOOPT);
1085		}
1086		*mp = m;
1087		return (0);
1088	}
1089}
1090
1091void
1092sohasoutofband(so)
1093	register struct socket *so;
1094{
1095	struct proc *p;
1096
1097	if (so->so_pgid < 0)
1098		gsignal(-so->so_pgid, SIGURG);
1099	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1100		psignal(p, SIGURG);
1101	selwakeup(&so->so_rcv.sb_sel);
1102}
1103
1104int
1105sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1106{
1107	int revents = 0;
1108	int s = splnet();
1109
1110	if (events & (POLLIN | POLLRDNORM))
1111		if (soreadable(so))
1112			revents |= events & (POLLIN | POLLRDNORM);
1113
1114	if (events & (POLLOUT | POLLWRNORM))
1115		if (sowriteable(so))
1116			revents |= events & (POLLOUT | POLLWRNORM);
1117
1118	if (events & (POLLPRI | POLLRDBAND))
1119		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1120			revents |= events & (POLLPRI | POLLRDBAND);
1121
1122	if (revents == 0) {
1123		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1124			selrecord(p, &so->so_rcv.sb_sel);
1125			so->so_rcv.sb_flags |= SB_SEL;
1126		}
1127
1128		if (events & (POLLOUT | POLLWRNORM)) {
1129			selrecord(p, &so->so_snd.sb_sel);
1130			so->so_snd.sb_flags |= SB_SEL;
1131		}
1132	}
1133
1134	splx(s);
1135	return (revents);
1136}
1137