uipc_socket.c revision 33628
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.36 1998/02/06 12:13:28 eivind Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/poll.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52
53#include <machine/limits.h>
54
55MALLOC_DEFINE(M_SOCKET, "socket", "socket structure");
56MALLOC_DEFINE(M_SONAME, "soname", "socket name");
57MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
58
59static int somaxconn = SOMAXCONN;
60SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
61	   0, "");
62
63/*
64 * Socket operation routines.
65 * These routines are called by the routines in
66 * sys_socket.c or from a system process, and
67 * implement the semantics of socket operations by
68 * switching out to the protocol specific routines.
69 */
70/*ARGSUSED*/
71int
72socreate(dom, aso, type, proto, p)
73	int dom;
74	struct socket **aso;
75	register int type;
76	int proto;
77	struct proc *p;
78{
79	register struct protosw *prp;
80	register struct socket *so;
81	register int error;
82
83	if (proto)
84		prp = pffindproto(dom, proto, type);
85	else
86		prp = pffindtype(dom, type);
87	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
88		return (EPROTONOSUPPORT);
89	if (prp->pr_type != type)
90		return (EPROTOTYPE);
91	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
92	bzero((caddr_t)so, sizeof(*so));
93	TAILQ_INIT(&so->so_incomp);
94	TAILQ_INIT(&so->so_comp);
95	so->so_type = type;
96	so->so_proto = prp;
97	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
98	if (error) {
99		so->so_state |= SS_NOFDREF;
100		sofree(so);
101		return (error);
102	}
103	*aso = so;
104	return (0);
105}
106
107int
108sobind(so, nam, p)
109	struct socket *so;
110	struct sockaddr *nam;
111	struct proc *p;
112{
113	int s = splnet();
114	int error;
115
116	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
117	splx(s);
118	return (error);
119}
120
121int
122solisten(so, backlog, p)
123	register struct socket *so;
124	int backlog;
125	struct proc *p;
126{
127	int s = splnet(), error;
128
129	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
130	if (error) {
131		splx(s);
132		return (error);
133	}
134	if (so->so_comp.tqh_first == NULL)
135		so->so_options |= SO_ACCEPTCONN;
136	if (backlog < 0 || backlog > somaxconn)
137		backlog = somaxconn;
138	so->so_qlimit = backlog;
139	splx(s);
140	return (0);
141}
142
143void
144sofree(so)
145	register struct socket *so;
146{
147	struct socket *head = so->so_head;
148
149	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
150		return;
151	if (head != NULL) {
152		if (so->so_state & SS_INCOMP) {
153			TAILQ_REMOVE(&head->so_incomp, so, so_list);
154			head->so_incqlen--;
155		} else if (so->so_state & SS_COMP) {
156			TAILQ_REMOVE(&head->so_comp, so, so_list);
157		} else {
158			panic("sofree: not queued");
159		}
160		head->so_qlen--;
161		so->so_state &= ~(SS_INCOMP|SS_COMP);
162		so->so_head = NULL;
163	}
164	sbrelease(&so->so_snd);
165	sorflush(so);
166	FREE(so, M_SOCKET);
167}
168
169/*
170 * Close a socket on last file table reference removal.
171 * Initiate disconnect if connected.
172 * Free socket when disconnect complete.
173 */
174int
175soclose(so)
176	register struct socket *so;
177{
178	int s = splnet();		/* conservative */
179	int error = 0;
180
181	if (so->so_options & SO_ACCEPTCONN) {
182		struct socket *sp, *sonext;
183
184		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
185			sonext = sp->so_list.tqe_next;
186			(void) soabort(sp);
187		}
188		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
189			sonext = sp->so_list.tqe_next;
190			(void) soabort(sp);
191		}
192	}
193	if (so->so_pcb == 0)
194		goto discard;
195	if (so->so_state & SS_ISCONNECTED) {
196		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
197			error = sodisconnect(so);
198			if (error)
199				goto drop;
200		}
201		if (so->so_options & SO_LINGER) {
202			if ((so->so_state & SS_ISDISCONNECTING) &&
203			    (so->so_state & SS_NBIO))
204				goto drop;
205			while (so->so_state & SS_ISCONNECTED) {
206				error = tsleep((caddr_t)&so->so_timeo,
207				    PSOCK | PCATCH, "soclos", so->so_linger);
208				if (error)
209					break;
210			}
211		}
212	}
213drop:
214	if (so->so_pcb) {
215		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
216		if (error == 0)
217			error = error2;
218	}
219discard:
220	if (so->so_state & SS_NOFDREF)
221		panic("soclose: NOFDREF");
222	so->so_state |= SS_NOFDREF;
223	sofree(so);
224	splx(s);
225	return (error);
226}
227
228/*
229 * Must be called at splnet...
230 */
231int
232soabort(so)
233	struct socket *so;
234{
235
236	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
237}
238
239int
240soaccept(so, nam)
241	register struct socket *so;
242	struct sockaddr **nam;
243{
244	int s = splnet();
245	int error;
246
247	if ((so->so_state & SS_NOFDREF) == 0)
248		panic("soaccept: !NOFDREF");
249	so->so_state &= ~SS_NOFDREF;
250	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
251	splx(s);
252	return (error);
253}
254
255int
256soconnect(so, nam, p)
257	register struct socket *so;
258	struct sockaddr *nam;
259	struct proc *p;
260{
261	int s;
262	int error;
263
264	if (so->so_options & SO_ACCEPTCONN)
265		return (EOPNOTSUPP);
266	s = splnet();
267	/*
268	 * If protocol is connection-based, can only connect once.
269	 * Otherwise, if connected, try to disconnect first.
270	 * This allows user to disconnect by connecting to, e.g.,
271	 * a null address.
272	 */
273	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
274	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
275	    (error = sodisconnect(so))))
276		error = EISCONN;
277	else
278		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
279	splx(s);
280	return (error);
281}
282
283int
284soconnect2(so1, so2)
285	register struct socket *so1;
286	struct socket *so2;
287{
288	int s = splnet();
289	int error;
290
291	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
292	splx(s);
293	return (error);
294}
295
296int
297sodisconnect(so)
298	register struct socket *so;
299{
300	int s = splnet();
301	int error;
302
303	if ((so->so_state & SS_ISCONNECTED) == 0) {
304		error = ENOTCONN;
305		goto bad;
306	}
307	if (so->so_state & SS_ISDISCONNECTING) {
308		error = EALREADY;
309		goto bad;
310	}
311	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
312bad:
313	splx(s);
314	return (error);
315}
316
317#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
318/*
319 * Send on a socket.
320 * If send must go all at once and message is larger than
321 * send buffering, then hard error.
322 * Lock against other senders.
323 * If must go all at once and not enough room now, then
324 * inform user that this would block and do nothing.
325 * Otherwise, if nonblocking, send as much as possible.
326 * The data to be sent is described by "uio" if nonzero,
327 * otherwise by the mbuf chain "top" (which must be null
328 * if uio is not).  Data provided in mbuf chain must be small
329 * enough to send all at once.
330 *
331 * Returns nonzero on error, timeout or signal; callers
332 * must check for short counts if EINTR/ERESTART are returned.
333 * Data and control buffers are freed on return.
334 */
335int
336sosend(so, addr, uio, top, control, flags, p)
337	register struct socket *so;
338	struct sockaddr *addr;
339	struct uio *uio;
340	struct mbuf *top;
341	struct mbuf *control;
342	int flags;
343	struct proc *p;
344{
345	struct mbuf **mp;
346	register struct mbuf *m;
347	register long space, len, resid;
348	int clen = 0, error, s, dontroute, mlen;
349	int atomic = sosendallatonce(so) || top;
350
351	if (uio)
352		resid = uio->uio_resid;
353	else
354		resid = top->m_pkthdr.len;
355	/*
356	 * In theory resid should be unsigned.
357	 * However, space must be signed, as it might be less than 0
358	 * if we over-committed, and we must use a signed comparison
359	 * of space and resid.  On the other hand, a negative resid
360	 * causes us to loop sending 0-length segments to the protocol.
361	 *
362	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
363	 * type sockets since that's an error.
364	 */
365	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
366		error = EINVAL;
367		goto out;
368	}
369
370	dontroute =
371	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
372	    (so->so_proto->pr_flags & PR_ATOMIC);
373	if (p)
374		p->p_stats->p_ru.ru_msgsnd++;
375	if (control)
376		clen = control->m_len;
377#define	snderr(errno)	{ error = errno; splx(s); goto release; }
378
379restart:
380	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
381	if (error)
382		goto out;
383	do {
384		s = splnet();
385		if (so->so_state & SS_CANTSENDMORE)
386			snderr(EPIPE);
387		if (so->so_error) {
388			error = so->so_error;
389			so->so_error = 0;
390			splx(s);
391			goto release;
392		}
393		if ((so->so_state & SS_ISCONNECTED) == 0) {
394			/*
395			 * `sendto' and `sendmsg' is allowed on a connection-
396			 * based socket if it supports implied connect.
397			 * Return ENOTCONN if not connected and no address is
398			 * supplied.
399			 */
400			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
401			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
402				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
403				    !(resid == 0 && clen != 0))
404					snderr(ENOTCONN);
405			} else if (addr == 0)
406			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
407				   ENOTCONN : EDESTADDRREQ);
408		}
409		space = sbspace(&so->so_snd);
410		if (flags & MSG_OOB)
411			space += 1024;
412		if ((atomic && resid > so->so_snd.sb_hiwat) ||
413		    clen > so->so_snd.sb_hiwat)
414			snderr(EMSGSIZE);
415		if (space < resid + clen && uio &&
416		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
417			if (so->so_state & SS_NBIO)
418				snderr(EWOULDBLOCK);
419			sbunlock(&so->so_snd);
420			error = sbwait(&so->so_snd);
421			splx(s);
422			if (error)
423				goto out;
424			goto restart;
425		}
426		splx(s);
427		mp = &top;
428		space -= clen;
429		do {
430		    if (uio == NULL) {
431			/*
432			 * Data is prepackaged in "top".
433			 */
434			resid = 0;
435			if (flags & MSG_EOR)
436				top->m_flags |= M_EOR;
437		    } else do {
438			if (top == 0) {
439				MGETHDR(m, M_WAIT, MT_DATA);
440				mlen = MHLEN;
441				m->m_pkthdr.len = 0;
442				m->m_pkthdr.rcvif = (struct ifnet *)0;
443			} else {
444				MGET(m, M_WAIT, MT_DATA);
445				mlen = MLEN;
446			}
447			if (resid >= MINCLSIZE) {
448				MCLGET(m, M_WAIT);
449				if ((m->m_flags & M_EXT) == 0)
450					goto nopages;
451				mlen = MCLBYTES;
452				len = min(min(mlen, resid), space);
453			} else {
454nopages:
455				len = min(min(mlen, resid), space);
456				/*
457				 * For datagram protocols, leave room
458				 * for protocol headers in first mbuf.
459				 */
460				if (atomic && top == 0 && len < mlen)
461					MH_ALIGN(m, len);
462			}
463			space -= len;
464			error = uiomove(mtod(m, caddr_t), (int)len, uio);
465			resid = uio->uio_resid;
466			m->m_len = len;
467			*mp = m;
468			top->m_pkthdr.len += len;
469			if (error)
470				goto release;
471			mp = &m->m_next;
472			if (resid <= 0) {
473				if (flags & MSG_EOR)
474					top->m_flags |= M_EOR;
475				break;
476			}
477		    } while (space > 0 && atomic);
478		    if (dontroute)
479			    so->so_options |= SO_DONTROUTE;
480		    s = splnet();				/* XXX */
481		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
482			(flags & MSG_OOB) ? PRUS_OOB :
483			/*
484			 * If the user set MSG_EOF, the protocol
485			 * understands this flag and nothing left to
486			 * send then use PRU_SEND_EOF instead of PRU_SEND.
487			 */
488			((flags & MSG_EOF) &&
489			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
490			 (resid <= 0)) ?
491				PRUS_EOF : 0,
492			top, addr, control, p);
493		    splx(s);
494		    if (dontroute)
495			    so->so_options &= ~SO_DONTROUTE;
496		    clen = 0;
497		    control = 0;
498		    top = 0;
499		    mp = &top;
500		    if (error)
501			goto release;
502		} while (resid && space > 0);
503	} while (resid);
504
505release:
506	sbunlock(&so->so_snd);
507out:
508	if (top)
509		m_freem(top);
510	if (control)
511		m_freem(control);
512	return (error);
513}
514
515/*
516 * Implement receive operations on a socket.
517 * We depend on the way that records are added to the sockbuf
518 * by sbappend*.  In particular, each record (mbufs linked through m_next)
519 * must begin with an address if the protocol so specifies,
520 * followed by an optional mbuf or mbufs containing ancillary data,
521 * and then zero or more mbufs of data.
522 * In order to avoid blocking network interrupts for the entire time here,
523 * we splx() while doing the actual copy to user space.
524 * Although the sockbuf is locked, new data may still be appended,
525 * and thus we must maintain consistency of the sockbuf during that time.
526 *
527 * The caller may receive the data as a single mbuf chain by supplying
528 * an mbuf **mp0 for use in returning the chain.  The uio is then used
529 * only for the count in uio_resid.
530 */
531int
532soreceive(so, psa, uio, mp0, controlp, flagsp)
533	register struct socket *so;
534	struct sockaddr **psa;
535	struct uio *uio;
536	struct mbuf **mp0;
537	struct mbuf **controlp;
538	int *flagsp;
539{
540	register struct mbuf *m, **mp;
541	register int flags, len, error, s, offset;
542	struct protosw *pr = so->so_proto;
543	struct mbuf *nextrecord;
544	int moff, type = 0;
545	int orig_resid = uio->uio_resid;
546
547	mp = mp0;
548	if (psa)
549		*psa = 0;
550	if (controlp)
551		*controlp = 0;
552	if (flagsp)
553		flags = *flagsp &~ MSG_EOR;
554	else
555		flags = 0;
556	if (flags & MSG_OOB) {
557		m = m_get(M_WAIT, MT_DATA);
558		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
559		if (error)
560			goto bad;
561		do {
562			error = uiomove(mtod(m, caddr_t),
563			    (int) min(uio->uio_resid, m->m_len), uio);
564			m = m_free(m);
565		} while (uio->uio_resid && error == 0 && m);
566bad:
567		if (m)
568			m_freem(m);
569		return (error);
570	}
571	if (mp)
572		*mp = (struct mbuf *)0;
573	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
574		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
575
576restart:
577	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
578	if (error)
579		return (error);
580	s = splnet();
581
582	m = so->so_rcv.sb_mb;
583	/*
584	 * If we have less data than requested, block awaiting more
585	 * (subject to any timeout) if:
586	 *   1. the current count is less than the low water mark, or
587	 *   2. MSG_WAITALL is set, and it is possible to do the entire
588	 *	receive operation at once if we block (resid <= hiwat).
589	 *   3. MSG_DONTWAIT is not set
590	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
591	 * we have to do the receive in sections, and thus risk returning
592	 * a short count if a timeout or signal occurs after we start.
593	 */
594	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
595	    so->so_rcv.sb_cc < uio->uio_resid) &&
596	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
597	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
598	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
599#ifdef DIAGNOSTIC
600		if (m == 0 && so->so_rcv.sb_cc)
601			panic("receive 1");
602#endif
603		if (so->so_error) {
604			if (m)
605				goto dontblock;
606			error = so->so_error;
607			if ((flags & MSG_PEEK) == 0)
608				so->so_error = 0;
609			goto release;
610		}
611		if (so->so_state & SS_CANTRCVMORE) {
612			if (m)
613				goto dontblock;
614			else
615				goto release;
616		}
617		for (; m; m = m->m_next)
618			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
619				m = so->so_rcv.sb_mb;
620				goto dontblock;
621			}
622		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
623		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
624			error = ENOTCONN;
625			goto release;
626		}
627		if (uio->uio_resid == 0)
628			goto release;
629		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
630			error = EWOULDBLOCK;
631			goto release;
632		}
633		sbunlock(&so->so_rcv);
634		error = sbwait(&so->so_rcv);
635		splx(s);
636		if (error)
637			return (error);
638		goto restart;
639	}
640dontblock:
641	if (uio->uio_procp)
642		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
643	nextrecord = m->m_nextpkt;
644	if (pr->pr_flags & PR_ADDR) {
645#ifdef DIAGNOSTIC
646		if (m->m_type != MT_SONAME)
647			panic("receive 1a");
648#endif
649		orig_resid = 0;
650		if (psa)
651			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
652					    mp0 == 0);
653		if (flags & MSG_PEEK) {
654			m = m->m_next;
655		} else {
656			sbfree(&so->so_rcv, m);
657			MFREE(m, so->so_rcv.sb_mb);
658			m = so->so_rcv.sb_mb;
659		}
660	}
661	while (m && m->m_type == MT_CONTROL && error == 0) {
662		if (flags & MSG_PEEK) {
663			if (controlp)
664				*controlp = m_copy(m, 0, m->m_len);
665			m = m->m_next;
666		} else {
667			sbfree(&so->so_rcv, m);
668			if (controlp) {
669				if (pr->pr_domain->dom_externalize &&
670				    mtod(m, struct cmsghdr *)->cmsg_type ==
671				    SCM_RIGHTS)
672				   error = (*pr->pr_domain->dom_externalize)(m);
673				*controlp = m;
674				so->so_rcv.sb_mb = m->m_next;
675				m->m_next = 0;
676				m = so->so_rcv.sb_mb;
677			} else {
678				MFREE(m, so->so_rcv.sb_mb);
679				m = so->so_rcv.sb_mb;
680			}
681		}
682		if (controlp) {
683			orig_resid = 0;
684			controlp = &(*controlp)->m_next;
685		}
686	}
687	if (m) {
688		if ((flags & MSG_PEEK) == 0)
689			m->m_nextpkt = nextrecord;
690		type = m->m_type;
691		if (type == MT_OOBDATA)
692			flags |= MSG_OOB;
693	}
694	moff = 0;
695	offset = 0;
696	while (m && uio->uio_resid > 0 && error == 0) {
697		if (m->m_type == MT_OOBDATA) {
698			if (type != MT_OOBDATA)
699				break;
700		} else if (type == MT_OOBDATA)
701			break;
702#ifdef DIAGNOSTIC
703		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
704			panic("receive 3");
705#endif
706		so->so_state &= ~SS_RCVATMARK;
707		len = uio->uio_resid;
708		if (so->so_oobmark && len > so->so_oobmark - offset)
709			len = so->so_oobmark - offset;
710		if (len > m->m_len - moff)
711			len = m->m_len - moff;
712		/*
713		 * If mp is set, just pass back the mbufs.
714		 * Otherwise copy them out via the uio, then free.
715		 * Sockbuf must be consistent here (points to current mbuf,
716		 * it points to next record) when we drop priority;
717		 * we must note any additions to the sockbuf when we
718		 * block interrupts again.
719		 */
720		if (mp == 0) {
721			splx(s);
722			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
723			s = splnet();
724			if (error)
725				goto release;
726		} else
727			uio->uio_resid -= len;
728		if (len == m->m_len - moff) {
729			if (m->m_flags & M_EOR)
730				flags |= MSG_EOR;
731			if (flags & MSG_PEEK) {
732				m = m->m_next;
733				moff = 0;
734			} else {
735				nextrecord = m->m_nextpkt;
736				sbfree(&so->so_rcv, m);
737				if (mp) {
738					*mp = m;
739					mp = &m->m_next;
740					so->so_rcv.sb_mb = m = m->m_next;
741					*mp = (struct mbuf *)0;
742				} else {
743					MFREE(m, so->so_rcv.sb_mb);
744					m = so->so_rcv.sb_mb;
745				}
746				if (m)
747					m->m_nextpkt = nextrecord;
748			}
749		} else {
750			if (flags & MSG_PEEK)
751				moff += len;
752			else {
753				if (mp)
754					*mp = m_copym(m, 0, len, M_WAIT);
755				m->m_data += len;
756				m->m_len -= len;
757				so->so_rcv.sb_cc -= len;
758			}
759		}
760		if (so->so_oobmark) {
761			if ((flags & MSG_PEEK) == 0) {
762				so->so_oobmark -= len;
763				if (so->so_oobmark == 0) {
764					so->so_state |= SS_RCVATMARK;
765					break;
766				}
767			} else {
768				offset += len;
769				if (offset == so->so_oobmark)
770					break;
771			}
772		}
773		if (flags & MSG_EOR)
774			break;
775		/*
776		 * If the MSG_WAITALL flag is set (for non-atomic socket),
777		 * we must not quit until "uio->uio_resid == 0" or an error
778		 * termination.  If a signal/timeout occurs, return
779		 * with a short count but without error.
780		 * Keep sockbuf locked against other readers.
781		 */
782		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
783		    !sosendallatonce(so) && !nextrecord) {
784			if (so->so_error || so->so_state & SS_CANTRCVMORE)
785				break;
786			error = sbwait(&so->so_rcv);
787			if (error) {
788				sbunlock(&so->so_rcv);
789				splx(s);
790				return (0);
791			}
792			m = so->so_rcv.sb_mb;
793			if (m)
794				nextrecord = m->m_nextpkt;
795		}
796	}
797
798	if (m && pr->pr_flags & PR_ATOMIC) {
799		flags |= MSG_TRUNC;
800		if ((flags & MSG_PEEK) == 0)
801			(void) sbdroprecord(&so->so_rcv);
802	}
803	if ((flags & MSG_PEEK) == 0) {
804		if (m == 0)
805			so->so_rcv.sb_mb = nextrecord;
806		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
807			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
808	}
809	if (orig_resid == uio->uio_resid && orig_resid &&
810	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
811		sbunlock(&so->so_rcv);
812		splx(s);
813		goto restart;
814	}
815
816	if (flagsp)
817		*flagsp |= flags;
818release:
819	sbunlock(&so->so_rcv);
820	splx(s);
821	return (error);
822}
823
824int
825soshutdown(so, how)
826	register struct socket *so;
827	register int how;
828{
829	register struct protosw *pr = so->so_proto;
830
831	how++;
832	if (how & FREAD)
833		sorflush(so);
834	if (how & FWRITE)
835		return ((*pr->pr_usrreqs->pru_shutdown)(so));
836	return (0);
837}
838
839void
840sorflush(so)
841	register struct socket *so;
842{
843	register struct sockbuf *sb = &so->so_rcv;
844	register struct protosw *pr = so->so_proto;
845	register int s;
846	struct sockbuf asb;
847
848	sb->sb_flags |= SB_NOINTR;
849	(void) sblock(sb, M_WAITOK);
850	s = splimp();
851	socantrcvmore(so);
852	sbunlock(sb);
853	asb = *sb;
854	bzero((caddr_t)sb, sizeof (*sb));
855	splx(s);
856	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
857		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
858	sbrelease(&asb);
859}
860
861int
862sosetopt(so, level, optname, m0, p)
863	register struct socket *so;
864	int level, optname;
865	struct mbuf *m0;
866	struct proc *p;
867{
868	int error = 0;
869	register struct mbuf *m = m0;
870
871	if (level != SOL_SOCKET) {
872		if (so->so_proto && so->so_proto->pr_ctloutput)
873			return ((*so->so_proto->pr_ctloutput)
874				  (PRCO_SETOPT, so, level, optname, &m0, p));
875		error = ENOPROTOOPT;
876	} else {
877		switch (optname) {
878
879		case SO_LINGER:
880			if (m == NULL || m->m_len != sizeof (struct linger)) {
881				error = EINVAL;
882				goto bad;
883			}
884			so->so_linger = mtod(m, struct linger *)->l_linger;
885			/* fall thru... */
886
887		case SO_DEBUG:
888		case SO_KEEPALIVE:
889		case SO_DONTROUTE:
890		case SO_USELOOPBACK:
891		case SO_BROADCAST:
892		case SO_REUSEADDR:
893		case SO_REUSEPORT:
894		case SO_OOBINLINE:
895		case SO_TIMESTAMP:
896			if (m == NULL || m->m_len < sizeof (int)) {
897				error = EINVAL;
898				goto bad;
899			}
900			if (*mtod(m, int *))
901				so->so_options |= optname;
902			else
903				so->so_options &= ~optname;
904			break;
905
906		case SO_SNDBUF:
907		case SO_RCVBUF:
908		case SO_SNDLOWAT:
909		case SO_RCVLOWAT:
910		    {
911			int optval;
912
913			if (m == NULL || m->m_len < sizeof (int)) {
914				error = EINVAL;
915				goto bad;
916			}
917
918			/*
919			 * Values < 1 make no sense for any of these
920			 * options, so disallow them.
921			 */
922			optval = *mtod(m, int *);
923			if (optval < 1) {
924				error = EINVAL;
925				goto bad;
926			}
927
928			switch (optname) {
929
930			case SO_SNDBUF:
931			case SO_RCVBUF:
932				if (sbreserve(optname == SO_SNDBUF ?
933				    &so->so_snd : &so->so_rcv,
934				    (u_long) optval) == 0) {
935					error = ENOBUFS;
936					goto bad;
937				}
938				break;
939
940			/*
941			 * Make sure the low-water is never greater than
942			 * the high-water.
943			 */
944			case SO_SNDLOWAT:
945				so->so_snd.sb_lowat =
946				    (optval > so->so_snd.sb_hiwat) ?
947				    so->so_snd.sb_hiwat : optval;
948				break;
949			case SO_RCVLOWAT:
950				so->so_rcv.sb_lowat =
951				    (optval > so->so_rcv.sb_hiwat) ?
952				    so->so_rcv.sb_hiwat : optval;
953				break;
954			}
955			break;
956		    }
957
958		case SO_SNDTIMEO:
959		case SO_RCVTIMEO:
960		    {
961			struct timeval *tv;
962			short val;
963
964			if (m == NULL || m->m_len < sizeof (*tv)) {
965				error = EINVAL;
966				goto bad;
967			}
968			tv = mtod(m, struct timeval *);
969			if (tv->tv_sec > SHRT_MAX / hz - hz) {
970				error = EDOM;
971				goto bad;
972			}
973			val = tv->tv_sec * hz + tv->tv_usec / tick;
974
975			switch (optname) {
976
977			case SO_SNDTIMEO:
978				so->so_snd.sb_timeo = val;
979				break;
980			case SO_RCVTIMEO:
981				so->so_rcv.sb_timeo = val;
982				break;
983			}
984			break;
985		    }
986
987		default:
988			error = ENOPROTOOPT;
989			break;
990		}
991		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
992			(void) ((*so->so_proto->pr_ctloutput)
993				  (PRCO_SETOPT, so, level, optname, &m0, p));
994			m = NULL;	/* freed by protocol */
995		}
996	}
997bad:
998	if (m)
999		(void) m_free(m);
1000	return (error);
1001}
1002
1003int
1004sogetopt(so, level, optname, mp, p)
1005	register struct socket *so;
1006	int level, optname;
1007	struct mbuf **mp;
1008	struct proc *p;
1009{
1010	register struct mbuf *m;
1011
1012	if (level != SOL_SOCKET) {
1013		if (so->so_proto && so->so_proto->pr_ctloutput) {
1014			return ((*so->so_proto->pr_ctloutput)
1015				  (PRCO_GETOPT, so, level, optname, mp, p));
1016		} else
1017			return (ENOPROTOOPT);
1018	} else {
1019		m = m_get(M_WAIT, MT_SOOPTS);
1020		m->m_len = sizeof (int);
1021
1022		switch (optname) {
1023
1024		case SO_LINGER:
1025			m->m_len = sizeof (struct linger);
1026			mtod(m, struct linger *)->l_onoff =
1027				so->so_options & SO_LINGER;
1028			mtod(m, struct linger *)->l_linger = so->so_linger;
1029			break;
1030
1031		case SO_USELOOPBACK:
1032		case SO_DONTROUTE:
1033		case SO_DEBUG:
1034		case SO_KEEPALIVE:
1035		case SO_REUSEADDR:
1036		case SO_REUSEPORT:
1037		case SO_BROADCAST:
1038		case SO_OOBINLINE:
1039		case SO_TIMESTAMP:
1040			*mtod(m, int *) = so->so_options & optname;
1041			break;
1042
1043		case SO_TYPE:
1044			*mtod(m, int *) = so->so_type;
1045			break;
1046
1047		case SO_ERROR:
1048			*mtod(m, int *) = so->so_error;
1049			so->so_error = 0;
1050			break;
1051
1052		case SO_SNDBUF:
1053			*mtod(m, int *) = so->so_snd.sb_hiwat;
1054			break;
1055
1056		case SO_RCVBUF:
1057			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1058			break;
1059
1060		case SO_SNDLOWAT:
1061			*mtod(m, int *) = so->so_snd.sb_lowat;
1062			break;
1063
1064		case SO_RCVLOWAT:
1065			*mtod(m, int *) = so->so_rcv.sb_lowat;
1066			break;
1067
1068		case SO_SNDTIMEO:
1069		case SO_RCVTIMEO:
1070		    {
1071			int val = (optname == SO_SNDTIMEO ?
1072			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1073
1074			m->m_len = sizeof(struct timeval);
1075			mtod(m, struct timeval *)->tv_sec = val / hz;
1076			mtod(m, struct timeval *)->tv_usec =
1077			    (val % hz) * tick;
1078			break;
1079		    }
1080
1081		default:
1082			(void)m_free(m);
1083			return (ENOPROTOOPT);
1084		}
1085		*mp = m;
1086		return (0);
1087	}
1088}
1089
1090void
1091sohasoutofband(so)
1092	register struct socket *so;
1093{
1094	struct proc *p;
1095
1096	if (so->so_pgid < 0)
1097		gsignal(-so->so_pgid, SIGURG);
1098	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1099		psignal(p, SIGURG);
1100	selwakeup(&so->so_rcv.sb_sel);
1101}
1102
1103int
1104sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1105{
1106	int revents = 0;
1107	int s = splnet();
1108
1109	if (events & (POLLIN | POLLRDNORM))
1110		if (soreadable(so))
1111			revents |= events & (POLLIN | POLLRDNORM);
1112
1113	if (events & (POLLOUT | POLLWRNORM))
1114		if (sowriteable(so))
1115			revents |= events & (POLLOUT | POLLWRNORM);
1116
1117	if (events & (POLLPRI | POLLRDBAND))
1118		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1119			revents |= events & (POLLPRI | POLLRDBAND);
1120
1121	if (revents == 0) {
1122		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1123			selrecord(p, &so->so_rcv.sb_sel);
1124			so->so_rcv.sb_flags |= SB_SEL;
1125		}
1126
1127		if (events & (POLLOUT | POLLWRNORM)) {
1128			selrecord(p, &so->so_snd.sb_sel);
1129			so->so_snd.sb_flags |= SB_SEL;
1130		}
1131	}
1132
1133	splx(s);
1134	return (revents);
1135}
1136