uipc_socket.c revision 31053
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.33 1997/10/12 20:24:12 phk Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/proc.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/domain.h>
44#include <sys/kernel.h>
45#include <sys/poll.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52
53#include <machine/limits.h>
54
55MALLOC_DEFINE(M_SOCKET, "socket", "socket structure");
56MALLOC_DEFINE(M_SONAME, "soname", "socket name");
57MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
58
59static int somaxconn = SOMAXCONN;
60SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
61	   0, "");
62
63/*
64 * Socket operation routines.
65 * These routines are called by the routines in
66 * sys_socket.c or from a system process, and
67 * implement the semantics of socket operations by
68 * switching out to the protocol specific routines.
69 */
70/*ARGSUSED*/
71int
72socreate(dom, aso, type, proto, p)
73	int dom;
74	struct socket **aso;
75	register int type;
76	int proto;
77	struct proc *p;
78{
79	register struct protosw *prp;
80	register struct socket *so;
81	register int error;
82
83	if (proto)
84		prp = pffindproto(dom, proto, type);
85	else
86		prp = pffindtype(dom, type);
87	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
88		return (EPROTONOSUPPORT);
89	if (prp->pr_type != type)
90		return (EPROTOTYPE);
91	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
92	bzero((caddr_t)so, sizeof(*so));
93	TAILQ_INIT(&so->so_incomp);
94	TAILQ_INIT(&so->so_comp);
95	so->so_type = type;
96	so->so_proto = prp;
97	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
98	if (error) {
99		so->so_state |= SS_NOFDREF;
100		sofree(so);
101		return (error);
102	}
103	*aso = so;
104	return (0);
105}
106
107int
108sobind(so, nam, p)
109	struct socket *so;
110	struct sockaddr *nam;
111	struct proc *p;
112{
113	int s = splnet();
114	int error;
115
116	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
117	splx(s);
118	return (error);
119}
120
121int
122solisten(so, backlog, p)
123	register struct socket *so;
124	int backlog;
125	struct proc *p;
126{
127	int s = splnet(), error;
128
129	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
130	if (error) {
131		splx(s);
132		return (error);
133	}
134	if (so->so_comp.tqh_first == NULL)
135		so->so_options |= SO_ACCEPTCONN;
136	if (backlog < 0 || backlog > somaxconn)
137		backlog = somaxconn;
138	so->so_qlimit = backlog;
139	splx(s);
140	return (0);
141}
142
143void
144sofree(so)
145	register struct socket *so;
146{
147	struct socket *head = so->so_head;
148
149	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
150		return;
151	if (head != NULL) {
152		if (so->so_state & SS_INCOMP) {
153			TAILQ_REMOVE(&head->so_incomp, so, so_list);
154			head->so_incqlen--;
155		} else if (so->so_state & SS_COMP) {
156			TAILQ_REMOVE(&head->so_comp, so, so_list);
157		} else {
158			panic("sofree: not queued");
159		}
160		head->so_qlen--;
161		so->so_state &= ~(SS_INCOMP|SS_COMP);
162		so->so_head = NULL;
163	}
164	sbrelease(&so->so_snd);
165	sorflush(so);
166	FREE(so, M_SOCKET);
167}
168
169/*
170 * Close a socket on last file table reference removal.
171 * Initiate disconnect if connected.
172 * Free socket when disconnect complete.
173 */
174int
175soclose(so)
176	register struct socket *so;
177{
178	int s = splnet();		/* conservative */
179	int error = 0;
180
181	if (so->so_options & SO_ACCEPTCONN) {
182		struct socket *sp, *sonext;
183
184		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
185			sonext = sp->so_list.tqe_next;
186			(void) soabort(sp);
187		}
188		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
189			sonext = sp->so_list.tqe_next;
190			(void) soabort(sp);
191		}
192	}
193	if (so->so_pcb == 0)
194		goto discard;
195	if (so->so_state & SS_ISCONNECTED) {
196		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
197			error = sodisconnect(so);
198			if (error)
199				goto drop;
200		}
201		if (so->so_options & SO_LINGER) {
202			if ((so->so_state & SS_ISDISCONNECTING) &&
203			    (so->so_state & SS_NBIO))
204				goto drop;
205			while (so->so_state & SS_ISCONNECTED) {
206				error = tsleep((caddr_t)&so->so_timeo,
207				    PSOCK | PCATCH, "soclos", so->so_linger);
208				if (error)
209					break;
210			}
211		}
212	}
213drop:
214	if (so->so_pcb) {
215		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
216		if (error == 0)
217			error = error2;
218	}
219discard:
220	if (so->so_state & SS_NOFDREF)
221		panic("soclose: NOFDREF");
222	so->so_state |= SS_NOFDREF;
223	sofree(so);
224	splx(s);
225	return (error);
226}
227
228/*
229 * Must be called at splnet...
230 */
231int
232soabort(so)
233	struct socket *so;
234{
235
236	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
237}
238
239int
240soaccept(so, nam)
241	register struct socket *so;
242	struct sockaddr **nam;
243{
244	int s = splnet();
245	int error;
246
247	if ((so->so_state & SS_NOFDREF) == 0)
248		panic("soaccept: !NOFDREF");
249	so->so_state &= ~SS_NOFDREF;
250	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
251	splx(s);
252	return (error);
253}
254
255int
256soconnect(so, nam, p)
257	register struct socket *so;
258	struct sockaddr *nam;
259	struct proc *p;
260{
261	int s;
262	int error;
263
264	if (so->so_options & SO_ACCEPTCONN)
265		return (EOPNOTSUPP);
266	s = splnet();
267	/*
268	 * If protocol is connection-based, can only connect once.
269	 * Otherwise, if connected, try to disconnect first.
270	 * This allows user to disconnect by connecting to, e.g.,
271	 * a null address.
272	 */
273	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
274	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
275	    (error = sodisconnect(so))))
276		error = EISCONN;
277	else
278		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
279	splx(s);
280	return (error);
281}
282
283int
284soconnect2(so1, so2)
285	register struct socket *so1;
286	struct socket *so2;
287{
288	int s = splnet();
289	int error;
290
291	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
292	splx(s);
293	return (error);
294}
295
296int
297sodisconnect(so)
298	register struct socket *so;
299{
300	int s = splnet();
301	int error;
302
303	if ((so->so_state & SS_ISCONNECTED) == 0) {
304		error = ENOTCONN;
305		goto bad;
306	}
307	if (so->so_state & SS_ISDISCONNECTING) {
308		error = EALREADY;
309		goto bad;
310	}
311	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
312bad:
313	splx(s);
314	return (error);
315}
316
317#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
318/*
319 * Send on a socket.
320 * If send must go all at once and message is larger than
321 * send buffering, then hard error.
322 * Lock against other senders.
323 * If must go all at once and not enough room now, then
324 * inform user that this would block and do nothing.
325 * Otherwise, if nonblocking, send as much as possible.
326 * The data to be sent is described by "uio" if nonzero,
327 * otherwise by the mbuf chain "top" (which must be null
328 * if uio is not).  Data provided in mbuf chain must be small
329 * enough to send all at once.
330 *
331 * Returns nonzero on error, timeout or signal; callers
332 * must check for short counts if EINTR/ERESTART are returned.
333 * Data and control buffers are freed on return.
334 */
335int
336sosend(so, addr, uio, top, control, flags, p)
337	register struct socket *so;
338	struct sockaddr *addr;
339	struct uio *uio;
340	struct mbuf *top;
341	struct mbuf *control;
342	int flags;
343	struct proc *p;
344{
345	struct mbuf **mp;
346	register struct mbuf *m;
347	register long space, len, resid;
348	int clen = 0, error, s, dontroute, mlen;
349	int atomic = sosendallatonce(so) || top;
350
351	if (uio)
352		resid = uio->uio_resid;
353	else
354		resid = top->m_pkthdr.len;
355	/*
356	 * In theory resid should be unsigned.
357	 * However, space must be signed, as it might be less than 0
358	 * if we over-committed, and we must use a signed comparison
359	 * of space and resid.  On the other hand, a negative resid
360	 * causes us to loop sending 0-length segments to the protocol.
361	 *
362	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
363	 * type sockets since that's an error.
364	 */
365	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
366		error = EINVAL;
367		goto out;
368	}
369
370	dontroute =
371	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
372	    (so->so_proto->pr_flags & PR_ATOMIC);
373	if (p)
374		p->p_stats->p_ru.ru_msgsnd++;
375	if (control)
376		clen = control->m_len;
377#define	snderr(errno)	{ error = errno; splx(s); goto release; }
378
379restart:
380	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
381	if (error)
382		goto out;
383	do {
384		s = splnet();
385		if (so->so_state & SS_CANTSENDMORE)
386			snderr(EPIPE);
387		if (so->so_error)
388			snderr(so->so_error);
389		if ((so->so_state & SS_ISCONNECTED) == 0) {
390			/*
391			 * `sendto' and `sendmsg' is allowed on a connection-
392			 * based socket if it supports implied connect.
393			 * Return ENOTCONN if not connected and no address is
394			 * supplied.
395			 */
396			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
397			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
398				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
399				    !(resid == 0 && clen != 0))
400					snderr(ENOTCONN);
401			} else if (addr == 0)
402			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
403				   ENOTCONN : EDESTADDRREQ);
404		}
405		space = sbspace(&so->so_snd);
406		if (flags & MSG_OOB)
407			space += 1024;
408		if ((atomic && resid > so->so_snd.sb_hiwat) ||
409		    clen > so->so_snd.sb_hiwat)
410			snderr(EMSGSIZE);
411		if (space < resid + clen && uio &&
412		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
413			if (so->so_state & SS_NBIO)
414				snderr(EWOULDBLOCK);
415			sbunlock(&so->so_snd);
416			error = sbwait(&so->so_snd);
417			splx(s);
418			if (error)
419				goto out;
420			goto restart;
421		}
422		splx(s);
423		mp = &top;
424		space -= clen;
425		do {
426		    if (uio == NULL) {
427			/*
428			 * Data is prepackaged in "top".
429			 */
430			resid = 0;
431			if (flags & MSG_EOR)
432				top->m_flags |= M_EOR;
433		    } else do {
434			if (top == 0) {
435				MGETHDR(m, M_WAIT, MT_DATA);
436				mlen = MHLEN;
437				m->m_pkthdr.len = 0;
438				m->m_pkthdr.rcvif = (struct ifnet *)0;
439			} else {
440				MGET(m, M_WAIT, MT_DATA);
441				mlen = MLEN;
442			}
443			if (resid >= MINCLSIZE) {
444				MCLGET(m, M_WAIT);
445				if ((m->m_flags & M_EXT) == 0)
446					goto nopages;
447				mlen = MCLBYTES;
448				len = min(min(mlen, resid), space);
449			} else {
450nopages:
451				len = min(min(mlen, resid), space);
452				/*
453				 * For datagram protocols, leave room
454				 * for protocol headers in first mbuf.
455				 */
456				if (atomic && top == 0 && len < mlen)
457					MH_ALIGN(m, len);
458			}
459			space -= len;
460			error = uiomove(mtod(m, caddr_t), (int)len, uio);
461			resid = uio->uio_resid;
462			m->m_len = len;
463			*mp = m;
464			top->m_pkthdr.len += len;
465			if (error)
466				goto release;
467			mp = &m->m_next;
468			if (resid <= 0) {
469				if (flags & MSG_EOR)
470					top->m_flags |= M_EOR;
471				break;
472			}
473		    } while (space > 0 && atomic);
474		    if (dontroute)
475			    so->so_options |= SO_DONTROUTE;
476		    s = splnet();				/* XXX */
477		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
478			(flags & MSG_OOB) ? PRUS_OOB :
479			/*
480			 * If the user set MSG_EOF, the protocol
481			 * understands this flag and nothing left to
482			 * send then use PRU_SEND_EOF instead of PRU_SEND.
483			 */
484			((flags & MSG_EOF) &&
485			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
486			 (resid <= 0)) ?
487				PRUS_EOF : 0,
488			top, addr, control, p);
489		    splx(s);
490		    if (dontroute)
491			    so->so_options &= ~SO_DONTROUTE;
492		    clen = 0;
493		    control = 0;
494		    top = 0;
495		    mp = &top;
496		    if (error)
497			goto release;
498		} while (resid && space > 0);
499	} while (resid);
500
501release:
502	sbunlock(&so->so_snd);
503out:
504	if (top)
505		m_freem(top);
506	if (control)
507		m_freem(control);
508	return (error);
509}
510
511/*
512 * Implement receive operations on a socket.
513 * We depend on the way that records are added to the sockbuf
514 * by sbappend*.  In particular, each record (mbufs linked through m_next)
515 * must begin with an address if the protocol so specifies,
516 * followed by an optional mbuf or mbufs containing ancillary data,
517 * and then zero or more mbufs of data.
518 * In order to avoid blocking network interrupts for the entire time here,
519 * we splx() while doing the actual copy to user space.
520 * Although the sockbuf is locked, new data may still be appended,
521 * and thus we must maintain consistency of the sockbuf during that time.
522 *
523 * The caller may receive the data as a single mbuf chain by supplying
524 * an mbuf **mp0 for use in returning the chain.  The uio is then used
525 * only for the count in uio_resid.
526 */
527int
528soreceive(so, psa, uio, mp0, controlp, flagsp)
529	register struct socket *so;
530	struct sockaddr **psa;
531	struct uio *uio;
532	struct mbuf **mp0;
533	struct mbuf **controlp;
534	int *flagsp;
535{
536	register struct mbuf *m, **mp;
537	register int flags, len, error, s, offset;
538	struct protosw *pr = so->so_proto;
539	struct mbuf *nextrecord;
540	int moff, type = 0;
541	int orig_resid = uio->uio_resid;
542
543	mp = mp0;
544	if (psa)
545		*psa = 0;
546	if (controlp)
547		*controlp = 0;
548	if (flagsp)
549		flags = *flagsp &~ MSG_EOR;
550	else
551		flags = 0;
552	if (flags & MSG_OOB) {
553		m = m_get(M_WAIT, MT_DATA);
554		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
555		if (error)
556			goto bad;
557		do {
558			error = uiomove(mtod(m, caddr_t),
559			    (int) min(uio->uio_resid, m->m_len), uio);
560			m = m_free(m);
561		} while (uio->uio_resid && error == 0 && m);
562bad:
563		if (m)
564			m_freem(m);
565		return (error);
566	}
567	if (mp)
568		*mp = (struct mbuf *)0;
569	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
570		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
571
572restart:
573	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
574	if (error)
575		return (error);
576	s = splnet();
577
578	m = so->so_rcv.sb_mb;
579	/*
580	 * If we have less data than requested, block awaiting more
581	 * (subject to any timeout) if:
582	 *   1. the current count is less than the low water mark, or
583	 *   2. MSG_WAITALL is set, and it is possible to do the entire
584	 *	receive operation at once if we block (resid <= hiwat).
585	 *   3. MSG_DONTWAIT is not set
586	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
587	 * we have to do the receive in sections, and thus risk returning
588	 * a short count if a timeout or signal occurs after we start.
589	 */
590	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
591	    so->so_rcv.sb_cc < uio->uio_resid) &&
592	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
593	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
594	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
595#ifdef DIAGNOSTIC
596		if (m == 0 && so->so_rcv.sb_cc)
597			panic("receive 1");
598#endif
599		if (so->so_error) {
600			if (m)
601				goto dontblock;
602			error = so->so_error;
603			if ((flags & MSG_PEEK) == 0)
604				so->so_error = 0;
605			goto release;
606		}
607		if (so->so_state & SS_CANTRCVMORE) {
608			if (m)
609				goto dontblock;
610			else
611				goto release;
612		}
613		for (; m; m = m->m_next)
614			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
615				m = so->so_rcv.sb_mb;
616				goto dontblock;
617			}
618		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
619		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
620			error = ENOTCONN;
621			goto release;
622		}
623		if (uio->uio_resid == 0)
624			goto release;
625		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
626			error = EWOULDBLOCK;
627			goto release;
628		}
629		sbunlock(&so->so_rcv);
630		error = sbwait(&so->so_rcv);
631		splx(s);
632		if (error)
633			return (error);
634		goto restart;
635	}
636dontblock:
637	if (uio->uio_procp)
638		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
639	nextrecord = m->m_nextpkt;
640	if (pr->pr_flags & PR_ADDR) {
641#ifdef DIAGNOSTIC
642		if (m->m_type != MT_SONAME)
643			panic("receive 1a");
644#endif
645		orig_resid = 0;
646		if (psa)
647			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
648					    mp0 == 0);
649		if (flags & MSG_PEEK) {
650			m = m->m_next;
651		} else {
652			sbfree(&so->so_rcv, m);
653			MFREE(m, so->so_rcv.sb_mb);
654			m = so->so_rcv.sb_mb;
655		}
656	}
657	while (m && m->m_type == MT_CONTROL && error == 0) {
658		if (flags & MSG_PEEK) {
659			if (controlp)
660				*controlp = m_copy(m, 0, m->m_len);
661			m = m->m_next;
662		} else {
663			sbfree(&so->so_rcv, m);
664			if (controlp) {
665				if (pr->pr_domain->dom_externalize &&
666				    mtod(m, struct cmsghdr *)->cmsg_type ==
667				    SCM_RIGHTS)
668				   error = (*pr->pr_domain->dom_externalize)(m);
669				*controlp = m;
670				so->so_rcv.sb_mb = m->m_next;
671				m->m_next = 0;
672				m = so->so_rcv.sb_mb;
673			} else {
674				MFREE(m, so->so_rcv.sb_mb);
675				m = so->so_rcv.sb_mb;
676			}
677		}
678		if (controlp) {
679			orig_resid = 0;
680			controlp = &(*controlp)->m_next;
681		}
682	}
683	if (m) {
684		if ((flags & MSG_PEEK) == 0)
685			m->m_nextpkt = nextrecord;
686		type = m->m_type;
687		if (type == MT_OOBDATA)
688			flags |= MSG_OOB;
689	}
690	moff = 0;
691	offset = 0;
692	while (m && uio->uio_resid > 0 && error == 0) {
693		if (m->m_type == MT_OOBDATA) {
694			if (type != MT_OOBDATA)
695				break;
696		} else if (type == MT_OOBDATA)
697			break;
698#ifdef DIAGNOSTIC
699		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
700			panic("receive 3");
701#endif
702		so->so_state &= ~SS_RCVATMARK;
703		len = uio->uio_resid;
704		if (so->so_oobmark && len > so->so_oobmark - offset)
705			len = so->so_oobmark - offset;
706		if (len > m->m_len - moff)
707			len = m->m_len - moff;
708		/*
709		 * If mp is set, just pass back the mbufs.
710		 * Otherwise copy them out via the uio, then free.
711		 * Sockbuf must be consistent here (points to current mbuf,
712		 * it points to next record) when we drop priority;
713		 * we must note any additions to the sockbuf when we
714		 * block interrupts again.
715		 */
716		if (mp == 0) {
717			splx(s);
718			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
719			s = splnet();
720			if (error)
721				goto release;
722		} else
723			uio->uio_resid -= len;
724		if (len == m->m_len - moff) {
725			if (m->m_flags & M_EOR)
726				flags |= MSG_EOR;
727			if (flags & MSG_PEEK) {
728				m = m->m_next;
729				moff = 0;
730			} else {
731				nextrecord = m->m_nextpkt;
732				sbfree(&so->so_rcv, m);
733				if (mp) {
734					*mp = m;
735					mp = &m->m_next;
736					so->so_rcv.sb_mb = m = m->m_next;
737					*mp = (struct mbuf *)0;
738				} else {
739					MFREE(m, so->so_rcv.sb_mb);
740					m = so->so_rcv.sb_mb;
741				}
742				if (m)
743					m->m_nextpkt = nextrecord;
744			}
745		} else {
746			if (flags & MSG_PEEK)
747				moff += len;
748			else {
749				if (mp)
750					*mp = m_copym(m, 0, len, M_WAIT);
751				m->m_data += len;
752				m->m_len -= len;
753				so->so_rcv.sb_cc -= len;
754			}
755		}
756		if (so->so_oobmark) {
757			if ((flags & MSG_PEEK) == 0) {
758				so->so_oobmark -= len;
759				if (so->so_oobmark == 0) {
760					so->so_state |= SS_RCVATMARK;
761					break;
762				}
763			} else {
764				offset += len;
765				if (offset == so->so_oobmark)
766					break;
767			}
768		}
769		if (flags & MSG_EOR)
770			break;
771		/*
772		 * If the MSG_WAITALL flag is set (for non-atomic socket),
773		 * we must not quit until "uio->uio_resid == 0" or an error
774		 * termination.  If a signal/timeout occurs, return
775		 * with a short count but without error.
776		 * Keep sockbuf locked against other readers.
777		 */
778		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
779		    !sosendallatonce(so) && !nextrecord) {
780			if (so->so_error || so->so_state & SS_CANTRCVMORE)
781				break;
782			error = sbwait(&so->so_rcv);
783			if (error) {
784				sbunlock(&so->so_rcv);
785				splx(s);
786				return (0);
787			}
788			m = so->so_rcv.sb_mb;
789			if (m)
790				nextrecord = m->m_nextpkt;
791		}
792	}
793
794	if (m && pr->pr_flags & PR_ATOMIC) {
795		flags |= MSG_TRUNC;
796		if ((flags & MSG_PEEK) == 0)
797			(void) sbdroprecord(&so->so_rcv);
798	}
799	if ((flags & MSG_PEEK) == 0) {
800		if (m == 0)
801			so->so_rcv.sb_mb = nextrecord;
802		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
803			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
804	}
805	if (orig_resid == uio->uio_resid && orig_resid &&
806	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
807		sbunlock(&so->so_rcv);
808		splx(s);
809		goto restart;
810	}
811
812	if (flagsp)
813		*flagsp |= flags;
814release:
815	sbunlock(&so->so_rcv);
816	splx(s);
817	return (error);
818}
819
820int
821soshutdown(so, how)
822	register struct socket *so;
823	register int how;
824{
825	register struct protosw *pr = so->so_proto;
826
827	how++;
828	if (how & FREAD)
829		sorflush(so);
830	if (how & FWRITE)
831		return ((*pr->pr_usrreqs->pru_shutdown)(so));
832	return (0);
833}
834
835void
836sorflush(so)
837	register struct socket *so;
838{
839	register struct sockbuf *sb = &so->so_rcv;
840	register struct protosw *pr = so->so_proto;
841	register int s;
842	struct sockbuf asb;
843
844	sb->sb_flags |= SB_NOINTR;
845	(void) sblock(sb, M_WAITOK);
846	s = splimp();
847	socantrcvmore(so);
848	sbunlock(sb);
849	asb = *sb;
850	bzero((caddr_t)sb, sizeof (*sb));
851	splx(s);
852	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
853		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
854	sbrelease(&asb);
855}
856
857int
858sosetopt(so, level, optname, m0, p)
859	register struct socket *so;
860	int level, optname;
861	struct mbuf *m0;
862	struct proc *p;
863{
864	int error = 0;
865	register struct mbuf *m = m0;
866
867	if (level != SOL_SOCKET) {
868		if (so->so_proto && so->so_proto->pr_ctloutput)
869			return ((*so->so_proto->pr_ctloutput)
870				  (PRCO_SETOPT, so, level, optname, &m0, p));
871		error = ENOPROTOOPT;
872	} else {
873		switch (optname) {
874
875		case SO_LINGER:
876			if (m == NULL || m->m_len != sizeof (struct linger)) {
877				error = EINVAL;
878				goto bad;
879			}
880			so->so_linger = mtod(m, struct linger *)->l_linger;
881			/* fall thru... */
882
883		case SO_DEBUG:
884		case SO_KEEPALIVE:
885		case SO_DONTROUTE:
886		case SO_USELOOPBACK:
887		case SO_BROADCAST:
888		case SO_REUSEADDR:
889		case SO_REUSEPORT:
890		case SO_OOBINLINE:
891		case SO_TIMESTAMP:
892			if (m == NULL || m->m_len < sizeof (int)) {
893				error = EINVAL;
894				goto bad;
895			}
896			if (*mtod(m, int *))
897				so->so_options |= optname;
898			else
899				so->so_options &= ~optname;
900			break;
901
902		case SO_SNDBUF:
903		case SO_RCVBUF:
904		case SO_SNDLOWAT:
905		case SO_RCVLOWAT:
906		    {
907			int optval;
908
909			if (m == NULL || m->m_len < sizeof (int)) {
910				error = EINVAL;
911				goto bad;
912			}
913
914			/*
915			 * Values < 1 make no sense for any of these
916			 * options, so disallow them.
917			 */
918			optval = *mtod(m, int *);
919			if (optval < 1) {
920				error = EINVAL;
921				goto bad;
922			}
923
924			switch (optname) {
925
926			case SO_SNDBUF:
927			case SO_RCVBUF:
928				if (sbreserve(optname == SO_SNDBUF ?
929				    &so->so_snd : &so->so_rcv,
930				    (u_long) optval) == 0) {
931					error = ENOBUFS;
932					goto bad;
933				}
934				break;
935
936			/*
937			 * Make sure the low-water is never greater than
938			 * the high-water.
939			 */
940			case SO_SNDLOWAT:
941				so->so_snd.sb_lowat =
942				    (optval > so->so_snd.sb_hiwat) ?
943				    so->so_snd.sb_hiwat : optval;
944				break;
945			case SO_RCVLOWAT:
946				so->so_rcv.sb_lowat =
947				    (optval > so->so_rcv.sb_hiwat) ?
948				    so->so_rcv.sb_hiwat : optval;
949				break;
950			}
951			break;
952		    }
953
954		case SO_SNDTIMEO:
955		case SO_RCVTIMEO:
956		    {
957			struct timeval *tv;
958			short val;
959
960			if (m == NULL || m->m_len < sizeof (*tv)) {
961				error = EINVAL;
962				goto bad;
963			}
964			tv = mtod(m, struct timeval *);
965			if (tv->tv_sec > SHRT_MAX / hz - hz) {
966				error = EDOM;
967				goto bad;
968			}
969			val = tv->tv_sec * hz + tv->tv_usec / tick;
970
971			switch (optname) {
972
973			case SO_SNDTIMEO:
974				so->so_snd.sb_timeo = val;
975				break;
976			case SO_RCVTIMEO:
977				so->so_rcv.sb_timeo = val;
978				break;
979			}
980			break;
981		    }
982
983		default:
984			error = ENOPROTOOPT;
985			break;
986		}
987		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
988			(void) ((*so->so_proto->pr_ctloutput)
989				  (PRCO_SETOPT, so, level, optname, &m0, p));
990			m = NULL;	/* freed by protocol */
991		}
992	}
993bad:
994	if (m)
995		(void) m_free(m);
996	return (error);
997}
998
999int
1000sogetopt(so, level, optname, mp, p)
1001	register struct socket *so;
1002	int level, optname;
1003	struct mbuf **mp;
1004	struct proc *p;
1005{
1006	register struct mbuf *m;
1007
1008	if (level != SOL_SOCKET) {
1009		if (so->so_proto && so->so_proto->pr_ctloutput) {
1010			return ((*so->so_proto->pr_ctloutput)
1011				  (PRCO_GETOPT, so, level, optname, mp, p));
1012		} else
1013			return (ENOPROTOOPT);
1014	} else {
1015		m = m_get(M_WAIT, MT_SOOPTS);
1016		m->m_len = sizeof (int);
1017
1018		switch (optname) {
1019
1020		case SO_LINGER:
1021			m->m_len = sizeof (struct linger);
1022			mtod(m, struct linger *)->l_onoff =
1023				so->so_options & SO_LINGER;
1024			mtod(m, struct linger *)->l_linger = so->so_linger;
1025			break;
1026
1027		case SO_USELOOPBACK:
1028		case SO_DONTROUTE:
1029		case SO_DEBUG:
1030		case SO_KEEPALIVE:
1031		case SO_REUSEADDR:
1032		case SO_REUSEPORT:
1033		case SO_BROADCAST:
1034		case SO_OOBINLINE:
1035		case SO_TIMESTAMP:
1036			*mtod(m, int *) = so->so_options & optname;
1037			break;
1038
1039		case SO_TYPE:
1040			*mtod(m, int *) = so->so_type;
1041			break;
1042
1043		case SO_ERROR:
1044			*mtod(m, int *) = so->so_error;
1045			so->so_error = 0;
1046			break;
1047
1048		case SO_SNDBUF:
1049			*mtod(m, int *) = so->so_snd.sb_hiwat;
1050			break;
1051
1052		case SO_RCVBUF:
1053			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1054			break;
1055
1056		case SO_SNDLOWAT:
1057			*mtod(m, int *) = so->so_snd.sb_lowat;
1058			break;
1059
1060		case SO_RCVLOWAT:
1061			*mtod(m, int *) = so->so_rcv.sb_lowat;
1062			break;
1063
1064		case SO_SNDTIMEO:
1065		case SO_RCVTIMEO:
1066		    {
1067			int val = (optname == SO_SNDTIMEO ?
1068			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1069
1070			m->m_len = sizeof(struct timeval);
1071			mtod(m, struct timeval *)->tv_sec = val / hz;
1072			mtod(m, struct timeval *)->tv_usec =
1073			    (val % hz) * tick;
1074			break;
1075		    }
1076
1077		default:
1078			(void)m_free(m);
1079			return (ENOPROTOOPT);
1080		}
1081		*mp = m;
1082		return (0);
1083	}
1084}
1085
1086void
1087sohasoutofband(so)
1088	register struct socket *so;
1089{
1090	struct proc *p;
1091
1092	if (so->so_pgid < 0)
1093		gsignal(-so->so_pgid, SIGURG);
1094	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1095		psignal(p, SIGURG);
1096	selwakeup(&so->so_rcv.sb_sel);
1097}
1098
1099int
1100sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1101{
1102	int revents = 0;
1103	int s = splnet();
1104
1105	if (events & (POLLIN | POLLRDNORM))
1106		if (soreadable(so))
1107			revents |= events & (POLLIN | POLLRDNORM);
1108
1109	if (events & (POLLOUT | POLLWRNORM))
1110		if (sowriteable(so))
1111			revents |= events & (POLLOUT | POLLWRNORM);
1112
1113	if (events & (POLLPRI | POLLRDBAND))
1114		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1115			revents |= events & (POLLPRI | POLLRDBAND);
1116
1117	if (revents == 0) {
1118		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1119			selrecord(p, &so->so_rcv.sb_sel);
1120			so->so_rcv.sb_flags |= SB_SEL;
1121		}
1122
1123		if (events & (POLLOUT | POLLWRNORM)) {
1124			selrecord(p, &so->so_snd.sb_sel);
1125			so->so_snd.sb_flags |= SB_SEL;
1126		}
1127	}
1128
1129	splx(s);
1130	return (revents);
1131}
1132