uipc_socket.c revision 28551
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34 *	$Id: uipc_socket.c,v 1.28 1997/08/16 19:15:04 wollman Exp $
35 */
36
37#include <sys/param.h>
38#include <sys/queue.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/fcntl.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/domain.h>
45#include <sys/kernel.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/resourcevar.h>
50#include <sys/signalvar.h>
51#include <sys/sysctl.h>
52
53#include <machine/limits.h>
54
55static int somaxconn = SOMAXCONN;
56SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
57	   0, "");
58
59/*
60 * Socket operation routines.
61 * These routines are called by the routines in
62 * sys_socket.c or from a system process, and
63 * implement the semantics of socket operations by
64 * switching out to the protocol specific routines.
65 */
66/*ARGSUSED*/
67int
68socreate(dom, aso, type, proto, p)
69	int dom;
70	struct socket **aso;
71	register int type;
72	int proto;
73	struct proc *p;
74{
75	register struct protosw *prp;
76	register struct socket *so;
77	register int error;
78
79	if (proto)
80		prp = pffindproto(dom, proto, type);
81	else
82		prp = pffindtype(dom, type);
83	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
84		return (EPROTONOSUPPORT);
85	if (prp->pr_type != type)
86		return (EPROTOTYPE);
87	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
88	bzero((caddr_t)so, sizeof(*so));
89	TAILQ_INIT(&so->so_incomp);
90	TAILQ_INIT(&so->so_comp);
91	so->so_type = type;
92	so->so_proto = prp;
93	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
94	if (error) {
95		so->so_state |= SS_NOFDREF;
96		sofree(so);
97		return (error);
98	}
99	*aso = so;
100	return (0);
101}
102
103int
104sobind(so, nam, p)
105	struct socket *so;
106	struct sockaddr *nam;
107	struct proc *p;
108{
109	int s = splnet();
110	int error;
111
112	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
113	splx(s);
114	return (error);
115}
116
117int
118solisten(so, backlog, p)
119	register struct socket *so;
120	int backlog;
121	struct proc *p;
122{
123	int s = splnet(), error;
124
125	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
126	if (error) {
127		splx(s);
128		return (error);
129	}
130	if (so->so_comp.tqh_first == NULL)
131		so->so_options |= SO_ACCEPTCONN;
132	if (backlog < 0 || backlog > somaxconn)
133		backlog = somaxconn;
134	so->so_qlimit = backlog;
135	splx(s);
136	return (0);
137}
138
139void
140sofree(so)
141	register struct socket *so;
142{
143	struct socket *head = so->so_head;
144
145	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
146		return;
147	if (head != NULL) {
148		if (so->so_state & SS_INCOMP) {
149			TAILQ_REMOVE(&head->so_incomp, so, so_list);
150			head->so_incqlen--;
151		} else if (so->so_state & SS_COMP) {
152			TAILQ_REMOVE(&head->so_comp, so, so_list);
153		} else {
154			panic("sofree: not queued");
155		}
156		head->so_qlen--;
157		so->so_state &= ~(SS_INCOMP|SS_COMP);
158		so->so_head = NULL;
159	}
160	sbrelease(&so->so_snd);
161	sorflush(so);
162	FREE(so, M_SOCKET);
163}
164
165/*
166 * Close a socket on last file table reference removal.
167 * Initiate disconnect if connected.
168 * Free socket when disconnect complete.
169 */
170int
171soclose(so)
172	register struct socket *so;
173{
174	int s = splnet();		/* conservative */
175	int error = 0;
176
177	if (so->so_options & SO_ACCEPTCONN) {
178		struct socket *sp, *sonext;
179
180		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
181			sonext = sp->so_list.tqe_next;
182			(void) soabort(sp);
183		}
184		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
185			sonext = sp->so_list.tqe_next;
186			(void) soabort(sp);
187		}
188	}
189	if (so->so_pcb == 0)
190		goto discard;
191	if (so->so_state & SS_ISCONNECTED) {
192		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
193			error = sodisconnect(so);
194			if (error)
195				goto drop;
196		}
197		if (so->so_options & SO_LINGER) {
198			if ((so->so_state & SS_ISDISCONNECTING) &&
199			    (so->so_state & SS_NBIO))
200				goto drop;
201			while (so->so_state & SS_ISCONNECTED) {
202				error = tsleep((caddr_t)&so->so_timeo,
203				    PSOCK | PCATCH, "soclos", so->so_linger);
204				if (error)
205					break;
206			}
207		}
208	}
209drop:
210	if (so->so_pcb) {
211		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
212		if (error == 0)
213			error = error2;
214	}
215discard:
216	if (so->so_state & SS_NOFDREF)
217		panic("soclose: NOFDREF");
218	so->so_state |= SS_NOFDREF;
219	sofree(so);
220	splx(s);
221	return (error);
222}
223
224/*
225 * Must be called at splnet...
226 */
227int
228soabort(so)
229	struct socket *so;
230{
231
232	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
233}
234
235int
236soaccept(so, nam)
237	register struct socket *so;
238	struct sockaddr **nam;
239{
240	int s = splnet();
241	int error;
242
243	if ((so->so_state & SS_NOFDREF) == 0)
244		panic("soaccept: !NOFDREF");
245	so->so_state &= ~SS_NOFDREF;
246	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
247	splx(s);
248	return (error);
249}
250
251int
252soconnect(so, nam, p)
253	register struct socket *so;
254	struct sockaddr *nam;
255	struct proc *p;
256{
257	int s;
258	int error;
259
260	if (so->so_options & SO_ACCEPTCONN)
261		return (EOPNOTSUPP);
262	s = splnet();
263	/*
264	 * If protocol is connection-based, can only connect once.
265	 * Otherwise, if connected, try to disconnect first.
266	 * This allows user to disconnect by connecting to, e.g.,
267	 * a null address.
268	 */
269	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
270	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
271	    (error = sodisconnect(so))))
272		error = EISCONN;
273	else
274		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
275	splx(s);
276	return (error);
277}
278
279int
280soconnect2(so1, so2)
281	register struct socket *so1;
282	struct socket *so2;
283{
284	int s = splnet();
285	int error;
286
287	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
288	splx(s);
289	return (error);
290}
291
292int
293sodisconnect(so)
294	register struct socket *so;
295{
296	int s = splnet();
297	int error;
298
299	if ((so->so_state & SS_ISCONNECTED) == 0) {
300		error = ENOTCONN;
301		goto bad;
302	}
303	if (so->so_state & SS_ISDISCONNECTING) {
304		error = EALREADY;
305		goto bad;
306	}
307	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
308bad:
309	splx(s);
310	return (error);
311}
312
313#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
314/*
315 * Send on a socket.
316 * If send must go all at once and message is larger than
317 * send buffering, then hard error.
318 * Lock against other senders.
319 * If must go all at once and not enough room now, then
320 * inform user that this would block and do nothing.
321 * Otherwise, if nonblocking, send as much as possible.
322 * The data to be sent is described by "uio" if nonzero,
323 * otherwise by the mbuf chain "top" (which must be null
324 * if uio is not).  Data provided in mbuf chain must be small
325 * enough to send all at once.
326 *
327 * Returns nonzero on error, timeout or signal; callers
328 * must check for short counts if EINTR/ERESTART are returned.
329 * Data and control buffers are freed on return.
330 */
331int
332sosend(so, addr, uio, top, control, flags, p)
333	register struct socket *so;
334	struct sockaddr *addr;
335	struct uio *uio;
336	struct mbuf *top;
337	struct mbuf *control;
338	int flags;
339	struct proc *p;
340{
341	struct mbuf **mp;
342	register struct mbuf *m;
343	register long space, len, resid;
344	int clen = 0, error, s, dontroute, mlen;
345	int atomic = sosendallatonce(so) || top;
346
347	if (uio)
348		resid = uio->uio_resid;
349	else
350		resid = top->m_pkthdr.len;
351	/*
352	 * In theory resid should be unsigned.
353	 * However, space must be signed, as it might be less than 0
354	 * if we over-committed, and we must use a signed comparison
355	 * of space and resid.  On the other hand, a negative resid
356	 * causes us to loop sending 0-length segments to the protocol.
357	 */
358	if (resid < 0)
359		return (EINVAL);
360	dontroute =
361	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
362	    (so->so_proto->pr_flags & PR_ATOMIC);
363	p->p_stats->p_ru.ru_msgsnd++;
364	if (control)
365		clen = control->m_len;
366#define	snderr(errno)	{ error = errno; splx(s); goto release; }
367
368restart:
369	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
370	if (error)
371		goto out;
372	do {
373		s = splnet();
374		if (so->so_state & SS_CANTSENDMORE)
375			snderr(EPIPE);
376		if (so->so_error)
377			snderr(so->so_error);
378		if ((so->so_state & SS_ISCONNECTED) == 0) {
379			/*
380			 * `sendto' and `sendmsg' is allowed on a connection-
381			 * based socket if it supports implied connect.
382			 * Return ENOTCONN if not connected and no address is
383			 * supplied.
384			 */
385			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
386			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
387				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
388				    !(resid == 0 && clen != 0))
389					snderr(ENOTCONN);
390			} else if (addr == 0)
391			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
392				   ENOTCONN : EDESTADDRREQ);
393		}
394		space = sbspace(&so->so_snd);
395		if (flags & MSG_OOB)
396			space += 1024;
397		if ((atomic && resid > so->so_snd.sb_hiwat) ||
398		    clen > so->so_snd.sb_hiwat)
399			snderr(EMSGSIZE);
400		if (space < resid + clen && uio &&
401		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
402			if (so->so_state & SS_NBIO)
403				snderr(EWOULDBLOCK);
404			sbunlock(&so->so_snd);
405			error = sbwait(&so->so_snd);
406			splx(s);
407			if (error)
408				goto out;
409			goto restart;
410		}
411		splx(s);
412		mp = &top;
413		space -= clen;
414		do {
415		    if (uio == NULL) {
416			/*
417			 * Data is prepackaged in "top".
418			 */
419			resid = 0;
420			if (flags & MSG_EOR)
421				top->m_flags |= M_EOR;
422		    } else do {
423			if (top == 0) {
424				MGETHDR(m, M_WAIT, MT_DATA);
425				mlen = MHLEN;
426				m->m_pkthdr.len = 0;
427				m->m_pkthdr.rcvif = (struct ifnet *)0;
428			} else {
429				MGET(m, M_WAIT, MT_DATA);
430				mlen = MLEN;
431			}
432			if (resid >= MINCLSIZE) {
433				MCLGET(m, M_WAIT);
434				if ((m->m_flags & M_EXT) == 0)
435					goto nopages;
436				mlen = MCLBYTES;
437				len = min(min(mlen, resid), space);
438			} else {
439nopages:
440				len = min(min(mlen, resid), space);
441				/*
442				 * For datagram protocols, leave room
443				 * for protocol headers in first mbuf.
444				 */
445				if (atomic && top == 0 && len < mlen)
446					MH_ALIGN(m, len);
447			}
448			space -= len;
449			error = uiomove(mtod(m, caddr_t), (int)len, uio);
450			resid = uio->uio_resid;
451			m->m_len = len;
452			*mp = m;
453			top->m_pkthdr.len += len;
454			if (error)
455				goto release;
456			mp = &m->m_next;
457			if (resid <= 0) {
458				if (flags & MSG_EOR)
459					top->m_flags |= M_EOR;
460				break;
461			}
462		    } while (space > 0 && atomic);
463		    if (dontroute)
464			    so->so_options |= SO_DONTROUTE;
465		    s = splnet();				/* XXX */
466		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
467			(flags & MSG_OOB) ? PRUS_OOB :
468			/*
469			 * If the user set MSG_EOF, the protocol
470			 * understands this flag and nothing left to
471			 * send then use PRU_SEND_EOF instead of PRU_SEND.
472			 */
473			((flags & MSG_EOF) &&
474			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
475			 (resid <= 0)) ?
476				PRUS_EOF : 0,
477			top, addr, control, p);
478		    splx(s);
479		    if (dontroute)
480			    so->so_options &= ~SO_DONTROUTE;
481		    clen = 0;
482		    control = 0;
483		    top = 0;
484		    mp = &top;
485		    if (error)
486			goto release;
487		} while (resid && space > 0);
488	} while (resid);
489
490release:
491	sbunlock(&so->so_snd);
492out:
493	if (top)
494		m_freem(top);
495	if (control)
496		m_freem(control);
497	return (error);
498}
499
500/*
501 * Implement receive operations on a socket.
502 * We depend on the way that records are added to the sockbuf
503 * by sbappend*.  In particular, each record (mbufs linked through m_next)
504 * must begin with an address if the protocol so specifies,
505 * followed by an optional mbuf or mbufs containing ancillary data,
506 * and then zero or more mbufs of data.
507 * In order to avoid blocking network interrupts for the entire time here,
508 * we splx() while doing the actual copy to user space.
509 * Although the sockbuf is locked, new data may still be appended,
510 * and thus we must maintain consistency of the sockbuf during that time.
511 *
512 * The caller may receive the data as a single mbuf chain by supplying
513 * an mbuf **mp0 for use in returning the chain.  The uio is then used
514 * only for the count in uio_resid.
515 */
516int
517soreceive(so, psa, uio, mp0, controlp, flagsp)
518	register struct socket *so;
519	struct sockaddr **psa;
520	struct uio *uio;
521	struct mbuf **mp0;
522	struct mbuf **controlp;
523	int *flagsp;
524{
525	register struct mbuf *m, **mp;
526	register int flags, len, error, s, offset;
527	struct protosw *pr = so->so_proto;
528	struct mbuf *nextrecord;
529	int moff, type = 0;
530	int orig_resid = uio->uio_resid;
531
532	mp = mp0;
533	if (psa)
534		*psa = 0;
535	if (controlp)
536		*controlp = 0;
537	if (flagsp)
538		flags = *flagsp &~ MSG_EOR;
539	else
540		flags = 0;
541	if (flags & MSG_OOB) {
542		m = m_get(M_WAIT, MT_DATA);
543		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
544		if (error)
545			goto bad;
546		do {
547			error = uiomove(mtod(m, caddr_t),
548			    (int) min(uio->uio_resid, m->m_len), uio);
549			m = m_free(m);
550		} while (uio->uio_resid && error == 0 && m);
551bad:
552		if (m)
553			m_freem(m);
554		return (error);
555	}
556	if (mp)
557		*mp = (struct mbuf *)0;
558	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
559		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
560
561restart:
562	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
563	if (error)
564		return (error);
565	s = splnet();
566
567	m = so->so_rcv.sb_mb;
568	/*
569	 * If we have less data than requested, block awaiting more
570	 * (subject to any timeout) if:
571	 *   1. the current count is less than the low water mark, or
572	 *   2. MSG_WAITALL is set, and it is possible to do the entire
573	 *	receive operation at once if we block (resid <= hiwat).
574	 *   3. MSG_DONTWAIT is not set
575	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
576	 * we have to do the receive in sections, and thus risk returning
577	 * a short count if a timeout or signal occurs after we start.
578	 */
579	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
580	    so->so_rcv.sb_cc < uio->uio_resid) &&
581	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
582	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
583	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
584#ifdef DIAGNOSTIC
585		if (m == 0 && so->so_rcv.sb_cc)
586			panic("receive 1");
587#endif
588		if (so->so_error) {
589			if (m)
590				goto dontblock;
591			error = so->so_error;
592			if ((flags & MSG_PEEK) == 0)
593				so->so_error = 0;
594			goto release;
595		}
596		if (so->so_state & SS_CANTRCVMORE) {
597			if (m)
598				goto dontblock;
599			else
600				goto release;
601		}
602		for (; m; m = m->m_next)
603			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
604				m = so->so_rcv.sb_mb;
605				goto dontblock;
606			}
607		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
608		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
609			error = ENOTCONN;
610			goto release;
611		}
612		if (uio->uio_resid == 0)
613			goto release;
614		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
615			error = EWOULDBLOCK;
616			goto release;
617		}
618		sbunlock(&so->so_rcv);
619		error = sbwait(&so->so_rcv);
620		splx(s);
621		if (error)
622			return (error);
623		goto restart;
624	}
625dontblock:
626	if (uio->uio_procp)
627		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
628	nextrecord = m->m_nextpkt;
629	if (pr->pr_flags & PR_ADDR) {
630#ifdef DIAGNOSTIC
631		if (m->m_type != MT_SONAME)
632			panic("receive 1a");
633#endif
634		orig_resid = 0;
635		if (psa)
636			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
637					    mp0 == 0);
638		if (flags & MSG_PEEK) {
639			m = m->m_next;
640		} else {
641			sbfree(&so->so_rcv, m);
642			MFREE(m, so->so_rcv.sb_mb);
643			m = so->so_rcv.sb_mb;
644		}
645	}
646	while (m && m->m_type == MT_CONTROL && error == 0) {
647		if (flags & MSG_PEEK) {
648			if (controlp)
649				*controlp = m_copy(m, 0, m->m_len);
650			m = m->m_next;
651		} else {
652			sbfree(&so->so_rcv, m);
653			if (controlp) {
654				if (pr->pr_domain->dom_externalize &&
655				    mtod(m, struct cmsghdr *)->cmsg_type ==
656				    SCM_RIGHTS)
657				   error = (*pr->pr_domain->dom_externalize)(m);
658				*controlp = m;
659				so->so_rcv.sb_mb = m->m_next;
660				m->m_next = 0;
661				m = so->so_rcv.sb_mb;
662			} else {
663				MFREE(m, so->so_rcv.sb_mb);
664				m = so->so_rcv.sb_mb;
665			}
666		}
667		if (controlp) {
668			orig_resid = 0;
669			controlp = &(*controlp)->m_next;
670		}
671	}
672	if (m) {
673		if ((flags & MSG_PEEK) == 0)
674			m->m_nextpkt = nextrecord;
675		type = m->m_type;
676		if (type == MT_OOBDATA)
677			flags |= MSG_OOB;
678	}
679	moff = 0;
680	offset = 0;
681	while (m && uio->uio_resid > 0 && error == 0) {
682		if (m->m_type == MT_OOBDATA) {
683			if (type != MT_OOBDATA)
684				break;
685		} else if (type == MT_OOBDATA)
686			break;
687#ifdef DIAGNOSTIC
688		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
689			panic("receive 3");
690#endif
691		so->so_state &= ~SS_RCVATMARK;
692		len = uio->uio_resid;
693		if (so->so_oobmark && len > so->so_oobmark - offset)
694			len = so->so_oobmark - offset;
695		if (len > m->m_len - moff)
696			len = m->m_len - moff;
697		/*
698		 * If mp is set, just pass back the mbufs.
699		 * Otherwise copy them out via the uio, then free.
700		 * Sockbuf must be consistent here (points to current mbuf,
701		 * it points to next record) when we drop priority;
702		 * we must note any additions to the sockbuf when we
703		 * block interrupts again.
704		 */
705		if (mp == 0) {
706			splx(s);
707			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
708			s = splnet();
709			if (error)
710				goto release;
711		} else
712			uio->uio_resid -= len;
713		if (len == m->m_len - moff) {
714			if (m->m_flags & M_EOR)
715				flags |= MSG_EOR;
716			if (flags & MSG_PEEK) {
717				m = m->m_next;
718				moff = 0;
719			} else {
720				nextrecord = m->m_nextpkt;
721				sbfree(&so->so_rcv, m);
722				if (mp) {
723					*mp = m;
724					mp = &m->m_next;
725					so->so_rcv.sb_mb = m = m->m_next;
726					*mp = (struct mbuf *)0;
727				} else {
728					MFREE(m, so->so_rcv.sb_mb);
729					m = so->so_rcv.sb_mb;
730				}
731				if (m)
732					m->m_nextpkt = nextrecord;
733			}
734		} else {
735			if (flags & MSG_PEEK)
736				moff += len;
737			else {
738				if (mp)
739					*mp = m_copym(m, 0, len, M_WAIT);
740				m->m_data += len;
741				m->m_len -= len;
742				so->so_rcv.sb_cc -= len;
743			}
744		}
745		if (so->so_oobmark) {
746			if ((flags & MSG_PEEK) == 0) {
747				so->so_oobmark -= len;
748				if (so->so_oobmark == 0) {
749					so->so_state |= SS_RCVATMARK;
750					break;
751				}
752			} else {
753				offset += len;
754				if (offset == so->so_oobmark)
755					break;
756			}
757		}
758		if (flags & MSG_EOR)
759			break;
760		/*
761		 * If the MSG_WAITALL flag is set (for non-atomic socket),
762		 * we must not quit until "uio->uio_resid == 0" or an error
763		 * termination.  If a signal/timeout occurs, return
764		 * with a short count but without error.
765		 * Keep sockbuf locked against other readers.
766		 */
767		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
768		    !sosendallatonce(so) && !nextrecord) {
769			if (so->so_error || so->so_state & SS_CANTRCVMORE)
770				break;
771			error = sbwait(&so->so_rcv);
772			if (error) {
773				sbunlock(&so->so_rcv);
774				splx(s);
775				return (0);
776			}
777			m = so->so_rcv.sb_mb;
778			if (m)
779				nextrecord = m->m_nextpkt;
780		}
781	}
782
783	if (m && pr->pr_flags & PR_ATOMIC) {
784		flags |= MSG_TRUNC;
785		if ((flags & MSG_PEEK) == 0)
786			(void) sbdroprecord(&so->so_rcv);
787	}
788	if ((flags & MSG_PEEK) == 0) {
789		if (m == 0)
790			so->so_rcv.sb_mb = nextrecord;
791		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
792			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
793	}
794	if (orig_resid == uio->uio_resid && orig_resid &&
795	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
796		sbunlock(&so->so_rcv);
797		splx(s);
798		goto restart;
799	}
800
801	if (flagsp)
802		*flagsp |= flags;
803release:
804	sbunlock(&so->so_rcv);
805	splx(s);
806	return (error);
807}
808
809int
810soshutdown(so, how)
811	register struct socket *so;
812	register int how;
813{
814	register struct protosw *pr = so->so_proto;
815
816	how++;
817	if (how & FREAD)
818		sorflush(so);
819	if (how & FWRITE)
820		return ((*pr->pr_usrreqs->pru_shutdown)(so));
821	return (0);
822}
823
824void
825sorflush(so)
826	register struct socket *so;
827{
828	register struct sockbuf *sb = &so->so_rcv;
829	register struct protosw *pr = so->so_proto;
830	register int s;
831	struct sockbuf asb;
832
833	sb->sb_flags |= SB_NOINTR;
834	(void) sblock(sb, M_WAITOK);
835	s = splimp();
836	socantrcvmore(so);
837	sbunlock(sb);
838	asb = *sb;
839	bzero((caddr_t)sb, sizeof (*sb));
840	splx(s);
841	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
842		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
843	sbrelease(&asb);
844}
845
846int
847sosetopt(so, level, optname, m0, p)
848	register struct socket *so;
849	int level, optname;
850	struct mbuf *m0;
851	struct proc *p;
852{
853	int error = 0;
854	register struct mbuf *m = m0;
855
856	if (level != SOL_SOCKET) {
857		if (so->so_proto && so->so_proto->pr_ctloutput)
858			return ((*so->so_proto->pr_ctloutput)
859				  (PRCO_SETOPT, so, level, optname, &m0, p));
860		error = ENOPROTOOPT;
861	} else {
862		switch (optname) {
863
864		case SO_LINGER:
865			if (m == NULL || m->m_len != sizeof (struct linger)) {
866				error = EINVAL;
867				goto bad;
868			}
869			so->so_linger = mtod(m, struct linger *)->l_linger;
870			/* fall thru... */
871
872		case SO_DEBUG:
873		case SO_KEEPALIVE:
874		case SO_DONTROUTE:
875		case SO_USELOOPBACK:
876		case SO_BROADCAST:
877		case SO_REUSEADDR:
878		case SO_REUSEPORT:
879		case SO_OOBINLINE:
880		case SO_TIMESTAMP:
881			if (m == NULL || m->m_len < sizeof (int)) {
882				error = EINVAL;
883				goto bad;
884			}
885			if (*mtod(m, int *))
886				so->so_options |= optname;
887			else
888				so->so_options &= ~optname;
889			break;
890
891		case SO_SNDBUF:
892		case SO_RCVBUF:
893		case SO_SNDLOWAT:
894		case SO_RCVLOWAT:
895		    {
896			int optval;
897
898			if (m == NULL || m->m_len < sizeof (int)) {
899				error = EINVAL;
900				goto bad;
901			}
902
903			/*
904			 * Values < 1 make no sense for any of these
905			 * options, so disallow them.
906			 */
907			optval = *mtod(m, int *);
908			if (optval < 1) {
909				error = EINVAL;
910				goto bad;
911			}
912
913			switch (optname) {
914
915			case SO_SNDBUF:
916			case SO_RCVBUF:
917				if (sbreserve(optname == SO_SNDBUF ?
918				    &so->so_snd : &so->so_rcv,
919				    (u_long) optval) == 0) {
920					error = ENOBUFS;
921					goto bad;
922				}
923				break;
924
925			/*
926			 * Make sure the low-water is never greater than
927			 * the high-water.
928			 */
929			case SO_SNDLOWAT:
930				so->so_snd.sb_lowat =
931				    (optval > so->so_snd.sb_hiwat) ?
932				    so->so_snd.sb_hiwat : optval;
933				break;
934			case SO_RCVLOWAT:
935				so->so_rcv.sb_lowat =
936				    (optval > so->so_rcv.sb_hiwat) ?
937				    so->so_rcv.sb_hiwat : optval;
938				break;
939			}
940			break;
941		    }
942
943		case SO_SNDTIMEO:
944		case SO_RCVTIMEO:
945		    {
946			struct timeval *tv;
947			short val;
948
949			if (m == NULL || m->m_len < sizeof (*tv)) {
950				error = EINVAL;
951				goto bad;
952			}
953			tv = mtod(m, struct timeval *);
954			if (tv->tv_sec > SHRT_MAX / hz - hz) {
955				error = EDOM;
956				goto bad;
957			}
958			val = tv->tv_sec * hz + tv->tv_usec / tick;
959
960			switch (optname) {
961
962			case SO_SNDTIMEO:
963				so->so_snd.sb_timeo = val;
964				break;
965			case SO_RCVTIMEO:
966				so->so_rcv.sb_timeo = val;
967				break;
968			}
969			break;
970		    }
971
972		default:
973			error = ENOPROTOOPT;
974			break;
975		}
976		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
977			(void) ((*so->so_proto->pr_ctloutput)
978				  (PRCO_SETOPT, so, level, optname, &m0, p));
979			m = NULL;	/* freed by protocol */
980		}
981	}
982bad:
983	if (m)
984		(void) m_free(m);
985	return (error);
986}
987
988int
989sogetopt(so, level, optname, mp, p)
990	register struct socket *so;
991	int level, optname;
992	struct mbuf **mp;
993	struct proc *p;
994{
995	register struct mbuf *m;
996
997	if (level != SOL_SOCKET) {
998		if (so->so_proto && so->so_proto->pr_ctloutput) {
999			return ((*so->so_proto->pr_ctloutput)
1000				  (PRCO_GETOPT, so, level, optname, mp, p));
1001		} else
1002			return (ENOPROTOOPT);
1003	} else {
1004		m = m_get(M_WAIT, MT_SOOPTS);
1005		m->m_len = sizeof (int);
1006
1007		switch (optname) {
1008
1009		case SO_LINGER:
1010			m->m_len = sizeof (struct linger);
1011			mtod(m, struct linger *)->l_onoff =
1012				so->so_options & SO_LINGER;
1013			mtod(m, struct linger *)->l_linger = so->so_linger;
1014			break;
1015
1016		case SO_USELOOPBACK:
1017		case SO_DONTROUTE:
1018		case SO_DEBUG:
1019		case SO_KEEPALIVE:
1020		case SO_REUSEADDR:
1021		case SO_REUSEPORT:
1022		case SO_BROADCAST:
1023		case SO_OOBINLINE:
1024		case SO_TIMESTAMP:
1025			*mtod(m, int *) = so->so_options & optname;
1026			break;
1027
1028		case SO_TYPE:
1029			*mtod(m, int *) = so->so_type;
1030			break;
1031
1032		case SO_ERROR:
1033			*mtod(m, int *) = so->so_error;
1034			so->so_error = 0;
1035			break;
1036
1037		case SO_SNDBUF:
1038			*mtod(m, int *) = so->so_snd.sb_hiwat;
1039			break;
1040
1041		case SO_RCVBUF:
1042			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1043			break;
1044
1045		case SO_SNDLOWAT:
1046			*mtod(m, int *) = so->so_snd.sb_lowat;
1047			break;
1048
1049		case SO_RCVLOWAT:
1050			*mtod(m, int *) = so->so_rcv.sb_lowat;
1051			break;
1052
1053		case SO_SNDTIMEO:
1054		case SO_RCVTIMEO:
1055		    {
1056			int val = (optname == SO_SNDTIMEO ?
1057			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1058
1059			m->m_len = sizeof(struct timeval);
1060			mtod(m, struct timeval *)->tv_sec = val / hz;
1061			mtod(m, struct timeval *)->tv_usec =
1062			    (val % hz) * tick;
1063			break;
1064		    }
1065
1066		default:
1067			(void)m_free(m);
1068			return (ENOPROTOOPT);
1069		}
1070		*mp = m;
1071		return (0);
1072	}
1073}
1074
1075void
1076sohasoutofband(so)
1077	register struct socket *so;
1078{
1079	struct proc *p;
1080
1081	if (so->so_pgid < 0)
1082		gsignal(-so->so_pgid, SIGURG);
1083	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1084		psignal(p, SIGURG);
1085	selwakeup(&so->so_rcv.sb_sel);
1086}
1087
1088int
1089soselect(struct socket *so, int which, struct proc *p)
1090{
1091	int s = splnet();
1092	switch (which) {
1093
1094	case FREAD:
1095		if (soreadable(so)) {
1096			splx(s);
1097			return (1);
1098		}
1099		selrecord(p, &so->so_rcv.sb_sel);
1100		so->so_rcv.sb_flags |= SB_SEL;
1101		break;
1102
1103	case FWRITE:
1104		if (sowriteable(so)) {
1105			splx(s);
1106			return (1);
1107		}
1108		selrecord(p, &so->so_snd.sb_sel);
1109		so->so_snd.sb_flags |= SB_SEL;
1110		break;
1111
1112	case 0:
1113		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
1114			splx(s);
1115			return (1);
1116		}
1117		selrecord(p, &so->so_rcv.sb_sel);
1118		so->so_rcv.sb_flags |= SB_SEL;
1119		break;
1120	}
1121	splx(s);
1122	return (0);
1123}
1124