uipc_socket.c revision 30354
17011SN/A/*
212325Spsandoz * Copyright (c) 1982, 1986, 1988, 1990, 1993
37011SN/A *	The Regents of the University of California.  All rights reserved.
47011SN/A *
57011SN/A * Redistribution and use in source and binary forms, with or without
67011SN/A * modification, are permitted provided that the following conditions
77011SN/A * are met:
87011SN/A * 1. Redistributions of source code must retain the above copyright
97011SN/A *    notice, this list of conditions and the following disclaimer.
107011SN/A * 2. Redistributions in binary form must reproduce the above copyright
117011SN/A *    notice, this list of conditions and the following disclaimer in the
127011SN/A *    documentation and/or other materials provided with the distribution.
137011SN/A * 3. All advertising materials mentioning features or use of this software
147011SN/A *    must display the following acknowledgement:
157011SN/A *	This product includes software developed by the University of
167011SN/A *	California, Berkeley and its contributors.
177011SN/A * 4. Neither the name of the University nor the names of its contributors
187011SN/A *    may be used to endorse or promote products derived from this software
197011SN/A *    without specific prior written permission.
207011SN/A *
217011SN/A * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
227011SN/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
237011SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
247011SN/A * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
257011SN/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
267011SN/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
277011SN/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
287011SN/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
297011SN/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
307011SN/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
318051SN/A * SUCH DAMAGE.
327011SN/A *
337011SN/A *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
347011SN/A *	$Id: uipc_socket.c,v 1.32 1997/10/04 18:21:15 phk Exp $
357011SN/A */
367011SN/A
377011SN/A#include <sys/param.h>
387011SN/A#include <sys/systm.h>
397011SN/A#include <sys/proc.h>
407011SN/A#include <sys/fcntl.h>
417011SN/A#include <sys/malloc.h>
427011SN/A#include <sys/mbuf.h>
437011SN/A#include <sys/domain.h>
447011SN/A#include <sys/kernel.h>
457011SN/A#include <sys/poll.h>
467011SN/A#include <sys/protosw.h>
477570SN/A#include <sys/socket.h>
487011SN/A#include <sys/socketvar.h>
497011SN/A#include <sys/resourcevar.h>
507011SN/A#include <sys/signalvar.h>
517011SN/A#include <sys/sysctl.h>
527011SN/A
537011SN/A#include <machine/limits.h>
547011SN/A
557011SN/AMALLOC_DEFINE(M_SOCKET, "socket", "socket structure");
567011SN/AMALLOC_DEFINE(M_SONAME, "soname", "socket name");
577011SN/AMALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
587570SN/A
597570SN/Astatic int somaxconn = SOMAXCONN;
607570SN/ASYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
617570SN/A	   0, "");
627570SN/A
637570SN/A/*
647570SN/A * Socket operation routines.
657011SN/A * These routines are called by the routines in
667011SN/A * sys_socket.c or from a system process, and
677011SN/A * implement the semantics of socket operations by
687011SN/A * switching out to the protocol specific routines.
697011SN/A */
707011SN/A/*ARGSUSED*/
717011SN/Aint
727011SN/Asocreate(dom, aso, type, proto, p)
737011SN/A	int dom;
747011SN/A	struct socket **aso;
757011SN/A	register int type;
767011SN/A	int proto;
7710298SN/A	struct proc *p;
787011SN/A{
797011SN/A	register struct protosw *prp;
807011SN/A	register struct socket *so;
817011SN/A	register int error;
827011SN/A
837011SN/A	if (proto)
847011SN/A		prp = pffindproto(dom, proto, type);
857011SN/A	else
867011SN/A		prp = pffindtype(dom, type);
877011SN/A	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
887011SN/A		return (EPROTONOSUPPORT);
897011SN/A	if (prp->pr_type != type)
907011SN/A		return (EPROTOTYPE);
917011SN/A	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
927011SN/A	bzero((caddr_t)so, sizeof(*so));
937011SN/A	TAILQ_INIT(&so->so_incomp);
947011SN/A	TAILQ_INIT(&so->so_comp);
957011SN/A	so->so_type = type;
967011SN/A	so->so_proto = prp;
977011SN/A	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
987011SN/A	if (error) {
997011SN/A		so->so_state |= SS_NOFDREF;
1007011SN/A		sofree(so);
10113916Samlu		return (error);
1027011SN/A	}
1037011SN/A	*aso = so;
1047011SN/A	return (0);
1057011SN/A}
1067011SN/A
1077011SN/Aint
1087011SN/Asobind(so, nam, p)
1097011SN/A	struct socket *so;
1107011SN/A	struct sockaddr *nam;
1117011SN/A	struct proc *p;
1127011SN/A{
1137011SN/A	int s = splnet();
1147011SN/A	int error;
1157011SN/A
1167011SN/A	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
1177011SN/A	splx(s);
1187011SN/A	return (error);
1197011SN/A}
1207011SN/A
1217011SN/Aint
1227011SN/Asolisten(so, backlog, p)
1237011SN/A	register struct socket *so;
1247011SN/A	int backlog;
1257011SN/A	struct proc *p;
1267011SN/A{
1277011SN/A	int s = splnet(), error;
1287011SN/A
1297011SN/A	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1307011SN/A	if (error) {
1317011SN/A		splx(s);
1327011SN/A		return (error);
1337011SN/A	}
1347011SN/A	if (so->so_comp.tqh_first == NULL)
1357011SN/A		so->so_options |= SO_ACCEPTCONN;
1367011SN/A	if (backlog < 0 || backlog > somaxconn)
1377011SN/A		backlog = somaxconn;
1387011SN/A	so->so_qlimit = backlog;
13912325Spsandoz	splx(s);
1407011SN/A	return (0);
14112325Spsandoz}
1427011SN/A
1437011SN/Avoid
1447011SN/Asofree(so)
1457011SN/A	register struct socket *so;
1467011SN/A{
1477011SN/A	struct socket *head = so->so_head;
1487011SN/A
1497011SN/A	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
1507011SN/A		return;
1517011SN/A	if (head != NULL) {
1527011SN/A		if (so->so_state & SS_INCOMP) {
1537011SN/A			TAILQ_REMOVE(&head->so_incomp, so, so_list);
1547011SN/A			head->so_incqlen--;
1557011SN/A		} else if (so->so_state & SS_COMP) {
1567011SN/A			TAILQ_REMOVE(&head->so_comp, so, so_list);
1577570SN/A		} else {
1587570SN/A			panic("sofree: not queued");
1597570SN/A		}
1607570SN/A		head->so_qlen--;
1617570SN/A		so->so_state &= ~(SS_INCOMP|SS_COMP);
1627570SN/A		so->so_head = NULL;
1637570SN/A	}
1647570SN/A	sbrelease(&so->so_snd);
1657011SN/A	sorflush(so);
1667011SN/A	FREE(so, M_SOCKET);
1677011SN/A}
1687011SN/A
1697011SN/A/*
1707011SN/A * Close a socket on last file table reference removal.
1717011SN/A * Initiate disconnect if connected.
1727011SN/A * Free socket when disconnect complete.
1737011SN/A */
1747011SN/Aint
1757011SN/Asoclose(so)
1767011SN/A	register struct socket *so;
1777011SN/A{
1787011SN/A	int s = splnet();		/* conservative */
1797011SN/A	int error = 0;
1807011SN/A
1817011SN/A	if (so->so_options & SO_ACCEPTCONN) {
1827011SN/A		struct socket *sp, *sonext;
1837011SN/A
1847011SN/A		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
1857011SN/A			sonext = sp->so_list.tqe_next;
1867011SN/A			(void) soabort(sp);
1877011SN/A		}
1887011SN/A		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
1897011SN/A			sonext = sp->so_list.tqe_next;
1907011SN/A			(void) soabort(sp);
1917011SN/A		}
1927011SN/A	}
1937011SN/A	if (so->so_pcb == 0)
1947011SN/A		goto discard;
1957011SN/A	if (so->so_state & SS_ISCONNECTED) {
1967011SN/A		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1977011SN/A			error = sodisconnect(so);
1987011SN/A			if (error)
1997011SN/A				goto drop;
2007011SN/A		}
2017011SN/A		if (so->so_options & SO_LINGER) {
2027011SN/A			if ((so->so_state & SS_ISDISCONNECTING) &&
2037011SN/A			    (so->so_state & SS_NBIO))
2047011SN/A				goto drop;
205			while (so->so_state & SS_ISCONNECTED) {
206				error = tsleep((caddr_t)&so->so_timeo,
207				    PSOCK | PCATCH, "soclos", so->so_linger);
208				if (error)
209					break;
210			}
211		}
212	}
213drop:
214	if (so->so_pcb) {
215		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
216		if (error == 0)
217			error = error2;
218	}
219discard:
220	if (so->so_state & SS_NOFDREF)
221		panic("soclose: NOFDREF");
222	so->so_state |= SS_NOFDREF;
223	sofree(so);
224	splx(s);
225	return (error);
226}
227
228/*
229 * Must be called at splnet...
230 */
231int
232soabort(so)
233	struct socket *so;
234{
235
236	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
237}
238
239int
240soaccept(so, nam)
241	register struct socket *so;
242	struct sockaddr **nam;
243{
244	int s = splnet();
245	int error;
246
247	if ((so->so_state & SS_NOFDREF) == 0)
248		panic("soaccept: !NOFDREF");
249	so->so_state &= ~SS_NOFDREF;
250	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
251	splx(s);
252	return (error);
253}
254
255int
256soconnect(so, nam, p)
257	register struct socket *so;
258	struct sockaddr *nam;
259	struct proc *p;
260{
261	int s;
262	int error;
263
264	if (so->so_options & SO_ACCEPTCONN)
265		return (EOPNOTSUPP);
266	s = splnet();
267	/*
268	 * If protocol is connection-based, can only connect once.
269	 * Otherwise, if connected, try to disconnect first.
270	 * This allows user to disconnect by connecting to, e.g.,
271	 * a null address.
272	 */
273	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
274	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
275	    (error = sodisconnect(so))))
276		error = EISCONN;
277	else
278		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
279	splx(s);
280	return (error);
281}
282
283int
284soconnect2(so1, so2)
285	register struct socket *so1;
286	struct socket *so2;
287{
288	int s = splnet();
289	int error;
290
291	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
292	splx(s);
293	return (error);
294}
295
296int
297sodisconnect(so)
298	register struct socket *so;
299{
300	int s = splnet();
301	int error;
302
303	if ((so->so_state & SS_ISCONNECTED) == 0) {
304		error = ENOTCONN;
305		goto bad;
306	}
307	if (so->so_state & SS_ISDISCONNECTING) {
308		error = EALREADY;
309		goto bad;
310	}
311	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
312bad:
313	splx(s);
314	return (error);
315}
316
317#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
318/*
319 * Send on a socket.
320 * If send must go all at once and message is larger than
321 * send buffering, then hard error.
322 * Lock against other senders.
323 * If must go all at once and not enough room now, then
324 * inform user that this would block and do nothing.
325 * Otherwise, if nonblocking, send as much as possible.
326 * The data to be sent is described by "uio" if nonzero,
327 * otherwise by the mbuf chain "top" (which must be null
328 * if uio is not).  Data provided in mbuf chain must be small
329 * enough to send all at once.
330 *
331 * Returns nonzero on error, timeout or signal; callers
332 * must check for short counts if EINTR/ERESTART are returned.
333 * Data and control buffers are freed on return.
334 */
335int
336sosend(so, addr, uio, top, control, flags, p)
337	register struct socket *so;
338	struct sockaddr *addr;
339	struct uio *uio;
340	struct mbuf *top;
341	struct mbuf *control;
342	int flags;
343	struct proc *p;
344{
345	struct mbuf **mp;
346	register struct mbuf *m;
347	register long space, len, resid;
348	int clen = 0, error, s, dontroute, mlen;
349	int atomic = sosendallatonce(so) || top;
350
351	if (uio)
352		resid = uio->uio_resid;
353	else
354		resid = top->m_pkthdr.len;
355	/*
356	 * In theory resid should be unsigned.
357	 * However, space must be signed, as it might be less than 0
358	 * if we over-committed, and we must use a signed comparison
359	 * of space and resid.  On the other hand, a negative resid
360	 * causes us to loop sending 0-length segments to the protocol.
361	 */
362	if (resid < 0)
363		return (EINVAL);
364	dontroute =
365	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
366	    (so->so_proto->pr_flags & PR_ATOMIC);
367	if (p)
368		p->p_stats->p_ru.ru_msgsnd++;
369	if (control)
370		clen = control->m_len;
371#define	snderr(errno)	{ error = errno; splx(s); goto release; }
372
373restart:
374	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
375	if (error)
376		goto out;
377	do {
378		s = splnet();
379		if (so->so_state & SS_CANTSENDMORE)
380			snderr(EPIPE);
381		if (so->so_error)
382			snderr(so->so_error);
383		if ((so->so_state & SS_ISCONNECTED) == 0) {
384			/*
385			 * `sendto' and `sendmsg' is allowed on a connection-
386			 * based socket if it supports implied connect.
387			 * Return ENOTCONN if not connected and no address is
388			 * supplied.
389			 */
390			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
391			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
392				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
393				    !(resid == 0 && clen != 0))
394					snderr(ENOTCONN);
395			} else if (addr == 0)
396			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
397				   ENOTCONN : EDESTADDRREQ);
398		}
399		space = sbspace(&so->so_snd);
400		if (flags & MSG_OOB)
401			space += 1024;
402		if ((atomic && resid > so->so_snd.sb_hiwat) ||
403		    clen > so->so_snd.sb_hiwat)
404			snderr(EMSGSIZE);
405		if (space < resid + clen && uio &&
406		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
407			if (so->so_state & SS_NBIO)
408				snderr(EWOULDBLOCK);
409			sbunlock(&so->so_snd);
410			error = sbwait(&so->so_snd);
411			splx(s);
412			if (error)
413				goto out;
414			goto restart;
415		}
416		splx(s);
417		mp = &top;
418		space -= clen;
419		do {
420		    if (uio == NULL) {
421			/*
422			 * Data is prepackaged in "top".
423			 */
424			resid = 0;
425			if (flags & MSG_EOR)
426				top->m_flags |= M_EOR;
427		    } else do {
428			if (top == 0) {
429				MGETHDR(m, M_WAIT, MT_DATA);
430				mlen = MHLEN;
431				m->m_pkthdr.len = 0;
432				m->m_pkthdr.rcvif = (struct ifnet *)0;
433			} else {
434				MGET(m, M_WAIT, MT_DATA);
435				mlen = MLEN;
436			}
437			if (resid >= MINCLSIZE) {
438				MCLGET(m, M_WAIT);
439				if ((m->m_flags & M_EXT) == 0)
440					goto nopages;
441				mlen = MCLBYTES;
442				len = min(min(mlen, resid), space);
443			} else {
444nopages:
445				len = min(min(mlen, resid), space);
446				/*
447				 * For datagram protocols, leave room
448				 * for protocol headers in first mbuf.
449				 */
450				if (atomic && top == 0 && len < mlen)
451					MH_ALIGN(m, len);
452			}
453			space -= len;
454			error = uiomove(mtod(m, caddr_t), (int)len, uio);
455			resid = uio->uio_resid;
456			m->m_len = len;
457			*mp = m;
458			top->m_pkthdr.len += len;
459			if (error)
460				goto release;
461			mp = &m->m_next;
462			if (resid <= 0) {
463				if (flags & MSG_EOR)
464					top->m_flags |= M_EOR;
465				break;
466			}
467		    } while (space > 0 && atomic);
468		    if (dontroute)
469			    so->so_options |= SO_DONTROUTE;
470		    s = splnet();				/* XXX */
471		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
472			(flags & MSG_OOB) ? PRUS_OOB :
473			/*
474			 * If the user set MSG_EOF, the protocol
475			 * understands this flag and nothing left to
476			 * send then use PRU_SEND_EOF instead of PRU_SEND.
477			 */
478			((flags & MSG_EOF) &&
479			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
480			 (resid <= 0)) ?
481				PRUS_EOF : 0,
482			top, addr, control, p);
483		    splx(s);
484		    if (dontroute)
485			    so->so_options &= ~SO_DONTROUTE;
486		    clen = 0;
487		    control = 0;
488		    top = 0;
489		    mp = &top;
490		    if (error)
491			goto release;
492		} while (resid && space > 0);
493	} while (resid);
494
495release:
496	sbunlock(&so->so_snd);
497out:
498	if (top)
499		m_freem(top);
500	if (control)
501		m_freem(control);
502	return (error);
503}
504
505/*
506 * Implement receive operations on a socket.
507 * We depend on the way that records are added to the sockbuf
508 * by sbappend*.  In particular, each record (mbufs linked through m_next)
509 * must begin with an address if the protocol so specifies,
510 * followed by an optional mbuf or mbufs containing ancillary data,
511 * and then zero or more mbufs of data.
512 * In order to avoid blocking network interrupts for the entire time here,
513 * we splx() while doing the actual copy to user space.
514 * Although the sockbuf is locked, new data may still be appended,
515 * and thus we must maintain consistency of the sockbuf during that time.
516 *
517 * The caller may receive the data as a single mbuf chain by supplying
518 * an mbuf **mp0 for use in returning the chain.  The uio is then used
519 * only for the count in uio_resid.
520 */
521int
522soreceive(so, psa, uio, mp0, controlp, flagsp)
523	register struct socket *so;
524	struct sockaddr **psa;
525	struct uio *uio;
526	struct mbuf **mp0;
527	struct mbuf **controlp;
528	int *flagsp;
529{
530	register struct mbuf *m, **mp;
531	register int flags, len, error, s, offset;
532	struct protosw *pr = so->so_proto;
533	struct mbuf *nextrecord;
534	int moff, type = 0;
535	int orig_resid = uio->uio_resid;
536
537	mp = mp0;
538	if (psa)
539		*psa = 0;
540	if (controlp)
541		*controlp = 0;
542	if (flagsp)
543		flags = *flagsp &~ MSG_EOR;
544	else
545		flags = 0;
546	if (flags & MSG_OOB) {
547		m = m_get(M_WAIT, MT_DATA);
548		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
549		if (error)
550			goto bad;
551		do {
552			error = uiomove(mtod(m, caddr_t),
553			    (int) min(uio->uio_resid, m->m_len), uio);
554			m = m_free(m);
555		} while (uio->uio_resid && error == 0 && m);
556bad:
557		if (m)
558			m_freem(m);
559		return (error);
560	}
561	if (mp)
562		*mp = (struct mbuf *)0;
563	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
564		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
565
566restart:
567	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
568	if (error)
569		return (error);
570	s = splnet();
571
572	m = so->so_rcv.sb_mb;
573	/*
574	 * If we have less data than requested, block awaiting more
575	 * (subject to any timeout) if:
576	 *   1. the current count is less than the low water mark, or
577	 *   2. MSG_WAITALL is set, and it is possible to do the entire
578	 *	receive operation at once if we block (resid <= hiwat).
579	 *   3. MSG_DONTWAIT is not set
580	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
581	 * we have to do the receive in sections, and thus risk returning
582	 * a short count if a timeout or signal occurs after we start.
583	 */
584	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
585	    so->so_rcv.sb_cc < uio->uio_resid) &&
586	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
587	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
588	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
589#ifdef DIAGNOSTIC
590		if (m == 0 && so->so_rcv.sb_cc)
591			panic("receive 1");
592#endif
593		if (so->so_error) {
594			if (m)
595				goto dontblock;
596			error = so->so_error;
597			if ((flags & MSG_PEEK) == 0)
598				so->so_error = 0;
599			goto release;
600		}
601		if (so->so_state & SS_CANTRCVMORE) {
602			if (m)
603				goto dontblock;
604			else
605				goto release;
606		}
607		for (; m; m = m->m_next)
608			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
609				m = so->so_rcv.sb_mb;
610				goto dontblock;
611			}
612		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
613		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
614			error = ENOTCONN;
615			goto release;
616		}
617		if (uio->uio_resid == 0)
618			goto release;
619		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
620			error = EWOULDBLOCK;
621			goto release;
622		}
623		sbunlock(&so->so_rcv);
624		error = sbwait(&so->so_rcv);
625		splx(s);
626		if (error)
627			return (error);
628		goto restart;
629	}
630dontblock:
631	if (uio->uio_procp)
632		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
633	nextrecord = m->m_nextpkt;
634	if (pr->pr_flags & PR_ADDR) {
635#ifdef DIAGNOSTIC
636		if (m->m_type != MT_SONAME)
637			panic("receive 1a");
638#endif
639		orig_resid = 0;
640		if (psa)
641			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
642					    mp0 == 0);
643		if (flags & MSG_PEEK) {
644			m = m->m_next;
645		} else {
646			sbfree(&so->so_rcv, m);
647			MFREE(m, so->so_rcv.sb_mb);
648			m = so->so_rcv.sb_mb;
649		}
650	}
651	while (m && m->m_type == MT_CONTROL && error == 0) {
652		if (flags & MSG_PEEK) {
653			if (controlp)
654				*controlp = m_copy(m, 0, m->m_len);
655			m = m->m_next;
656		} else {
657			sbfree(&so->so_rcv, m);
658			if (controlp) {
659				if (pr->pr_domain->dom_externalize &&
660				    mtod(m, struct cmsghdr *)->cmsg_type ==
661				    SCM_RIGHTS)
662				   error = (*pr->pr_domain->dom_externalize)(m);
663				*controlp = m;
664				so->so_rcv.sb_mb = m->m_next;
665				m->m_next = 0;
666				m = so->so_rcv.sb_mb;
667			} else {
668				MFREE(m, so->so_rcv.sb_mb);
669				m = so->so_rcv.sb_mb;
670			}
671		}
672		if (controlp) {
673			orig_resid = 0;
674			controlp = &(*controlp)->m_next;
675		}
676	}
677	if (m) {
678		if ((flags & MSG_PEEK) == 0)
679			m->m_nextpkt = nextrecord;
680		type = m->m_type;
681		if (type == MT_OOBDATA)
682			flags |= MSG_OOB;
683	}
684	moff = 0;
685	offset = 0;
686	while (m && uio->uio_resid > 0 && error == 0) {
687		if (m->m_type == MT_OOBDATA) {
688			if (type != MT_OOBDATA)
689				break;
690		} else if (type == MT_OOBDATA)
691			break;
692#ifdef DIAGNOSTIC
693		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
694			panic("receive 3");
695#endif
696		so->so_state &= ~SS_RCVATMARK;
697		len = uio->uio_resid;
698		if (so->so_oobmark && len > so->so_oobmark - offset)
699			len = so->so_oobmark - offset;
700		if (len > m->m_len - moff)
701			len = m->m_len - moff;
702		/*
703		 * If mp is set, just pass back the mbufs.
704		 * Otherwise copy them out via the uio, then free.
705		 * Sockbuf must be consistent here (points to current mbuf,
706		 * it points to next record) when we drop priority;
707		 * we must note any additions to the sockbuf when we
708		 * block interrupts again.
709		 */
710		if (mp == 0) {
711			splx(s);
712			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
713			s = splnet();
714			if (error)
715				goto release;
716		} else
717			uio->uio_resid -= len;
718		if (len == m->m_len - moff) {
719			if (m->m_flags & M_EOR)
720				flags |= MSG_EOR;
721			if (flags & MSG_PEEK) {
722				m = m->m_next;
723				moff = 0;
724			} else {
725				nextrecord = m->m_nextpkt;
726				sbfree(&so->so_rcv, m);
727				if (mp) {
728					*mp = m;
729					mp = &m->m_next;
730					so->so_rcv.sb_mb = m = m->m_next;
731					*mp = (struct mbuf *)0;
732				} else {
733					MFREE(m, so->so_rcv.sb_mb);
734					m = so->so_rcv.sb_mb;
735				}
736				if (m)
737					m->m_nextpkt = nextrecord;
738			}
739		} else {
740			if (flags & MSG_PEEK)
741				moff += len;
742			else {
743				if (mp)
744					*mp = m_copym(m, 0, len, M_WAIT);
745				m->m_data += len;
746				m->m_len -= len;
747				so->so_rcv.sb_cc -= len;
748			}
749		}
750		if (so->so_oobmark) {
751			if ((flags & MSG_PEEK) == 0) {
752				so->so_oobmark -= len;
753				if (so->so_oobmark == 0) {
754					so->so_state |= SS_RCVATMARK;
755					break;
756				}
757			} else {
758				offset += len;
759				if (offset == so->so_oobmark)
760					break;
761			}
762		}
763		if (flags & MSG_EOR)
764			break;
765		/*
766		 * If the MSG_WAITALL flag is set (for non-atomic socket),
767		 * we must not quit until "uio->uio_resid == 0" or an error
768		 * termination.  If a signal/timeout occurs, return
769		 * with a short count but without error.
770		 * Keep sockbuf locked against other readers.
771		 */
772		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
773		    !sosendallatonce(so) && !nextrecord) {
774			if (so->so_error || so->so_state & SS_CANTRCVMORE)
775				break;
776			error = sbwait(&so->so_rcv);
777			if (error) {
778				sbunlock(&so->so_rcv);
779				splx(s);
780				return (0);
781			}
782			m = so->so_rcv.sb_mb;
783			if (m)
784				nextrecord = m->m_nextpkt;
785		}
786	}
787
788	if (m && pr->pr_flags & PR_ATOMIC) {
789		flags |= MSG_TRUNC;
790		if ((flags & MSG_PEEK) == 0)
791			(void) sbdroprecord(&so->so_rcv);
792	}
793	if ((flags & MSG_PEEK) == 0) {
794		if (m == 0)
795			so->so_rcv.sb_mb = nextrecord;
796		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
797			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
798	}
799	if (orig_resid == uio->uio_resid && orig_resid &&
800	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
801		sbunlock(&so->so_rcv);
802		splx(s);
803		goto restart;
804	}
805
806	if (flagsp)
807		*flagsp |= flags;
808release:
809	sbunlock(&so->so_rcv);
810	splx(s);
811	return (error);
812}
813
814int
815soshutdown(so, how)
816	register struct socket *so;
817	register int how;
818{
819	register struct protosw *pr = so->so_proto;
820
821	how++;
822	if (how & FREAD)
823		sorflush(so);
824	if (how & FWRITE)
825		return ((*pr->pr_usrreqs->pru_shutdown)(so));
826	return (0);
827}
828
829void
830sorflush(so)
831	register struct socket *so;
832{
833	register struct sockbuf *sb = &so->so_rcv;
834	register struct protosw *pr = so->so_proto;
835	register int s;
836	struct sockbuf asb;
837
838	sb->sb_flags |= SB_NOINTR;
839	(void) sblock(sb, M_WAITOK);
840	s = splimp();
841	socantrcvmore(so);
842	sbunlock(sb);
843	asb = *sb;
844	bzero((caddr_t)sb, sizeof (*sb));
845	splx(s);
846	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
847		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
848	sbrelease(&asb);
849}
850
851int
852sosetopt(so, level, optname, m0, p)
853	register struct socket *so;
854	int level, optname;
855	struct mbuf *m0;
856	struct proc *p;
857{
858	int error = 0;
859	register struct mbuf *m = m0;
860
861	if (level != SOL_SOCKET) {
862		if (so->so_proto && so->so_proto->pr_ctloutput)
863			return ((*so->so_proto->pr_ctloutput)
864				  (PRCO_SETOPT, so, level, optname, &m0, p));
865		error = ENOPROTOOPT;
866	} else {
867		switch (optname) {
868
869		case SO_LINGER:
870			if (m == NULL || m->m_len != sizeof (struct linger)) {
871				error = EINVAL;
872				goto bad;
873			}
874			so->so_linger = mtod(m, struct linger *)->l_linger;
875			/* fall thru... */
876
877		case SO_DEBUG:
878		case SO_KEEPALIVE:
879		case SO_DONTROUTE:
880		case SO_USELOOPBACK:
881		case SO_BROADCAST:
882		case SO_REUSEADDR:
883		case SO_REUSEPORT:
884		case SO_OOBINLINE:
885		case SO_TIMESTAMP:
886			if (m == NULL || m->m_len < sizeof (int)) {
887				error = EINVAL;
888				goto bad;
889			}
890			if (*mtod(m, int *))
891				so->so_options |= optname;
892			else
893				so->so_options &= ~optname;
894			break;
895
896		case SO_SNDBUF:
897		case SO_RCVBUF:
898		case SO_SNDLOWAT:
899		case SO_RCVLOWAT:
900		    {
901			int optval;
902
903			if (m == NULL || m->m_len < sizeof (int)) {
904				error = EINVAL;
905				goto bad;
906			}
907
908			/*
909			 * Values < 1 make no sense for any of these
910			 * options, so disallow them.
911			 */
912			optval = *mtod(m, int *);
913			if (optval < 1) {
914				error = EINVAL;
915				goto bad;
916			}
917
918			switch (optname) {
919
920			case SO_SNDBUF:
921			case SO_RCVBUF:
922				if (sbreserve(optname == SO_SNDBUF ?
923				    &so->so_snd : &so->so_rcv,
924				    (u_long) optval) == 0) {
925					error = ENOBUFS;
926					goto bad;
927				}
928				break;
929
930			/*
931			 * Make sure the low-water is never greater than
932			 * the high-water.
933			 */
934			case SO_SNDLOWAT:
935				so->so_snd.sb_lowat =
936				    (optval > so->so_snd.sb_hiwat) ?
937				    so->so_snd.sb_hiwat : optval;
938				break;
939			case SO_RCVLOWAT:
940				so->so_rcv.sb_lowat =
941				    (optval > so->so_rcv.sb_hiwat) ?
942				    so->so_rcv.sb_hiwat : optval;
943				break;
944			}
945			break;
946		    }
947
948		case SO_SNDTIMEO:
949		case SO_RCVTIMEO:
950		    {
951			struct timeval *tv;
952			short val;
953
954			if (m == NULL || m->m_len < sizeof (*tv)) {
955				error = EINVAL;
956				goto bad;
957			}
958			tv = mtod(m, struct timeval *);
959			if (tv->tv_sec > SHRT_MAX / hz - hz) {
960				error = EDOM;
961				goto bad;
962			}
963			val = tv->tv_sec * hz + tv->tv_usec / tick;
964
965			switch (optname) {
966
967			case SO_SNDTIMEO:
968				so->so_snd.sb_timeo = val;
969				break;
970			case SO_RCVTIMEO:
971				so->so_rcv.sb_timeo = val;
972				break;
973			}
974			break;
975		    }
976
977		default:
978			error = ENOPROTOOPT;
979			break;
980		}
981		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
982			(void) ((*so->so_proto->pr_ctloutput)
983				  (PRCO_SETOPT, so, level, optname, &m0, p));
984			m = NULL;	/* freed by protocol */
985		}
986	}
987bad:
988	if (m)
989		(void) m_free(m);
990	return (error);
991}
992
993int
994sogetopt(so, level, optname, mp, p)
995	register struct socket *so;
996	int level, optname;
997	struct mbuf **mp;
998	struct proc *p;
999{
1000	register struct mbuf *m;
1001
1002	if (level != SOL_SOCKET) {
1003		if (so->so_proto && so->so_proto->pr_ctloutput) {
1004			return ((*so->so_proto->pr_ctloutput)
1005				  (PRCO_GETOPT, so, level, optname, mp, p));
1006		} else
1007			return (ENOPROTOOPT);
1008	} else {
1009		m = m_get(M_WAIT, MT_SOOPTS);
1010		m->m_len = sizeof (int);
1011
1012		switch (optname) {
1013
1014		case SO_LINGER:
1015			m->m_len = sizeof (struct linger);
1016			mtod(m, struct linger *)->l_onoff =
1017				so->so_options & SO_LINGER;
1018			mtod(m, struct linger *)->l_linger = so->so_linger;
1019			break;
1020
1021		case SO_USELOOPBACK:
1022		case SO_DONTROUTE:
1023		case SO_DEBUG:
1024		case SO_KEEPALIVE:
1025		case SO_REUSEADDR:
1026		case SO_REUSEPORT:
1027		case SO_BROADCAST:
1028		case SO_OOBINLINE:
1029		case SO_TIMESTAMP:
1030			*mtod(m, int *) = so->so_options & optname;
1031			break;
1032
1033		case SO_TYPE:
1034			*mtod(m, int *) = so->so_type;
1035			break;
1036
1037		case SO_ERROR:
1038			*mtod(m, int *) = so->so_error;
1039			so->so_error = 0;
1040			break;
1041
1042		case SO_SNDBUF:
1043			*mtod(m, int *) = so->so_snd.sb_hiwat;
1044			break;
1045
1046		case SO_RCVBUF:
1047			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1048			break;
1049
1050		case SO_SNDLOWAT:
1051			*mtod(m, int *) = so->so_snd.sb_lowat;
1052			break;
1053
1054		case SO_RCVLOWAT:
1055			*mtod(m, int *) = so->so_rcv.sb_lowat;
1056			break;
1057
1058		case SO_SNDTIMEO:
1059		case SO_RCVTIMEO:
1060		    {
1061			int val = (optname == SO_SNDTIMEO ?
1062			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1063
1064			m->m_len = sizeof(struct timeval);
1065			mtod(m, struct timeval *)->tv_sec = val / hz;
1066			mtod(m, struct timeval *)->tv_usec =
1067			    (val % hz) * tick;
1068			break;
1069		    }
1070
1071		default:
1072			(void)m_free(m);
1073			return (ENOPROTOOPT);
1074		}
1075		*mp = m;
1076		return (0);
1077	}
1078}
1079
1080void
1081sohasoutofband(so)
1082	register struct socket *so;
1083{
1084	struct proc *p;
1085
1086	if (so->so_pgid < 0)
1087		gsignal(-so->so_pgid, SIGURG);
1088	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1089		psignal(p, SIGURG);
1090	selwakeup(&so->so_rcv.sb_sel);
1091}
1092
1093int
1094sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1095{
1096	int revents = 0;
1097	int s = splnet();
1098
1099	if (events & (POLLIN | POLLRDNORM))
1100		if (soreadable(so))
1101			revents |= events & (POLLIN | POLLRDNORM);
1102
1103	if (events & (POLLOUT | POLLWRNORM))
1104		if (sowriteable(so))
1105			revents |= events & (POLLOUT | POLLWRNORM);
1106
1107	if (events & (POLLPRI | POLLRDBAND))
1108		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1109			revents |= events & (POLLPRI | POLLRDBAND);
1110
1111	if (revents == 0) {
1112		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1113			selrecord(p, &so->so_rcv.sb_sel);
1114			so->so_rcv.sb_flags |= SB_SEL;
1115		}
1116
1117		if (events & (POLLOUT | POLLWRNORM)) {
1118			selrecord(p, &so->so_snd.sb_sel);
1119			so->so_snd.sb_flags |= SB_SEL;
1120		}
1121	}
1122
1123	splx(s);
1124	return (revents);
1125}
1126