uipc_usrreq.c revision 145492
1/*-
2 * Copyright 2004-2005 Robert N. M. Watson
3 * Copyright (c) 1982, 1986, 1989, 1991, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 145492 2005-04-25 00:48:04Z mdodd $");
35
36#include "opt_mac.h"
37
38#include <sys/param.h>
39#include <sys/domain.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/jail.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mac.h>
48#include <sys/mbuf.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/proc.h>
52#include <sys/protosw.h>
53#include <sys/resourcevar.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/signalvar.h>
57#include <sys/stat.h>
58#include <sys/sx.h>
59#include <sys/sysctl.h>
60#include <sys/systm.h>
61#include <sys/un.h>
62#include <sys/unpcb.h>
63#include <sys/vnode.h>
64
65#include <vm/uma.h>
66
67static uma_zone_t unp_zone;
68static	unp_gen_t unp_gencnt;
69static	u_int unp_count;
70
71static	struct unp_head unp_shead, unp_dhead;
72
73/*
74 * Unix communications domain.
75 *
76 * TODO:
77 *	SEQPACKET, RDM
78 *	rethink name space problems
79 *	need a proper out-of-band
80 *	lock pushdown
81 */
82static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
83static ino_t	unp_ino;		/* prototype for fake inode numbers */
84struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
85
86/*
87 * Currently, UNIX domain sockets are protected by a single subsystem lock,
88 * which covers global data structures and variables, the contents of each
89 * per-socket unpcb structure, and the so_pcb field in sockets attached to
90 * the UNIX domain.  This provides for a moderate degree of paralellism, as
91 * receive operations on UNIX domain sockets do not need to acquire the
92 * subsystem lock.  Finer grained locking to permit send() without acquiring
93 * a global lock would be a logical next step.
94 *
95 * The UNIX domain socket lock preceds all socket layer locks, including the
96 * socket lock and socket buffer lock, permitting UNIX domain socket code to
97 * call into socket support routines without releasing its locks.
98 *
99 * Some caution is required in areas where the UNIX domain socket code enters
100 * VFS in order to create or find rendezvous points.  This results in
101 * dropping of the UNIX domain socket subsystem lock, acquisition of the
102 * Giant lock, and potential sleeping.  This increases the chances of races,
103 * and exposes weaknesses in the socket->protocol API by offering poor
104 * failure modes.
105 */
106static struct mtx unp_mtx;
107#define	UNP_LOCK_INIT() \
108	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
109#define	UNP_LOCK()		mtx_lock(&unp_mtx)
110#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
111#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
112#define	UNP_UNLOCK_ASSERT()	mtx_assert(&unp_mtx, MA_NOTOWNED)
113
114static int     unp_attach(struct socket *);
115static void    unp_detach(struct unpcb *);
116static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
117static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
118static int     unp_connect2(struct socket *so, struct socket *so2, int);
119static void    unp_disconnect(struct unpcb *);
120static void    unp_shutdown(struct unpcb *);
121static void    unp_drop(struct unpcb *, int);
122static void    unp_gc(void);
123static void    unp_scan(struct mbuf *, void (*)(struct file *));
124static void    unp_mark(struct file *);
125static void    unp_discard(struct file *);
126static void    unp_freerights(struct file **, int);
127static int     unp_internalize(struct mbuf **, struct thread *);
128static int     unp_listen(struct socket *, struct unpcb *, struct thread *);
129
130static int
131uipc_abort(struct socket *so)
132{
133	struct unpcb *unp;
134
135	UNP_LOCK();
136	unp = sotounpcb(so);
137	if (unp == NULL) {
138		UNP_UNLOCK();
139		return (EINVAL);
140	}
141	unp_drop(unp, ECONNABORTED);
142	unp_detach(unp);
143	UNP_UNLOCK_ASSERT();
144	ACCEPT_LOCK();
145	SOCK_LOCK(so);
146	sotryfree(so);
147	return (0);
148}
149
150static int
151uipc_accept(struct socket *so, struct sockaddr **nam)
152{
153	struct unpcb *unp;
154	const struct sockaddr *sa;
155
156	/*
157	 * Pass back name of connected socket,
158	 * if it was bound and we are still connected
159	 * (our peer may have closed already!).
160	 */
161	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
162	UNP_LOCK();
163	unp = sotounpcb(so);
164	if (unp == NULL) {
165		UNP_UNLOCK();
166		free(*nam, M_SONAME);
167		*nam = NULL;
168		return (EINVAL);
169	}
170	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
171		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
172	else
173		sa = &sun_noname;
174	bcopy(sa, *nam, sa->sa_len);
175	UNP_UNLOCK();
176	return (0);
177}
178
179static int
180uipc_attach(struct socket *so, int proto, struct thread *td)
181{
182	struct unpcb *unp = sotounpcb(so);
183
184	if (unp != NULL)
185		return (EISCONN);
186	return (unp_attach(so));
187}
188
189static int
190uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
191{
192	struct unpcb *unp;
193	int error;
194
195	UNP_LOCK();
196	unp = sotounpcb(so);
197	if (unp == NULL) {
198		UNP_UNLOCK();
199		return (EINVAL);
200	}
201	error = unp_bind(unp, nam, td);
202	UNP_UNLOCK();
203	return (error);
204}
205
206static int
207uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
208{
209	struct unpcb *unp;
210	int error;
211
212	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
213
214	UNP_LOCK();
215	unp = sotounpcb(so);
216	if (unp == NULL) {
217		UNP_UNLOCK();
218		return (EINVAL);
219	}
220	error = unp_connect(so, nam, td);
221	UNP_UNLOCK();
222	return (error);
223}
224
225int
226uipc_connect2(struct socket *so1, struct socket *so2)
227{
228	struct unpcb *unp;
229	int error;
230
231	UNP_LOCK();
232	unp = sotounpcb(so1);
233	if (unp == NULL) {
234		UNP_UNLOCK();
235		return (EINVAL);
236	}
237	error = unp_connect2(so1, so2, PRU_CONNECT2);
238	UNP_UNLOCK();
239	return (error);
240}
241
242/* control is EOPNOTSUPP */
243
244static int
245uipc_detach(struct socket *so)
246{
247	struct unpcb *unp;
248
249	UNP_LOCK();
250	unp = sotounpcb(so);
251	if (unp == NULL) {
252		UNP_UNLOCK();
253		return (EINVAL);
254	}
255	unp_detach(unp);
256	UNP_UNLOCK_ASSERT();
257	return (0);
258}
259
260static int
261uipc_disconnect(struct socket *so)
262{
263	struct unpcb *unp;
264
265	UNP_LOCK();
266	unp = sotounpcb(so);
267	if (unp == NULL) {
268		UNP_UNLOCK();
269		return (EINVAL);
270	}
271	unp_disconnect(unp);
272	UNP_UNLOCK();
273	return (0);
274}
275
276static int
277uipc_listen(struct socket *so, struct thread *td)
278{
279	struct unpcb *unp;
280	int error;
281
282	UNP_LOCK();
283	unp = sotounpcb(so);
284	if (unp == NULL || unp->unp_vnode == NULL) {
285		UNP_UNLOCK();
286		return (EINVAL);
287	}
288	error = unp_listen(so, unp, td);
289	UNP_UNLOCK();
290	return (error);
291}
292
293static int
294uipc_peeraddr(struct socket *so, struct sockaddr **nam)
295{
296	struct unpcb *unp;
297	const struct sockaddr *sa;
298
299	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
300	UNP_LOCK();
301	unp = sotounpcb(so);
302	if (unp == NULL) {
303		UNP_UNLOCK();
304		free(*nam, M_SONAME);
305		*nam = NULL;
306		return (EINVAL);
307	}
308	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
309		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
310	else {
311		/*
312		 * XXX: It seems that this test always fails even when
313		 * connection is established.  So, this else clause is
314		 * added as workaround to return PF_LOCAL sockaddr.
315		 */
316		sa = &sun_noname;
317	}
318	bcopy(sa, *nam, sa->sa_len);
319	UNP_UNLOCK();
320	return (0);
321}
322
323static int
324uipc_rcvd(struct socket *so, int flags)
325{
326	struct unpcb *unp;
327	struct socket *so2;
328	u_long newhiwat;
329
330	UNP_LOCK();
331	unp = sotounpcb(so);
332	if (unp == NULL) {
333		UNP_UNLOCK();
334		return (EINVAL);
335	}
336	switch (so->so_type) {
337	case SOCK_DGRAM:
338		panic("uipc_rcvd DGRAM?");
339		/*NOTREACHED*/
340
341	case SOCK_STREAM:
342		if (unp->unp_conn == NULL)
343			break;
344		so2 = unp->unp_conn->unp_socket;
345		SOCKBUF_LOCK(&so2->so_snd);
346		SOCKBUF_LOCK(&so->so_rcv);
347		/*
348		 * Adjust backpressure on sender
349		 * and wakeup any waiting to write.
350		 */
351		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
352		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
353		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
354		    so->so_rcv.sb_cc;
355		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
356		    newhiwat, RLIM_INFINITY);
357		unp->unp_cc = so->so_rcv.sb_cc;
358		SOCKBUF_UNLOCK(&so->so_rcv);
359		sowwakeup_locked(so2);
360		break;
361
362	default:
363		panic("uipc_rcvd unknown socktype");
364	}
365	UNP_UNLOCK();
366	return (0);
367}
368
369/* pru_rcvoob is EOPNOTSUPP */
370
371static int
372uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
373    struct mbuf *control, struct thread *td)
374{
375	int error = 0;
376	struct unpcb *unp;
377	struct socket *so2;
378	u_long newhiwat;
379
380	unp = sotounpcb(so);
381	if (unp == NULL) {
382		error = EINVAL;
383		goto release;
384	}
385	if (flags & PRUS_OOB) {
386		error = EOPNOTSUPP;
387		goto release;
388	}
389
390	if (control != NULL && (error = unp_internalize(&control, td)))
391		goto release;
392
393	UNP_LOCK();
394	unp = sotounpcb(so);
395	if (unp == NULL) {
396		UNP_UNLOCK();
397		error = EINVAL;
398		goto dispose_release;
399	}
400
401	switch (so->so_type) {
402	case SOCK_DGRAM:
403	{
404		const struct sockaddr *from;
405
406		if (nam != NULL) {
407			if (unp->unp_conn != NULL) {
408				error = EISCONN;
409				break;
410			}
411			error = unp_connect(so, nam, td);
412			if (error)
413				break;
414		} else {
415			if (unp->unp_conn == NULL) {
416				error = ENOTCONN;
417				break;
418			}
419		}
420		so2 = unp->unp_conn->unp_socket;
421		if (unp->unp_addr != NULL)
422			from = (struct sockaddr *)unp->unp_addr;
423		else
424			from = &sun_noname;
425		if (unp->unp_conn->unp_flags & UNP_WANTCRED)
426			control = unp_addsockcred(td, control);
427		SOCKBUF_LOCK(&so2->so_rcv);
428		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
429			sorwakeup_locked(so2);
430			m = NULL;
431			control = NULL;
432		} else {
433			SOCKBUF_UNLOCK(&so2->so_rcv);
434			error = ENOBUFS;
435		}
436		if (nam != NULL)
437			unp_disconnect(unp);
438		break;
439	}
440
441	case SOCK_STREAM:
442		/* Connect if not connected yet. */
443		/*
444		 * Note: A better implementation would complain
445		 * if not equal to the peer's address.
446		 */
447		if ((so->so_state & SS_ISCONNECTED) == 0) {
448			if (nam != NULL) {
449				error = unp_connect(so, nam, td);
450				if (error)
451					break;	/* XXX */
452			} else {
453				error = ENOTCONN;
454				break;
455			}
456		}
457
458		SOCKBUF_LOCK(&so->so_snd);
459		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
460			SOCKBUF_UNLOCK(&so->so_snd);
461			error = EPIPE;
462			break;
463		}
464		if (unp->unp_conn == NULL)
465			panic("uipc_send connected but no connection?");
466		so2 = unp->unp_conn->unp_socket;
467		SOCKBUF_LOCK(&so2->so_rcv);
468		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
469			/*
470			 * Credentials are passed only once on
471			 * SOCK_STREAM.
472			 */
473			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
474			control = unp_addsockcred(td, control);
475		}
476		/*
477		 * Send to paired receive port, and then reduce
478		 * send buffer hiwater marks to maintain backpressure.
479		 * Wake up readers.
480		 */
481		if (control != NULL) {
482			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
483				control = NULL;
484		} else {
485			sbappend_locked(&so2->so_rcv, m);
486		}
487		so->so_snd.sb_mbmax -=
488			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
489		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
490		newhiwat = so->so_snd.sb_hiwat -
491		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
492		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
493		    newhiwat, RLIM_INFINITY);
494		SOCKBUF_UNLOCK(&so->so_snd);
495		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
496		sorwakeup_locked(so2);
497		m = NULL;
498		break;
499
500	default:
501		panic("uipc_send unknown socktype");
502	}
503
504	/*
505	 * SEND_EOF is equivalent to a SEND followed by
506	 * a SHUTDOWN.
507	 */
508	if (flags & PRUS_EOF) {
509		socantsendmore(so);
510		unp_shutdown(unp);
511	}
512	UNP_UNLOCK();
513
514dispose_release:
515	if (control != NULL && error != 0)
516		unp_dispose(control);
517
518release:
519	if (control != NULL)
520		m_freem(control);
521	if (m != NULL)
522		m_freem(m);
523	return (error);
524}
525
526static int
527uipc_sense(struct socket *so, struct stat *sb)
528{
529	struct unpcb *unp;
530	struct socket *so2;
531
532	UNP_LOCK();
533	unp = sotounpcb(so);
534	if (unp == NULL) {
535		UNP_UNLOCK();
536		return (EINVAL);
537	}
538	sb->st_blksize = so->so_snd.sb_hiwat;
539	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
540		so2 = unp->unp_conn->unp_socket;
541		sb->st_blksize += so2->so_rcv.sb_cc;
542	}
543	sb->st_dev = NODEV;
544	if (unp->unp_ino == 0)
545		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
546	sb->st_ino = unp->unp_ino;
547	UNP_UNLOCK();
548	return (0);
549}
550
551static int
552uipc_shutdown(struct socket *so)
553{
554	struct unpcb *unp;
555
556	UNP_LOCK();
557	unp = sotounpcb(so);
558	if (unp == NULL) {
559		UNP_UNLOCK();
560		return (EINVAL);
561	}
562	socantsendmore(so);
563	unp_shutdown(unp);
564	UNP_UNLOCK();
565	return (0);
566}
567
568static int
569uipc_sockaddr(struct socket *so, struct sockaddr **nam)
570{
571	struct unpcb *unp;
572	const struct sockaddr *sa;
573
574	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
575	UNP_LOCK();
576	unp = sotounpcb(so);
577	if (unp == NULL) {
578		UNP_UNLOCK();
579		free(*nam, M_SONAME);
580		*nam = NULL;
581		return (EINVAL);
582	}
583	if (unp->unp_addr != NULL)
584		sa = (struct sockaddr *) unp->unp_addr;
585	else
586		sa = &sun_noname;
587	bcopy(sa, *nam, sa->sa_len);
588	UNP_UNLOCK();
589	return (0);
590}
591
592struct pr_usrreqs uipc_usrreqs = {
593	.pru_abort = 		uipc_abort,
594	.pru_accept =		uipc_accept,
595	.pru_attach =		uipc_attach,
596	.pru_bind =		uipc_bind,
597	.pru_connect =		uipc_connect,
598	.pru_connect2 =		uipc_connect2,
599	.pru_detach =		uipc_detach,
600	.pru_disconnect =	uipc_disconnect,
601	.pru_listen =		uipc_listen,
602	.pru_peeraddr =		uipc_peeraddr,
603	.pru_rcvd =		uipc_rcvd,
604	.pru_send =		uipc_send,
605	.pru_sense =		uipc_sense,
606	.pru_shutdown =		uipc_shutdown,
607	.pru_sockaddr =		uipc_sockaddr,
608	.pru_sosend =		sosend,
609	.pru_soreceive =	soreceive,
610	.pru_sopoll =		sopoll,
611};
612
613int
614uipc_ctloutput(struct socket *so, struct sockopt *sopt)
615{
616	struct unpcb *unp;
617	struct xucred xu;
618	int error, optval;
619
620	if (sopt->sopt_level != 0)
621		return (EINVAL);
622
623	UNP_LOCK();
624	unp = sotounpcb(so);
625	if (unp == NULL) {
626		UNP_UNLOCK();
627		return (EINVAL);
628	}
629	error = 0;
630
631	switch (sopt->sopt_dir) {
632	case SOPT_GET:
633		switch (sopt->sopt_name) {
634		case LOCAL_PEERCRED:
635			if (unp->unp_flags & UNP_HAVEPC)
636				xu = unp->unp_peercred;
637			else {
638				if (so->so_type == SOCK_STREAM)
639					error = ENOTCONN;
640				else
641					error = EINVAL;
642			}
643			if (error == 0)
644				error = sooptcopyout(sopt, &xu, sizeof(xu));
645			break;
646		case LOCAL_CREDS:
647			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
648			error = sooptcopyout(sopt, &optval, sizeof(optval));
649			break;
650		case LOCAL_CONNWAIT:
651			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
652			error = sooptcopyout(sopt, &optval, sizeof(optval));
653			break;
654		default:
655			error = EOPNOTSUPP;
656			break;
657		}
658		break;
659	case SOPT_SET:
660		switch (sopt->sopt_name) {
661		case LOCAL_CREDS:
662		case LOCAL_CONNWAIT:
663			error = sooptcopyin(sopt, &optval, sizeof(optval),
664					    sizeof(optval));
665			if (error)
666				break;
667
668#define	OPTSET(bit) \
669	if (optval) \
670		unp->unp_flags |= bit; \
671	else \
672		unp->unp_flags &= ~bit;
673
674			switch (sopt->sopt_name) {
675			case LOCAL_CREDS:
676				OPTSET(UNP_WANTCRED);
677				break;
678			case LOCAL_CONNWAIT:
679				OPTSET(UNP_CONNWAIT);
680				break;
681			default:
682				break;
683			}
684			break;
685#undef	OPTSET
686		default:
687			error = ENOPROTOOPT;
688			break;
689		}
690		break;
691	default:
692		error = EOPNOTSUPP;
693		break;
694	}
695	UNP_UNLOCK();
696	return (error);
697}
698
699/*
700 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
701 * for stream sockets, although the total for sender and receiver is
702 * actually only PIPSIZ.
703 * Datagram sockets really use the sendspace as the maximum datagram size,
704 * and don't really want to reserve the sendspace.  Their recvspace should
705 * be large enough for at least one max-size datagram plus address.
706 */
707#ifndef PIPSIZ
708#define	PIPSIZ	8192
709#endif
710static u_long	unpst_sendspace = PIPSIZ;
711static u_long	unpst_recvspace = PIPSIZ;
712static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
713static u_long	unpdg_recvspace = 4*1024;
714
715static int	unp_rights;			/* file descriptors in flight */
716
717SYSCTL_DECL(_net_local_stream);
718SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
719	   &unpst_sendspace, 0, "");
720SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
721	   &unpst_recvspace, 0, "");
722SYSCTL_DECL(_net_local_dgram);
723SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
724	   &unpdg_sendspace, 0, "");
725SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
726	   &unpdg_recvspace, 0, "");
727SYSCTL_DECL(_net_local);
728SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
729
730static int
731unp_attach(struct socket *so)
732{
733	struct unpcb *unp;
734	int error;
735
736	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
737		switch (so->so_type) {
738
739		case SOCK_STREAM:
740			error = soreserve(so, unpst_sendspace, unpst_recvspace);
741			break;
742
743		case SOCK_DGRAM:
744			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
745			break;
746
747		default:
748			panic("unp_attach");
749		}
750		if (error)
751			return (error);
752	}
753	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
754	if (unp == NULL)
755		return (ENOBUFS);
756	LIST_INIT(&unp->unp_refs);
757	unp->unp_socket = so;
758	so->so_pcb = unp;
759
760	UNP_LOCK();
761	unp->unp_gencnt = ++unp_gencnt;
762	unp_count++;
763	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
764			 : &unp_shead, unp, unp_link);
765	UNP_UNLOCK();
766
767	return (0);
768}
769
770static void
771unp_detach(struct unpcb *unp)
772{
773	struct vnode *vp;
774
775	UNP_LOCK_ASSERT();
776
777	LIST_REMOVE(unp, unp_link);
778	unp->unp_gencnt = ++unp_gencnt;
779	--unp_count;
780	if ((vp = unp->unp_vnode) != NULL) {
781		/*
782		 * XXXRW: should v_socket be frobbed only while holding
783		 * Giant?
784		 */
785		unp->unp_vnode->v_socket = NULL;
786		unp->unp_vnode = NULL;
787	}
788	if (unp->unp_conn != NULL)
789		unp_disconnect(unp);
790	while (!LIST_EMPTY(&unp->unp_refs)) {
791		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
792		unp_drop(ref, ECONNRESET);
793	}
794	soisdisconnected(unp->unp_socket);
795	unp->unp_socket->so_pcb = NULL;
796	if (unp_rights) {
797		/*
798		 * Normally the receive buffer is flushed later,
799		 * in sofree, but if our receive buffer holds references
800		 * to descriptors that are now garbage, we will dispose
801		 * of those descriptor references after the garbage collector
802		 * gets them (resulting in a "panic: closef: count < 0").
803		 */
804		sorflush(unp->unp_socket);
805		unp_gc();	/* Will unlock UNP. */
806	} else
807		UNP_UNLOCK();
808	UNP_UNLOCK_ASSERT();
809	if (unp->unp_addr != NULL)
810		FREE(unp->unp_addr, M_SONAME);
811	uma_zfree(unp_zone, unp);
812	if (vp) {
813		mtx_lock(&Giant);
814		vrele(vp);
815		mtx_unlock(&Giant);
816	}
817}
818
819static int
820unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
821{
822	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
823	struct vnode *vp;
824	struct mount *mp;
825	struct vattr vattr;
826	int error, namelen;
827	struct nameidata nd;
828	char *buf;
829
830	UNP_LOCK_ASSERT();
831
832	/*
833	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
834	 * unlocked read here is fine, but the value of unp_vnode needs
835	 * to be tested again after we do all the lookups to see if the
836	 * pcb is still unbound?
837	 */
838	if (unp->unp_vnode != NULL)
839		return (EINVAL);
840
841	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
842	if (namelen <= 0)
843		return (EINVAL);
844
845	UNP_UNLOCK();
846
847	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
848	strlcpy(buf, soun->sun_path, namelen + 1);
849
850	mtx_lock(&Giant);
851restart:
852	mtx_assert(&Giant, MA_OWNED);
853	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
854	    buf, td);
855/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
856	error = namei(&nd);
857	if (error)
858		goto done;
859	vp = nd.ni_vp;
860	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
861		NDFREE(&nd, NDF_ONLY_PNBUF);
862		if (nd.ni_dvp == vp)
863			vrele(nd.ni_dvp);
864		else
865			vput(nd.ni_dvp);
866		if (vp != NULL) {
867			vrele(vp);
868			error = EADDRINUSE;
869			goto done;
870		}
871		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
872		if (error)
873			goto done;
874		goto restart;
875	}
876	VATTR_NULL(&vattr);
877	vattr.va_type = VSOCK;
878	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
879#ifdef MAC
880	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
881	    &vattr);
882#endif
883	if (error == 0) {
884		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
885		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
886	}
887	NDFREE(&nd, NDF_ONLY_PNBUF);
888	vput(nd.ni_dvp);
889	if (error) {
890		vn_finished_write(mp);
891		goto done;
892	}
893	vp = nd.ni_vp;
894	ASSERT_VOP_LOCKED(vp, "unp_bind");
895	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
896	UNP_LOCK();
897	vp->v_socket = unp->unp_socket;
898	unp->unp_vnode = vp;
899	unp->unp_addr = soun;
900	UNP_UNLOCK();
901	VOP_UNLOCK(vp, 0, td);
902	vn_finished_write(mp);
903done:
904	mtx_unlock(&Giant);
905	free(buf, M_TEMP);
906	UNP_LOCK();
907	return (error);
908}
909
910static int
911unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
912{
913	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
914	struct vnode *vp;
915	struct socket *so2, *so3;
916	struct unpcb *unp, *unp2, *unp3;
917	int error, len;
918	struct nameidata nd;
919	char buf[SOCK_MAXADDRLEN];
920	struct sockaddr *sa;
921
922	UNP_LOCK_ASSERT();
923	unp = sotounpcb(so);
924
925	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
926	if (len <= 0)
927		return (EINVAL);
928	strlcpy(buf, soun->sun_path, len + 1);
929	UNP_UNLOCK();
930	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
931	mtx_lock(&Giant);
932	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
933	error = namei(&nd);
934	if (error)
935		vp = NULL;
936	else
937		vp = nd.ni_vp;
938	ASSERT_VOP_LOCKED(vp, "unp_connect");
939	NDFREE(&nd, NDF_ONLY_PNBUF);
940	if (error)
941		goto bad;
942
943	if (vp->v_type != VSOCK) {
944		error = ENOTSOCK;
945		goto bad;
946	}
947	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
948	if (error)
949		goto bad;
950	mtx_unlock(&Giant);
951	UNP_LOCK();
952	unp = sotounpcb(so);
953	if (unp == NULL) {
954		error = EINVAL;
955		goto bad2;
956	}
957	so2 = vp->v_socket;
958	if (so2 == NULL) {
959		error = ECONNREFUSED;
960		goto bad2;
961	}
962	if (so->so_type != so2->so_type) {
963		error = EPROTOTYPE;
964		goto bad2;
965	}
966	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
967		if (so2->so_options & SO_ACCEPTCONN) {
968			/*
969			 * NB: drop locks here so unp_attach is entered
970			 *     w/o locks; this avoids a recursive lock
971			 *     of the head and holding sleep locks across
972			 *     a (potentially) blocking malloc.
973			 */
974			UNP_UNLOCK();
975			so3 = sonewconn(so2, 0);
976			UNP_LOCK();
977		} else
978			so3 = NULL;
979		if (so3 == NULL) {
980			error = ECONNREFUSED;
981			goto bad2;
982		}
983		unp = sotounpcb(so);
984		unp2 = sotounpcb(so2);
985		unp3 = sotounpcb(so3);
986		if (unp2->unp_addr != NULL) {
987			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
988			unp3->unp_addr = (struct sockaddr_un *) sa;
989			sa = NULL;
990		}
991		/*
992		 * unp_peercred management:
993		 *
994		 * The connecter's (client's) credentials are copied
995		 * from its process structure at the time of connect()
996		 * (which is now).
997		 */
998		cru2x(td->td_ucred, &unp3->unp_peercred);
999		unp3->unp_flags |= UNP_HAVEPC;
1000		/*
1001		 * The receiver's (server's) credentials are copied
1002		 * from the unp_peercred member of socket on which the
1003		 * former called listen(); unp_listen() cached that
1004		 * process's credentials at that time so we can use
1005		 * them now.
1006		 */
1007		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
1008		    ("unp_connect: listener without cached peercred"));
1009		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
1010		    sizeof(unp->unp_peercred));
1011		unp->unp_flags |= UNP_HAVEPC;
1012#ifdef MAC
1013		SOCK_LOCK(so);
1014		mac_set_socket_peer_from_socket(so, so3);
1015		mac_set_socket_peer_from_socket(so3, so);
1016		SOCK_UNLOCK(so);
1017#endif
1018
1019		so2 = so3;
1020	}
1021	error = unp_connect2(so, so2, PRU_CONNECT);
1022bad2:
1023	UNP_UNLOCK();
1024	mtx_lock(&Giant);
1025bad:
1026	mtx_assert(&Giant, MA_OWNED);
1027	if (vp != NULL)
1028		vput(vp);
1029	mtx_unlock(&Giant);
1030	free(sa, M_SONAME);
1031	UNP_LOCK();
1032	return (error);
1033}
1034
1035static int
1036unp_connect2(struct socket *so, struct socket *so2, int req)
1037{
1038	struct unpcb *unp = sotounpcb(so);
1039	struct unpcb *unp2;
1040
1041	UNP_LOCK_ASSERT();
1042
1043	if (so2->so_type != so->so_type)
1044		return (EPROTOTYPE);
1045	unp2 = sotounpcb(so2);
1046	unp->unp_conn = unp2;
1047	switch (so->so_type) {
1048
1049	case SOCK_DGRAM:
1050		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1051		soisconnected(so);
1052		break;
1053
1054	case SOCK_STREAM:
1055		unp2->unp_conn = unp;
1056		if (req == PRU_CONNECT &&
1057		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1058			soisconnecting(so);
1059		else
1060			soisconnected(so);
1061		soisconnected(so2);
1062		break;
1063
1064	default:
1065		panic("unp_connect2");
1066	}
1067	return (0);
1068}
1069
1070static void
1071unp_disconnect(struct unpcb *unp)
1072{
1073	struct unpcb *unp2 = unp->unp_conn;
1074	struct socket *so;
1075
1076	UNP_LOCK_ASSERT();
1077
1078	if (unp2 == NULL)
1079		return;
1080	unp->unp_conn = NULL;
1081	switch (unp->unp_socket->so_type) {
1082
1083	case SOCK_DGRAM:
1084		LIST_REMOVE(unp, unp_reflink);
1085		so = unp->unp_socket;
1086		SOCK_LOCK(so);
1087		so->so_state &= ~SS_ISCONNECTED;
1088		SOCK_UNLOCK(so);
1089		break;
1090
1091	case SOCK_STREAM:
1092		soisdisconnected(unp->unp_socket);
1093		unp2->unp_conn = NULL;
1094		soisdisconnected(unp2->unp_socket);
1095		break;
1096	}
1097}
1098
1099#ifdef notdef
1100void
1101unp_abort(struct unpcb *unp)
1102{
1103
1104	unp_detach(unp);
1105	UNP_UNLOCK_ASSERT();
1106}
1107#endif
1108
1109/*
1110 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
1111 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
1112 * are safe to reference.  It first scans the list of struct unpcb's to
1113 * generate a pointer list, then it rescans its list one entry at a time to
1114 * externalize and copyout.  It checks the generation number to see if a
1115 * struct unpcb has been reused, and will skip it if so.
1116 */
1117static int
1118unp_pcblist(SYSCTL_HANDLER_ARGS)
1119{
1120	int error, i, n;
1121	struct unpcb *unp, **unp_list;
1122	unp_gen_t gencnt;
1123	struct xunpgen *xug;
1124	struct unp_head *head;
1125	struct xunpcb *xu;
1126
1127	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1128
1129	/*
1130	 * The process of preparing the PCB list is too time-consuming and
1131	 * resource-intensive to repeat twice on every request.
1132	 */
1133	if (req->oldptr == NULL) {
1134		n = unp_count;
1135		req->oldidx = 2 * (sizeof *xug)
1136			+ (n + n/8) * sizeof(struct xunpcb);
1137		return (0);
1138	}
1139
1140	if (req->newptr != NULL)
1141		return (EPERM);
1142
1143	/*
1144	 * OK, now we're committed to doing something.
1145	 */
1146	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1147	UNP_LOCK();
1148	gencnt = unp_gencnt;
1149	n = unp_count;
1150	UNP_UNLOCK();
1151
1152	xug->xug_len = sizeof *xug;
1153	xug->xug_count = n;
1154	xug->xug_gen = gencnt;
1155	xug->xug_sogen = so_gencnt;
1156	error = SYSCTL_OUT(req, xug, sizeof *xug);
1157	if (error) {
1158		free(xug, M_TEMP);
1159		return (error);
1160	}
1161
1162	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1163
1164	UNP_LOCK();
1165	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1166	     unp = LIST_NEXT(unp, unp_link)) {
1167		if (unp->unp_gencnt <= gencnt) {
1168			if (cr_cansee(req->td->td_ucred,
1169			    unp->unp_socket->so_cred))
1170				continue;
1171			unp_list[i++] = unp;
1172		}
1173	}
1174	UNP_UNLOCK();
1175	n = i;			/* in case we lost some during malloc */
1176
1177	error = 0;
1178	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1179	for (i = 0; i < n; i++) {
1180		unp = unp_list[i];
1181		if (unp->unp_gencnt <= gencnt) {
1182			xu->xu_len = sizeof *xu;
1183			xu->xu_unpp = unp;
1184			/*
1185			 * XXX - need more locking here to protect against
1186			 * connect/disconnect races for SMP.
1187			 */
1188			if (unp->unp_addr != NULL)
1189				bcopy(unp->unp_addr, &xu->xu_addr,
1190				      unp->unp_addr->sun_len);
1191			if (unp->unp_conn != NULL &&
1192			    unp->unp_conn->unp_addr != NULL)
1193				bcopy(unp->unp_conn->unp_addr,
1194				      &xu->xu_caddr,
1195				      unp->unp_conn->unp_addr->sun_len);
1196			bcopy(unp, &xu->xu_unp, sizeof *unp);
1197			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1198			error = SYSCTL_OUT(req, xu, sizeof *xu);
1199		}
1200	}
1201	free(xu, M_TEMP);
1202	if (!error) {
1203		/*
1204		 * Give the user an updated idea of our state.
1205		 * If the generation differs from what we told
1206		 * her before, she knows that something happened
1207		 * while we were processing this request, and it
1208		 * might be necessary to retry.
1209		 */
1210		xug->xug_gen = unp_gencnt;
1211		xug->xug_sogen = so_gencnt;
1212		xug->xug_count = unp_count;
1213		error = SYSCTL_OUT(req, xug, sizeof *xug);
1214	}
1215	free(unp_list, M_TEMP);
1216	free(xug, M_TEMP);
1217	return (error);
1218}
1219
1220SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1221	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1222	    "List of active local datagram sockets");
1223SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1224	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1225	    "List of active local stream sockets");
1226
1227static void
1228unp_shutdown(struct unpcb *unp)
1229{
1230	struct socket *so;
1231
1232	UNP_LOCK_ASSERT();
1233
1234	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1235	    (so = unp->unp_conn->unp_socket))
1236		socantrcvmore(so);
1237}
1238
1239static void
1240unp_drop(struct unpcb *unp, int errno)
1241{
1242	struct socket *so = unp->unp_socket;
1243
1244	UNP_LOCK_ASSERT();
1245
1246	so->so_error = errno;
1247	unp_disconnect(unp);
1248}
1249
1250#ifdef notdef
1251void
1252unp_drain(void)
1253{
1254
1255}
1256#endif
1257
1258static void
1259unp_freerights(struct file **rp, int fdcount)
1260{
1261	int i;
1262	struct file *fp;
1263
1264	for (i = 0; i < fdcount; i++) {
1265		fp = *rp;
1266		/*
1267		 * zero the pointer before calling
1268		 * unp_discard since it may end up
1269		 * in unp_gc()..
1270		 */
1271		*rp++ = 0;
1272		unp_discard(fp);
1273	}
1274}
1275
1276int
1277unp_externalize(struct mbuf *control, struct mbuf **controlp)
1278{
1279	struct thread *td = curthread;		/* XXX */
1280	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1281	int i;
1282	int *fdp;
1283	struct file **rp;
1284	struct file *fp;
1285	void *data;
1286	socklen_t clen = control->m_len, datalen;
1287	int error, newfds;
1288	int f;
1289	u_int newlen;
1290
1291	UNP_UNLOCK_ASSERT();
1292
1293	error = 0;
1294	if (controlp != NULL) /* controlp == NULL => free control messages */
1295		*controlp = NULL;
1296
1297	while (cm != NULL) {
1298		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1299			error = EINVAL;
1300			break;
1301		}
1302
1303		data = CMSG_DATA(cm);
1304		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1305
1306		if (cm->cmsg_level == SOL_SOCKET
1307		    && cm->cmsg_type == SCM_RIGHTS) {
1308			newfds = datalen / sizeof(struct file *);
1309			rp = data;
1310
1311			/* If we're not outputting the descriptors free them. */
1312			if (error || controlp == NULL) {
1313				unp_freerights(rp, newfds);
1314				goto next;
1315			}
1316			FILEDESC_LOCK(td->td_proc->p_fd);
1317			/* if the new FD's will not fit free them.  */
1318			if (!fdavail(td, newfds)) {
1319				FILEDESC_UNLOCK(td->td_proc->p_fd);
1320				error = EMSGSIZE;
1321				unp_freerights(rp, newfds);
1322				goto next;
1323			}
1324			/*
1325			 * now change each pointer to an fd in the global
1326			 * table to an integer that is the index to the
1327			 * local fd table entry that we set up to point
1328			 * to the global one we are transferring.
1329			 */
1330			newlen = newfds * sizeof(int);
1331			*controlp = sbcreatecontrol(NULL, newlen,
1332			    SCM_RIGHTS, SOL_SOCKET);
1333			if (*controlp == NULL) {
1334				FILEDESC_UNLOCK(td->td_proc->p_fd);
1335				error = E2BIG;
1336				unp_freerights(rp, newfds);
1337				goto next;
1338			}
1339
1340			fdp = (int *)
1341			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1342			for (i = 0; i < newfds; i++) {
1343				if (fdalloc(td, 0, &f))
1344					panic("unp_externalize fdalloc failed");
1345				fp = *rp++;
1346				td->td_proc->p_fd->fd_ofiles[f] = fp;
1347				FILE_LOCK(fp);
1348				fp->f_msgcount--;
1349				FILE_UNLOCK(fp);
1350				unp_rights--;
1351				*fdp++ = f;
1352			}
1353			FILEDESC_UNLOCK(td->td_proc->p_fd);
1354		} else { /* We can just copy anything else across */
1355			if (error || controlp == NULL)
1356				goto next;
1357			*controlp = sbcreatecontrol(NULL, datalen,
1358			    cm->cmsg_type, cm->cmsg_level);
1359			if (*controlp == NULL) {
1360				error = ENOBUFS;
1361				goto next;
1362			}
1363			bcopy(data,
1364			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1365			    datalen);
1366		}
1367
1368		controlp = &(*controlp)->m_next;
1369
1370next:
1371		if (CMSG_SPACE(datalen) < clen) {
1372			clen -= CMSG_SPACE(datalen);
1373			cm = (struct cmsghdr *)
1374			    ((caddr_t)cm + CMSG_SPACE(datalen));
1375		} else {
1376			clen = 0;
1377			cm = NULL;
1378		}
1379	}
1380
1381	m_freem(control);
1382
1383	return (error);
1384}
1385
1386void
1387unp_init(void)
1388{
1389	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1390	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1391	if (unp_zone == NULL)
1392		panic("unp_init");
1393	uma_zone_set_max(unp_zone, nmbclusters);
1394	LIST_INIT(&unp_dhead);
1395	LIST_INIT(&unp_shead);
1396
1397	UNP_LOCK_INIT();
1398}
1399
1400static int
1401unp_internalize(struct mbuf **controlp, struct thread *td)
1402{
1403	struct mbuf *control = *controlp;
1404	struct proc *p = td->td_proc;
1405	struct filedesc *fdescp = p->p_fd;
1406	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1407	struct cmsgcred *cmcred;
1408	struct file **rp;
1409	struct file *fp;
1410	struct timeval *tv;
1411	int i, fd, *fdp;
1412	void *data;
1413	socklen_t clen = control->m_len, datalen;
1414	int error, oldfds;
1415	u_int newlen;
1416
1417	UNP_UNLOCK_ASSERT();
1418
1419	error = 0;
1420	*controlp = NULL;
1421
1422	while (cm != NULL) {
1423		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1424		    || cm->cmsg_len > clen) {
1425			error = EINVAL;
1426			goto out;
1427		}
1428
1429		data = CMSG_DATA(cm);
1430		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1431
1432		switch (cm->cmsg_type) {
1433		/*
1434		 * Fill in credential information.
1435		 */
1436		case SCM_CREDS:
1437			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1438			    SCM_CREDS, SOL_SOCKET);
1439			if (*controlp == NULL) {
1440				error = ENOBUFS;
1441				goto out;
1442			}
1443
1444			cmcred = (struct cmsgcred *)
1445			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1446			cmcred->cmcred_pid = p->p_pid;
1447			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1448			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1449			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1450			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1451							CMGROUP_MAX);
1452			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1453				cmcred->cmcred_groups[i] =
1454				    td->td_ucred->cr_groups[i];
1455			break;
1456
1457		case SCM_RIGHTS:
1458			oldfds = datalen / sizeof (int);
1459			/*
1460			 * check that all the FDs passed in refer to legal files
1461			 * If not, reject the entire operation.
1462			 */
1463			fdp = data;
1464			FILEDESC_LOCK(fdescp);
1465			for (i = 0; i < oldfds; i++) {
1466				fd = *fdp++;
1467				if ((unsigned)fd >= fdescp->fd_nfiles ||
1468				    fdescp->fd_ofiles[fd] == NULL) {
1469					FILEDESC_UNLOCK(fdescp);
1470					error = EBADF;
1471					goto out;
1472				}
1473				fp = fdescp->fd_ofiles[fd];
1474				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1475					FILEDESC_UNLOCK(fdescp);
1476					error = EOPNOTSUPP;
1477					goto out;
1478				}
1479
1480			}
1481			/*
1482			 * Now replace the integer FDs with pointers to
1483			 * the associated global file table entry..
1484			 */
1485			newlen = oldfds * sizeof(struct file *);
1486			*controlp = sbcreatecontrol(NULL, newlen,
1487			    SCM_RIGHTS, SOL_SOCKET);
1488			if (*controlp == NULL) {
1489				FILEDESC_UNLOCK(fdescp);
1490				error = E2BIG;
1491				goto out;
1492			}
1493
1494			fdp = data;
1495			rp = (struct file **)
1496			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1497			for (i = 0; i < oldfds; i++) {
1498				fp = fdescp->fd_ofiles[*fdp++];
1499				*rp++ = fp;
1500				FILE_LOCK(fp);
1501				fp->f_count++;
1502				fp->f_msgcount++;
1503				FILE_UNLOCK(fp);
1504				unp_rights++;
1505			}
1506			FILEDESC_UNLOCK(fdescp);
1507			break;
1508
1509		case SCM_TIMESTAMP:
1510			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1511			    SCM_TIMESTAMP, SOL_SOCKET);
1512			if (*controlp == NULL) {
1513				error = ENOBUFS;
1514				goto out;
1515			}
1516			tv = (struct timeval *)
1517			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1518			microtime(tv);
1519			break;
1520
1521		default:
1522			error = EINVAL;
1523			goto out;
1524		}
1525
1526		controlp = &(*controlp)->m_next;
1527
1528		if (CMSG_SPACE(datalen) < clen) {
1529			clen -= CMSG_SPACE(datalen);
1530			cm = (struct cmsghdr *)
1531			    ((caddr_t)cm + CMSG_SPACE(datalen));
1532		} else {
1533			clen = 0;
1534			cm = NULL;
1535		}
1536	}
1537
1538out:
1539	m_freem(control);
1540
1541	return (error);
1542}
1543
1544struct mbuf *
1545unp_addsockcred(struct thread *td, struct mbuf *control)
1546{
1547	struct mbuf *m, *n;
1548	struct sockcred *sc;
1549	int ngroups;
1550	int i;
1551
1552	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
1553
1554	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
1555	if (m == NULL)
1556		return (control);
1557	m->m_next = NULL;
1558
1559	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
1560	sc->sc_uid = td->td_ucred->cr_ruid;
1561	sc->sc_euid = td->td_ucred->cr_uid;
1562	sc->sc_gid = td->td_ucred->cr_rgid;
1563	sc->sc_egid = td->td_ucred->cr_gid;
1564	sc->sc_ngroups = ngroups;
1565	for (i = 0; i < sc->sc_ngroups; i++)
1566		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
1567
1568	/*
1569	 * If a control message already exists, append us to the end.
1570	 */
1571	if (control != NULL) {
1572		for (n = control; n->m_next != NULL; n = n->m_next)
1573			;
1574		n->m_next = m;
1575	} else
1576		control = m;
1577
1578	return (control);
1579}
1580
1581/*
1582 * unp_defer is thread-local during garbage collection, and does not require
1583 * explicit synchronization.  unp_gcing prevents other threads from entering
1584 * garbage collection, and perhaps should be an sx lock instead.
1585 */
1586static int	unp_defer, unp_gcing;
1587
1588static void
1589unp_gc(void)
1590{
1591	struct file *fp, *nextfp;
1592	struct socket *so;
1593	struct file **extra_ref, **fpp;
1594	int nunref, i;
1595	int nfiles_snap;
1596	int nfiles_slack = 20;
1597
1598	UNP_LOCK_ASSERT();
1599
1600	if (unp_gcing) {
1601		UNP_UNLOCK();
1602		return;
1603	}
1604	unp_gcing = 1;
1605	unp_defer = 0;
1606	UNP_UNLOCK();
1607	/*
1608	 * before going through all this, set all FDs to
1609	 * be NOT defered and NOT externally accessible
1610	 */
1611	sx_slock(&filelist_lock);
1612	LIST_FOREACH(fp, &filehead, f_list)
1613		fp->f_gcflag &= ~(FMARK|FDEFER);
1614	do {
1615		LIST_FOREACH(fp, &filehead, f_list) {
1616			FILE_LOCK(fp);
1617			/*
1618			 * If the file is not open, skip it
1619			 */
1620			if (fp->f_count == 0) {
1621				FILE_UNLOCK(fp);
1622				continue;
1623			}
1624			/*
1625			 * If we already marked it as 'defer'  in a
1626			 * previous pass, then try process it this time
1627			 * and un-mark it
1628			 */
1629			if (fp->f_gcflag & FDEFER) {
1630				fp->f_gcflag &= ~FDEFER;
1631				unp_defer--;
1632			} else {
1633				/*
1634				 * if it's not defered, then check if it's
1635				 * already marked.. if so skip it
1636				 */
1637				if (fp->f_gcflag & FMARK) {
1638					FILE_UNLOCK(fp);
1639					continue;
1640				}
1641				/*
1642				 * If all references are from messages
1643				 * in transit, then skip it. it's not
1644				 * externally accessible.
1645				 */
1646				if (fp->f_count == fp->f_msgcount) {
1647					FILE_UNLOCK(fp);
1648					continue;
1649				}
1650				/*
1651				 * If it got this far then it must be
1652				 * externally accessible.
1653				 */
1654				fp->f_gcflag |= FMARK;
1655			}
1656			/*
1657			 * either it was defered, or it is externally
1658			 * accessible and not already marked so.
1659			 * Now check if it is possibly one of OUR sockets.
1660			 */
1661			if (fp->f_type != DTYPE_SOCKET ||
1662			    (so = fp->f_data) == NULL) {
1663				FILE_UNLOCK(fp);
1664				continue;
1665			}
1666			FILE_UNLOCK(fp);
1667			if (so->so_proto->pr_domain != &localdomain ||
1668			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1669				continue;
1670#ifdef notdef
1671			if (so->so_rcv.sb_flags & SB_LOCK) {
1672				/*
1673				 * This is problematical; it's not clear
1674				 * we need to wait for the sockbuf to be
1675				 * unlocked (on a uniprocessor, at least),
1676				 * and it's also not clear what to do
1677				 * if sbwait returns an error due to receipt
1678				 * of a signal.  If sbwait does return
1679				 * an error, we'll go into an infinite
1680				 * loop.  Delete all of this for now.
1681				 */
1682				(void) sbwait(&so->so_rcv);
1683				goto restart;
1684			}
1685#endif
1686			/*
1687			 * So, Ok, it's one of our sockets and it IS externally
1688			 * accessible (or was defered). Now we look
1689			 * to see if we hold any file descriptors in its
1690			 * message buffers. Follow those links and mark them
1691			 * as accessible too.
1692			 */
1693			SOCKBUF_LOCK(&so->so_rcv);
1694			unp_scan(so->so_rcv.sb_mb, unp_mark);
1695			SOCKBUF_UNLOCK(&so->so_rcv);
1696		}
1697	} while (unp_defer);
1698	sx_sunlock(&filelist_lock);
1699	/*
1700	 * We grab an extra reference to each of the file table entries
1701	 * that are not otherwise accessible and then free the rights
1702	 * that are stored in messages on them.
1703	 *
1704	 * The bug in the orginal code is a little tricky, so I'll describe
1705	 * what's wrong with it here.
1706	 *
1707	 * It is incorrect to simply unp_discard each entry for f_msgcount
1708	 * times -- consider the case of sockets A and B that contain
1709	 * references to each other.  On a last close of some other socket,
1710	 * we trigger a gc since the number of outstanding rights (unp_rights)
1711	 * is non-zero.  If during the sweep phase the gc code un_discards,
1712	 * we end up doing a (full) closef on the descriptor.  A closef on A
1713	 * results in the following chain.  Closef calls soo_close, which
1714	 * calls soclose.   Soclose calls first (through the switch
1715	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1716	 * returns because the previous instance had set unp_gcing, and
1717	 * we return all the way back to soclose, which marks the socket
1718	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1719	 * to free up the rights that are queued in messages on the socket A,
1720	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1721	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1722	 * instance of unp_discard just calls closef on B.
1723	 *
1724	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1725	 * which results in another closef on A.  Unfortunately, A is already
1726	 * being closed, and the descriptor has already been marked with
1727	 * SS_NOFDREF, and soclose panics at this point.
1728	 *
1729	 * Here, we first take an extra reference to each inaccessible
1730	 * descriptor.  Then, we call sorflush ourself, since we know
1731	 * it is a Unix domain socket anyhow.  After we destroy all the
1732	 * rights carried in messages, we do a last closef to get rid
1733	 * of our extra reference.  This is the last close, and the
1734	 * unp_detach etc will shut down the socket.
1735	 *
1736	 * 91/09/19, bsy@cs.cmu.edu
1737	 */
1738again:
1739	nfiles_snap = openfiles + nfiles_slack;	/* some slack */
1740	extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
1741	    M_WAITOK);
1742	sx_slock(&filelist_lock);
1743	if (nfiles_snap < openfiles) {
1744		sx_sunlock(&filelist_lock);
1745		free(extra_ref, M_TEMP);
1746		nfiles_slack += 20;
1747		goto again;
1748	}
1749	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1750	    fp != NULL; fp = nextfp) {
1751		nextfp = LIST_NEXT(fp, f_list);
1752		FILE_LOCK(fp);
1753		/*
1754		 * If it's not open, skip it
1755		 */
1756		if (fp->f_count == 0) {
1757			FILE_UNLOCK(fp);
1758			continue;
1759		}
1760		/*
1761		 * If all refs are from msgs, and it's not marked accessible
1762		 * then it must be referenced from some unreachable cycle
1763		 * of (shut-down) FDs, so include it in our
1764		 * list of FDs to remove
1765		 */
1766		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1767			*fpp++ = fp;
1768			nunref++;
1769			fp->f_count++;
1770		}
1771		FILE_UNLOCK(fp);
1772	}
1773	sx_sunlock(&filelist_lock);
1774	/*
1775	 * for each FD on our hit list, do the following two things
1776	 */
1777	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1778		struct file *tfp = *fpp;
1779		FILE_LOCK(tfp);
1780		if (tfp->f_type == DTYPE_SOCKET &&
1781		    tfp->f_data != NULL) {
1782			FILE_UNLOCK(tfp);
1783			sorflush(tfp->f_data);
1784		} else {
1785			FILE_UNLOCK(tfp);
1786		}
1787	}
1788	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1789		closef(*fpp, (struct thread *) NULL);
1790	free(extra_ref, M_TEMP);
1791	unp_gcing = 0;
1792
1793	UNP_UNLOCK_ASSERT();
1794}
1795
1796void
1797unp_dispose(struct mbuf *m)
1798{
1799
1800	if (m)
1801		unp_scan(m, unp_discard);
1802}
1803
1804static int
1805unp_listen(struct socket *so, struct unpcb *unp, struct thread *td)
1806{
1807	int error;
1808
1809	UNP_LOCK_ASSERT();
1810
1811	SOCK_LOCK(so);
1812	error = solisten_proto_check(so);
1813	if (error == 0) {
1814		cru2x(td->td_ucred, &unp->unp_peercred);
1815		unp->unp_flags |= UNP_HAVEPCCACHED;
1816		solisten_proto(so);
1817	}
1818	SOCK_UNLOCK(so);
1819	return (error);
1820}
1821
1822static void
1823unp_scan(struct mbuf *m0, void (*op)(struct file *))
1824{
1825	struct mbuf *m;
1826	struct file **rp;
1827	struct cmsghdr *cm;
1828	void *data;
1829	int i;
1830	socklen_t clen, datalen;
1831	int qfds;
1832
1833	while (m0 != NULL) {
1834		for (m = m0; m; m = m->m_next) {
1835			if (m->m_type != MT_CONTROL)
1836				continue;
1837
1838			cm = mtod(m, struct cmsghdr *);
1839			clen = m->m_len;
1840
1841			while (cm != NULL) {
1842				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1843					break;
1844
1845				data = CMSG_DATA(cm);
1846				datalen = (caddr_t)cm + cm->cmsg_len
1847				    - (caddr_t)data;
1848
1849				if (cm->cmsg_level == SOL_SOCKET &&
1850				    cm->cmsg_type == SCM_RIGHTS) {
1851					qfds = datalen / sizeof (struct file *);
1852					rp = data;
1853					for (i = 0; i < qfds; i++)
1854						(*op)(*rp++);
1855				}
1856
1857				if (CMSG_SPACE(datalen) < clen) {
1858					clen -= CMSG_SPACE(datalen);
1859					cm = (struct cmsghdr *)
1860					    ((caddr_t)cm + CMSG_SPACE(datalen));
1861				} else {
1862					clen = 0;
1863					cm = NULL;
1864				}
1865			}
1866		}
1867		m0 = m0->m_act;
1868	}
1869}
1870
1871static void
1872unp_mark(struct file *fp)
1873{
1874	if (fp->f_gcflag & FMARK)
1875		return;
1876	unp_defer++;
1877	fp->f_gcflag |= (FMARK|FDEFER);
1878}
1879
1880static void
1881unp_discard(struct file *fp)
1882{
1883	FILE_LOCK(fp);
1884	fp->f_msgcount--;
1885	unp_rights--;
1886	FILE_UNLOCK(fp);
1887	(void) closef(fp, (struct thread *)NULL);
1888}
1889