uipc_usrreq.c revision 130316
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 130316 2004-06-10 21:34:38Z rwatson $");
34
35#include "opt_mac.h"
36
37#include <sys/param.h>
38#include <sys/domain.h>
39#include <sys/fcntl.h>
40#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
41#include <sys/file.h>
42#include <sys/filedesc.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mac.h>
47#include <sys/mbuf.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/proc.h>
51#include <sys/protosw.h>
52#include <sys/resourcevar.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/signalvar.h>
56#include <sys/stat.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/systm.h>
60#include <sys/un.h>
61#include <sys/unpcb.h>
62#include <sys/vnode.h>
63
64#include <vm/uma.h>
65
66static uma_zone_t unp_zone;
67static	unp_gen_t unp_gencnt;
68static	u_int unp_count;
69
70static	struct unp_head unp_shead, unp_dhead;
71
72/*
73 * Unix communications domain.
74 *
75 * TODO:
76 *	SEQPACKET, RDM
77 *	rethink name space problems
78 *	need a proper out-of-band
79 *	lock pushdown
80 */
81static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
82static ino_t	unp_ino;		/* prototype for fake inode numbers */
83
84static struct mtx unp_mtx;
85#define	UNP_LOCK_INIT() \
86	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
87#define	UNP_LOCK()		mtx_lock(&unp_mtx)
88#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
89#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
90
91static int     unp_attach(struct socket *);
92static void    unp_detach(struct unpcb *);
93static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
94static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
95static int     unp_connect2(struct socket *so, struct socket *so2);
96static void    unp_disconnect(struct unpcb *);
97static void    unp_shutdown(struct unpcb *);
98static void    unp_drop(struct unpcb *, int);
99static void    unp_gc(void);
100static void    unp_scan(struct mbuf *, void (*)(struct file *));
101static void    unp_mark(struct file *);
102static void    unp_discard(struct file *);
103static void    unp_freerights(struct file **, int);
104static int     unp_internalize(struct mbuf **, struct thread *);
105static int     unp_listen(struct unpcb *, struct thread *);
106
107static int
108uipc_abort(struct socket *so)
109{
110	struct unpcb *unp = sotounpcb(so);
111
112	if (unp == NULL)
113		return (EINVAL);
114	UNP_LOCK();
115	unp_drop(unp, ECONNABORTED);
116	unp_detach(unp);	/* NB: unlocks */
117	sotryfree(so);
118	return (0);
119}
120
121static int
122uipc_accept(struct socket *so, struct sockaddr **nam)
123{
124	struct unpcb *unp = sotounpcb(so);
125	const struct sockaddr *sa;
126
127	if (unp == NULL)
128		return (EINVAL);
129
130	/*
131	 * Pass back name of connected socket,
132	 * if it was bound and we are still connected
133	 * (our peer may have closed already!).
134	 */
135	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
136	UNP_LOCK();
137	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
138		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
139	else
140		sa = &sun_noname;
141	bcopy(sa, *nam, sa->sa_len);
142	UNP_UNLOCK();
143	return (0);
144}
145
146static int
147uipc_attach(struct socket *so, int proto, struct thread *td)
148{
149	struct unpcb *unp = sotounpcb(so);
150
151	if (unp != NULL)
152		return (EISCONN);
153	return (unp_attach(so));
154}
155
156static int
157uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
158{
159	struct unpcb *unp = sotounpcb(so);
160
161	if (unp == NULL)
162		return (EINVAL);
163
164	return (unp_bind(unp, nam, td));
165}
166
167static int
168uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
169{
170	struct unpcb *unp = sotounpcb(so);
171	int error;
172
173	if (unp == NULL)
174		return (EINVAL);
175	UNP_LOCK();
176	error = unp_connect(so, nam, curthread);
177	UNP_UNLOCK();
178	return (error);
179}
180
181int
182uipc_connect2(struct socket *so1, struct socket *so2)
183{
184	struct unpcb *unp = sotounpcb(so1);
185	int error;
186
187	if (unp == NULL)
188		return (EINVAL);
189
190	UNP_LOCK();
191	error = unp_connect2(so1, so2);
192	UNP_UNLOCK();
193	return (error);
194}
195
196/* control is EOPNOTSUPP */
197
198static int
199uipc_detach(struct socket *so)
200{
201	struct unpcb *unp = sotounpcb(so);
202
203	if (unp == NULL)
204		return (EINVAL);
205
206	UNP_LOCK();
207	unp_detach(unp);	/* NB: unlocks unp */
208	return (0);
209}
210
211static int
212uipc_disconnect(struct socket *so)
213{
214	struct unpcb *unp = sotounpcb(so);
215
216	if (unp == NULL)
217		return (EINVAL);
218	UNP_LOCK();
219	unp_disconnect(unp);
220	UNP_UNLOCK();
221	return (0);
222}
223
224static int
225uipc_listen(struct socket *so, struct thread *td)
226{
227	struct unpcb *unp = sotounpcb(so);
228	int error;
229
230	if (unp == NULL || unp->unp_vnode == NULL)
231		return (EINVAL);
232	UNP_LOCK();
233	error = unp_listen(unp, td);
234	UNP_UNLOCK();
235	return (error);
236}
237
238static int
239uipc_peeraddr(struct socket *so, struct sockaddr **nam)
240{
241	struct unpcb *unp = sotounpcb(so);
242	const struct sockaddr *sa;
243
244	if (unp == NULL)
245		return (EINVAL);
246	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
247	UNP_LOCK();
248	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
249		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
250	else {
251		/*
252		 * XXX: It seems that this test always fails even when
253		 * connection is established.  So, this else clause is
254		 * added as workaround to return PF_LOCAL sockaddr.
255		 */
256		sa = &sun_noname;
257	}
258	bcopy(sa, *nam, sa->sa_len);
259	UNP_UNLOCK();
260	return (0);
261}
262
263static int
264uipc_rcvd(struct socket *so, int flags)
265{
266	struct unpcb *unp = sotounpcb(so);
267	struct socket *so2;
268	u_long newhiwat;
269
270	if (unp == NULL)
271		return (EINVAL);
272	UNP_LOCK();
273	switch (so->so_type) {
274	case SOCK_DGRAM:
275		panic("uipc_rcvd DGRAM?");
276		/*NOTREACHED*/
277
278	case SOCK_STREAM:
279		if (unp->unp_conn == NULL)
280			break;
281		so2 = unp->unp_conn->unp_socket;
282		/*
283		 * Adjust backpressure on sender
284		 * and wakeup any waiting to write.
285		 */
286		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
287		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
288		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
289		    so->so_rcv.sb_cc;
290		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
291		    newhiwat, RLIM_INFINITY);
292		unp->unp_cc = so->so_rcv.sb_cc;
293		sowwakeup(so2);
294		break;
295
296	default:
297		panic("uipc_rcvd unknown socktype");
298	}
299	UNP_UNLOCK();
300	return (0);
301}
302
303/* pru_rcvoob is EOPNOTSUPP */
304
305static int
306uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
307	  struct mbuf *control, struct thread *td)
308{
309	int error = 0;
310	struct unpcb *unp = sotounpcb(so);
311	struct socket *so2;
312	u_long newhiwat;
313
314	if (unp == NULL) {
315		error = EINVAL;
316		goto release;
317	}
318	if (flags & PRUS_OOB) {
319		error = EOPNOTSUPP;
320		goto release;
321	}
322
323	if (control != NULL && (error = unp_internalize(&control, td)))
324		goto release;
325
326	UNP_LOCK();
327	switch (so->so_type) {
328	case SOCK_DGRAM:
329	{
330		const struct sockaddr *from;
331
332		if (nam != NULL) {
333			if (unp->unp_conn != NULL) {
334				error = EISCONN;
335				break;
336			}
337			error = unp_connect(so, nam, td);
338			if (error)
339				break;
340		} else {
341			if (unp->unp_conn == NULL) {
342				error = ENOTCONN;
343				break;
344			}
345		}
346		so2 = unp->unp_conn->unp_socket;
347		if (unp->unp_addr != NULL)
348			from = (struct sockaddr *)unp->unp_addr;
349		else
350			from = &sun_noname;
351		if (sbappendaddr(&so2->so_rcv, from, m, control)) {
352			sorwakeup(so2);
353			m = NULL;
354			control = NULL;
355		} else {
356			error = ENOBUFS;
357		}
358		if (nam != NULL)
359			unp_disconnect(unp);
360		break;
361	}
362
363	case SOCK_STREAM:
364		/* Connect if not connected yet. */
365		/*
366		 * Note: A better implementation would complain
367		 * if not equal to the peer's address.
368		 */
369		if ((so->so_state & SS_ISCONNECTED) == 0) {
370			if (nam != NULL) {
371				error = unp_connect(so, nam, td);
372				if (error)
373					break;	/* XXX */
374			} else {
375				error = ENOTCONN;
376				break;
377			}
378		}
379
380		if (so->so_state & SS_CANTSENDMORE) {
381			error = EPIPE;
382			break;
383		}
384		if (unp->unp_conn == NULL)
385			panic("uipc_send connected but no connection?");
386		so2 = unp->unp_conn->unp_socket;
387		/*
388		 * Send to paired receive port, and then reduce
389		 * send buffer hiwater marks to maintain backpressure.
390		 * Wake up readers.
391		 */
392		if (control != NULL) {
393			if (sbappendcontrol(&so2->so_rcv, m, control))
394				control = NULL;
395		} else {
396			sbappend(&so2->so_rcv, m);
397		}
398		so->so_snd.sb_mbmax -=
399			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
400		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
401		newhiwat = so->so_snd.sb_hiwat -
402		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
403		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
404		    newhiwat, RLIM_INFINITY);
405		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
406		sorwakeup(so2);
407		m = NULL;
408		break;
409
410	default:
411		panic("uipc_send unknown socktype");
412	}
413
414	/*
415	 * SEND_EOF is equivalent to a SEND followed by
416	 * a SHUTDOWN.
417	 */
418	if (flags & PRUS_EOF) {
419		socantsendmore(so);
420		unp_shutdown(unp);
421	}
422	UNP_UNLOCK();
423
424	if (control != NULL && error != 0)
425		unp_dispose(control);
426
427release:
428	if (control != NULL)
429		m_freem(control);
430	if (m != NULL)
431		m_freem(m);
432	return (error);
433}
434
435static int
436uipc_sense(struct socket *so, struct stat *sb)
437{
438	struct unpcb *unp = sotounpcb(so);
439	struct socket *so2;
440
441	if (unp == NULL)
442		return (EINVAL);
443	UNP_LOCK();
444	sb->st_blksize = so->so_snd.sb_hiwat;
445	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
446		so2 = unp->unp_conn->unp_socket;
447		sb->st_blksize += so2->so_rcv.sb_cc;
448	}
449	sb->st_dev = NOUDEV;
450	if (unp->unp_ino == 0)
451		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
452	sb->st_ino = unp->unp_ino;
453	UNP_UNLOCK();
454	return (0);
455}
456
457static int
458uipc_shutdown(struct socket *so)
459{
460	struct unpcb *unp = sotounpcb(so);
461
462	if (unp == NULL)
463		return (EINVAL);
464	UNP_LOCK();
465	socantsendmore(so);
466	unp_shutdown(unp);
467	UNP_UNLOCK();
468	return (0);
469}
470
471static int
472uipc_sockaddr(struct socket *so, struct sockaddr **nam)
473{
474	struct unpcb *unp = sotounpcb(so);
475	const struct sockaddr *sa;
476
477	if (unp == NULL)
478		return (EINVAL);
479	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
480	UNP_LOCK();
481	if (unp->unp_addr != NULL)
482		sa = (struct sockaddr *) unp->unp_addr;
483	else
484		sa = &sun_noname;
485	bcopy(sa, *nam, sa->sa_len);
486	UNP_UNLOCK();
487	return (0);
488}
489
490struct pr_usrreqs uipc_usrreqs = {
491	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
492	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
493	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
494	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
495	sosend, soreceive, sopoll, pru_sosetlabel_null
496};
497
498int
499uipc_ctloutput(so, sopt)
500	struct socket *so;
501	struct sockopt *sopt;
502{
503	struct unpcb *unp = sotounpcb(so);
504	struct xucred xu;
505	int error;
506
507	switch (sopt->sopt_dir) {
508	case SOPT_GET:
509		switch (sopt->sopt_name) {
510		case LOCAL_PEERCRED:
511			error = 0;
512			UNP_LOCK();
513			if (unp->unp_flags & UNP_HAVEPC)
514				xu = unp->unp_peercred;
515			else {
516				if (so->so_type == SOCK_STREAM)
517					error = ENOTCONN;
518				else
519					error = EINVAL;
520			}
521			UNP_UNLOCK();
522			if (error == 0)
523				error = sooptcopyout(sopt, &xu, sizeof(xu));
524			break;
525		default:
526			error = EOPNOTSUPP;
527			break;
528		}
529		break;
530	case SOPT_SET:
531	default:
532		error = EOPNOTSUPP;
533		break;
534	}
535	return (error);
536}
537
538/*
539 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
540 * for stream sockets, although the total for sender and receiver is
541 * actually only PIPSIZ.
542 * Datagram sockets really use the sendspace as the maximum datagram size,
543 * and don't really want to reserve the sendspace.  Their recvspace should
544 * be large enough for at least one max-size datagram plus address.
545 */
546#ifndef PIPSIZ
547#define	PIPSIZ	8192
548#endif
549static u_long	unpst_sendspace = PIPSIZ;
550static u_long	unpst_recvspace = PIPSIZ;
551static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
552static u_long	unpdg_recvspace = 4*1024;
553
554static int	unp_rights;			/* file descriptors in flight */
555
556SYSCTL_DECL(_net_local_stream);
557SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
558	   &unpst_sendspace, 0, "");
559SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
560	   &unpst_recvspace, 0, "");
561SYSCTL_DECL(_net_local_dgram);
562SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
563	   &unpdg_sendspace, 0, "");
564SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
565	   &unpdg_recvspace, 0, "");
566SYSCTL_DECL(_net_local);
567SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
568
569static int
570unp_attach(so)
571	struct socket *so;
572{
573	register struct unpcb *unp;
574	int error;
575
576	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
577		switch (so->so_type) {
578
579		case SOCK_STREAM:
580			error = soreserve(so, unpst_sendspace, unpst_recvspace);
581			break;
582
583		case SOCK_DGRAM:
584			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
585			break;
586
587		default:
588			panic("unp_attach");
589		}
590		if (error)
591			return (error);
592	}
593	unp = uma_zalloc(unp_zone, M_WAITOK);
594	if (unp == NULL)
595		return (ENOBUFS);
596	bzero(unp, sizeof *unp);
597	LIST_INIT(&unp->unp_refs);
598	unp->unp_socket = so;
599
600	UNP_LOCK();
601	unp->unp_gencnt = ++unp_gencnt;
602	unp_count++;
603	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
604			 : &unp_shead, unp, unp_link);
605	UNP_UNLOCK();
606
607	so->so_pcb = unp;
608	return (0);
609}
610
611static void
612unp_detach(unp)
613	register struct unpcb *unp;
614{
615	struct vnode *vp;
616
617	UNP_LOCK_ASSERT();
618
619	LIST_REMOVE(unp, unp_link);
620	unp->unp_gencnt = ++unp_gencnt;
621	--unp_count;
622	if ((vp = unp->unp_vnode) != NULL) {
623		/*
624		 * XXXRW: should v_socket be frobbed only while holding
625		 * Giant?
626		 */
627		unp->unp_vnode->v_socket = NULL;
628		unp->unp_vnode = NULL;
629	}
630	if (unp->unp_conn != NULL)
631		unp_disconnect(unp);
632	while (!LIST_EMPTY(&unp->unp_refs)) {
633		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
634		unp_drop(ref, ECONNRESET);
635	}
636	soisdisconnected(unp->unp_socket);
637	unp->unp_socket->so_pcb = NULL;
638	if (unp_rights) {
639		/*
640		 * Normally the receive buffer is flushed later,
641		 * in sofree, but if our receive buffer holds references
642		 * to descriptors that are now garbage, we will dispose
643		 * of those descriptor references after the garbage collector
644		 * gets them (resulting in a "panic: closef: count < 0").
645		 */
646		sorflush(unp->unp_socket);
647		unp_gc();
648	}
649	if (unp->unp_addr != NULL)
650		FREE(unp->unp_addr, M_SONAME);
651	UNP_UNLOCK();
652	uma_zfree(unp_zone, unp);
653	if (vp) {
654		mtx_lock(&Giant);
655		vrele(vp);
656		mtx_unlock(&Giant);
657	}
658}
659
660static int
661unp_bind(unp, nam, td)
662	struct unpcb *unp;
663	struct sockaddr *nam;
664	struct thread *td;
665{
666	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
667	struct vnode *vp;
668	struct mount *mp;
669	struct vattr vattr;
670	int error, namelen;
671	struct nameidata nd;
672	char *buf;
673
674	/*
675	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
676	 * unlocked read here is fine, but the value of unp_vnode needs
677	 * to be tested again after we do all the lookups to see if the
678	 * pcb is still unbound?
679	 */
680	if (unp->unp_vnode != NULL)
681		return (EINVAL);
682
683	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
684	if (namelen <= 0)
685		return (EINVAL);
686
687	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
688	strlcpy(buf, soun->sun_path, namelen + 1);
689
690	mtx_lock(&Giant);
691restart:
692	mtx_assert(&Giant, MA_OWNED);
693	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
694	    buf, td);
695/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
696	error = namei(&nd);
697	if (error)
698		goto done;
699	vp = nd.ni_vp;
700	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
701		NDFREE(&nd, NDF_ONLY_PNBUF);
702		if (nd.ni_dvp == vp)
703			vrele(nd.ni_dvp);
704		else
705			vput(nd.ni_dvp);
706		if (vp != NULL) {
707			vrele(vp);
708			error = EADDRINUSE;
709			goto done;
710		}
711		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
712		if (error)
713			goto done;
714		goto restart;
715	}
716	VATTR_NULL(&vattr);
717	vattr.va_type = VSOCK;
718	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
719#ifdef MAC
720	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
721	    &vattr);
722#endif
723	if (error == 0) {
724		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
725		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
726	}
727	NDFREE(&nd, NDF_ONLY_PNBUF);
728	vput(nd.ni_dvp);
729	if (error)
730		goto done;
731	vp = nd.ni_vp;
732	ASSERT_VOP_LOCKED(vp, "unp_bind");
733	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
734	UNP_LOCK();
735	vp->v_socket = unp->unp_socket;
736	unp->unp_vnode = vp;
737	unp->unp_addr = soun;
738	UNP_UNLOCK();
739	VOP_UNLOCK(vp, 0, td);
740	vn_finished_write(mp);
741done:
742	mtx_unlock(&Giant);
743	free(buf, M_TEMP);
744	return (error);
745}
746
747static int
748unp_connect(so, nam, td)
749	struct socket *so;
750	struct sockaddr *nam;
751	struct thread *td;
752{
753	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
754	register struct vnode *vp;
755	register struct socket *so2, *so3;
756	struct unpcb *unp = sotounpcb(so);
757	struct unpcb *unp2, *unp3;
758	int error, len;
759	struct nameidata nd;
760	char buf[SOCK_MAXADDRLEN];
761	struct sockaddr *sa;
762
763	UNP_LOCK_ASSERT();
764
765	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
766	if (len <= 0)
767		return (EINVAL);
768	strlcpy(buf, soun->sun_path, len + 1);
769	UNP_UNLOCK();
770	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
771	mtx_lock(&Giant);
772	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
773	error = namei(&nd);
774	if (error)
775		vp = NULL;
776	else
777		vp = nd.ni_vp;
778	ASSERT_VOP_LOCKED(vp, "unp_connect");
779	NDFREE(&nd, NDF_ONLY_PNBUF);
780	if (error)
781		goto bad;
782
783	if (vp->v_type != VSOCK) {
784		error = ENOTSOCK;
785		goto bad;
786	}
787	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
788	if (error)
789		goto bad;
790	so2 = vp->v_socket;
791	if (so2 == NULL) {
792		error = ECONNREFUSED;
793		goto bad;
794	}
795	if (so->so_type != so2->so_type) {
796		error = EPROTOTYPE;
797		goto bad;
798	}
799	mtx_unlock(&Giant);
800	UNP_LOCK();
801	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
802		if (so2->so_options & SO_ACCEPTCONN) {
803			/*
804			 * NB: drop locks here so unp_attach is entered
805			 *     w/o locks; this avoids a recursive lock
806			 *     of the head and holding sleep locks across
807			 *     a (potentially) blocking malloc.
808			 */
809			UNP_UNLOCK();
810			so3 = sonewconn(so2, 0);
811			UNP_LOCK();
812		} else
813			so3 = NULL;
814		if (so3 == NULL) {
815			error = ECONNREFUSED;
816			goto bad2;
817		}
818		unp = sotounpcb(so);
819		unp2 = sotounpcb(so2);
820		unp3 = sotounpcb(so3);
821		if (unp2->unp_addr != NULL) {
822			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
823			unp3->unp_addr = (struct sockaddr_un *) sa;
824			sa = NULL;
825		}
826		/*
827		 * unp_peercred management:
828		 *
829		 * The connecter's (client's) credentials are copied
830		 * from its process structure at the time of connect()
831		 * (which is now).
832		 */
833		cru2x(td->td_ucred, &unp3->unp_peercred);
834		unp3->unp_flags |= UNP_HAVEPC;
835		/*
836		 * The receiver's (server's) credentials are copied
837		 * from the unp_peercred member of socket on which the
838		 * former called listen(); unp_listen() cached that
839		 * process's credentials at that time so we can use
840		 * them now.
841		 */
842		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
843		    ("unp_connect: listener without cached peercred"));
844		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
845		    sizeof(unp->unp_peercred));
846		unp->unp_flags |= UNP_HAVEPC;
847#ifdef MAC
848		mac_set_socket_peer_from_socket(so, so3);
849		mac_set_socket_peer_from_socket(so3, so);
850#endif
851
852		so2 = so3;
853	}
854	error = unp_connect2(so, so2);
855bad2:
856	UNP_UNLOCK();
857	mtx_lock(&Giant);
858bad:
859	mtx_assert(&Giant, MA_OWNED);
860	if (vp != NULL)
861		vput(vp);
862	mtx_unlock(&Giant);
863	free(sa, M_SONAME);
864	UNP_LOCK();
865	return (error);
866}
867
868static int
869unp_connect2(so, so2)
870	register struct socket *so;
871	register struct socket *so2;
872{
873	register struct unpcb *unp = sotounpcb(so);
874	register struct unpcb *unp2;
875
876	UNP_LOCK_ASSERT();
877
878	if (so2->so_type != so->so_type)
879		return (EPROTOTYPE);
880	unp2 = sotounpcb(so2);
881	unp->unp_conn = unp2;
882	switch (so->so_type) {
883
884	case SOCK_DGRAM:
885		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
886		soisconnected(so);
887		break;
888
889	case SOCK_STREAM:
890		unp2->unp_conn = unp;
891		soisconnected(so);
892		soisconnected(so2);
893		break;
894
895	default:
896		panic("unp_connect2");
897	}
898	return (0);
899}
900
901static void
902unp_disconnect(unp)
903	struct unpcb *unp;
904{
905	register struct unpcb *unp2 = unp->unp_conn;
906
907	UNP_LOCK_ASSERT();
908
909	if (unp2 == NULL)
910		return;
911	unp->unp_conn = NULL;
912	switch (unp->unp_socket->so_type) {
913
914	case SOCK_DGRAM:
915		LIST_REMOVE(unp, unp_reflink);
916		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
917		break;
918
919	case SOCK_STREAM:
920		soisdisconnected(unp->unp_socket);
921		unp2->unp_conn = NULL;
922		soisdisconnected(unp2->unp_socket);
923		break;
924	}
925}
926
927#ifdef notdef
928void
929unp_abort(unp)
930	struct unpcb *unp;
931{
932
933	unp_detach(unp);
934}
935#endif
936
937/*
938 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
939 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
940 * are safe to reference.  It first scans the list of struct unpcb's to
941 * generate a pointer list, then it rescans its list one entry at a time to
942 * externalize and copyout.  It checks the generation number to see if a
943 * struct unpcb has been reused, and will skip it if so.
944 */
945static int
946unp_pcblist(SYSCTL_HANDLER_ARGS)
947{
948	int error, i, n;
949	struct unpcb *unp, **unp_list;
950	unp_gen_t gencnt;
951	struct xunpgen *xug;
952	struct unp_head *head;
953	struct xunpcb *xu;
954
955	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
956
957	/*
958	 * The process of preparing the PCB list is too time-consuming and
959	 * resource-intensive to repeat twice on every request.
960	 */
961	if (req->oldptr == NULL) {
962		n = unp_count;
963		req->oldidx = 2 * (sizeof *xug)
964			+ (n + n/8) * sizeof(struct xunpcb);
965		return (0);
966	}
967
968	if (req->newptr != NULL)
969		return (EPERM);
970
971	/*
972	 * OK, now we're committed to doing something.
973	 */
974	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
975	UNP_LOCK();
976	gencnt = unp_gencnt;
977	n = unp_count;
978	UNP_UNLOCK();
979
980	xug->xug_len = sizeof *xug;
981	xug->xug_count = n;
982	xug->xug_gen = gencnt;
983	xug->xug_sogen = so_gencnt;
984	error = SYSCTL_OUT(req, xug, sizeof *xug);
985	if (error) {
986		free(xug, M_TEMP);
987		return (error);
988	}
989
990	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
991
992	UNP_LOCK();
993	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
994	     unp = LIST_NEXT(unp, unp_link)) {
995		if (unp->unp_gencnt <= gencnt) {
996			if (cr_cansee(req->td->td_ucred,
997			    unp->unp_socket->so_cred))
998				continue;
999			unp_list[i++] = unp;
1000		}
1001	}
1002	UNP_UNLOCK();
1003	n = i;			/* in case we lost some during malloc */
1004
1005	error = 0;
1006	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1007	for (i = 0; i < n; i++) {
1008		unp = unp_list[i];
1009		if (unp->unp_gencnt <= gencnt) {
1010			xu->xu_len = sizeof *xu;
1011			xu->xu_unpp = unp;
1012			/*
1013			 * XXX - need more locking here to protect against
1014			 * connect/disconnect races for SMP.
1015			 */
1016			if (unp->unp_addr != NULL)
1017				bcopy(unp->unp_addr, &xu->xu_addr,
1018				      unp->unp_addr->sun_len);
1019			if (unp->unp_conn != NULL &&
1020			    unp->unp_conn->unp_addr != NULL)
1021				bcopy(unp->unp_conn->unp_addr,
1022				      &xu->xu_caddr,
1023				      unp->unp_conn->unp_addr->sun_len);
1024			bcopy(unp, &xu->xu_unp, sizeof *unp);
1025			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1026			error = SYSCTL_OUT(req, xu, sizeof *xu);
1027		}
1028	}
1029	free(xu, M_TEMP);
1030	if (!error) {
1031		/*
1032		 * Give the user an updated idea of our state.
1033		 * If the generation differs from what we told
1034		 * her before, she knows that something happened
1035		 * while we were processing this request, and it
1036		 * might be necessary to retry.
1037		 */
1038		xug->xug_gen = unp_gencnt;
1039		xug->xug_sogen = so_gencnt;
1040		xug->xug_count = unp_count;
1041		error = SYSCTL_OUT(req, xug, sizeof *xug);
1042	}
1043	free(unp_list, M_TEMP);
1044	free(xug, M_TEMP);
1045	return (error);
1046}
1047
1048SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1049	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1050	    "List of active local datagram sockets");
1051SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1052	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1053	    "List of active local stream sockets");
1054
1055static void
1056unp_shutdown(unp)
1057	struct unpcb *unp;
1058{
1059	struct socket *so;
1060
1061	UNP_LOCK_ASSERT();
1062
1063	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1064	    (so = unp->unp_conn->unp_socket))
1065		socantrcvmore(so);
1066}
1067
1068static void
1069unp_drop(unp, errno)
1070	struct unpcb *unp;
1071	int errno;
1072{
1073	struct socket *so = unp->unp_socket;
1074
1075	UNP_LOCK_ASSERT();
1076
1077	so->so_error = errno;
1078	unp_disconnect(unp);
1079}
1080
1081#ifdef notdef
1082void
1083unp_drain()
1084{
1085
1086}
1087#endif
1088
1089static void
1090unp_freerights(rp, fdcount)
1091	struct file **rp;
1092	int fdcount;
1093{
1094	int i;
1095	struct file *fp;
1096
1097	for (i = 0; i < fdcount; i++) {
1098		fp = *rp;
1099		/*
1100		 * zero the pointer before calling
1101		 * unp_discard since it may end up
1102		 * in unp_gc()..
1103		 */
1104		*rp++ = 0;
1105		unp_discard(fp);
1106	}
1107}
1108
1109int
1110unp_externalize(control, controlp)
1111	struct mbuf *control, **controlp;
1112{
1113	struct thread *td = curthread;		/* XXX */
1114	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1115	int i;
1116	int *fdp;
1117	struct file **rp;
1118	struct file *fp;
1119	void *data;
1120	socklen_t clen = control->m_len, datalen;
1121	int error, newfds;
1122	int f;
1123	u_int newlen;
1124
1125	error = 0;
1126	if (controlp != NULL) /* controlp == NULL => free control messages */
1127		*controlp = NULL;
1128
1129	while (cm != NULL) {
1130		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1131			error = EINVAL;
1132			break;
1133		}
1134
1135		data = CMSG_DATA(cm);
1136		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1137
1138		if (cm->cmsg_level == SOL_SOCKET
1139		    && cm->cmsg_type == SCM_RIGHTS) {
1140			newfds = datalen / sizeof(struct file *);
1141			rp = data;
1142
1143			/* If we're not outputting the descriptors free them. */
1144			if (error || controlp == NULL) {
1145				unp_freerights(rp, newfds);
1146				goto next;
1147			}
1148			FILEDESC_LOCK(td->td_proc->p_fd);
1149			/* if the new FD's will not fit free them.  */
1150			if (!fdavail(td, newfds)) {
1151				FILEDESC_UNLOCK(td->td_proc->p_fd);
1152				error = EMSGSIZE;
1153				unp_freerights(rp, newfds);
1154				goto next;
1155			}
1156			/*
1157			 * now change each pointer to an fd in the global
1158			 * table to an integer that is the index to the
1159			 * local fd table entry that we set up to point
1160			 * to the global one we are transferring.
1161			 */
1162			newlen = newfds * sizeof(int);
1163			*controlp = sbcreatecontrol(NULL, newlen,
1164			    SCM_RIGHTS, SOL_SOCKET);
1165			if (*controlp == NULL) {
1166				FILEDESC_UNLOCK(td->td_proc->p_fd);
1167				error = E2BIG;
1168				unp_freerights(rp, newfds);
1169				goto next;
1170			}
1171
1172			fdp = (int *)
1173			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1174			for (i = 0; i < newfds; i++) {
1175				if (fdalloc(td, 0, &f))
1176					panic("unp_externalize fdalloc failed");
1177				fp = *rp++;
1178				td->td_proc->p_fd->fd_ofiles[f] = fp;
1179				FILE_LOCK(fp);
1180				fp->f_msgcount--;
1181				FILE_UNLOCK(fp);
1182				unp_rights--;
1183				*fdp++ = f;
1184			}
1185			FILEDESC_UNLOCK(td->td_proc->p_fd);
1186		} else { /* We can just copy anything else across */
1187			if (error || controlp == NULL)
1188				goto next;
1189			*controlp = sbcreatecontrol(NULL, datalen,
1190			    cm->cmsg_type, cm->cmsg_level);
1191			if (*controlp == NULL) {
1192				error = ENOBUFS;
1193				goto next;
1194			}
1195			bcopy(data,
1196			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1197			    datalen);
1198		}
1199
1200		controlp = &(*controlp)->m_next;
1201
1202next:
1203		if (CMSG_SPACE(datalen) < clen) {
1204			clen -= CMSG_SPACE(datalen);
1205			cm = (struct cmsghdr *)
1206			    ((caddr_t)cm + CMSG_SPACE(datalen));
1207		} else {
1208			clen = 0;
1209			cm = NULL;
1210		}
1211	}
1212
1213	m_freem(control);
1214
1215	return (error);
1216}
1217
1218void
1219unp_init(void)
1220{
1221	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1222	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1223	if (unp_zone == NULL)
1224		panic("unp_init");
1225	uma_zone_set_max(unp_zone, nmbclusters);
1226	LIST_INIT(&unp_dhead);
1227	LIST_INIT(&unp_shead);
1228
1229	UNP_LOCK_INIT();
1230}
1231
1232static int
1233unp_internalize(controlp, td)
1234	struct mbuf **controlp;
1235	struct thread *td;
1236{
1237	struct mbuf *control = *controlp;
1238	struct proc *p = td->td_proc;
1239	struct filedesc *fdescp = p->p_fd;
1240	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1241	struct cmsgcred *cmcred;
1242	struct file **rp;
1243	struct file *fp;
1244	struct timeval *tv;
1245	int i, fd, *fdp;
1246	void *data;
1247	socklen_t clen = control->m_len, datalen;
1248	int error, oldfds;
1249	u_int newlen;
1250
1251	error = 0;
1252	*controlp = NULL;
1253
1254	while (cm != NULL) {
1255		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1256		    || cm->cmsg_len > clen) {
1257			error = EINVAL;
1258			goto out;
1259		}
1260
1261		data = CMSG_DATA(cm);
1262		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1263
1264		switch (cm->cmsg_type) {
1265		/*
1266		 * Fill in credential information.
1267		 */
1268		case SCM_CREDS:
1269			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1270			    SCM_CREDS, SOL_SOCKET);
1271			if (*controlp == NULL) {
1272				error = ENOBUFS;
1273				goto out;
1274			}
1275
1276			cmcred = (struct cmsgcred *)
1277			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1278			cmcred->cmcred_pid = p->p_pid;
1279			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1280			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1281			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1282			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1283							CMGROUP_MAX);
1284			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1285				cmcred->cmcred_groups[i] =
1286				    td->td_ucred->cr_groups[i];
1287			break;
1288
1289		case SCM_RIGHTS:
1290			oldfds = datalen / sizeof (int);
1291			/*
1292			 * check that all the FDs passed in refer to legal files
1293			 * If not, reject the entire operation.
1294			 */
1295			fdp = data;
1296			FILEDESC_LOCK(fdescp);
1297			for (i = 0; i < oldfds; i++) {
1298				fd = *fdp++;
1299				if ((unsigned)fd >= fdescp->fd_nfiles ||
1300				    fdescp->fd_ofiles[fd] == NULL) {
1301					FILEDESC_UNLOCK(fdescp);
1302					error = EBADF;
1303					goto out;
1304				}
1305				fp = fdescp->fd_ofiles[fd];
1306				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1307					FILEDESC_UNLOCK(fdescp);
1308					error = EOPNOTSUPP;
1309					goto out;
1310				}
1311
1312			}
1313			/*
1314			 * Now replace the integer FDs with pointers to
1315			 * the associated global file table entry..
1316			 */
1317			newlen = oldfds * sizeof(struct file *);
1318			*controlp = sbcreatecontrol(NULL, newlen,
1319			    SCM_RIGHTS, SOL_SOCKET);
1320			if (*controlp == NULL) {
1321				FILEDESC_UNLOCK(fdescp);
1322				error = E2BIG;
1323				goto out;
1324			}
1325
1326			fdp = data;
1327			rp = (struct file **)
1328			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1329			for (i = 0; i < oldfds; i++) {
1330				fp = fdescp->fd_ofiles[*fdp++];
1331				*rp++ = fp;
1332				FILE_LOCK(fp);
1333				fp->f_count++;
1334				fp->f_msgcount++;
1335				FILE_UNLOCK(fp);
1336				unp_rights++;
1337			}
1338			FILEDESC_UNLOCK(fdescp);
1339			break;
1340
1341		case SCM_TIMESTAMP:
1342			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1343			    SCM_TIMESTAMP, SOL_SOCKET);
1344			if (*controlp == NULL) {
1345				error = ENOBUFS;
1346				goto out;
1347			}
1348			tv = (struct timeval *)
1349			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1350			microtime(tv);
1351			break;
1352
1353		default:
1354			error = EINVAL;
1355			goto out;
1356		}
1357
1358		controlp = &(*controlp)->m_next;
1359
1360		if (CMSG_SPACE(datalen) < clen) {
1361			clen -= CMSG_SPACE(datalen);
1362			cm = (struct cmsghdr *)
1363			    ((caddr_t)cm + CMSG_SPACE(datalen));
1364		} else {
1365			clen = 0;
1366			cm = NULL;
1367		}
1368	}
1369
1370out:
1371	m_freem(control);
1372
1373	return (error);
1374}
1375
1376static int	unp_defer, unp_gcing;
1377
1378static void
1379unp_gc()
1380{
1381	register struct file *fp, *nextfp;
1382	register struct socket *so;
1383	struct file **extra_ref, **fpp;
1384	int nunref, i;
1385
1386	UNP_LOCK_ASSERT();
1387
1388	if (unp_gcing)
1389		return;
1390	unp_gcing = 1;
1391	unp_defer = 0;
1392	/*
1393	 * before going through all this, set all FDs to
1394	 * be NOT defered and NOT externally accessible
1395	 */
1396	/*
1397	 * XXXRW: Acquiring a sleep lock while holding UNP
1398	 * mutex cannot be a good thing.
1399	 */
1400	sx_slock(&filelist_lock);
1401	LIST_FOREACH(fp, &filehead, f_list)
1402		fp->f_gcflag &= ~(FMARK|FDEFER);
1403	do {
1404		LIST_FOREACH(fp, &filehead, f_list) {
1405			FILE_LOCK(fp);
1406			/*
1407			 * If the file is not open, skip it
1408			 */
1409			if (fp->f_count == 0) {
1410				FILE_UNLOCK(fp);
1411				continue;
1412			}
1413			/*
1414			 * If we already marked it as 'defer'  in a
1415			 * previous pass, then try process it this time
1416			 * and un-mark it
1417			 */
1418			if (fp->f_gcflag & FDEFER) {
1419				fp->f_gcflag &= ~FDEFER;
1420				unp_defer--;
1421			} else {
1422				/*
1423				 * if it's not defered, then check if it's
1424				 * already marked.. if so skip it
1425				 */
1426				if (fp->f_gcflag & FMARK) {
1427					FILE_UNLOCK(fp);
1428					continue;
1429				}
1430				/*
1431				 * If all references are from messages
1432				 * in transit, then skip it. it's not
1433				 * externally accessible.
1434				 */
1435				if (fp->f_count == fp->f_msgcount) {
1436					FILE_UNLOCK(fp);
1437					continue;
1438				}
1439				/*
1440				 * If it got this far then it must be
1441				 * externally accessible.
1442				 */
1443				fp->f_gcflag |= FMARK;
1444			}
1445			/*
1446			 * either it was defered, or it is externally
1447			 * accessible and not already marked so.
1448			 * Now check if it is possibly one of OUR sockets.
1449			 */
1450			if (fp->f_type != DTYPE_SOCKET ||
1451			    (so = fp->f_data) == NULL) {
1452				FILE_UNLOCK(fp);
1453				continue;
1454			}
1455			FILE_UNLOCK(fp);
1456			if (so->so_proto->pr_domain != &localdomain ||
1457			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1458				continue;
1459#ifdef notdef
1460			if (so->so_rcv.sb_flags & SB_LOCK) {
1461				/*
1462				 * This is problematical; it's not clear
1463				 * we need to wait for the sockbuf to be
1464				 * unlocked (on a uniprocessor, at least),
1465				 * and it's also not clear what to do
1466				 * if sbwait returns an error due to receipt
1467				 * of a signal.  If sbwait does return
1468				 * an error, we'll go into an infinite
1469				 * loop.  Delete all of this for now.
1470				 */
1471				(void) sbwait(&so->so_rcv);
1472				goto restart;
1473			}
1474#endif
1475			/*
1476			 * So, Ok, it's one of our sockets and it IS externally
1477			 * accessible (or was defered). Now we look
1478			 * to see if we hold any file descriptors in its
1479			 * message buffers. Follow those links and mark them
1480			 * as accessible too.
1481			 */
1482			unp_scan(so->so_rcv.sb_mb, unp_mark);
1483		}
1484	} while (unp_defer);
1485	sx_sunlock(&filelist_lock);
1486	/*
1487	 * We grab an extra reference to each of the file table entries
1488	 * that are not otherwise accessible and then free the rights
1489	 * that are stored in messages on them.
1490	 *
1491	 * The bug in the orginal code is a little tricky, so I'll describe
1492	 * what's wrong with it here.
1493	 *
1494	 * It is incorrect to simply unp_discard each entry for f_msgcount
1495	 * times -- consider the case of sockets A and B that contain
1496	 * references to each other.  On a last close of some other socket,
1497	 * we trigger a gc since the number of outstanding rights (unp_rights)
1498	 * is non-zero.  If during the sweep phase the gc code un_discards,
1499	 * we end up doing a (full) closef on the descriptor.  A closef on A
1500	 * results in the following chain.  Closef calls soo_close, which
1501	 * calls soclose.   Soclose calls first (through the switch
1502	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1503	 * returns because the previous instance had set unp_gcing, and
1504	 * we return all the way back to soclose, which marks the socket
1505	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1506	 * to free up the rights that are queued in messages on the socket A,
1507	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1508	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1509	 * instance of unp_discard just calls closef on B.
1510	 *
1511	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1512	 * which results in another closef on A.  Unfortunately, A is already
1513	 * being closed, and the descriptor has already been marked with
1514	 * SS_NOFDREF, and soclose panics at this point.
1515	 *
1516	 * Here, we first take an extra reference to each inaccessible
1517	 * descriptor.  Then, we call sorflush ourself, since we know
1518	 * it is a Unix domain socket anyhow.  After we destroy all the
1519	 * rights carried in messages, we do a last closef to get rid
1520	 * of our extra reference.  This is the last close, and the
1521	 * unp_detach etc will shut down the socket.
1522	 *
1523	 * 91/09/19, bsy@cs.cmu.edu
1524	 */
1525	extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK);
1526	sx_slock(&filelist_lock);
1527	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1528	    fp != NULL; fp = nextfp) {
1529		nextfp = LIST_NEXT(fp, f_list);
1530		FILE_LOCK(fp);
1531		/*
1532		 * If it's not open, skip it
1533		 */
1534		if (fp->f_count == 0) {
1535			FILE_UNLOCK(fp);
1536			continue;
1537		}
1538		/*
1539		 * If all refs are from msgs, and it's not marked accessible
1540		 * then it must be referenced from some unreachable cycle
1541		 * of (shut-down) FDs, so include it in our
1542		 * list of FDs to remove
1543		 */
1544		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1545			*fpp++ = fp;
1546			nunref++;
1547			fp->f_count++;
1548		}
1549		FILE_UNLOCK(fp);
1550	}
1551	sx_sunlock(&filelist_lock);
1552	/*
1553	 * for each FD on our hit list, do the following two things
1554	 */
1555	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1556		struct file *tfp = *fpp;
1557		FILE_LOCK(tfp);
1558		if (tfp->f_type == DTYPE_SOCKET &&
1559		    tfp->f_data != NULL) {
1560			FILE_UNLOCK(tfp);
1561			sorflush(tfp->f_data);
1562		} else {
1563			FILE_UNLOCK(tfp);
1564		}
1565	}
1566	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1567		closef(*fpp, (struct thread *) NULL);
1568	free(extra_ref, M_TEMP);
1569	unp_gcing = 0;
1570}
1571
1572void
1573unp_dispose(m)
1574	struct mbuf *m;
1575{
1576
1577	if (m)
1578		unp_scan(m, unp_discard);
1579}
1580
1581static int
1582unp_listen(unp, td)
1583	struct unpcb *unp;
1584	struct thread *td;
1585{
1586	UNP_LOCK_ASSERT();
1587
1588	/*
1589	 * XXXRW: Why populate the local peer cred with our own credential?
1590	 */
1591	cru2x(td->td_ucred, &unp->unp_peercred);
1592	unp->unp_flags |= UNP_HAVEPCCACHED;
1593	return (0);
1594}
1595
1596static void
1597unp_scan(m0, op)
1598	register struct mbuf *m0;
1599	void (*op)(struct file *);
1600{
1601	struct mbuf *m;
1602	struct file **rp;
1603	struct cmsghdr *cm;
1604	void *data;
1605	int i;
1606	socklen_t clen, datalen;
1607	int qfds;
1608
1609	while (m0 != NULL) {
1610		for (m = m0; m; m = m->m_next) {
1611			if (m->m_type != MT_CONTROL)
1612				continue;
1613
1614			cm = mtod(m, struct cmsghdr *);
1615			clen = m->m_len;
1616
1617			while (cm != NULL) {
1618				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1619					break;
1620
1621				data = CMSG_DATA(cm);
1622				datalen = (caddr_t)cm + cm->cmsg_len
1623				    - (caddr_t)data;
1624
1625				if (cm->cmsg_level == SOL_SOCKET &&
1626				    cm->cmsg_type == SCM_RIGHTS) {
1627					qfds = datalen / sizeof (struct file *);
1628					rp = data;
1629					for (i = 0; i < qfds; i++)
1630						(*op)(*rp++);
1631				}
1632
1633				if (CMSG_SPACE(datalen) < clen) {
1634					clen -= CMSG_SPACE(datalen);
1635					cm = (struct cmsghdr *)
1636					    ((caddr_t)cm + CMSG_SPACE(datalen));
1637				} else {
1638					clen = 0;
1639					cm = NULL;
1640				}
1641			}
1642		}
1643		m0 = m0->m_act;
1644	}
1645}
1646
1647static void
1648unp_mark(fp)
1649	struct file *fp;
1650{
1651	if (fp->f_gcflag & FMARK)
1652		return;
1653	unp_defer++;
1654	fp->f_gcflag |= (FMARK|FDEFER);
1655}
1656
1657static void
1658unp_discard(fp)
1659	struct file *fp;
1660{
1661	FILE_LOCK(fp);
1662	fp->f_msgcount--;
1663	unp_rights--;
1664	FILE_UNLOCK(fp);
1665	(void) closef(fp, (struct thread *)NULL);
1666}
1667