uipc_usrreq.c revision 131170
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 131170 2004-06-27 03:29:25Z rwatson $");
34
35#include "opt_mac.h"
36
37#include <sys/param.h>
38#include <sys/domain.h>
39#include <sys/fcntl.h>
40#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
41#include <sys/file.h>
42#include <sys/filedesc.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mac.h>
47#include <sys/mbuf.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/proc.h>
51#include <sys/protosw.h>
52#include <sys/resourcevar.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/signalvar.h>
56#include <sys/stat.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/systm.h>
60#include <sys/un.h>
61#include <sys/unpcb.h>
62#include <sys/vnode.h>
63
64#include <vm/uma.h>
65
66static uma_zone_t unp_zone;
67static	unp_gen_t unp_gencnt;
68static	u_int unp_count;
69
70static	struct unp_head unp_shead, unp_dhead;
71
72/*
73 * Unix communications domain.
74 *
75 * TODO:
76 *	SEQPACKET, RDM
77 *	rethink name space problems
78 *	need a proper out-of-band
79 *	lock pushdown
80 */
81static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
82static ino_t	unp_ino;		/* prototype for fake inode numbers */
83
84static struct mtx unp_mtx;
85#define	UNP_LOCK_INIT() \
86	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
87#define	UNP_LOCK()		mtx_lock(&unp_mtx)
88#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
89#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
90
91static int     unp_attach(struct socket *);
92static void    unp_detach(struct unpcb *);
93static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
94static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
95static int     unp_connect2(struct socket *so, struct socket *so2);
96static void    unp_disconnect(struct unpcb *);
97static void    unp_shutdown(struct unpcb *);
98static void    unp_drop(struct unpcb *, int);
99static void    unp_gc(void);
100static void    unp_scan(struct mbuf *, void (*)(struct file *));
101static void    unp_mark(struct file *);
102static void    unp_discard(struct file *);
103static void    unp_freerights(struct file **, int);
104static int     unp_internalize(struct mbuf **, struct thread *);
105static int     unp_listen(struct unpcb *, struct thread *);
106
107static int
108uipc_abort(struct socket *so)
109{
110	struct unpcb *unp = sotounpcb(so);
111
112	if (unp == NULL)
113		return (EINVAL);
114	UNP_LOCK();
115	unp_drop(unp, ECONNABORTED);
116	unp_detach(unp);	/* NB: unlocks */
117	SOCK_LOCK(so);
118	sotryfree(so);
119	return (0);
120}
121
122static int
123uipc_accept(struct socket *so, struct sockaddr **nam)
124{
125	struct unpcb *unp = sotounpcb(so);
126	const struct sockaddr *sa;
127
128	if (unp == NULL)
129		return (EINVAL);
130
131	/*
132	 * Pass back name of connected socket,
133	 * if it was bound and we are still connected
134	 * (our peer may have closed already!).
135	 */
136	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
137	UNP_LOCK();
138	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
139		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
140	else
141		sa = &sun_noname;
142	bcopy(sa, *nam, sa->sa_len);
143	UNP_UNLOCK();
144	return (0);
145}
146
147static int
148uipc_attach(struct socket *so, int proto, struct thread *td)
149{
150	struct unpcb *unp = sotounpcb(so);
151
152	if (unp != NULL)
153		return (EISCONN);
154	return (unp_attach(so));
155}
156
157static int
158uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
159{
160	struct unpcb *unp = sotounpcb(so);
161
162	if (unp == NULL)
163		return (EINVAL);
164
165	return (unp_bind(unp, nam, td));
166}
167
168static int
169uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
170{
171	struct unpcb *unp = sotounpcb(so);
172	int error;
173
174	if (unp == NULL)
175		return (EINVAL);
176	UNP_LOCK();
177	error = unp_connect(so, nam, curthread);
178	UNP_UNLOCK();
179	return (error);
180}
181
182int
183uipc_connect2(struct socket *so1, struct socket *so2)
184{
185	struct unpcb *unp = sotounpcb(so1);
186	int error;
187
188	if (unp == NULL)
189		return (EINVAL);
190
191	UNP_LOCK();
192	error = unp_connect2(so1, so2);
193	UNP_UNLOCK();
194	return (error);
195}
196
197/* control is EOPNOTSUPP */
198
199static int
200uipc_detach(struct socket *so)
201{
202	struct unpcb *unp = sotounpcb(so);
203
204	if (unp == NULL)
205		return (EINVAL);
206
207	UNP_LOCK();
208	unp_detach(unp);	/* NB: unlocks unp */
209	return (0);
210}
211
212static int
213uipc_disconnect(struct socket *so)
214{
215	struct unpcb *unp = sotounpcb(so);
216
217	if (unp == NULL)
218		return (EINVAL);
219	UNP_LOCK();
220	unp_disconnect(unp);
221	UNP_UNLOCK();
222	return (0);
223}
224
225static int
226uipc_listen(struct socket *so, struct thread *td)
227{
228	struct unpcb *unp = sotounpcb(so);
229	int error;
230
231	if (unp == NULL || unp->unp_vnode == NULL)
232		return (EINVAL);
233	UNP_LOCK();
234	error = unp_listen(unp, td);
235	UNP_UNLOCK();
236	return (error);
237}
238
239static int
240uipc_peeraddr(struct socket *so, struct sockaddr **nam)
241{
242	struct unpcb *unp = sotounpcb(so);
243	const struct sockaddr *sa;
244
245	if (unp == NULL)
246		return (EINVAL);
247	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
248	UNP_LOCK();
249	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
250		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
251	else {
252		/*
253		 * XXX: It seems that this test always fails even when
254		 * connection is established.  So, this else clause is
255		 * added as workaround to return PF_LOCAL sockaddr.
256		 */
257		sa = &sun_noname;
258	}
259	bcopy(sa, *nam, sa->sa_len);
260	UNP_UNLOCK();
261	return (0);
262}
263
264static int
265uipc_rcvd(struct socket *so, int flags)
266{
267	struct unpcb *unp = sotounpcb(so);
268	struct socket *so2;
269	u_long newhiwat;
270
271	if (unp == NULL)
272		return (EINVAL);
273	UNP_LOCK();
274	switch (so->so_type) {
275	case SOCK_DGRAM:
276		panic("uipc_rcvd DGRAM?");
277		/*NOTREACHED*/
278
279	case SOCK_STREAM:
280		if (unp->unp_conn == NULL)
281			break;
282		so2 = unp->unp_conn->unp_socket;
283		SOCKBUF_LOCK(&so2->so_snd);
284		SOCKBUF_LOCK(&so->so_rcv);
285		/*
286		 * Adjust backpressure on sender
287		 * and wakeup any waiting to write.
288		 */
289		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
290		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
291		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
292		    so->so_rcv.sb_cc;
293		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
294		    newhiwat, RLIM_INFINITY);
295		unp->unp_cc = so->so_rcv.sb_cc;
296		SOCKBUF_UNLOCK(&so->so_rcv);
297		sowwakeup_locked(so2);
298		break;
299
300	default:
301		panic("uipc_rcvd unknown socktype");
302	}
303	UNP_UNLOCK();
304	return (0);
305}
306
307/* pru_rcvoob is EOPNOTSUPP */
308
309static int
310uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
311	  struct mbuf *control, struct thread *td)
312{
313	int error = 0;
314	struct unpcb *unp = sotounpcb(so);
315	struct socket *so2;
316	u_long newhiwat;
317
318	if (unp == NULL) {
319		error = EINVAL;
320		goto release;
321	}
322	if (flags & PRUS_OOB) {
323		error = EOPNOTSUPP;
324		goto release;
325	}
326
327	if (control != NULL && (error = unp_internalize(&control, td)))
328		goto release;
329
330	UNP_LOCK();
331	switch (so->so_type) {
332	case SOCK_DGRAM:
333	{
334		const struct sockaddr *from;
335
336		if (nam != NULL) {
337			if (unp->unp_conn != NULL) {
338				error = EISCONN;
339				break;
340			}
341			error = unp_connect(so, nam, td);
342			if (error)
343				break;
344		} else {
345			if (unp->unp_conn == NULL) {
346				error = ENOTCONN;
347				break;
348			}
349		}
350		so2 = unp->unp_conn->unp_socket;
351		if (unp->unp_addr != NULL)
352			from = (struct sockaddr *)unp->unp_addr;
353		else
354			from = &sun_noname;
355		SOCKBUF_LOCK(&so2->so_rcv);
356		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
357			sorwakeup_locked(so2);
358			m = NULL;
359			control = NULL;
360		} else {
361			SOCKBUF_UNLOCK(&so2->so_rcv);
362			error = ENOBUFS;
363		}
364		if (nam != NULL)
365			unp_disconnect(unp);
366		break;
367	}
368
369	case SOCK_STREAM:
370		/* Connect if not connected yet. */
371		/*
372		 * Note: A better implementation would complain
373		 * if not equal to the peer's address.
374		 */
375		if ((so->so_state & SS_ISCONNECTED) == 0) {
376			if (nam != NULL) {
377				error = unp_connect(so, nam, td);
378				if (error)
379					break;	/* XXX */
380			} else {
381				error = ENOTCONN;
382				break;
383			}
384		}
385
386		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
387			error = EPIPE;
388			break;
389		}
390		if (unp->unp_conn == NULL)
391			panic("uipc_send connected but no connection?");
392		so2 = unp->unp_conn->unp_socket;
393		SOCKBUF_LOCK(&so2->so_rcv);
394		/*
395		 * Send to paired receive port, and then reduce
396		 * send buffer hiwater marks to maintain backpressure.
397		 * Wake up readers.
398		 */
399		if (control != NULL) {
400			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
401				control = NULL;
402		} else {
403			sbappend_locked(&so2->so_rcv, m);
404		}
405		so->so_snd.sb_mbmax -=
406			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
407		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
408		newhiwat = so->so_snd.sb_hiwat -
409		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
410		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
411		    newhiwat, RLIM_INFINITY);
412		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
413		sorwakeup_locked(so2);
414		m = NULL;
415		break;
416
417	default:
418		panic("uipc_send unknown socktype");
419	}
420
421	/*
422	 * SEND_EOF is equivalent to a SEND followed by
423	 * a SHUTDOWN.
424	 */
425	if (flags & PRUS_EOF) {
426		socantsendmore(so);
427		unp_shutdown(unp);
428	}
429	UNP_UNLOCK();
430
431	if (control != NULL && error != 0)
432		unp_dispose(control);
433
434release:
435	if (control != NULL)
436		m_freem(control);
437	if (m != NULL)
438		m_freem(m);
439	return (error);
440}
441
442static int
443uipc_sense(struct socket *so, struct stat *sb)
444{
445	struct unpcb *unp = sotounpcb(so);
446	struct socket *so2;
447
448	if (unp == NULL)
449		return (EINVAL);
450	UNP_LOCK();
451	sb->st_blksize = so->so_snd.sb_hiwat;
452	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
453		so2 = unp->unp_conn->unp_socket;
454		sb->st_blksize += so2->so_rcv.sb_cc;
455	}
456	sb->st_dev = NODEV;
457	if (unp->unp_ino == 0)
458		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
459	sb->st_ino = unp->unp_ino;
460	UNP_UNLOCK();
461	return (0);
462}
463
464static int
465uipc_shutdown(struct socket *so)
466{
467	struct unpcb *unp = sotounpcb(so);
468
469	if (unp == NULL)
470		return (EINVAL);
471	UNP_LOCK();
472	socantsendmore(so);
473	unp_shutdown(unp);
474	UNP_UNLOCK();
475	return (0);
476}
477
478static int
479uipc_sockaddr(struct socket *so, struct sockaddr **nam)
480{
481	struct unpcb *unp = sotounpcb(so);
482	const struct sockaddr *sa;
483
484	if (unp == NULL)
485		return (EINVAL);
486	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
487	UNP_LOCK();
488	if (unp->unp_addr != NULL)
489		sa = (struct sockaddr *) unp->unp_addr;
490	else
491		sa = &sun_noname;
492	bcopy(sa, *nam, sa->sa_len);
493	UNP_UNLOCK();
494	return (0);
495}
496
497struct pr_usrreqs uipc_usrreqs = {
498	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
499	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
500	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
501	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
502	sosend, soreceive, sopoll, pru_sosetlabel_null
503};
504
505int
506uipc_ctloutput(so, sopt)
507	struct socket *so;
508	struct sockopt *sopt;
509{
510	struct unpcb *unp = sotounpcb(so);
511	struct xucred xu;
512	int error;
513
514	switch (sopt->sopt_dir) {
515	case SOPT_GET:
516		switch (sopt->sopt_name) {
517		case LOCAL_PEERCRED:
518			error = 0;
519			UNP_LOCK();
520			if (unp->unp_flags & UNP_HAVEPC)
521				xu = unp->unp_peercred;
522			else {
523				if (so->so_type == SOCK_STREAM)
524					error = ENOTCONN;
525				else
526					error = EINVAL;
527			}
528			UNP_UNLOCK();
529			if (error == 0)
530				error = sooptcopyout(sopt, &xu, sizeof(xu));
531			break;
532		default:
533			error = EOPNOTSUPP;
534			break;
535		}
536		break;
537	case SOPT_SET:
538	default:
539		error = EOPNOTSUPP;
540		break;
541	}
542	return (error);
543}
544
545/*
546 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
547 * for stream sockets, although the total for sender and receiver is
548 * actually only PIPSIZ.
549 * Datagram sockets really use the sendspace as the maximum datagram size,
550 * and don't really want to reserve the sendspace.  Their recvspace should
551 * be large enough for at least one max-size datagram plus address.
552 */
553#ifndef PIPSIZ
554#define	PIPSIZ	8192
555#endif
556static u_long	unpst_sendspace = PIPSIZ;
557static u_long	unpst_recvspace = PIPSIZ;
558static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
559static u_long	unpdg_recvspace = 4*1024;
560
561static int	unp_rights;			/* file descriptors in flight */
562
563SYSCTL_DECL(_net_local_stream);
564SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
565	   &unpst_sendspace, 0, "");
566SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
567	   &unpst_recvspace, 0, "");
568SYSCTL_DECL(_net_local_dgram);
569SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
570	   &unpdg_sendspace, 0, "");
571SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
572	   &unpdg_recvspace, 0, "");
573SYSCTL_DECL(_net_local);
574SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
575
576static int
577unp_attach(so)
578	struct socket *so;
579{
580	register struct unpcb *unp;
581	int error;
582
583	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
584		switch (so->so_type) {
585
586		case SOCK_STREAM:
587			error = soreserve(so, unpst_sendspace, unpst_recvspace);
588			break;
589
590		case SOCK_DGRAM:
591			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
592			break;
593
594		default:
595			panic("unp_attach");
596		}
597		if (error)
598			return (error);
599	}
600	unp = uma_zalloc(unp_zone, M_WAITOK);
601	if (unp == NULL)
602		return (ENOBUFS);
603	bzero(unp, sizeof *unp);
604	LIST_INIT(&unp->unp_refs);
605	unp->unp_socket = so;
606
607	UNP_LOCK();
608	unp->unp_gencnt = ++unp_gencnt;
609	unp_count++;
610	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
611			 : &unp_shead, unp, unp_link);
612	UNP_UNLOCK();
613
614	so->so_pcb = unp;
615	return (0);
616}
617
618static void
619unp_detach(unp)
620	register struct unpcb *unp;
621{
622	struct vnode *vp;
623
624	UNP_LOCK_ASSERT();
625
626	LIST_REMOVE(unp, unp_link);
627	unp->unp_gencnt = ++unp_gencnt;
628	--unp_count;
629	if ((vp = unp->unp_vnode) != NULL) {
630		/*
631		 * XXXRW: should v_socket be frobbed only while holding
632		 * Giant?
633		 */
634		unp->unp_vnode->v_socket = NULL;
635		unp->unp_vnode = NULL;
636	}
637	if (unp->unp_conn != NULL)
638		unp_disconnect(unp);
639	while (!LIST_EMPTY(&unp->unp_refs)) {
640		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
641		unp_drop(ref, ECONNRESET);
642	}
643	soisdisconnected(unp->unp_socket);
644	unp->unp_socket->so_pcb = NULL;
645	if (unp_rights) {
646		/*
647		 * Normally the receive buffer is flushed later,
648		 * in sofree, but if our receive buffer holds references
649		 * to descriptors that are now garbage, we will dispose
650		 * of those descriptor references after the garbage collector
651		 * gets them (resulting in a "panic: closef: count < 0").
652		 */
653		sorflush(unp->unp_socket);
654		unp_gc();
655	}
656	UNP_UNLOCK();
657	if (unp->unp_addr != NULL)
658		FREE(unp->unp_addr, M_SONAME);
659	uma_zfree(unp_zone, unp);
660	if (vp) {
661		mtx_lock(&Giant);
662		vrele(vp);
663		mtx_unlock(&Giant);
664	}
665}
666
667static int
668unp_bind(unp, nam, td)
669	struct unpcb *unp;
670	struct sockaddr *nam;
671	struct thread *td;
672{
673	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
674	struct vnode *vp;
675	struct mount *mp;
676	struct vattr vattr;
677	int error, namelen;
678	struct nameidata nd;
679	char *buf;
680
681	/*
682	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
683	 * unlocked read here is fine, but the value of unp_vnode needs
684	 * to be tested again after we do all the lookups to see if the
685	 * pcb is still unbound?
686	 */
687	if (unp->unp_vnode != NULL)
688		return (EINVAL);
689
690	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
691	if (namelen <= 0)
692		return (EINVAL);
693
694	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
695	strlcpy(buf, soun->sun_path, namelen + 1);
696
697	mtx_lock(&Giant);
698restart:
699	mtx_assert(&Giant, MA_OWNED);
700	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
701	    buf, td);
702/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
703	error = namei(&nd);
704	if (error)
705		goto done;
706	vp = nd.ni_vp;
707	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
708		NDFREE(&nd, NDF_ONLY_PNBUF);
709		if (nd.ni_dvp == vp)
710			vrele(nd.ni_dvp);
711		else
712			vput(nd.ni_dvp);
713		if (vp != NULL) {
714			vrele(vp);
715			error = EADDRINUSE;
716			goto done;
717		}
718		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
719		if (error)
720			goto done;
721		goto restart;
722	}
723	VATTR_NULL(&vattr);
724	vattr.va_type = VSOCK;
725	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
726#ifdef MAC
727	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
728	    &vattr);
729#endif
730	if (error == 0) {
731		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
732		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
733	}
734	NDFREE(&nd, NDF_ONLY_PNBUF);
735	vput(nd.ni_dvp);
736	if (error)
737		goto done;
738	vp = nd.ni_vp;
739	ASSERT_VOP_LOCKED(vp, "unp_bind");
740	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
741	UNP_LOCK();
742	vp->v_socket = unp->unp_socket;
743	unp->unp_vnode = vp;
744	unp->unp_addr = soun;
745	UNP_UNLOCK();
746	VOP_UNLOCK(vp, 0, td);
747	vn_finished_write(mp);
748done:
749	mtx_unlock(&Giant);
750	free(buf, M_TEMP);
751	return (error);
752}
753
754static int
755unp_connect(so, nam, td)
756	struct socket *so;
757	struct sockaddr *nam;
758	struct thread *td;
759{
760	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
761	register struct vnode *vp;
762	register struct socket *so2, *so3;
763	struct unpcb *unp = sotounpcb(so);
764	struct unpcb *unp2, *unp3;
765	int error, len;
766	struct nameidata nd;
767	char buf[SOCK_MAXADDRLEN];
768	struct sockaddr *sa;
769
770	UNP_LOCK_ASSERT();
771
772	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
773	if (len <= 0)
774		return (EINVAL);
775	strlcpy(buf, soun->sun_path, len + 1);
776	UNP_UNLOCK();
777	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
778	mtx_lock(&Giant);
779	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
780	error = namei(&nd);
781	if (error)
782		vp = NULL;
783	else
784		vp = nd.ni_vp;
785	ASSERT_VOP_LOCKED(vp, "unp_connect");
786	NDFREE(&nd, NDF_ONLY_PNBUF);
787	if (error)
788		goto bad;
789
790	if (vp->v_type != VSOCK) {
791		error = ENOTSOCK;
792		goto bad;
793	}
794	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
795	if (error)
796		goto bad;
797	so2 = vp->v_socket;
798	if (so2 == NULL) {
799		error = ECONNREFUSED;
800		goto bad;
801	}
802	if (so->so_type != so2->so_type) {
803		error = EPROTOTYPE;
804		goto bad;
805	}
806	mtx_unlock(&Giant);
807	UNP_LOCK();
808	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
809		if (so2->so_options & SO_ACCEPTCONN) {
810			/*
811			 * NB: drop locks here so unp_attach is entered
812			 *     w/o locks; this avoids a recursive lock
813			 *     of the head and holding sleep locks across
814			 *     a (potentially) blocking malloc.
815			 */
816			UNP_UNLOCK();
817			so3 = sonewconn(so2, 0);
818			UNP_LOCK();
819		} else
820			so3 = NULL;
821		if (so3 == NULL) {
822			error = ECONNREFUSED;
823			goto bad2;
824		}
825		unp = sotounpcb(so);
826		unp2 = sotounpcb(so2);
827		unp3 = sotounpcb(so3);
828		if (unp2->unp_addr != NULL) {
829			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
830			unp3->unp_addr = (struct sockaddr_un *) sa;
831			sa = NULL;
832		}
833		/*
834		 * unp_peercred management:
835		 *
836		 * The connecter's (client's) credentials are copied
837		 * from its process structure at the time of connect()
838		 * (which is now).
839		 */
840		cru2x(td->td_ucred, &unp3->unp_peercred);
841		unp3->unp_flags |= UNP_HAVEPC;
842		/*
843		 * The receiver's (server's) credentials are copied
844		 * from the unp_peercred member of socket on which the
845		 * former called listen(); unp_listen() cached that
846		 * process's credentials at that time so we can use
847		 * them now.
848		 */
849		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
850		    ("unp_connect: listener without cached peercred"));
851		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
852		    sizeof(unp->unp_peercred));
853		unp->unp_flags |= UNP_HAVEPC;
854#ifdef MAC
855		SOCK_LOCK(so);
856		mac_set_socket_peer_from_socket(so, so3);
857		mac_set_socket_peer_from_socket(so3, so);
858		SOCK_UNLOCK(so);
859#endif
860
861		so2 = so3;
862	}
863	error = unp_connect2(so, so2);
864bad2:
865	UNP_UNLOCK();
866	mtx_lock(&Giant);
867bad:
868	mtx_assert(&Giant, MA_OWNED);
869	if (vp != NULL)
870		vput(vp);
871	mtx_unlock(&Giant);
872	free(sa, M_SONAME);
873	UNP_LOCK();
874	return (error);
875}
876
877static int
878unp_connect2(so, so2)
879	register struct socket *so;
880	register struct socket *so2;
881{
882	register struct unpcb *unp = sotounpcb(so);
883	register struct unpcb *unp2;
884
885	UNP_LOCK_ASSERT();
886
887	if (so2->so_type != so->so_type)
888		return (EPROTOTYPE);
889	unp2 = sotounpcb(so2);
890	unp->unp_conn = unp2;
891	switch (so->so_type) {
892
893	case SOCK_DGRAM:
894		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
895		soisconnected(so);
896		break;
897
898	case SOCK_STREAM:
899		unp2->unp_conn = unp;
900		soisconnected(so);
901		soisconnected(so2);
902		break;
903
904	default:
905		panic("unp_connect2");
906	}
907	return (0);
908}
909
910static void
911unp_disconnect(unp)
912	struct unpcb *unp;
913{
914	register struct unpcb *unp2 = unp->unp_conn;
915	struct socket *so;
916
917	UNP_LOCK_ASSERT();
918
919	if (unp2 == NULL)
920		return;
921	unp->unp_conn = NULL;
922	switch (unp->unp_socket->so_type) {
923
924	case SOCK_DGRAM:
925		LIST_REMOVE(unp, unp_reflink);
926		so = unp->unp_socket;
927		SOCK_LOCK(so);
928		so->so_state &= ~SS_ISCONNECTED;
929		SOCK_UNLOCK(so);
930		break;
931
932	case SOCK_STREAM:
933		soisdisconnected(unp->unp_socket);
934		unp2->unp_conn = NULL;
935		soisdisconnected(unp2->unp_socket);
936		break;
937	}
938}
939
940#ifdef notdef
941void
942unp_abort(unp)
943	struct unpcb *unp;
944{
945
946	unp_detach(unp);
947}
948#endif
949
950/*
951 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
952 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
953 * are safe to reference.  It first scans the list of struct unpcb's to
954 * generate a pointer list, then it rescans its list one entry at a time to
955 * externalize and copyout.  It checks the generation number to see if a
956 * struct unpcb has been reused, and will skip it if so.
957 */
958static int
959unp_pcblist(SYSCTL_HANDLER_ARGS)
960{
961	int error, i, n;
962	struct unpcb *unp, **unp_list;
963	unp_gen_t gencnt;
964	struct xunpgen *xug;
965	struct unp_head *head;
966	struct xunpcb *xu;
967
968	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
969
970	/*
971	 * The process of preparing the PCB list is too time-consuming and
972	 * resource-intensive to repeat twice on every request.
973	 */
974	if (req->oldptr == NULL) {
975		n = unp_count;
976		req->oldidx = 2 * (sizeof *xug)
977			+ (n + n/8) * sizeof(struct xunpcb);
978		return (0);
979	}
980
981	if (req->newptr != NULL)
982		return (EPERM);
983
984	/*
985	 * OK, now we're committed to doing something.
986	 */
987	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
988	UNP_LOCK();
989	gencnt = unp_gencnt;
990	n = unp_count;
991	UNP_UNLOCK();
992
993	xug->xug_len = sizeof *xug;
994	xug->xug_count = n;
995	xug->xug_gen = gencnt;
996	xug->xug_sogen = so_gencnt;
997	error = SYSCTL_OUT(req, xug, sizeof *xug);
998	if (error) {
999		free(xug, M_TEMP);
1000		return (error);
1001	}
1002
1003	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1004
1005	UNP_LOCK();
1006	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1007	     unp = LIST_NEXT(unp, unp_link)) {
1008		if (unp->unp_gencnt <= gencnt) {
1009			if (cr_cansee(req->td->td_ucred,
1010			    unp->unp_socket->so_cred))
1011				continue;
1012			unp_list[i++] = unp;
1013		}
1014	}
1015	UNP_UNLOCK();
1016	n = i;			/* in case we lost some during malloc */
1017
1018	error = 0;
1019	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1020	for (i = 0; i < n; i++) {
1021		unp = unp_list[i];
1022		if (unp->unp_gencnt <= gencnt) {
1023			xu->xu_len = sizeof *xu;
1024			xu->xu_unpp = unp;
1025			/*
1026			 * XXX - need more locking here to protect against
1027			 * connect/disconnect races for SMP.
1028			 */
1029			if (unp->unp_addr != NULL)
1030				bcopy(unp->unp_addr, &xu->xu_addr,
1031				      unp->unp_addr->sun_len);
1032			if (unp->unp_conn != NULL &&
1033			    unp->unp_conn->unp_addr != NULL)
1034				bcopy(unp->unp_conn->unp_addr,
1035				      &xu->xu_caddr,
1036				      unp->unp_conn->unp_addr->sun_len);
1037			bcopy(unp, &xu->xu_unp, sizeof *unp);
1038			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1039			error = SYSCTL_OUT(req, xu, sizeof *xu);
1040		}
1041	}
1042	free(xu, M_TEMP);
1043	if (!error) {
1044		/*
1045		 * Give the user an updated idea of our state.
1046		 * If the generation differs from what we told
1047		 * her before, she knows that something happened
1048		 * while we were processing this request, and it
1049		 * might be necessary to retry.
1050		 */
1051		xug->xug_gen = unp_gencnt;
1052		xug->xug_sogen = so_gencnt;
1053		xug->xug_count = unp_count;
1054		error = SYSCTL_OUT(req, xug, sizeof *xug);
1055	}
1056	free(unp_list, M_TEMP);
1057	free(xug, M_TEMP);
1058	return (error);
1059}
1060
1061SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1062	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1063	    "List of active local datagram sockets");
1064SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1065	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1066	    "List of active local stream sockets");
1067
1068static void
1069unp_shutdown(unp)
1070	struct unpcb *unp;
1071{
1072	struct socket *so;
1073
1074	UNP_LOCK_ASSERT();
1075
1076	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1077	    (so = unp->unp_conn->unp_socket))
1078		socantrcvmore(so);
1079}
1080
1081static void
1082unp_drop(unp, errno)
1083	struct unpcb *unp;
1084	int errno;
1085{
1086	struct socket *so = unp->unp_socket;
1087
1088	UNP_LOCK_ASSERT();
1089
1090	so->so_error = errno;
1091	unp_disconnect(unp);
1092}
1093
1094#ifdef notdef
1095void
1096unp_drain()
1097{
1098
1099}
1100#endif
1101
1102static void
1103unp_freerights(rp, fdcount)
1104	struct file **rp;
1105	int fdcount;
1106{
1107	int i;
1108	struct file *fp;
1109
1110	for (i = 0; i < fdcount; i++) {
1111		fp = *rp;
1112		/*
1113		 * zero the pointer before calling
1114		 * unp_discard since it may end up
1115		 * in unp_gc()..
1116		 */
1117		*rp++ = 0;
1118		unp_discard(fp);
1119	}
1120}
1121
1122int
1123unp_externalize(control, controlp)
1124	struct mbuf *control, **controlp;
1125{
1126	struct thread *td = curthread;		/* XXX */
1127	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1128	int i;
1129	int *fdp;
1130	struct file **rp;
1131	struct file *fp;
1132	void *data;
1133	socklen_t clen = control->m_len, datalen;
1134	int error, newfds;
1135	int f;
1136	u_int newlen;
1137
1138	error = 0;
1139	if (controlp != NULL) /* controlp == NULL => free control messages */
1140		*controlp = NULL;
1141
1142	while (cm != NULL) {
1143		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1144			error = EINVAL;
1145			break;
1146		}
1147
1148		data = CMSG_DATA(cm);
1149		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1150
1151		if (cm->cmsg_level == SOL_SOCKET
1152		    && cm->cmsg_type == SCM_RIGHTS) {
1153			newfds = datalen / sizeof(struct file *);
1154			rp = data;
1155
1156			/* If we're not outputting the descriptors free them. */
1157			if (error || controlp == NULL) {
1158				unp_freerights(rp, newfds);
1159				goto next;
1160			}
1161			FILEDESC_LOCK(td->td_proc->p_fd);
1162			/* if the new FD's will not fit free them.  */
1163			if (!fdavail(td, newfds)) {
1164				FILEDESC_UNLOCK(td->td_proc->p_fd);
1165				error = EMSGSIZE;
1166				unp_freerights(rp, newfds);
1167				goto next;
1168			}
1169			/*
1170			 * now change each pointer to an fd in the global
1171			 * table to an integer that is the index to the
1172			 * local fd table entry that we set up to point
1173			 * to the global one we are transferring.
1174			 */
1175			newlen = newfds * sizeof(int);
1176			*controlp = sbcreatecontrol(NULL, newlen,
1177			    SCM_RIGHTS, SOL_SOCKET);
1178			if (*controlp == NULL) {
1179				FILEDESC_UNLOCK(td->td_proc->p_fd);
1180				error = E2BIG;
1181				unp_freerights(rp, newfds);
1182				goto next;
1183			}
1184
1185			fdp = (int *)
1186			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1187			for (i = 0; i < newfds; i++) {
1188				if (fdalloc(td, 0, &f))
1189					panic("unp_externalize fdalloc failed");
1190				fp = *rp++;
1191				td->td_proc->p_fd->fd_ofiles[f] = fp;
1192				FILE_LOCK(fp);
1193				fp->f_msgcount--;
1194				FILE_UNLOCK(fp);
1195				unp_rights--;
1196				*fdp++ = f;
1197			}
1198			FILEDESC_UNLOCK(td->td_proc->p_fd);
1199		} else { /* We can just copy anything else across */
1200			if (error || controlp == NULL)
1201				goto next;
1202			*controlp = sbcreatecontrol(NULL, datalen,
1203			    cm->cmsg_type, cm->cmsg_level);
1204			if (*controlp == NULL) {
1205				error = ENOBUFS;
1206				goto next;
1207			}
1208			bcopy(data,
1209			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1210			    datalen);
1211		}
1212
1213		controlp = &(*controlp)->m_next;
1214
1215next:
1216		if (CMSG_SPACE(datalen) < clen) {
1217			clen -= CMSG_SPACE(datalen);
1218			cm = (struct cmsghdr *)
1219			    ((caddr_t)cm + CMSG_SPACE(datalen));
1220		} else {
1221			clen = 0;
1222			cm = NULL;
1223		}
1224	}
1225
1226	m_freem(control);
1227
1228	return (error);
1229}
1230
1231void
1232unp_init(void)
1233{
1234	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1235	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1236	if (unp_zone == NULL)
1237		panic("unp_init");
1238	uma_zone_set_max(unp_zone, nmbclusters);
1239	LIST_INIT(&unp_dhead);
1240	LIST_INIT(&unp_shead);
1241
1242	UNP_LOCK_INIT();
1243}
1244
1245static int
1246unp_internalize(controlp, td)
1247	struct mbuf **controlp;
1248	struct thread *td;
1249{
1250	struct mbuf *control = *controlp;
1251	struct proc *p = td->td_proc;
1252	struct filedesc *fdescp = p->p_fd;
1253	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1254	struct cmsgcred *cmcred;
1255	struct file **rp;
1256	struct file *fp;
1257	struct timeval *tv;
1258	int i, fd, *fdp;
1259	void *data;
1260	socklen_t clen = control->m_len, datalen;
1261	int error, oldfds;
1262	u_int newlen;
1263
1264	error = 0;
1265	*controlp = NULL;
1266
1267	while (cm != NULL) {
1268		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1269		    || cm->cmsg_len > clen) {
1270			error = EINVAL;
1271			goto out;
1272		}
1273
1274		data = CMSG_DATA(cm);
1275		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1276
1277		switch (cm->cmsg_type) {
1278		/*
1279		 * Fill in credential information.
1280		 */
1281		case SCM_CREDS:
1282			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1283			    SCM_CREDS, SOL_SOCKET);
1284			if (*controlp == NULL) {
1285				error = ENOBUFS;
1286				goto out;
1287			}
1288
1289			cmcred = (struct cmsgcred *)
1290			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1291			cmcred->cmcred_pid = p->p_pid;
1292			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1293			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1294			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1295			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1296							CMGROUP_MAX);
1297			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1298				cmcred->cmcred_groups[i] =
1299				    td->td_ucred->cr_groups[i];
1300			break;
1301
1302		case SCM_RIGHTS:
1303			oldfds = datalen / sizeof (int);
1304			/*
1305			 * check that all the FDs passed in refer to legal files
1306			 * If not, reject the entire operation.
1307			 */
1308			fdp = data;
1309			FILEDESC_LOCK(fdescp);
1310			for (i = 0; i < oldfds; i++) {
1311				fd = *fdp++;
1312				if ((unsigned)fd >= fdescp->fd_nfiles ||
1313				    fdescp->fd_ofiles[fd] == NULL) {
1314					FILEDESC_UNLOCK(fdescp);
1315					error = EBADF;
1316					goto out;
1317				}
1318				fp = fdescp->fd_ofiles[fd];
1319				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1320					FILEDESC_UNLOCK(fdescp);
1321					error = EOPNOTSUPP;
1322					goto out;
1323				}
1324
1325			}
1326			/*
1327			 * Now replace the integer FDs with pointers to
1328			 * the associated global file table entry..
1329			 */
1330			newlen = oldfds * sizeof(struct file *);
1331			*controlp = sbcreatecontrol(NULL, newlen,
1332			    SCM_RIGHTS, SOL_SOCKET);
1333			if (*controlp == NULL) {
1334				FILEDESC_UNLOCK(fdescp);
1335				error = E2BIG;
1336				goto out;
1337			}
1338
1339			fdp = data;
1340			rp = (struct file **)
1341			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1342			for (i = 0; i < oldfds; i++) {
1343				fp = fdescp->fd_ofiles[*fdp++];
1344				*rp++ = fp;
1345				FILE_LOCK(fp);
1346				fp->f_count++;
1347				fp->f_msgcount++;
1348				FILE_UNLOCK(fp);
1349				unp_rights++;
1350			}
1351			FILEDESC_UNLOCK(fdescp);
1352			break;
1353
1354		case SCM_TIMESTAMP:
1355			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1356			    SCM_TIMESTAMP, SOL_SOCKET);
1357			if (*controlp == NULL) {
1358				error = ENOBUFS;
1359				goto out;
1360			}
1361			tv = (struct timeval *)
1362			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1363			microtime(tv);
1364			break;
1365
1366		default:
1367			error = EINVAL;
1368			goto out;
1369		}
1370
1371		controlp = &(*controlp)->m_next;
1372
1373		if (CMSG_SPACE(datalen) < clen) {
1374			clen -= CMSG_SPACE(datalen);
1375			cm = (struct cmsghdr *)
1376			    ((caddr_t)cm + CMSG_SPACE(datalen));
1377		} else {
1378			clen = 0;
1379			cm = NULL;
1380		}
1381	}
1382
1383out:
1384	m_freem(control);
1385
1386	return (error);
1387}
1388
1389static int	unp_defer, unp_gcing;
1390
1391static void
1392unp_gc()
1393{
1394	register struct file *fp, *nextfp;
1395	register struct socket *so;
1396	struct file **extra_ref, **fpp;
1397	int nunref, i;
1398
1399	UNP_LOCK_ASSERT();
1400
1401	if (unp_gcing)
1402		return;
1403	unp_gcing = 1;
1404	unp_defer = 0;
1405	/*
1406	 * before going through all this, set all FDs to
1407	 * be NOT defered and NOT externally accessible
1408	 */
1409	/*
1410	 * XXXRW: Acquiring a sleep lock while holding UNP
1411	 * mutex cannot be a good thing.
1412	 */
1413	sx_slock(&filelist_lock);
1414	LIST_FOREACH(fp, &filehead, f_list)
1415		fp->f_gcflag &= ~(FMARK|FDEFER);
1416	do {
1417		LIST_FOREACH(fp, &filehead, f_list) {
1418			FILE_LOCK(fp);
1419			/*
1420			 * If the file is not open, skip it
1421			 */
1422			if (fp->f_count == 0) {
1423				FILE_UNLOCK(fp);
1424				continue;
1425			}
1426			/*
1427			 * If we already marked it as 'defer'  in a
1428			 * previous pass, then try process it this time
1429			 * and un-mark it
1430			 */
1431			if (fp->f_gcflag & FDEFER) {
1432				fp->f_gcflag &= ~FDEFER;
1433				unp_defer--;
1434			} else {
1435				/*
1436				 * if it's not defered, then check if it's
1437				 * already marked.. if so skip it
1438				 */
1439				if (fp->f_gcflag & FMARK) {
1440					FILE_UNLOCK(fp);
1441					continue;
1442				}
1443				/*
1444				 * If all references are from messages
1445				 * in transit, then skip it. it's not
1446				 * externally accessible.
1447				 */
1448				if (fp->f_count == fp->f_msgcount) {
1449					FILE_UNLOCK(fp);
1450					continue;
1451				}
1452				/*
1453				 * If it got this far then it must be
1454				 * externally accessible.
1455				 */
1456				fp->f_gcflag |= FMARK;
1457			}
1458			/*
1459			 * either it was defered, or it is externally
1460			 * accessible and not already marked so.
1461			 * Now check if it is possibly one of OUR sockets.
1462			 */
1463			if (fp->f_type != DTYPE_SOCKET ||
1464			    (so = fp->f_data) == NULL) {
1465				FILE_UNLOCK(fp);
1466				continue;
1467			}
1468			FILE_UNLOCK(fp);
1469			if (so->so_proto->pr_domain != &localdomain ||
1470			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1471				continue;
1472#ifdef notdef
1473			if (so->so_rcv.sb_flags & SB_LOCK) {
1474				/*
1475				 * This is problematical; it's not clear
1476				 * we need to wait for the sockbuf to be
1477				 * unlocked (on a uniprocessor, at least),
1478				 * and it's also not clear what to do
1479				 * if sbwait returns an error due to receipt
1480				 * of a signal.  If sbwait does return
1481				 * an error, we'll go into an infinite
1482				 * loop.  Delete all of this for now.
1483				 */
1484				(void) sbwait(&so->so_rcv);
1485				goto restart;
1486			}
1487#endif
1488			/*
1489			 * So, Ok, it's one of our sockets and it IS externally
1490			 * accessible (or was defered). Now we look
1491			 * to see if we hold any file descriptors in its
1492			 * message buffers. Follow those links and mark them
1493			 * as accessible too.
1494			 */
1495			SOCKBUF_LOCK(&so->so_rcv);
1496			unp_scan(so->so_rcv.sb_mb, unp_mark);
1497			SOCKBUF_UNLOCK(&so->so_rcv);
1498		}
1499	} while (unp_defer);
1500	sx_sunlock(&filelist_lock);
1501	/*
1502	 * We grab an extra reference to each of the file table entries
1503	 * that are not otherwise accessible and then free the rights
1504	 * that are stored in messages on them.
1505	 *
1506	 * The bug in the orginal code is a little tricky, so I'll describe
1507	 * what's wrong with it here.
1508	 *
1509	 * It is incorrect to simply unp_discard each entry for f_msgcount
1510	 * times -- consider the case of sockets A and B that contain
1511	 * references to each other.  On a last close of some other socket,
1512	 * we trigger a gc since the number of outstanding rights (unp_rights)
1513	 * is non-zero.  If during the sweep phase the gc code un_discards,
1514	 * we end up doing a (full) closef on the descriptor.  A closef on A
1515	 * results in the following chain.  Closef calls soo_close, which
1516	 * calls soclose.   Soclose calls first (through the switch
1517	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1518	 * returns because the previous instance had set unp_gcing, and
1519	 * we return all the way back to soclose, which marks the socket
1520	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1521	 * to free up the rights that are queued in messages on the socket A,
1522	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1523	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1524	 * instance of unp_discard just calls closef on B.
1525	 *
1526	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1527	 * which results in another closef on A.  Unfortunately, A is already
1528	 * being closed, and the descriptor has already been marked with
1529	 * SS_NOFDREF, and soclose panics at this point.
1530	 *
1531	 * Here, we first take an extra reference to each inaccessible
1532	 * descriptor.  Then, we call sorflush ourself, since we know
1533	 * it is a Unix domain socket anyhow.  After we destroy all the
1534	 * rights carried in messages, we do a last closef to get rid
1535	 * of our extra reference.  This is the last close, and the
1536	 * unp_detach etc will shut down the socket.
1537	 *
1538	 * 91/09/19, bsy@cs.cmu.edu
1539	 */
1540	extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK);
1541	sx_slock(&filelist_lock);
1542	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1543	    fp != NULL; fp = nextfp) {
1544		nextfp = LIST_NEXT(fp, f_list);
1545		FILE_LOCK(fp);
1546		/*
1547		 * If it's not open, skip it
1548		 */
1549		if (fp->f_count == 0) {
1550			FILE_UNLOCK(fp);
1551			continue;
1552		}
1553		/*
1554		 * If all refs are from msgs, and it's not marked accessible
1555		 * then it must be referenced from some unreachable cycle
1556		 * of (shut-down) FDs, so include it in our
1557		 * list of FDs to remove
1558		 */
1559		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1560			*fpp++ = fp;
1561			nunref++;
1562			fp->f_count++;
1563		}
1564		FILE_UNLOCK(fp);
1565	}
1566	sx_sunlock(&filelist_lock);
1567	/*
1568	 * for each FD on our hit list, do the following two things
1569	 */
1570	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1571		struct file *tfp = *fpp;
1572		FILE_LOCK(tfp);
1573		if (tfp->f_type == DTYPE_SOCKET &&
1574		    tfp->f_data != NULL) {
1575			FILE_UNLOCK(tfp);
1576			sorflush(tfp->f_data);
1577		} else {
1578			FILE_UNLOCK(tfp);
1579		}
1580	}
1581	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1582		closef(*fpp, (struct thread *) NULL);
1583	free(extra_ref, M_TEMP);
1584	unp_gcing = 0;
1585}
1586
1587void
1588unp_dispose(m)
1589	struct mbuf *m;
1590{
1591
1592	if (m)
1593		unp_scan(m, unp_discard);
1594}
1595
1596static int
1597unp_listen(unp, td)
1598	struct unpcb *unp;
1599	struct thread *td;
1600{
1601	UNP_LOCK_ASSERT();
1602
1603	/*
1604	 * XXXRW: Why populate the local peer cred with our own credential?
1605	 */
1606	cru2x(td->td_ucred, &unp->unp_peercred);
1607	unp->unp_flags |= UNP_HAVEPCCACHED;
1608	return (0);
1609}
1610
1611static void
1612unp_scan(m0, op)
1613	register struct mbuf *m0;
1614	void (*op)(struct file *);
1615{
1616	struct mbuf *m;
1617	struct file **rp;
1618	struct cmsghdr *cm;
1619	void *data;
1620	int i;
1621	socklen_t clen, datalen;
1622	int qfds;
1623
1624	while (m0 != NULL) {
1625		for (m = m0; m; m = m->m_next) {
1626			if (m->m_type != MT_CONTROL)
1627				continue;
1628
1629			cm = mtod(m, struct cmsghdr *);
1630			clen = m->m_len;
1631
1632			while (cm != NULL) {
1633				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1634					break;
1635
1636				data = CMSG_DATA(cm);
1637				datalen = (caddr_t)cm + cm->cmsg_len
1638				    - (caddr_t)data;
1639
1640				if (cm->cmsg_level == SOL_SOCKET &&
1641				    cm->cmsg_type == SCM_RIGHTS) {
1642					qfds = datalen / sizeof (struct file *);
1643					rp = data;
1644					for (i = 0; i < qfds; i++)
1645						(*op)(*rp++);
1646				}
1647
1648				if (CMSG_SPACE(datalen) < clen) {
1649					clen -= CMSG_SPACE(datalen);
1650					cm = (struct cmsghdr *)
1651					    ((caddr_t)cm + CMSG_SPACE(datalen));
1652				} else {
1653					clen = 0;
1654					cm = NULL;
1655				}
1656			}
1657		}
1658		m0 = m0->m_act;
1659	}
1660}
1661
1662static void
1663unp_mark(fp)
1664	struct file *fp;
1665{
1666	if (fp->f_gcflag & FMARK)
1667		return;
1668	unp_defer++;
1669	fp->f_gcflag |= (FMARK|FDEFER);
1670}
1671
1672static void
1673unp_discard(fp)
1674	struct file *fp;
1675{
1676	FILE_LOCK(fp);
1677	fp->f_msgcount--;
1678	unp_rights--;
1679	FILE_UNLOCK(fp);
1680	(void) closef(fp, (struct thread *)NULL);
1681}
1682