uipc_usrreq.c revision 133709
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 133709 2004-08-14 03:43:49Z rwatson $");
34
35#include "opt_mac.h"
36
37#include <sys/param.h>
38#include <sys/domain.h>
39#include <sys/fcntl.h>
40#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
41#include <sys/file.h>
42#include <sys/filedesc.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mac.h>
47#include <sys/mbuf.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/proc.h>
51#include <sys/protosw.h>
52#include <sys/resourcevar.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/signalvar.h>
56#include <sys/stat.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/systm.h>
60#include <sys/un.h>
61#include <sys/unpcb.h>
62#include <sys/vnode.h>
63
64#include <vm/uma.h>
65
66static uma_zone_t unp_zone;
67static	unp_gen_t unp_gencnt;
68static	u_int unp_count;
69
70static	struct unp_head unp_shead, unp_dhead;
71
72/*
73 * Unix communications domain.
74 *
75 * TODO:
76 *	SEQPACKET, RDM
77 *	rethink name space problems
78 *	need a proper out-of-band
79 *	lock pushdown
80 */
81static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
82static ino_t	unp_ino;		/* prototype for fake inode numbers */
83
84static struct mtx unp_mtx;
85#define	UNP_LOCK_INIT() \
86	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
87#define	UNP_LOCK()		mtx_lock(&unp_mtx)
88#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
89#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
90
91static int     unp_attach(struct socket *);
92static void    unp_detach(struct unpcb *);
93static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
94static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
95static int     unp_connect2(struct socket *so, struct socket *so2);
96static void    unp_disconnect(struct unpcb *);
97static void    unp_shutdown(struct unpcb *);
98static void    unp_drop(struct unpcb *, int);
99static void    unp_gc(void);
100static void    unp_scan(struct mbuf *, void (*)(struct file *));
101static void    unp_mark(struct file *);
102static void    unp_discard(struct file *);
103static void    unp_freerights(struct file **, int);
104static int     unp_internalize(struct mbuf **, struct thread *);
105static int     unp_listen(struct unpcb *, struct thread *);
106
107static int
108uipc_abort(struct socket *so)
109{
110	struct unpcb *unp = sotounpcb(so);
111
112	if (unp == NULL)
113		return (EINVAL);
114	UNP_LOCK();
115	unp_drop(unp, ECONNABORTED);
116	unp_detach(unp);	/* NB: unlocks */
117	SOCK_LOCK(so);
118	sotryfree(so);
119	return (0);
120}
121
122static int
123uipc_accept(struct socket *so, struct sockaddr **nam)
124{
125	struct unpcb *unp = sotounpcb(so);
126	const struct sockaddr *sa;
127
128	if (unp == NULL)
129		return (EINVAL);
130
131	/*
132	 * Pass back name of connected socket,
133	 * if it was bound and we are still connected
134	 * (our peer may have closed already!).
135	 */
136	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
137	UNP_LOCK();
138	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
139		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
140	else
141		sa = &sun_noname;
142	bcopy(sa, *nam, sa->sa_len);
143	UNP_UNLOCK();
144	return (0);
145}
146
147static int
148uipc_attach(struct socket *so, int proto, struct thread *td)
149{
150	struct unpcb *unp = sotounpcb(so);
151
152	if (unp != NULL)
153		return (EISCONN);
154	return (unp_attach(so));
155}
156
157static int
158uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
159{
160	struct unpcb *unp = sotounpcb(so);
161
162	if (unp == NULL)
163		return (EINVAL);
164
165	return (unp_bind(unp, nam, td));
166}
167
168static int
169uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
170{
171	struct unpcb *unp;
172	int error;
173
174	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
175
176	UNP_LOCK();
177	unp = sotounpcb(so);
178	if (unp == NULL) {
179		error = EINVAL;
180		goto out;
181	}
182	error = unp_connect(so, nam, td);
183out:
184	UNP_UNLOCK();
185	return (error);
186}
187
188int
189uipc_connect2(struct socket *so1, struct socket *so2)
190{
191	struct unpcb *unp = sotounpcb(so1);
192	int error;
193
194	if (unp == NULL)
195		return (EINVAL);
196
197	UNP_LOCK();
198	error = unp_connect2(so1, so2);
199	UNP_UNLOCK();
200	return (error);
201}
202
203/* control is EOPNOTSUPP */
204
205static int
206uipc_detach(struct socket *so)
207{
208	struct unpcb *unp = sotounpcb(so);
209
210	if (unp == NULL)
211		return (EINVAL);
212
213	UNP_LOCK();
214	unp_detach(unp);	/* NB: unlocks unp */
215	return (0);
216}
217
218static int
219uipc_disconnect(struct socket *so)
220{
221	struct unpcb *unp = sotounpcb(so);
222
223	if (unp == NULL)
224		return (EINVAL);
225	UNP_LOCK();
226	unp_disconnect(unp);
227	UNP_UNLOCK();
228	return (0);
229}
230
231static int
232uipc_listen(struct socket *so, struct thread *td)
233{
234	struct unpcb *unp = sotounpcb(so);
235	int error;
236
237	if (unp == NULL || unp->unp_vnode == NULL)
238		return (EINVAL);
239	UNP_LOCK();
240	error = unp_listen(unp, td);
241	UNP_UNLOCK();
242	return (error);
243}
244
245static int
246uipc_peeraddr(struct socket *so, struct sockaddr **nam)
247{
248	struct unpcb *unp = sotounpcb(so);
249	const struct sockaddr *sa;
250
251	if (unp == NULL)
252		return (EINVAL);
253	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
254	UNP_LOCK();
255	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
256		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
257	else {
258		/*
259		 * XXX: It seems that this test always fails even when
260		 * connection is established.  So, this else clause is
261		 * added as workaround to return PF_LOCAL sockaddr.
262		 */
263		sa = &sun_noname;
264	}
265	bcopy(sa, *nam, sa->sa_len);
266	UNP_UNLOCK();
267	return (0);
268}
269
270static int
271uipc_rcvd(struct socket *so, int flags)
272{
273	struct unpcb *unp = sotounpcb(so);
274	struct socket *so2;
275	u_long newhiwat;
276
277	if (unp == NULL)
278		return (EINVAL);
279	UNP_LOCK();
280	switch (so->so_type) {
281	case SOCK_DGRAM:
282		panic("uipc_rcvd DGRAM?");
283		/*NOTREACHED*/
284
285	case SOCK_STREAM:
286		if (unp->unp_conn == NULL)
287			break;
288		so2 = unp->unp_conn->unp_socket;
289		SOCKBUF_LOCK(&so2->so_snd);
290		SOCKBUF_LOCK(&so->so_rcv);
291		/*
292		 * Adjust backpressure on sender
293		 * and wakeup any waiting to write.
294		 */
295		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
296		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
297		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
298		    so->so_rcv.sb_cc;
299		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
300		    newhiwat, RLIM_INFINITY);
301		unp->unp_cc = so->so_rcv.sb_cc;
302		SOCKBUF_UNLOCK(&so->so_rcv);
303		sowwakeup_locked(so2);
304		break;
305
306	default:
307		panic("uipc_rcvd unknown socktype");
308	}
309	UNP_UNLOCK();
310	return (0);
311}
312
313/* pru_rcvoob is EOPNOTSUPP */
314
315static int
316uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
317	  struct mbuf *control, struct thread *td)
318{
319	int error = 0;
320	struct unpcb *unp = sotounpcb(so);
321	struct socket *so2;
322	u_long newhiwat;
323
324	if (unp == NULL) {
325		error = EINVAL;
326		goto release;
327	}
328	if (flags & PRUS_OOB) {
329		error = EOPNOTSUPP;
330		goto release;
331	}
332
333	if (control != NULL && (error = unp_internalize(&control, td)))
334		goto release;
335
336	UNP_LOCK();
337	switch (so->so_type) {
338	case SOCK_DGRAM:
339	{
340		const struct sockaddr *from;
341
342		if (nam != NULL) {
343			if (unp->unp_conn != NULL) {
344				error = EISCONN;
345				break;
346			}
347			error = unp_connect(so, nam, td);
348			if (error)
349				break;
350		} else {
351			if (unp->unp_conn == NULL) {
352				error = ENOTCONN;
353				break;
354			}
355		}
356		so2 = unp->unp_conn->unp_socket;
357		if (unp->unp_addr != NULL)
358			from = (struct sockaddr *)unp->unp_addr;
359		else
360			from = &sun_noname;
361		SOCKBUF_LOCK(&so2->so_rcv);
362		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
363			sorwakeup_locked(so2);
364			m = NULL;
365			control = NULL;
366		} else {
367			SOCKBUF_UNLOCK(&so2->so_rcv);
368			error = ENOBUFS;
369		}
370		if (nam != NULL)
371			unp_disconnect(unp);
372		break;
373	}
374
375	case SOCK_STREAM:
376		/* Connect if not connected yet. */
377		/*
378		 * Note: A better implementation would complain
379		 * if not equal to the peer's address.
380		 */
381		if ((so->so_state & SS_ISCONNECTED) == 0) {
382			if (nam != NULL) {
383				error = unp_connect(so, nam, td);
384				if (error)
385					break;	/* XXX */
386			} else {
387				error = ENOTCONN;
388				break;
389			}
390		}
391
392		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
393			error = EPIPE;
394			break;
395		}
396		if (unp->unp_conn == NULL)
397			panic("uipc_send connected but no connection?");
398		so2 = unp->unp_conn->unp_socket;
399		SOCKBUF_LOCK(&so2->so_rcv);
400		/*
401		 * Send to paired receive port, and then reduce
402		 * send buffer hiwater marks to maintain backpressure.
403		 * Wake up readers.
404		 */
405		if (control != NULL) {
406			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
407				control = NULL;
408		} else {
409			sbappend_locked(&so2->so_rcv, m);
410		}
411		so->so_snd.sb_mbmax -=
412			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
413		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
414		newhiwat = so->so_snd.sb_hiwat -
415		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
416		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
417		    newhiwat, RLIM_INFINITY);
418		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
419		sorwakeup_locked(so2);
420		m = NULL;
421		break;
422
423	default:
424		panic("uipc_send unknown socktype");
425	}
426
427	/*
428	 * SEND_EOF is equivalent to a SEND followed by
429	 * a SHUTDOWN.
430	 */
431	if (flags & PRUS_EOF) {
432		socantsendmore(so);
433		unp_shutdown(unp);
434	}
435	UNP_UNLOCK();
436
437	if (control != NULL && error != 0)
438		unp_dispose(control);
439
440release:
441	if (control != NULL)
442		m_freem(control);
443	if (m != NULL)
444		m_freem(m);
445	return (error);
446}
447
448static int
449uipc_sense(struct socket *so, struct stat *sb)
450{
451	struct unpcb *unp = sotounpcb(so);
452	struct socket *so2;
453
454	if (unp == NULL)
455		return (EINVAL);
456	UNP_LOCK();
457	sb->st_blksize = so->so_snd.sb_hiwat;
458	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
459		so2 = unp->unp_conn->unp_socket;
460		sb->st_blksize += so2->so_rcv.sb_cc;
461	}
462	sb->st_dev = NODEV;
463	if (unp->unp_ino == 0)
464		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
465	sb->st_ino = unp->unp_ino;
466	UNP_UNLOCK();
467	return (0);
468}
469
470static int
471uipc_shutdown(struct socket *so)
472{
473	struct unpcb *unp = sotounpcb(so);
474
475	if (unp == NULL)
476		return (EINVAL);
477	UNP_LOCK();
478	socantsendmore(so);
479	unp_shutdown(unp);
480	UNP_UNLOCK();
481	return (0);
482}
483
484static int
485uipc_sockaddr(struct socket *so, struct sockaddr **nam)
486{
487	struct unpcb *unp = sotounpcb(so);
488	const struct sockaddr *sa;
489
490	if (unp == NULL)
491		return (EINVAL);
492	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
493	UNP_LOCK();
494	if (unp->unp_addr != NULL)
495		sa = (struct sockaddr *) unp->unp_addr;
496	else
497		sa = &sun_noname;
498	bcopy(sa, *nam, sa->sa_len);
499	UNP_UNLOCK();
500	return (0);
501}
502
503struct pr_usrreqs uipc_usrreqs = {
504	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
505	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
506	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
507	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
508	sosend, soreceive, sopoll, pru_sosetlabel_null
509};
510
511int
512uipc_ctloutput(so, sopt)
513	struct socket *so;
514	struct sockopt *sopt;
515{
516	struct unpcb *unp = sotounpcb(so);
517	struct xucred xu;
518	int error;
519
520	switch (sopt->sopt_dir) {
521	case SOPT_GET:
522		switch (sopt->sopt_name) {
523		case LOCAL_PEERCRED:
524			error = 0;
525			UNP_LOCK();
526			if (unp->unp_flags & UNP_HAVEPC)
527				xu = unp->unp_peercred;
528			else {
529				if (so->so_type == SOCK_STREAM)
530					error = ENOTCONN;
531				else
532					error = EINVAL;
533			}
534			UNP_UNLOCK();
535			if (error == 0)
536				error = sooptcopyout(sopt, &xu, sizeof(xu));
537			break;
538		default:
539			error = EOPNOTSUPP;
540			break;
541		}
542		break;
543	case SOPT_SET:
544	default:
545		error = EOPNOTSUPP;
546		break;
547	}
548	return (error);
549}
550
551/*
552 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
553 * for stream sockets, although the total for sender and receiver is
554 * actually only PIPSIZ.
555 * Datagram sockets really use the sendspace as the maximum datagram size,
556 * and don't really want to reserve the sendspace.  Their recvspace should
557 * be large enough for at least one max-size datagram plus address.
558 */
559#ifndef PIPSIZ
560#define	PIPSIZ	8192
561#endif
562static u_long	unpst_sendspace = PIPSIZ;
563static u_long	unpst_recvspace = PIPSIZ;
564static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
565static u_long	unpdg_recvspace = 4*1024;
566
567static int	unp_rights;			/* file descriptors in flight */
568
569SYSCTL_DECL(_net_local_stream);
570SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
571	   &unpst_sendspace, 0, "");
572SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
573	   &unpst_recvspace, 0, "");
574SYSCTL_DECL(_net_local_dgram);
575SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
576	   &unpdg_sendspace, 0, "");
577SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
578	   &unpdg_recvspace, 0, "");
579SYSCTL_DECL(_net_local);
580SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
581
582static int
583unp_attach(so)
584	struct socket *so;
585{
586	register struct unpcb *unp;
587	int error;
588
589	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
590		switch (so->so_type) {
591
592		case SOCK_STREAM:
593			error = soreserve(so, unpst_sendspace, unpst_recvspace);
594			break;
595
596		case SOCK_DGRAM:
597			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
598			break;
599
600		default:
601			panic("unp_attach");
602		}
603		if (error)
604			return (error);
605	}
606	unp = uma_zalloc(unp_zone, M_WAITOK);
607	if (unp == NULL)
608		return (ENOBUFS);
609	bzero(unp, sizeof *unp);
610	LIST_INIT(&unp->unp_refs);
611	unp->unp_socket = so;
612
613	UNP_LOCK();
614	unp->unp_gencnt = ++unp_gencnt;
615	unp_count++;
616	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
617			 : &unp_shead, unp, unp_link);
618	UNP_UNLOCK();
619
620	so->so_pcb = unp;
621	return (0);
622}
623
624static void
625unp_detach(unp)
626	register struct unpcb *unp;
627{
628	struct vnode *vp;
629
630	UNP_LOCK_ASSERT();
631
632	LIST_REMOVE(unp, unp_link);
633	unp->unp_gencnt = ++unp_gencnt;
634	--unp_count;
635	if ((vp = unp->unp_vnode) != NULL) {
636		/*
637		 * XXXRW: should v_socket be frobbed only while holding
638		 * Giant?
639		 */
640		unp->unp_vnode->v_socket = NULL;
641		unp->unp_vnode = NULL;
642	}
643	if (unp->unp_conn != NULL)
644		unp_disconnect(unp);
645	while (!LIST_EMPTY(&unp->unp_refs)) {
646		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
647		unp_drop(ref, ECONNRESET);
648	}
649	soisdisconnected(unp->unp_socket);
650	unp->unp_socket->so_pcb = NULL;
651	if (unp_rights) {
652		/*
653		 * Normally the receive buffer is flushed later,
654		 * in sofree, but if our receive buffer holds references
655		 * to descriptors that are now garbage, we will dispose
656		 * of those descriptor references after the garbage collector
657		 * gets them (resulting in a "panic: closef: count < 0").
658		 */
659		sorflush(unp->unp_socket);
660		unp_gc();
661	}
662	UNP_UNLOCK();
663	if (unp->unp_addr != NULL)
664		FREE(unp->unp_addr, M_SONAME);
665	uma_zfree(unp_zone, unp);
666	if (vp) {
667		mtx_lock(&Giant);
668		vrele(vp);
669		mtx_unlock(&Giant);
670	}
671}
672
673static int
674unp_bind(unp, nam, td)
675	struct unpcb *unp;
676	struct sockaddr *nam;
677	struct thread *td;
678{
679	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
680	struct vnode *vp;
681	struct mount *mp;
682	struct vattr vattr;
683	int error, namelen;
684	struct nameidata nd;
685	char *buf;
686
687	/*
688	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
689	 * unlocked read here is fine, but the value of unp_vnode needs
690	 * to be tested again after we do all the lookups to see if the
691	 * pcb is still unbound?
692	 */
693	if (unp->unp_vnode != NULL)
694		return (EINVAL);
695
696	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
697	if (namelen <= 0)
698		return (EINVAL);
699
700	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
701	strlcpy(buf, soun->sun_path, namelen + 1);
702
703	mtx_lock(&Giant);
704restart:
705	mtx_assert(&Giant, MA_OWNED);
706	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
707	    buf, td);
708/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
709	error = namei(&nd);
710	if (error)
711		goto done;
712	vp = nd.ni_vp;
713	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
714		NDFREE(&nd, NDF_ONLY_PNBUF);
715		if (nd.ni_dvp == vp)
716			vrele(nd.ni_dvp);
717		else
718			vput(nd.ni_dvp);
719		if (vp != NULL) {
720			vrele(vp);
721			error = EADDRINUSE;
722			goto done;
723		}
724		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
725		if (error)
726			goto done;
727		goto restart;
728	}
729	VATTR_NULL(&vattr);
730	vattr.va_type = VSOCK;
731	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
732#ifdef MAC
733	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
734	    &vattr);
735#endif
736	if (error == 0) {
737		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
738		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
739	}
740	NDFREE(&nd, NDF_ONLY_PNBUF);
741	vput(nd.ni_dvp);
742	if (error)
743		goto done;
744	vp = nd.ni_vp;
745	ASSERT_VOP_LOCKED(vp, "unp_bind");
746	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
747	UNP_LOCK();
748	vp->v_socket = unp->unp_socket;
749	unp->unp_vnode = vp;
750	unp->unp_addr = soun;
751	UNP_UNLOCK();
752	VOP_UNLOCK(vp, 0, td);
753	vn_finished_write(mp);
754done:
755	mtx_unlock(&Giant);
756	free(buf, M_TEMP);
757	return (error);
758}
759
760static int
761unp_connect(so, nam, td)
762	struct socket *so;
763	struct sockaddr *nam;
764	struct thread *td;
765{
766	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
767	register struct vnode *vp;
768	register struct socket *so2, *so3;
769	struct unpcb *unp, *unp2, *unp3;
770	int error, len;
771	struct nameidata nd;
772	char buf[SOCK_MAXADDRLEN];
773	struct sockaddr *sa;
774
775	UNP_LOCK_ASSERT();
776	unp = sotounpcb(so);
777
778	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
779	if (len <= 0)
780		return (EINVAL);
781	strlcpy(buf, soun->sun_path, len + 1);
782	UNP_UNLOCK();
783	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
784	mtx_lock(&Giant);
785	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
786	error = namei(&nd);
787	if (error)
788		vp = NULL;
789	else
790		vp = nd.ni_vp;
791	ASSERT_VOP_LOCKED(vp, "unp_connect");
792	NDFREE(&nd, NDF_ONLY_PNBUF);
793	if (error)
794		goto bad;
795
796	if (vp->v_type != VSOCK) {
797		error = ENOTSOCK;
798		goto bad;
799	}
800	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
801	if (error)
802		goto bad;
803	mtx_unlock(&Giant);
804	UNP_LOCK();
805	unp = sotounpcb(so);
806	if (unp == NULL) {
807		/*
808		 * XXXRW: Temporary debugging printf.
809		 */
810		printf("unp_connect(): lost race to another thread\n");
811		error = EINVAL;
812		goto bad2;
813	}
814	so2 = vp->v_socket;
815	if (so2 == NULL) {
816		error = ECONNREFUSED;
817		goto bad2;
818	}
819	if (so->so_type != so2->so_type) {
820		error = EPROTOTYPE;
821		goto bad2;
822	}
823	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
824		if (so2->so_options & SO_ACCEPTCONN) {
825			/*
826			 * NB: drop locks here so unp_attach is entered
827			 *     w/o locks; this avoids a recursive lock
828			 *     of the head and holding sleep locks across
829			 *     a (potentially) blocking malloc.
830			 */
831			UNP_UNLOCK();
832			so3 = sonewconn(so2, 0);
833			UNP_LOCK();
834		} else
835			so3 = NULL;
836		if (so3 == NULL) {
837			error = ECONNREFUSED;
838			goto bad2;
839		}
840		unp = sotounpcb(so);
841		unp2 = sotounpcb(so2);
842		unp3 = sotounpcb(so3);
843		if (unp2->unp_addr != NULL) {
844			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
845			unp3->unp_addr = (struct sockaddr_un *) sa;
846			sa = NULL;
847		}
848		/*
849		 * unp_peercred management:
850		 *
851		 * The connecter's (client's) credentials are copied
852		 * from its process structure at the time of connect()
853		 * (which is now).
854		 */
855		cru2x(td->td_ucred, &unp3->unp_peercred);
856		unp3->unp_flags |= UNP_HAVEPC;
857		/*
858		 * The receiver's (server's) credentials are copied
859		 * from the unp_peercred member of socket on which the
860		 * former called listen(); unp_listen() cached that
861		 * process's credentials at that time so we can use
862		 * them now.
863		 */
864		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
865		    ("unp_connect: listener without cached peercred"));
866		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
867		    sizeof(unp->unp_peercred));
868		unp->unp_flags |= UNP_HAVEPC;
869#ifdef MAC
870		SOCK_LOCK(so);
871		mac_set_socket_peer_from_socket(so, so3);
872		mac_set_socket_peer_from_socket(so3, so);
873		SOCK_UNLOCK(so);
874#endif
875
876		so2 = so3;
877	}
878	error = unp_connect2(so, so2);
879bad2:
880	UNP_UNLOCK();
881	mtx_lock(&Giant);
882bad:
883	mtx_assert(&Giant, MA_OWNED);
884	if (vp != NULL)
885		vput(vp);
886	mtx_unlock(&Giant);
887	free(sa, M_SONAME);
888	UNP_LOCK();
889	return (error);
890}
891
892static int
893unp_connect2(so, so2)
894	register struct socket *so;
895	register struct socket *so2;
896{
897	register struct unpcb *unp = sotounpcb(so);
898	register struct unpcb *unp2;
899
900	UNP_LOCK_ASSERT();
901
902	if (so2->so_type != so->so_type)
903		return (EPROTOTYPE);
904	unp2 = sotounpcb(so2);
905	unp->unp_conn = unp2;
906	switch (so->so_type) {
907
908	case SOCK_DGRAM:
909		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
910		soisconnected(so);
911		break;
912
913	case SOCK_STREAM:
914		unp2->unp_conn = unp;
915		soisconnected(so);
916		soisconnected(so2);
917		break;
918
919	default:
920		panic("unp_connect2");
921	}
922	return (0);
923}
924
925static void
926unp_disconnect(unp)
927	struct unpcb *unp;
928{
929	register struct unpcb *unp2 = unp->unp_conn;
930	struct socket *so;
931
932	UNP_LOCK_ASSERT();
933
934	if (unp2 == NULL)
935		return;
936	unp->unp_conn = NULL;
937	switch (unp->unp_socket->so_type) {
938
939	case SOCK_DGRAM:
940		LIST_REMOVE(unp, unp_reflink);
941		so = unp->unp_socket;
942		SOCK_LOCK(so);
943		so->so_state &= ~SS_ISCONNECTED;
944		SOCK_UNLOCK(so);
945		break;
946
947	case SOCK_STREAM:
948		soisdisconnected(unp->unp_socket);
949		unp2->unp_conn = NULL;
950		soisdisconnected(unp2->unp_socket);
951		break;
952	}
953}
954
955#ifdef notdef
956void
957unp_abort(unp)
958	struct unpcb *unp;
959{
960
961	unp_detach(unp);
962}
963#endif
964
965/*
966 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
967 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
968 * are safe to reference.  It first scans the list of struct unpcb's to
969 * generate a pointer list, then it rescans its list one entry at a time to
970 * externalize and copyout.  It checks the generation number to see if a
971 * struct unpcb has been reused, and will skip it if so.
972 */
973static int
974unp_pcblist(SYSCTL_HANDLER_ARGS)
975{
976	int error, i, n;
977	struct unpcb *unp, **unp_list;
978	unp_gen_t gencnt;
979	struct xunpgen *xug;
980	struct unp_head *head;
981	struct xunpcb *xu;
982
983	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
984
985	/*
986	 * The process of preparing the PCB list is too time-consuming and
987	 * resource-intensive to repeat twice on every request.
988	 */
989	if (req->oldptr == NULL) {
990		n = unp_count;
991		req->oldidx = 2 * (sizeof *xug)
992			+ (n + n/8) * sizeof(struct xunpcb);
993		return (0);
994	}
995
996	if (req->newptr != NULL)
997		return (EPERM);
998
999	/*
1000	 * OK, now we're committed to doing something.
1001	 */
1002	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1003	UNP_LOCK();
1004	gencnt = unp_gencnt;
1005	n = unp_count;
1006	UNP_UNLOCK();
1007
1008	xug->xug_len = sizeof *xug;
1009	xug->xug_count = n;
1010	xug->xug_gen = gencnt;
1011	xug->xug_sogen = so_gencnt;
1012	error = SYSCTL_OUT(req, xug, sizeof *xug);
1013	if (error) {
1014		free(xug, M_TEMP);
1015		return (error);
1016	}
1017
1018	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1019
1020	UNP_LOCK();
1021	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1022	     unp = LIST_NEXT(unp, unp_link)) {
1023		if (unp->unp_gencnt <= gencnt) {
1024			if (cr_cansee(req->td->td_ucred,
1025			    unp->unp_socket->so_cred))
1026				continue;
1027			unp_list[i++] = unp;
1028		}
1029	}
1030	UNP_UNLOCK();
1031	n = i;			/* in case we lost some during malloc */
1032
1033	error = 0;
1034	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1035	for (i = 0; i < n; i++) {
1036		unp = unp_list[i];
1037		if (unp->unp_gencnt <= gencnt) {
1038			xu->xu_len = sizeof *xu;
1039			xu->xu_unpp = unp;
1040			/*
1041			 * XXX - need more locking here to protect against
1042			 * connect/disconnect races for SMP.
1043			 */
1044			if (unp->unp_addr != NULL)
1045				bcopy(unp->unp_addr, &xu->xu_addr,
1046				      unp->unp_addr->sun_len);
1047			if (unp->unp_conn != NULL &&
1048			    unp->unp_conn->unp_addr != NULL)
1049				bcopy(unp->unp_conn->unp_addr,
1050				      &xu->xu_caddr,
1051				      unp->unp_conn->unp_addr->sun_len);
1052			bcopy(unp, &xu->xu_unp, sizeof *unp);
1053			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1054			error = SYSCTL_OUT(req, xu, sizeof *xu);
1055		}
1056	}
1057	free(xu, M_TEMP);
1058	if (!error) {
1059		/*
1060		 * Give the user an updated idea of our state.
1061		 * If the generation differs from what we told
1062		 * her before, she knows that something happened
1063		 * while we were processing this request, and it
1064		 * might be necessary to retry.
1065		 */
1066		xug->xug_gen = unp_gencnt;
1067		xug->xug_sogen = so_gencnt;
1068		xug->xug_count = unp_count;
1069		error = SYSCTL_OUT(req, xug, sizeof *xug);
1070	}
1071	free(unp_list, M_TEMP);
1072	free(xug, M_TEMP);
1073	return (error);
1074}
1075
1076SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1077	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1078	    "List of active local datagram sockets");
1079SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1080	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1081	    "List of active local stream sockets");
1082
1083static void
1084unp_shutdown(unp)
1085	struct unpcb *unp;
1086{
1087	struct socket *so;
1088
1089	UNP_LOCK_ASSERT();
1090
1091	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1092	    (so = unp->unp_conn->unp_socket))
1093		socantrcvmore(so);
1094}
1095
1096static void
1097unp_drop(unp, errno)
1098	struct unpcb *unp;
1099	int errno;
1100{
1101	struct socket *so = unp->unp_socket;
1102
1103	UNP_LOCK_ASSERT();
1104
1105	so->so_error = errno;
1106	unp_disconnect(unp);
1107}
1108
1109#ifdef notdef
1110void
1111unp_drain()
1112{
1113
1114}
1115#endif
1116
1117static void
1118unp_freerights(rp, fdcount)
1119	struct file **rp;
1120	int fdcount;
1121{
1122	int i;
1123	struct file *fp;
1124
1125	for (i = 0; i < fdcount; i++) {
1126		fp = *rp;
1127		/*
1128		 * zero the pointer before calling
1129		 * unp_discard since it may end up
1130		 * in unp_gc()..
1131		 */
1132		*rp++ = 0;
1133		unp_discard(fp);
1134	}
1135}
1136
1137int
1138unp_externalize(control, controlp)
1139	struct mbuf *control, **controlp;
1140{
1141	struct thread *td = curthread;		/* XXX */
1142	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1143	int i;
1144	int *fdp;
1145	struct file **rp;
1146	struct file *fp;
1147	void *data;
1148	socklen_t clen = control->m_len, datalen;
1149	int error, newfds;
1150	int f;
1151	u_int newlen;
1152
1153	error = 0;
1154	if (controlp != NULL) /* controlp == NULL => free control messages */
1155		*controlp = NULL;
1156
1157	while (cm != NULL) {
1158		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1159			error = EINVAL;
1160			break;
1161		}
1162
1163		data = CMSG_DATA(cm);
1164		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1165
1166		if (cm->cmsg_level == SOL_SOCKET
1167		    && cm->cmsg_type == SCM_RIGHTS) {
1168			newfds = datalen / sizeof(struct file *);
1169			rp = data;
1170
1171			/* If we're not outputting the descriptors free them. */
1172			if (error || controlp == NULL) {
1173				unp_freerights(rp, newfds);
1174				goto next;
1175			}
1176			FILEDESC_LOCK(td->td_proc->p_fd);
1177			/* if the new FD's will not fit free them.  */
1178			if (!fdavail(td, newfds)) {
1179				FILEDESC_UNLOCK(td->td_proc->p_fd);
1180				error = EMSGSIZE;
1181				unp_freerights(rp, newfds);
1182				goto next;
1183			}
1184			/*
1185			 * now change each pointer to an fd in the global
1186			 * table to an integer that is the index to the
1187			 * local fd table entry that we set up to point
1188			 * to the global one we are transferring.
1189			 */
1190			newlen = newfds * sizeof(int);
1191			*controlp = sbcreatecontrol(NULL, newlen,
1192			    SCM_RIGHTS, SOL_SOCKET);
1193			if (*controlp == NULL) {
1194				FILEDESC_UNLOCK(td->td_proc->p_fd);
1195				error = E2BIG;
1196				unp_freerights(rp, newfds);
1197				goto next;
1198			}
1199
1200			fdp = (int *)
1201			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1202			for (i = 0; i < newfds; i++) {
1203				if (fdalloc(td, 0, &f))
1204					panic("unp_externalize fdalloc failed");
1205				fp = *rp++;
1206				td->td_proc->p_fd->fd_ofiles[f] = fp;
1207				FILE_LOCK(fp);
1208				fp->f_msgcount--;
1209				FILE_UNLOCK(fp);
1210				unp_rights--;
1211				*fdp++ = f;
1212			}
1213			FILEDESC_UNLOCK(td->td_proc->p_fd);
1214		} else { /* We can just copy anything else across */
1215			if (error || controlp == NULL)
1216				goto next;
1217			*controlp = sbcreatecontrol(NULL, datalen,
1218			    cm->cmsg_type, cm->cmsg_level);
1219			if (*controlp == NULL) {
1220				error = ENOBUFS;
1221				goto next;
1222			}
1223			bcopy(data,
1224			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1225			    datalen);
1226		}
1227
1228		controlp = &(*controlp)->m_next;
1229
1230next:
1231		if (CMSG_SPACE(datalen) < clen) {
1232			clen -= CMSG_SPACE(datalen);
1233			cm = (struct cmsghdr *)
1234			    ((caddr_t)cm + CMSG_SPACE(datalen));
1235		} else {
1236			clen = 0;
1237			cm = NULL;
1238		}
1239	}
1240
1241	m_freem(control);
1242
1243	return (error);
1244}
1245
1246void
1247unp_init(void)
1248{
1249	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1250	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1251	if (unp_zone == NULL)
1252		panic("unp_init");
1253	uma_zone_set_max(unp_zone, nmbclusters);
1254	LIST_INIT(&unp_dhead);
1255	LIST_INIT(&unp_shead);
1256
1257	UNP_LOCK_INIT();
1258}
1259
1260static int
1261unp_internalize(controlp, td)
1262	struct mbuf **controlp;
1263	struct thread *td;
1264{
1265	struct mbuf *control = *controlp;
1266	struct proc *p = td->td_proc;
1267	struct filedesc *fdescp = p->p_fd;
1268	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1269	struct cmsgcred *cmcred;
1270	struct file **rp;
1271	struct file *fp;
1272	struct timeval *tv;
1273	int i, fd, *fdp;
1274	void *data;
1275	socklen_t clen = control->m_len, datalen;
1276	int error, oldfds;
1277	u_int newlen;
1278
1279	error = 0;
1280	*controlp = NULL;
1281
1282	while (cm != NULL) {
1283		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1284		    || cm->cmsg_len > clen) {
1285			error = EINVAL;
1286			goto out;
1287		}
1288
1289		data = CMSG_DATA(cm);
1290		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1291
1292		switch (cm->cmsg_type) {
1293		/*
1294		 * Fill in credential information.
1295		 */
1296		case SCM_CREDS:
1297			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1298			    SCM_CREDS, SOL_SOCKET);
1299			if (*controlp == NULL) {
1300				error = ENOBUFS;
1301				goto out;
1302			}
1303
1304			cmcred = (struct cmsgcred *)
1305			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1306			cmcred->cmcred_pid = p->p_pid;
1307			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1308			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1309			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1310			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1311							CMGROUP_MAX);
1312			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1313				cmcred->cmcred_groups[i] =
1314				    td->td_ucred->cr_groups[i];
1315			break;
1316
1317		case SCM_RIGHTS:
1318			oldfds = datalen / sizeof (int);
1319			/*
1320			 * check that all the FDs passed in refer to legal files
1321			 * If not, reject the entire operation.
1322			 */
1323			fdp = data;
1324			FILEDESC_LOCK(fdescp);
1325			for (i = 0; i < oldfds; i++) {
1326				fd = *fdp++;
1327				if ((unsigned)fd >= fdescp->fd_nfiles ||
1328				    fdescp->fd_ofiles[fd] == NULL) {
1329					FILEDESC_UNLOCK(fdescp);
1330					error = EBADF;
1331					goto out;
1332				}
1333				fp = fdescp->fd_ofiles[fd];
1334				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1335					FILEDESC_UNLOCK(fdescp);
1336					error = EOPNOTSUPP;
1337					goto out;
1338				}
1339
1340			}
1341			/*
1342			 * Now replace the integer FDs with pointers to
1343			 * the associated global file table entry..
1344			 */
1345			newlen = oldfds * sizeof(struct file *);
1346			*controlp = sbcreatecontrol(NULL, newlen,
1347			    SCM_RIGHTS, SOL_SOCKET);
1348			if (*controlp == NULL) {
1349				FILEDESC_UNLOCK(fdescp);
1350				error = E2BIG;
1351				goto out;
1352			}
1353
1354			fdp = data;
1355			rp = (struct file **)
1356			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1357			for (i = 0; i < oldfds; i++) {
1358				fp = fdescp->fd_ofiles[*fdp++];
1359				*rp++ = fp;
1360				FILE_LOCK(fp);
1361				fp->f_count++;
1362				fp->f_msgcount++;
1363				FILE_UNLOCK(fp);
1364				unp_rights++;
1365			}
1366			FILEDESC_UNLOCK(fdescp);
1367			break;
1368
1369		case SCM_TIMESTAMP:
1370			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1371			    SCM_TIMESTAMP, SOL_SOCKET);
1372			if (*controlp == NULL) {
1373				error = ENOBUFS;
1374				goto out;
1375			}
1376			tv = (struct timeval *)
1377			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1378			microtime(tv);
1379			break;
1380
1381		default:
1382			error = EINVAL;
1383			goto out;
1384		}
1385
1386		controlp = &(*controlp)->m_next;
1387
1388		if (CMSG_SPACE(datalen) < clen) {
1389			clen -= CMSG_SPACE(datalen);
1390			cm = (struct cmsghdr *)
1391			    ((caddr_t)cm + CMSG_SPACE(datalen));
1392		} else {
1393			clen = 0;
1394			cm = NULL;
1395		}
1396	}
1397
1398out:
1399	m_freem(control);
1400
1401	return (error);
1402}
1403
1404static int	unp_defer, unp_gcing;
1405
1406static void
1407unp_gc()
1408{
1409	register struct file *fp, *nextfp;
1410	register struct socket *so;
1411	struct file **extra_ref, **fpp;
1412	int nunref, i;
1413	int nfiles_snap;
1414	int nfiles_slack = 20;
1415
1416	UNP_LOCK_ASSERT();
1417
1418	if (unp_gcing)
1419		return;
1420	unp_gcing = 1;
1421	unp_defer = 0;
1422	/*
1423	 * before going through all this, set all FDs to
1424	 * be NOT defered and NOT externally accessible
1425	 */
1426	/*
1427	 * XXXRW: Acquiring a sleep lock while holding UNP
1428	 * mutex cannot be a good thing.
1429	 */
1430	sx_slock(&filelist_lock);
1431	LIST_FOREACH(fp, &filehead, f_list)
1432		fp->f_gcflag &= ~(FMARK|FDEFER);
1433	do {
1434		LIST_FOREACH(fp, &filehead, f_list) {
1435			FILE_LOCK(fp);
1436			/*
1437			 * If the file is not open, skip it
1438			 */
1439			if (fp->f_count == 0) {
1440				FILE_UNLOCK(fp);
1441				continue;
1442			}
1443			/*
1444			 * If we already marked it as 'defer'  in a
1445			 * previous pass, then try process it this time
1446			 * and un-mark it
1447			 */
1448			if (fp->f_gcflag & FDEFER) {
1449				fp->f_gcflag &= ~FDEFER;
1450				unp_defer--;
1451			} else {
1452				/*
1453				 * if it's not defered, then check if it's
1454				 * already marked.. if so skip it
1455				 */
1456				if (fp->f_gcflag & FMARK) {
1457					FILE_UNLOCK(fp);
1458					continue;
1459				}
1460				/*
1461				 * If all references are from messages
1462				 * in transit, then skip it. it's not
1463				 * externally accessible.
1464				 */
1465				if (fp->f_count == fp->f_msgcount) {
1466					FILE_UNLOCK(fp);
1467					continue;
1468				}
1469				/*
1470				 * If it got this far then it must be
1471				 * externally accessible.
1472				 */
1473				fp->f_gcflag |= FMARK;
1474			}
1475			/*
1476			 * either it was defered, or it is externally
1477			 * accessible and not already marked so.
1478			 * Now check if it is possibly one of OUR sockets.
1479			 */
1480			if (fp->f_type != DTYPE_SOCKET ||
1481			    (so = fp->f_data) == NULL) {
1482				FILE_UNLOCK(fp);
1483				continue;
1484			}
1485			FILE_UNLOCK(fp);
1486			if (so->so_proto->pr_domain != &localdomain ||
1487			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1488				continue;
1489#ifdef notdef
1490			if (so->so_rcv.sb_flags & SB_LOCK) {
1491				/*
1492				 * This is problematical; it's not clear
1493				 * we need to wait for the sockbuf to be
1494				 * unlocked (on a uniprocessor, at least),
1495				 * and it's also not clear what to do
1496				 * if sbwait returns an error due to receipt
1497				 * of a signal.  If sbwait does return
1498				 * an error, we'll go into an infinite
1499				 * loop.  Delete all of this for now.
1500				 */
1501				(void) sbwait(&so->so_rcv);
1502				goto restart;
1503			}
1504#endif
1505			/*
1506			 * So, Ok, it's one of our sockets and it IS externally
1507			 * accessible (or was defered). Now we look
1508			 * to see if we hold any file descriptors in its
1509			 * message buffers. Follow those links and mark them
1510			 * as accessible too.
1511			 */
1512			SOCKBUF_LOCK(&so->so_rcv);
1513			unp_scan(so->so_rcv.sb_mb, unp_mark);
1514			SOCKBUF_UNLOCK(&so->so_rcv);
1515		}
1516	} while (unp_defer);
1517	sx_sunlock(&filelist_lock);
1518	/*
1519	 * We grab an extra reference to each of the file table entries
1520	 * that are not otherwise accessible and then free the rights
1521	 * that are stored in messages on them.
1522	 *
1523	 * The bug in the orginal code is a little tricky, so I'll describe
1524	 * what's wrong with it here.
1525	 *
1526	 * It is incorrect to simply unp_discard each entry for f_msgcount
1527	 * times -- consider the case of sockets A and B that contain
1528	 * references to each other.  On a last close of some other socket,
1529	 * we trigger a gc since the number of outstanding rights (unp_rights)
1530	 * is non-zero.  If during the sweep phase the gc code un_discards,
1531	 * we end up doing a (full) closef on the descriptor.  A closef on A
1532	 * results in the following chain.  Closef calls soo_close, which
1533	 * calls soclose.   Soclose calls first (through the switch
1534	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1535	 * returns because the previous instance had set unp_gcing, and
1536	 * we return all the way back to soclose, which marks the socket
1537	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1538	 * to free up the rights that are queued in messages on the socket A,
1539	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1540	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1541	 * instance of unp_discard just calls closef on B.
1542	 *
1543	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1544	 * which results in another closef on A.  Unfortunately, A is already
1545	 * being closed, and the descriptor has already been marked with
1546	 * SS_NOFDREF, and soclose panics at this point.
1547	 *
1548	 * Here, we first take an extra reference to each inaccessible
1549	 * descriptor.  Then, we call sorflush ourself, since we know
1550	 * it is a Unix domain socket anyhow.  After we destroy all the
1551	 * rights carried in messages, we do a last closef to get rid
1552	 * of our extra reference.  This is the last close, and the
1553	 * unp_detach etc will shut down the socket.
1554	 *
1555	 * 91/09/19, bsy@cs.cmu.edu
1556	 */
1557again:
1558	nfiles_snap = nfiles + nfiles_slack;	/* some slack */
1559	extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
1560	    M_WAITOK);
1561	sx_slock(&filelist_lock);
1562	if (nfiles_snap < nfiles) {
1563		sx_sunlock(&filelist_lock);
1564		free(extra_ref, M_TEMP);
1565		nfiles_slack += 20;
1566		goto again;
1567	}
1568	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1569	    fp != NULL; fp = nextfp) {
1570		nextfp = LIST_NEXT(fp, f_list);
1571		FILE_LOCK(fp);
1572		/*
1573		 * If it's not open, skip it
1574		 */
1575		if (fp->f_count == 0) {
1576			FILE_UNLOCK(fp);
1577			continue;
1578		}
1579		/*
1580		 * If all refs are from msgs, and it's not marked accessible
1581		 * then it must be referenced from some unreachable cycle
1582		 * of (shut-down) FDs, so include it in our
1583		 * list of FDs to remove
1584		 */
1585		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1586			*fpp++ = fp;
1587			nunref++;
1588			fp->f_count++;
1589		}
1590		FILE_UNLOCK(fp);
1591	}
1592	sx_sunlock(&filelist_lock);
1593	/*
1594	 * for each FD on our hit list, do the following two things
1595	 */
1596	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1597		struct file *tfp = *fpp;
1598		FILE_LOCK(tfp);
1599		if (tfp->f_type == DTYPE_SOCKET &&
1600		    tfp->f_data != NULL) {
1601			FILE_UNLOCK(tfp);
1602			sorflush(tfp->f_data);
1603		} else {
1604			FILE_UNLOCK(tfp);
1605		}
1606	}
1607	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1608		closef(*fpp, (struct thread *) NULL);
1609	free(extra_ref, M_TEMP);
1610	unp_gcing = 0;
1611}
1612
1613void
1614unp_dispose(m)
1615	struct mbuf *m;
1616{
1617
1618	if (m)
1619		unp_scan(m, unp_discard);
1620}
1621
1622static int
1623unp_listen(unp, td)
1624	struct unpcb *unp;
1625	struct thread *td;
1626{
1627	UNP_LOCK_ASSERT();
1628
1629	/*
1630	 * XXXRW: Why populate the local peer cred with our own credential?
1631	 */
1632	cru2x(td->td_ucred, &unp->unp_peercred);
1633	unp->unp_flags |= UNP_HAVEPCCACHED;
1634	return (0);
1635}
1636
1637static void
1638unp_scan(m0, op)
1639	register struct mbuf *m0;
1640	void (*op)(struct file *);
1641{
1642	struct mbuf *m;
1643	struct file **rp;
1644	struct cmsghdr *cm;
1645	void *data;
1646	int i;
1647	socklen_t clen, datalen;
1648	int qfds;
1649
1650	while (m0 != NULL) {
1651		for (m = m0; m; m = m->m_next) {
1652			if (m->m_type != MT_CONTROL)
1653				continue;
1654
1655			cm = mtod(m, struct cmsghdr *);
1656			clen = m->m_len;
1657
1658			while (cm != NULL) {
1659				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1660					break;
1661
1662				data = CMSG_DATA(cm);
1663				datalen = (caddr_t)cm + cm->cmsg_len
1664				    - (caddr_t)data;
1665
1666				if (cm->cmsg_level == SOL_SOCKET &&
1667				    cm->cmsg_type == SCM_RIGHTS) {
1668					qfds = datalen / sizeof (struct file *);
1669					rp = data;
1670					for (i = 0; i < qfds; i++)
1671						(*op)(*rp++);
1672				}
1673
1674				if (CMSG_SPACE(datalen) < clen) {
1675					clen -= CMSG_SPACE(datalen);
1676					cm = (struct cmsghdr *)
1677					    ((caddr_t)cm + CMSG_SPACE(datalen));
1678				} else {
1679					clen = 0;
1680					cm = NULL;
1681				}
1682			}
1683		}
1684		m0 = m0->m_act;
1685	}
1686}
1687
1688static void
1689unp_mark(fp)
1690	struct file *fp;
1691{
1692	if (fp->f_gcflag & FMARK)
1693		return;
1694	unp_defer++;
1695	fp->f_gcflag |= (FMARK|FDEFER);
1696}
1697
1698static void
1699unp_discard(fp)
1700	struct file *fp;
1701{
1702	FILE_LOCK(fp);
1703	fp->f_msgcount--;
1704	unp_rights--;
1705	FILE_UNLOCK(fp);
1706	(void) closef(fp, (struct thread *)NULL);
1707}
1708