uipc_usrreq.c revision 21673
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34 *	$FreeBSD: head/sys/kern/uipc_usrreq.c 21673 1997-01-14 07:20:47Z jkh $
35 */
36
37#include <sys/param.h>
38#include <sys/queue.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/filedesc.h>
42#include <sys/domain.h>
43#include <sys/protosw.h>
44#include <sys/stat.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/unpcb.h>
48#include <sys/un.h>
49#include <sys/namei.h>
50#include <sys/vnode.h>
51#include <sys/file.h>
52#include <sys/stat.h>
53#include <sys/mbuf.h>
54
55/*
56 * Unix communications domain.
57 *
58 * TODO:
59 *	SEQPACKET, RDM
60 *	rethink name space problems
61 *	need a proper out-of-band
62 */
63static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
64static ino_t	unp_ino;		/* prototype for fake inode numbers */
65
66static int     unp_attach __P((struct socket *));
67static void    unp_detach __P((struct unpcb *));
68static int     unp_bind __P((struct unpcb *,struct mbuf *, struct proc *));
69static int     unp_connect __P((struct socket *,struct mbuf *, struct proc *));
70static void    unp_disconnect __P((struct unpcb *));
71static void    unp_shutdown __P((struct unpcb *));
72static void    unp_drop __P((struct unpcb *, int));
73static void    unp_gc __P((void));
74static void    unp_scan __P((struct mbuf *, void (*)(struct file *)));
75static void    unp_mark __P((struct file *));
76static void    unp_discard __P((struct file *));
77static int     unp_internalize __P((struct mbuf *, struct proc *));
78
79
80/*ARGSUSED*/
81int
82uipc_usrreq(so, req, m, nam, control)
83	struct socket *so;
84	int req;
85	struct mbuf *m, *nam, *control;
86{
87	struct unpcb *unp = sotounpcb(so);
88	register struct socket *so2;
89	register int error = 0;
90	struct proc *p = curproc;	/* XXX */
91
92	if (req == PRU_CONTROL)
93		return (EOPNOTSUPP);
94	if (req != PRU_SEND && control && control->m_len) {
95		error = EOPNOTSUPP;
96		goto release;
97	}
98	if (unp == 0 && req != PRU_ATTACH) {
99		error = EINVAL;
100		goto release;
101	}
102	switch (req) {
103
104	case PRU_ATTACH:
105		if (unp) {
106			error = EISCONN;
107			break;
108		}
109		error = unp_attach(so);
110		break;
111
112	case PRU_DETACH:
113		unp_detach(unp);
114		break;
115
116	case PRU_BIND:
117		error = unp_bind(unp, nam, p);
118		break;
119
120	case PRU_LISTEN:
121		if (unp->unp_vnode == 0)
122			error = EINVAL;
123		break;
124
125	case PRU_CONNECT:
126		error = unp_connect(so, nam, p);
127		break;
128
129	case PRU_CONNECT2:
130		error = unp_connect2(so, (struct socket *)nam);
131		break;
132
133	case PRU_DISCONNECT:
134		unp_disconnect(unp);
135		break;
136
137	case PRU_ACCEPT:
138		/*
139		 * Pass back name of connected socket,
140		 * if it was bound and we are still connected
141		 * (our peer may have closed already!).
142		 */
143		if (unp->unp_conn && unp->unp_conn->unp_addr) {
144			nam->m_len = unp->unp_conn->unp_addr->m_len;
145			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
146			    mtod(nam, caddr_t), (unsigned)nam->m_len);
147		} else {
148			nam->m_len = sizeof(sun_noname);
149			*(mtod(nam, struct sockaddr *)) = sun_noname;
150		}
151		break;
152
153	case PRU_SHUTDOWN:
154		socantsendmore(so);
155		unp_shutdown(unp);
156		break;
157
158	case PRU_RCVD:
159		switch (so->so_type) {
160
161		case SOCK_DGRAM:
162			panic("uipc 1");
163			/*NOTREACHED*/
164
165		case SOCK_STREAM:
166#define	rcv (&so->so_rcv)
167#define snd (&so2->so_snd)
168			if (unp->unp_conn == 0)
169				break;
170			so2 = unp->unp_conn->unp_socket;
171			/*
172			 * Adjust backpressure on sender
173			 * and wakeup any waiting to write.
174			 */
175			snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
176			unp->unp_mbcnt = rcv->sb_mbcnt;
177			snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
178			unp->unp_cc = rcv->sb_cc;
179			sowwakeup(so2);
180#undef snd
181#undef rcv
182			break;
183
184		default:
185			panic("uipc 2");
186		}
187		break;
188
189	case PRU_SEND:
190	case PRU_SEND_EOF:
191		if (control && (error = unp_internalize(control, p)))
192			break;
193		switch (so->so_type) {
194
195		case SOCK_DGRAM: {
196			struct sockaddr *from;
197
198			if (nam) {
199				if (unp->unp_conn) {
200					error = EISCONN;
201					break;
202				}
203				error = unp_connect(so, nam, p);
204				if (error)
205					break;
206			} else {
207				if (unp->unp_conn == 0) {
208					error = ENOTCONN;
209					break;
210				}
211			}
212			so2 = unp->unp_conn->unp_socket;
213			if (unp->unp_addr)
214				from = mtod(unp->unp_addr, struct sockaddr *);
215			else
216				from = &sun_noname;
217			if (sbappendaddr(&so2->so_rcv, from, m, control)) {
218				sorwakeup(so2);
219				m = 0;
220				control = 0;
221			} else
222				error = ENOBUFS;
223			if (nam)
224				unp_disconnect(unp);
225			break;
226		}
227
228		case SOCK_STREAM:
229#define	rcv (&so2->so_rcv)
230#define	snd (&so->so_snd)
231			/* Connect if not connected yet. */
232			/*
233			 * Note: A better implementation would complain
234			 * if not equal to the peer's address.
235			 */
236			if ((so->so_state & SS_ISCONNECTED) == 0) {
237				if (nam) {
238		    			error = unp_connect(so, nam, p);
239					if (error)
240						break;	/* XXX */
241				} else {
242					error = ENOTCONN;
243					break;
244				}
245			}
246
247			if (so->so_state & SS_CANTSENDMORE) {
248				error = EPIPE;
249				break;
250			}
251			if (unp->unp_conn == 0)
252				panic("uipc 3");
253			so2 = unp->unp_conn->unp_socket;
254			/*
255			 * Send to paired receive port, and then reduce
256			 * send buffer hiwater marks to maintain backpressure.
257			 * Wake up readers.
258			 */
259			if (control) {
260				if (sbappendcontrol(rcv, m, control))
261					control = 0;
262			} else
263				sbappend(rcv, m);
264			snd->sb_mbmax -=
265			    rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
266			unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
267			snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
268			unp->unp_conn->unp_cc = rcv->sb_cc;
269			sorwakeup(so2);
270			m = 0;
271#undef snd
272#undef rcv
273			break;
274
275		default:
276			panic("uipc 4");
277		}
278		/*
279		 * SEND_EOF is equivalent to a SEND followed by
280		 * a SHUTDOWN.
281		 */
282		if (req == PRU_SEND_EOF) {
283			socantsendmore(so);
284			unp_shutdown(unp);
285		}
286		break;
287
288	case PRU_ABORT:
289		unp_drop(unp, ECONNABORTED);
290		break;
291
292	case PRU_SENSE:
293		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
294		if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
295			so2 = unp->unp_conn->unp_socket;
296			((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
297		}
298		((struct stat *) m)->st_dev = NODEV;
299		if (unp->unp_ino == 0)
300			unp->unp_ino = unp_ino++;
301		((struct stat *) m)->st_ino = unp->unp_ino;
302		return (0);
303
304	case PRU_RCVOOB:
305		return (EOPNOTSUPP);
306
307	case PRU_SENDOOB:
308		error = EOPNOTSUPP;
309		break;
310
311	case PRU_SOCKADDR:
312		if (unp->unp_addr) {
313			nam->m_len = unp->unp_addr->m_len;
314			bcopy(mtod(unp->unp_addr, caddr_t),
315			    mtod(nam, caddr_t), (unsigned)nam->m_len);
316		} else
317			nam->m_len = 0;
318		break;
319
320	case PRU_PEERADDR:
321		if (unp->unp_conn && unp->unp_conn->unp_addr) {
322			nam->m_len = unp->unp_conn->unp_addr->m_len;
323			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
324			    mtod(nam, caddr_t), (unsigned)nam->m_len);
325		} else
326			nam->m_len = 0;
327		break;
328
329	case PRU_SLOWTIMO:
330		break;
331
332	default:
333		panic("piusrreq");
334	}
335release:
336	if (control)
337		m_freem(control);
338	if (m)
339		m_freem(m);
340	return (error);
341}
342
343/*
344 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
345 * for stream sockets, although the total for sender and receiver is
346 * actually only PIPSIZ.
347 * Datagram sockets really use the sendspace as the maximum datagram size,
348 * and don't really want to reserve the sendspace.  Their recvspace should
349 * be large enough for at least one max-size datagram plus address.
350 */
351#ifndef PIPSIZ
352#define	PIPSIZ	8192
353#endif
354static u_long	unpst_sendspace = PIPSIZ;
355static u_long	unpst_recvspace = PIPSIZ;
356static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
357static u_long	unpdg_recvspace = 4*1024;
358
359static int	unp_rights;			/* file descriptors in flight */
360
361static int
362unp_attach(so)
363	struct socket *so;
364{
365	register struct mbuf *m;
366	register struct unpcb *unp;
367	int error;
368
369	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
370		switch (so->so_type) {
371
372		case SOCK_STREAM:
373			error = soreserve(so, unpst_sendspace, unpst_recvspace);
374			break;
375
376		case SOCK_DGRAM:
377			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
378			break;
379
380		default:
381			panic("unp_attach");
382		}
383		if (error)
384			return (error);
385	}
386	m = m_getclr(M_DONTWAIT, MT_PCB);
387	if (m == NULL)
388		return (ENOBUFS);
389	unp = mtod(m, struct unpcb *);
390	so->so_pcb = (caddr_t)unp;
391	unp->unp_socket = so;
392	return (0);
393}
394
395static void
396unp_detach(unp)
397	register struct unpcb *unp;
398{
399
400	if (unp->unp_vnode) {
401		unp->unp_vnode->v_socket = 0;
402		vrele(unp->unp_vnode);
403		unp->unp_vnode = 0;
404	}
405	if (unp->unp_conn)
406		unp_disconnect(unp);
407	while (unp->unp_refs)
408		unp_drop(unp->unp_refs, ECONNRESET);
409	soisdisconnected(unp->unp_socket);
410	unp->unp_socket->so_pcb = 0;
411	if (unp_rights) {
412		/*
413		 * Normally the receive buffer is flushed later,
414		 * in sofree, but if our receive buffer holds references
415		 * to descriptors that are now garbage, we will dispose
416		 * of those descriptor references after the garbage collector
417		 * gets them (resulting in a "panic: closef: count < 0").
418		 */
419		sorflush(unp->unp_socket);
420		unp_gc();
421	}
422	m_freem(unp->unp_addr);
423	(void) m_free(dtom(unp));
424}
425
426static int
427unp_bind(unp, nam, p)
428	struct unpcb *unp;
429	struct mbuf *nam;
430	struct proc *p;
431{
432	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
433	register struct vnode *vp;
434	struct vattr vattr;
435	int error;
436	struct nameidata nd;
437
438	NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
439		soun->sun_path, p);
440	if (unp->unp_vnode != NULL)
441		return (EINVAL);
442	if (nam->m_len == MLEN) {
443		if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0)
444			return (EINVAL);
445	} else
446		*(mtod(nam, caddr_t) + nam->m_len) = 0;
447/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
448	error = namei(&nd);
449	if (error)
450		return (error);
451	vp = nd.ni_vp;
452	if (vp != NULL) {
453		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
454		if (nd.ni_dvp == vp)
455			vrele(nd.ni_dvp);
456		else
457			vput(nd.ni_dvp);
458		vrele(vp);
459		return (EADDRINUSE);
460	}
461	VATTR_NULL(&vattr);
462	vattr.va_type = VSOCK;
463	vattr.va_mode = ACCESSPERMS;
464	LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
465	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
466	if (error)
467		return (error);
468	vp = nd.ni_vp;
469	vp->v_socket = unp->unp_socket;
470	unp->unp_vnode = vp;
471	unp->unp_addr = m_copy(nam, 0, (int)M_COPYALL);
472	VOP_UNLOCK(vp);
473	return (0);
474}
475
476static int
477unp_connect(so, nam, p)
478	struct socket *so;
479	struct mbuf *nam;
480	struct proc *p;
481{
482	register struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
483	register struct vnode *vp;
484	register struct socket *so2, *so3;
485	struct unpcb *unp2, *unp3;
486	int error;
487	struct nameidata nd;
488
489	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
490	if (nam->m_data + nam->m_len == &nam->m_dat[MLEN]) {	/* XXX */
491		if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0)
492			return (EMSGSIZE);
493	} else
494		*(mtod(nam, caddr_t) + nam->m_len) = 0;
495	error = namei(&nd);
496	if (error)
497		return (error);
498	vp = nd.ni_vp;
499	if (vp->v_type != VSOCK) {
500		error = ENOTSOCK;
501		goto bad;
502	}
503	error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
504	if (error)
505		goto bad;
506	so2 = vp->v_socket;
507	if (so2 == 0) {
508		error = ECONNREFUSED;
509		goto bad;
510	}
511	if (so->so_type != so2->so_type) {
512		error = EPROTOTYPE;
513		goto bad;
514	}
515	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
516		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
517		    (so3 = sonewconn(so2, 0)) == 0) {
518			error = ECONNREFUSED;
519			goto bad;
520		}
521		unp2 = sotounpcb(so2);
522		unp3 = sotounpcb(so3);
523		if (unp2->unp_addr)
524			unp3->unp_addr =
525				  m_copy(unp2->unp_addr, 0, (int)M_COPYALL);
526		so2 = so3;
527	}
528	error = unp_connect2(so, so2);
529bad:
530	vput(vp);
531	return (error);
532}
533
534int
535unp_connect2(so, so2)
536	register struct socket *so;
537	register struct socket *so2;
538{
539	register struct unpcb *unp = sotounpcb(so);
540	register struct unpcb *unp2;
541
542	if (so2->so_type != so->so_type)
543		return (EPROTOTYPE);
544	unp2 = sotounpcb(so2);
545	unp->unp_conn = unp2;
546	switch (so->so_type) {
547
548	case SOCK_DGRAM:
549		unp->unp_nextref = unp2->unp_refs;
550		unp2->unp_refs = unp;
551		soisconnected(so);
552		break;
553
554	case SOCK_STREAM:
555		unp2->unp_conn = unp;
556		soisconnected(so);
557		soisconnected(so2);
558		break;
559
560	default:
561		panic("unp_connect2");
562	}
563	return (0);
564}
565
566static void
567unp_disconnect(unp)
568	struct unpcb *unp;
569{
570	register struct unpcb *unp2 = unp->unp_conn;
571
572	if (unp2 == 0)
573		return;
574	unp->unp_conn = 0;
575	switch (unp->unp_socket->so_type) {
576
577	case SOCK_DGRAM:
578		if (unp2->unp_refs == unp)
579			unp2->unp_refs = unp->unp_nextref;
580		else {
581			unp2 = unp2->unp_refs;
582			for (;;) {
583				if (unp2 == 0)
584					panic("unp_disconnect");
585				if (unp2->unp_nextref == unp)
586					break;
587				unp2 = unp2->unp_nextref;
588			}
589			unp2->unp_nextref = unp->unp_nextref;
590		}
591		unp->unp_nextref = 0;
592		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
593		break;
594
595	case SOCK_STREAM:
596		soisdisconnected(unp->unp_socket);
597		unp2->unp_conn = 0;
598		soisdisconnected(unp2->unp_socket);
599		break;
600	}
601}
602
603#ifdef notdef
604void
605unp_abort(unp)
606	struct unpcb *unp;
607{
608
609	unp_detach(unp);
610}
611#endif
612
613static void
614unp_shutdown(unp)
615	struct unpcb *unp;
616{
617	struct socket *so;
618
619	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
620	    (so = unp->unp_conn->unp_socket))
621		socantrcvmore(so);
622}
623
624static void
625unp_drop(unp, errno)
626	struct unpcb *unp;
627	int errno;
628{
629	struct socket *so = unp->unp_socket;
630
631	so->so_error = errno;
632	unp_disconnect(unp);
633	if (so->so_head) {
634		so->so_pcb = (caddr_t) 0;
635		m_freem(unp->unp_addr);
636		(void) m_free(dtom(unp));
637		sofree(so);
638	}
639}
640
641#ifdef notdef
642void
643unp_drain()
644{
645
646}
647#endif
648
649int
650unp_externalize(rights)
651	struct mbuf *rights;
652{
653	struct proc *p = curproc;		/* XXX */
654	register int i;
655	register struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
656	register struct file **rp = (struct file **)(cm + 1);
657	register struct file *fp;
658	int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int);
659	int f;
660
661	/*
662	 * if the new FD's will not fit, then we free them all
663	 */
664	if (!fdavail(p, newfds)) {
665		for (i = 0; i < newfds; i++) {
666			fp = *rp;
667			unp_discard(fp);
668			*rp++ = 0;
669		}
670		return (EMSGSIZE);
671	}
672	/*
673	 * now change each pointer to an fd in the global table to
674	 * an integer that is the index to the local fd table entry
675	 * that we set up to point to the global one we are transferring.
676	 * XXX this assumes a pointer and int are the same size...!
677	 */
678	for (i = 0; i < newfds; i++) {
679		if (fdalloc(p, 0, &f))
680			panic("unp_externalize");
681		fp = *rp;
682		p->p_fd->fd_ofiles[f] = fp;
683		fp->f_msgcount--;
684		unp_rights--;
685		*(int *)rp++ = f;
686	}
687	return (0);
688}
689
690static int
691unp_internalize(control, p)
692	struct mbuf *control;
693	struct proc *p;
694{
695	struct filedesc *fdp = p->p_fd;
696	register struct cmsghdr *cm = mtod(control, struct cmsghdr *);
697	register struct file **rp;
698	register struct file *fp;
699	register int i, fd;
700	int oldfds;
701
702	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
703	    cm->cmsg_len != control->m_len)
704		return (EINVAL);
705	oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
706	/*
707	 * check that all the FDs passed in refer to legal OPEN files
708	 * If not, reject the entire operation.
709	 */
710	rp = (struct file **)(cm + 1);
711	for (i = 0; i < oldfds; i++) {
712		fd = *(int *)rp++;
713		if ((unsigned)fd >= fdp->fd_nfiles ||
714		    fdp->fd_ofiles[fd] == NULL)
715			return (EBADF);
716	}
717	/*
718	 * Now replace the integer FDs with pointers to
719	 * the associated global file table entry..
720	 * XXX this assumes a pointer and an int are the same size!
721	 */
722	rp = (struct file **)(cm + 1);
723	for (i = 0; i < oldfds; i++) {
724		fp = fdp->fd_ofiles[*(int *)rp];
725		*rp++ = fp;
726		fp->f_count++;
727		fp->f_msgcount++;
728		unp_rights++;
729	}
730	return (0);
731}
732
733static int	unp_defer, unp_gcing;
734
735static void
736unp_gc()
737{
738	register struct file *fp, *nextfp;
739	register struct socket *so;
740	struct file **extra_ref, **fpp;
741	int nunref, i;
742
743	if (unp_gcing)
744		return;
745	unp_gcing = 1;
746	unp_defer = 0;
747	/*
748	 * before going through all this, set all FDs to
749	 * be NOT defered and NOT externally accessible
750	 */
751	for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next)
752		fp->f_flag &= ~(FMARK|FDEFER);
753	do {
754		for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
755			/*
756			 * If the file is not open, skip it
757			 */
758			if (fp->f_count == 0)
759				continue;
760			/*
761			 * If we already marked it as 'defer'  in a
762			 * previous pass, then try process it this time
763			 * and un-mark it
764			 */
765			if (fp->f_flag & FDEFER) {
766				fp->f_flag &= ~FDEFER;
767				unp_defer--;
768			} else {
769				/*
770				 * if it's not defered, then check if it's
771				 * already marked.. if so skip it
772				 */
773				if (fp->f_flag & FMARK)
774					continue;
775				/*
776				 * If all references are from messages
777				 * in transit, then skip it. it's not
778				 * externally accessible.
779				 */
780				if (fp->f_count == fp->f_msgcount)
781					continue;
782				/*
783				 * If it got this far then it must be
784				 * externally accessible.
785				 */
786				fp->f_flag |= FMARK;
787			}
788			/*
789			 * either it was defered, or it is externally
790			 * accessible and not already marked so.
791			 * Now check if it is possibly one of OUR sockets.
792			 */
793			if (fp->f_type != DTYPE_SOCKET ||
794			    (so = (struct socket *)fp->f_data) == 0)
795				continue;
796			if (so->so_proto->pr_domain != &localdomain ||
797			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
798				continue;
799#ifdef notdef
800			if (so->so_rcv.sb_flags & SB_LOCK) {
801				/*
802				 * This is problematical; it's not clear
803				 * we need to wait for the sockbuf to be
804				 * unlocked (on a uniprocessor, at least),
805				 * and it's also not clear what to do
806				 * if sbwait returns an error due to receipt
807				 * of a signal.  If sbwait does return
808				 * an error, we'll go into an infinite
809				 * loop.  Delete all of this for now.
810				 */
811				(void) sbwait(&so->so_rcv);
812				goto restart;
813			}
814#endif
815			/*
816			 * So, Ok, it's one of our sockets and it IS externally
817			 * accessible (or was defered). Now we look
818			 * to see if we hold any file descriptors in it's
819			 * message buffers. Follow those links and mark them
820			 * as accessible too.
821			 */
822			unp_scan(so->so_rcv.sb_mb, unp_mark);
823		}
824	} while (unp_defer);
825	/*
826	 * We grab an extra reference to each of the file table entries
827	 * that are not otherwise accessible and then free the rights
828	 * that are stored in messages on them.
829	 *
830	 * The bug in the orginal code is a little tricky, so I'll describe
831	 * what's wrong with it here.
832	 *
833	 * It is incorrect to simply unp_discard each entry for f_msgcount
834	 * times -- consider the case of sockets A and B that contain
835	 * references to each other.  On a last close of some other socket,
836	 * we trigger a gc since the number of outstanding rights (unp_rights)
837	 * is non-zero.  If during the sweep phase the gc code un_discards,
838	 * we end up doing a (full) closef on the descriptor.  A closef on A
839	 * results in the following chain.  Closef calls soo_close, which
840	 * calls soclose.   Soclose calls first (through the switch
841	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
842	 * returns because the previous instance had set unp_gcing, and
843	 * we return all the way back to soclose, which marks the socket
844	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
845	 * to free up the rights that are queued in messages on the socket A,
846	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
847	 * switch unp_dispose, which unp_scans with unp_discard.  This second
848	 * instance of unp_discard just calls closef on B.
849	 *
850	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
851	 * which results in another closef on A.  Unfortunately, A is already
852	 * being closed, and the descriptor has already been marked with
853	 * SS_NOFDREF, and soclose panics at this point.
854	 *
855	 * Here, we first take an extra reference to each inaccessible
856	 * descriptor.  Then, we call sorflush ourself, since we know
857	 * it is a Unix domain socket anyhow.  After we destroy all the
858	 * rights carried in messages, we do a last closef to get rid
859	 * of our extra reference.  This is the last close, and the
860	 * unp_detach etc will shut down the socket.
861	 *
862	 * 91/09/19, bsy@cs.cmu.edu
863	 */
864	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
865	for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0;
866	    fp = nextfp) {
867		nextfp = fp->f_list.le_next;
868		/*
869		 * If it's not open, skip it
870		 */
871		if (fp->f_count == 0)
872			continue;
873		/*
874		 * If all refs are from msgs, and it's not marked accessible
875		 * then it must be referenced from some unreachable cycle
876		 * of (shut-down) FDs, so include it in our
877		 * list of FDs to remove
878		 */
879		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
880			*fpp++ = fp;
881			nunref++;
882			fp->f_count++;
883		}
884	}
885	/*
886	 * for each FD on our hit list, do the following two things
887	 */
888	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
889		sorflush((struct socket *)(*fpp)->f_data);
890	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
891		closef(*fpp,(struct proc*) NULL);
892	free((caddr_t)extra_ref, M_FILE);
893	unp_gcing = 0;
894}
895
896void
897unp_dispose(m)
898	struct mbuf *m;
899{
900	if (m)
901		unp_scan(m, unp_discard);
902}
903
904static void
905unp_scan(m0, op)
906	register struct mbuf *m0;
907	void (*op)(struct file *);
908{
909	register struct mbuf *m;
910	register struct file **rp;
911	register struct cmsghdr *cm;
912	register int i;
913	int qfds;
914
915	while (m0) {
916		for (m = m0; m; m = m->m_next)
917			if (m->m_type == MT_CONTROL &&
918			    m->m_len >= sizeof(*cm)) {
919				cm = mtod(m, struct cmsghdr *);
920				if (cm->cmsg_level != SOL_SOCKET ||
921				    cm->cmsg_type != SCM_RIGHTS)
922					continue;
923				qfds = (cm->cmsg_len - sizeof *cm)
924						/ sizeof (struct file *);
925				rp = (struct file **)(cm + 1);
926				for (i = 0; i < qfds; i++)
927					(*op)(*rp++);
928				break;		/* XXX, but saves time */
929			}
930		m0 = m0->m_act;
931	}
932}
933
934static void
935unp_mark(fp)
936	struct file *fp;
937{
938
939	if (fp->f_flag & FMARK)
940		return;
941	unp_defer++;
942	fp->f_flag |= (FMARK|FDEFER);
943}
944
945static void
946unp_discard(fp)
947	struct file *fp;
948{
949
950	fp->f_msgcount--;
951	unp_rights--;
952	(void) closef(fp, (struct proc *)NULL);
953}
954