uipc_usrreq.c revision 133792
1192904Sbms/*
2192904Sbms * Copyright 2004 Robert N. M. Watson
3192904Sbms * Copyright (c) 1982, 1986, 1989, 1991, 1993
4192904Sbms *	The Regents of the University of California.  All rights reserved.
5192904Sbms *
6192904Sbms * Redistribution and use in source and binary forms, with or without
7192904Sbms * modification, are permitted provided that the following conditions
8192904Sbms * are met:
9192904Sbms * 1. Redistributions of source code must retain the above copyright
10192904Sbms *    notice, this list of conditions and the following disclaimer.
11192904Sbms * 2. Redistributions in binary form must reproduce the above copyright
12192904Sbms *    notice, this list of conditions and the following disclaimer in the
13192904Sbms *    documentation and/or other materials provided with the distribution.
14192904Sbms * 4. Neither the name of the University nor the names of its contributors
15192904Sbms *    may be used to endorse or promote products derived from this software
16192904Sbms *    without specific prior written permission.
17192904Sbms *
18192904Sbms * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19192904Sbms * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20192904Sbms * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21192904Sbms * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22192904Sbms * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23192904Sbms * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24192904Sbms * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25192904Sbms * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26192904Sbms * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27192904Sbms * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28192904Sbms * SUCH DAMAGE.
29192904Sbms *
30249252Sae *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
31192904Sbms */
32192904Sbms
33192904Sbms#include <sys/cdefs.h>
34192904Sbms__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 133792 2004-08-16 01:52:04Z rwatson $");
35192904Sbms
36192904Sbms#include "opt_mac.h"
37192904Sbms
38192904Sbms#include <sys/param.h>
39192904Sbms#include <sys/domain.h>
40192904Sbms#include <sys/fcntl.h>
41192904Sbms#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
42192904Sbms#include <sys/file.h>
43192904Sbms#include <sys/filedesc.h>
44192904Sbms#include <sys/jail.h>
45192904Sbms#include <sys/kernel.h>
46192904Sbms#include <sys/lock.h>
47192904Sbms#include <sys/mac.h>
48192904Sbms#include <sys/mbuf.h>
49192904Sbms#include <sys/mutex.h>
50192904Sbms#include <sys/namei.h>
51192904Sbms#include <sys/proc.h>
52192904Sbms#include <sys/protosw.h>
53192904Sbms#include <sys/resourcevar.h>
54192904Sbms#include <sys/socket.h>
55233648Seadler#include <sys/socketvar.h>
56192904Sbms#include <sys/signalvar.h>
57192904Sbms#include <sys/stat.h>
58192904Sbms#include <sys/sx.h>
59192904Sbms#include <sys/sysctl.h>
60192904Sbms#include <sys/systm.h>
61192904Sbms#include <sys/un.h>
62192904Sbms#include <sys/unpcb.h>
63192904Sbms#include <sys/vnode.h>
64192904Sbms
65192904Sbms#include <vm/uma.h>
66192904Sbms
67192904Sbmsstatic uma_zone_t unp_zone;
68192904Sbmsstatic	unp_gen_t unp_gencnt;
69192904Sbmsstatic	u_int unp_count;
70192904Sbms
71192904Sbmsstatic	struct unp_head unp_shead, unp_dhead;
72192904Sbms
73192904Sbms/*
74192904Sbms * Unix communications domain.
75192904Sbms *
76192904Sbms * TODO:
77192904Sbms *	SEQPACKET, RDM
78192904Sbms *	rethink name space problems
79192904Sbms *	need a proper out-of-band
80192904Sbms *	lock pushdown
81192904Sbms */
82192904Sbmsstatic const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
83192904Sbmsstatic ino_t	unp_ino;		/* prototype for fake inode numbers */
84192904Sbms
85192904Sbms/*
86192904Sbms * Currently, UNIX domain sockets are protected by a single subsystem lock,
87192904Sbms * which covers global data structures and variables, the contents of each
88192904Sbms * per-socket unpcb structure, and the so_pcb field in sockets attached to
89192904Sbms * the UNIX domain.  This provides for a moderate degree of paralellism, as
90192904Sbms * receive operations on UNIX domain sockets do not need to acquire the
91192904Sbms * subsystem lock.  Finer grained locking to permit send() without acquiring
92249253Sjoel * a global lock would be a logical next step.
93249253Sjoel *
94249252Sae * The UNIX domain socket lock preceds all socket layer locks, including the
95192904Sbms * socket lock and socket buffer lock, permitting UNIX domain socket code to
96192904Sbms * call into socket support routines without releasing its locks.
97249253Sjoel *
98192904Sbms * Some caution is required in areas where the UNIX domain socket code enters
99192904Sbms * VFS in order to create or find rendezvous points.  This results in
100192904Sbms * dropping of the UNIX domain socket subsystem lock, acquisition of the
101192904Sbms * Giant lock, and potential sleeping.  This increases the chances of races,
102192904Sbms * and exposes weaknesses in the socket->protocol API by offering poor
103 * failure modes.
104 */
105static struct mtx unp_mtx;
106#define	UNP_LOCK_INIT() \
107	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
108#define	UNP_LOCK()		mtx_lock(&unp_mtx)
109#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
110#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
111
112static int     unp_attach(struct socket *);
113static void    unp_detach(struct unpcb *);
114static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
115static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
116static int     unp_connect2(struct socket *so, struct socket *so2);
117static void    unp_disconnect(struct unpcb *);
118static void    unp_shutdown(struct unpcb *);
119static void    unp_drop(struct unpcb *, int);
120static void    unp_gc(void);
121static void    unp_scan(struct mbuf *, void (*)(struct file *));
122static void    unp_mark(struct file *);
123static void    unp_discard(struct file *);
124static void    unp_freerights(struct file **, int);
125static int     unp_internalize(struct mbuf **, struct thread *);
126static int     unp_listen(struct unpcb *, struct thread *);
127
128static int
129uipc_abort(struct socket *so)
130{
131	struct unpcb *unp = sotounpcb(so);
132
133	if (unp == NULL)
134		return (EINVAL);
135	UNP_LOCK();
136	unp_drop(unp, ECONNABORTED);
137	unp_detach(unp);	/* NB: unlocks */
138	SOCK_LOCK(so);
139	sotryfree(so);
140	return (0);
141}
142
143static int
144uipc_accept(struct socket *so, struct sockaddr **nam)
145{
146	struct unpcb *unp = sotounpcb(so);
147	const struct sockaddr *sa;
148
149	if (unp == NULL)
150		return (EINVAL);
151
152	/*
153	 * Pass back name of connected socket,
154	 * if it was bound and we are still connected
155	 * (our peer may have closed already!).
156	 */
157	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
158	UNP_LOCK();
159	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
160		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
161	else
162		sa = &sun_noname;
163	bcopy(sa, *nam, sa->sa_len);
164	UNP_UNLOCK();
165	return (0);
166}
167
168static int
169uipc_attach(struct socket *so, int proto, struct thread *td)
170{
171	struct unpcb *unp = sotounpcb(so);
172
173	if (unp != NULL)
174		return (EISCONN);
175	return (unp_attach(so));
176}
177
178static int
179uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
180{
181	struct unpcb *unp = sotounpcb(so);
182
183	if (unp == NULL)
184		return (EINVAL);
185
186	return (unp_bind(unp, nam, td));
187}
188
189static int
190uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
191{
192	struct unpcb *unp;
193	int error;
194
195	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
196
197	UNP_LOCK();
198	unp = sotounpcb(so);
199	if (unp == NULL) {
200		error = EINVAL;
201		goto out;
202	}
203	error = unp_connect(so, nam, td);
204out:
205	UNP_UNLOCK();
206	return (error);
207}
208
209int
210uipc_connect2(struct socket *so1, struct socket *so2)
211{
212	struct unpcb *unp = sotounpcb(so1);
213	int error;
214
215	if (unp == NULL)
216		return (EINVAL);
217
218	UNP_LOCK();
219	error = unp_connect2(so1, so2);
220	UNP_UNLOCK();
221	return (error);
222}
223
224/* control is EOPNOTSUPP */
225
226static int
227uipc_detach(struct socket *so)
228{
229	struct unpcb *unp = sotounpcb(so);
230
231	if (unp == NULL)
232		return (EINVAL);
233
234	UNP_LOCK();
235	unp_detach(unp);	/* NB: unlocks unp */
236	return (0);
237}
238
239static int
240uipc_disconnect(struct socket *so)
241{
242	struct unpcb *unp = sotounpcb(so);
243
244	if (unp == NULL)
245		return (EINVAL);
246	UNP_LOCK();
247	unp_disconnect(unp);
248	UNP_UNLOCK();
249	return (0);
250}
251
252static int
253uipc_listen(struct socket *so, struct thread *td)
254{
255	struct unpcb *unp = sotounpcb(so);
256	int error;
257
258	if (unp == NULL || unp->unp_vnode == NULL)
259		return (EINVAL);
260	UNP_LOCK();
261	error = unp_listen(unp, td);
262	UNP_UNLOCK();
263	return (error);
264}
265
266static int
267uipc_peeraddr(struct socket *so, struct sockaddr **nam)
268{
269	struct unpcb *unp = sotounpcb(so);
270	const struct sockaddr *sa;
271
272	if (unp == NULL)
273		return (EINVAL);
274	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
275	UNP_LOCK();
276	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
277		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
278	else {
279		/*
280		 * XXX: It seems that this test always fails even when
281		 * connection is established.  So, this else clause is
282		 * added as workaround to return PF_LOCAL sockaddr.
283		 */
284		sa = &sun_noname;
285	}
286	bcopy(sa, *nam, sa->sa_len);
287	UNP_UNLOCK();
288	return (0);
289}
290
291static int
292uipc_rcvd(struct socket *so, int flags)
293{
294	struct unpcb *unp = sotounpcb(so);
295	struct socket *so2;
296	u_long newhiwat;
297
298	if (unp == NULL)
299		return (EINVAL);
300	UNP_LOCK();
301	switch (so->so_type) {
302	case SOCK_DGRAM:
303		panic("uipc_rcvd DGRAM?");
304		/*NOTREACHED*/
305
306	case SOCK_STREAM:
307		if (unp->unp_conn == NULL)
308			break;
309		so2 = unp->unp_conn->unp_socket;
310		SOCKBUF_LOCK(&so2->so_snd);
311		SOCKBUF_LOCK(&so->so_rcv);
312		/*
313		 * Adjust backpressure on sender
314		 * and wakeup any waiting to write.
315		 */
316		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
317		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
318		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
319		    so->so_rcv.sb_cc;
320		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
321		    newhiwat, RLIM_INFINITY);
322		unp->unp_cc = so->so_rcv.sb_cc;
323		SOCKBUF_UNLOCK(&so->so_rcv);
324		sowwakeup_locked(so2);
325		break;
326
327	default:
328		panic("uipc_rcvd unknown socktype");
329	}
330	UNP_UNLOCK();
331	return (0);
332}
333
334/* pru_rcvoob is EOPNOTSUPP */
335
336static int
337uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
338	  struct mbuf *control, struct thread *td)
339{
340	int error = 0;
341	struct unpcb *unp = sotounpcb(so);
342	struct socket *so2;
343	u_long newhiwat;
344
345	if (unp == NULL) {
346		error = EINVAL;
347		goto release;
348	}
349	if (flags & PRUS_OOB) {
350		error = EOPNOTSUPP;
351		goto release;
352	}
353
354	if (control != NULL && (error = unp_internalize(&control, td)))
355		goto release;
356
357	UNP_LOCK();
358	switch (so->so_type) {
359	case SOCK_DGRAM:
360	{
361		const struct sockaddr *from;
362
363		if (nam != NULL) {
364			if (unp->unp_conn != NULL) {
365				error = EISCONN;
366				break;
367			}
368			error = unp_connect(so, nam, td);
369			if (error)
370				break;
371		} else {
372			if (unp->unp_conn == NULL) {
373				error = ENOTCONN;
374				break;
375			}
376		}
377		so2 = unp->unp_conn->unp_socket;
378		if (unp->unp_addr != NULL)
379			from = (struct sockaddr *)unp->unp_addr;
380		else
381			from = &sun_noname;
382		SOCKBUF_LOCK(&so2->so_rcv);
383		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
384			sorwakeup_locked(so2);
385			m = NULL;
386			control = NULL;
387		} else {
388			SOCKBUF_UNLOCK(&so2->so_rcv);
389			error = ENOBUFS;
390		}
391		if (nam != NULL)
392			unp_disconnect(unp);
393		break;
394	}
395
396	case SOCK_STREAM:
397		/* Connect if not connected yet. */
398		/*
399		 * Note: A better implementation would complain
400		 * if not equal to the peer's address.
401		 */
402		if ((so->so_state & SS_ISCONNECTED) == 0) {
403			if (nam != NULL) {
404				error = unp_connect(so, nam, td);
405				if (error)
406					break;	/* XXX */
407			} else {
408				error = ENOTCONN;
409				break;
410			}
411		}
412
413		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
414			error = EPIPE;
415			break;
416		}
417		if (unp->unp_conn == NULL)
418			panic("uipc_send connected but no connection?");
419		so2 = unp->unp_conn->unp_socket;
420		SOCKBUF_LOCK(&so2->so_rcv);
421		/*
422		 * Send to paired receive port, and then reduce
423		 * send buffer hiwater marks to maintain backpressure.
424		 * Wake up readers.
425		 */
426		if (control != NULL) {
427			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
428				control = NULL;
429		} else {
430			sbappend_locked(&so2->so_rcv, m);
431		}
432		so->so_snd.sb_mbmax -=
433			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
434		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
435		newhiwat = so->so_snd.sb_hiwat -
436		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
437		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
438		    newhiwat, RLIM_INFINITY);
439		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
440		sorwakeup_locked(so2);
441		m = NULL;
442		break;
443
444	default:
445		panic("uipc_send unknown socktype");
446	}
447
448	/*
449	 * SEND_EOF is equivalent to a SEND followed by
450	 * a SHUTDOWN.
451	 */
452	if (flags & PRUS_EOF) {
453		socantsendmore(so);
454		unp_shutdown(unp);
455	}
456	UNP_UNLOCK();
457
458	if (control != NULL && error != 0)
459		unp_dispose(control);
460
461release:
462	if (control != NULL)
463		m_freem(control);
464	if (m != NULL)
465		m_freem(m);
466	return (error);
467}
468
469static int
470uipc_sense(struct socket *so, struct stat *sb)
471{
472	struct unpcb *unp = sotounpcb(so);
473	struct socket *so2;
474
475	if (unp == NULL)
476		return (EINVAL);
477	UNP_LOCK();
478	sb->st_blksize = so->so_snd.sb_hiwat;
479	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
480		so2 = unp->unp_conn->unp_socket;
481		sb->st_blksize += so2->so_rcv.sb_cc;
482	}
483	sb->st_dev = NODEV;
484	if (unp->unp_ino == 0)
485		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
486	sb->st_ino = unp->unp_ino;
487	UNP_UNLOCK();
488	return (0);
489}
490
491static int
492uipc_shutdown(struct socket *so)
493{
494	struct unpcb *unp = sotounpcb(so);
495
496	if (unp == NULL)
497		return (EINVAL);
498	UNP_LOCK();
499	socantsendmore(so);
500	unp_shutdown(unp);
501	UNP_UNLOCK();
502	return (0);
503}
504
505static int
506uipc_sockaddr(struct socket *so, struct sockaddr **nam)
507{
508	struct unpcb *unp = sotounpcb(so);
509	const struct sockaddr *sa;
510
511	if (unp == NULL)
512		return (EINVAL);
513	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
514	UNP_LOCK();
515	if (unp->unp_addr != NULL)
516		sa = (struct sockaddr *) unp->unp_addr;
517	else
518		sa = &sun_noname;
519	bcopy(sa, *nam, sa->sa_len);
520	UNP_UNLOCK();
521	return (0);
522}
523
524struct pr_usrreqs uipc_usrreqs = {
525	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
526	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
527	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
528	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
529	sosend, soreceive, sopoll, pru_sosetlabel_null
530};
531
532int
533uipc_ctloutput(so, sopt)
534	struct socket *so;
535	struct sockopt *sopt;
536{
537	struct unpcb *unp = sotounpcb(so);
538	struct xucred xu;
539	int error;
540
541	switch (sopt->sopt_dir) {
542	case SOPT_GET:
543		switch (sopt->sopt_name) {
544		case LOCAL_PEERCRED:
545			error = 0;
546			UNP_LOCK();
547			if (unp->unp_flags & UNP_HAVEPC)
548				xu = unp->unp_peercred;
549			else {
550				if (so->so_type == SOCK_STREAM)
551					error = ENOTCONN;
552				else
553					error = EINVAL;
554			}
555			UNP_UNLOCK();
556			if (error == 0)
557				error = sooptcopyout(sopt, &xu, sizeof(xu));
558			break;
559		default:
560			error = EOPNOTSUPP;
561			break;
562		}
563		break;
564	case SOPT_SET:
565	default:
566		error = EOPNOTSUPP;
567		break;
568	}
569	return (error);
570}
571
572/*
573 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
574 * for stream sockets, although the total for sender and receiver is
575 * actually only PIPSIZ.
576 * Datagram sockets really use the sendspace as the maximum datagram size,
577 * and don't really want to reserve the sendspace.  Their recvspace should
578 * be large enough for at least one max-size datagram plus address.
579 */
580#ifndef PIPSIZ
581#define	PIPSIZ	8192
582#endif
583static u_long	unpst_sendspace = PIPSIZ;
584static u_long	unpst_recvspace = PIPSIZ;
585static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
586static u_long	unpdg_recvspace = 4*1024;
587
588static int	unp_rights;			/* file descriptors in flight */
589
590SYSCTL_DECL(_net_local_stream);
591SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
592	   &unpst_sendspace, 0, "");
593SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
594	   &unpst_recvspace, 0, "");
595SYSCTL_DECL(_net_local_dgram);
596SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
597	   &unpdg_sendspace, 0, "");
598SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
599	   &unpdg_recvspace, 0, "");
600SYSCTL_DECL(_net_local);
601SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
602
603static int
604unp_attach(so)
605	struct socket *so;
606{
607	register struct unpcb *unp;
608	int error;
609
610	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
611		switch (so->so_type) {
612
613		case SOCK_STREAM:
614			error = soreserve(so, unpst_sendspace, unpst_recvspace);
615			break;
616
617		case SOCK_DGRAM:
618			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
619			break;
620
621		default:
622			panic("unp_attach");
623		}
624		if (error)
625			return (error);
626	}
627	unp = uma_zalloc(unp_zone, M_WAITOK);
628	if (unp == NULL)
629		return (ENOBUFS);
630	bzero(unp, sizeof *unp);
631	LIST_INIT(&unp->unp_refs);
632	unp->unp_socket = so;
633
634	UNP_LOCK();
635	unp->unp_gencnt = ++unp_gencnt;
636	unp_count++;
637	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
638			 : &unp_shead, unp, unp_link);
639	UNP_UNLOCK();
640
641	so->so_pcb = unp;
642	return (0);
643}
644
645static void
646unp_detach(unp)
647	register struct unpcb *unp;
648{
649	struct vnode *vp;
650
651	UNP_LOCK_ASSERT();
652
653	LIST_REMOVE(unp, unp_link);
654	unp->unp_gencnt = ++unp_gencnt;
655	--unp_count;
656	if ((vp = unp->unp_vnode) != NULL) {
657		/*
658		 * XXXRW: should v_socket be frobbed only while holding
659		 * Giant?
660		 */
661		unp->unp_vnode->v_socket = NULL;
662		unp->unp_vnode = NULL;
663	}
664	if (unp->unp_conn != NULL)
665		unp_disconnect(unp);
666	while (!LIST_EMPTY(&unp->unp_refs)) {
667		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
668		unp_drop(ref, ECONNRESET);
669	}
670	soisdisconnected(unp->unp_socket);
671	unp->unp_socket->so_pcb = NULL;
672	if (unp_rights) {
673		/*
674		 * Normally the receive buffer is flushed later,
675		 * in sofree, but if our receive buffer holds references
676		 * to descriptors that are now garbage, we will dispose
677		 * of those descriptor references after the garbage collector
678		 * gets them (resulting in a "panic: closef: count < 0").
679		 */
680		sorflush(unp->unp_socket);
681		unp_gc();
682	}
683	UNP_UNLOCK();
684	if (unp->unp_addr != NULL)
685		FREE(unp->unp_addr, M_SONAME);
686	uma_zfree(unp_zone, unp);
687	if (vp) {
688		mtx_lock(&Giant);
689		vrele(vp);
690		mtx_unlock(&Giant);
691	}
692}
693
694static int
695unp_bind(unp, nam, td)
696	struct unpcb *unp;
697	struct sockaddr *nam;
698	struct thread *td;
699{
700	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
701	struct vnode *vp;
702	struct mount *mp;
703	struct vattr vattr;
704	int error, namelen;
705	struct nameidata nd;
706	char *buf;
707
708	/*
709	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
710	 * unlocked read here is fine, but the value of unp_vnode needs
711	 * to be tested again after we do all the lookups to see if the
712	 * pcb is still unbound?
713	 */
714	if (unp->unp_vnode != NULL)
715		return (EINVAL);
716
717	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
718	if (namelen <= 0)
719		return (EINVAL);
720
721	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
722	strlcpy(buf, soun->sun_path, namelen + 1);
723
724	mtx_lock(&Giant);
725restart:
726	mtx_assert(&Giant, MA_OWNED);
727	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
728	    buf, td);
729/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
730	error = namei(&nd);
731	if (error)
732		goto done;
733	vp = nd.ni_vp;
734	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
735		NDFREE(&nd, NDF_ONLY_PNBUF);
736		if (nd.ni_dvp == vp)
737			vrele(nd.ni_dvp);
738		else
739			vput(nd.ni_dvp);
740		if (vp != NULL) {
741			vrele(vp);
742			error = EADDRINUSE;
743			goto done;
744		}
745		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
746		if (error)
747			goto done;
748		goto restart;
749	}
750	VATTR_NULL(&vattr);
751	vattr.va_type = VSOCK;
752	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
753#ifdef MAC
754	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
755	    &vattr);
756#endif
757	if (error == 0) {
758		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
759		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
760	}
761	NDFREE(&nd, NDF_ONLY_PNBUF);
762	vput(nd.ni_dvp);
763	if (error)
764		goto done;
765	vp = nd.ni_vp;
766	ASSERT_VOP_LOCKED(vp, "unp_bind");
767	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
768	UNP_LOCK();
769	vp->v_socket = unp->unp_socket;
770	unp->unp_vnode = vp;
771	unp->unp_addr = soun;
772	UNP_UNLOCK();
773	VOP_UNLOCK(vp, 0, td);
774	vn_finished_write(mp);
775done:
776	mtx_unlock(&Giant);
777	free(buf, M_TEMP);
778	return (error);
779}
780
781static int
782unp_connect(so, nam, td)
783	struct socket *so;
784	struct sockaddr *nam;
785	struct thread *td;
786{
787	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
788	register struct vnode *vp;
789	register struct socket *so2, *so3;
790	struct unpcb *unp, *unp2, *unp3;
791	int error, len;
792	struct nameidata nd;
793	char buf[SOCK_MAXADDRLEN];
794	struct sockaddr *sa;
795
796	UNP_LOCK_ASSERT();
797	unp = sotounpcb(so);
798
799	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
800	if (len <= 0)
801		return (EINVAL);
802	strlcpy(buf, soun->sun_path, len + 1);
803	UNP_UNLOCK();
804	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
805	mtx_lock(&Giant);
806	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
807	error = namei(&nd);
808	if (error)
809		vp = NULL;
810	else
811		vp = nd.ni_vp;
812	ASSERT_VOP_LOCKED(vp, "unp_connect");
813	NDFREE(&nd, NDF_ONLY_PNBUF);
814	if (error)
815		goto bad;
816
817	if (vp->v_type != VSOCK) {
818		error = ENOTSOCK;
819		goto bad;
820	}
821	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
822	if (error)
823		goto bad;
824	mtx_unlock(&Giant);
825	UNP_LOCK();
826	unp = sotounpcb(so);
827	if (unp == NULL) {
828		/*
829		 * XXXRW: Temporary debugging printf.
830		 */
831		printf("unp_connect(): lost race to another thread\n");
832		error = EINVAL;
833		goto bad2;
834	}
835	so2 = vp->v_socket;
836	if (so2 == NULL) {
837		error = ECONNREFUSED;
838		goto bad2;
839	}
840	if (so->so_type != so2->so_type) {
841		error = EPROTOTYPE;
842		goto bad2;
843	}
844	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
845		if (so2->so_options & SO_ACCEPTCONN) {
846			/*
847			 * NB: drop locks here so unp_attach is entered
848			 *     w/o locks; this avoids a recursive lock
849			 *     of the head and holding sleep locks across
850			 *     a (potentially) blocking malloc.
851			 */
852			UNP_UNLOCK();
853			so3 = sonewconn(so2, 0);
854			UNP_LOCK();
855		} else
856			so3 = NULL;
857		if (so3 == NULL) {
858			error = ECONNREFUSED;
859			goto bad2;
860		}
861		unp = sotounpcb(so);
862		unp2 = sotounpcb(so2);
863		unp3 = sotounpcb(so3);
864		if (unp2->unp_addr != NULL) {
865			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
866			unp3->unp_addr = (struct sockaddr_un *) sa;
867			sa = NULL;
868		}
869		/*
870		 * unp_peercred management:
871		 *
872		 * The connecter's (client's) credentials are copied
873		 * from its process structure at the time of connect()
874		 * (which is now).
875		 */
876		cru2x(td->td_ucred, &unp3->unp_peercred);
877		unp3->unp_flags |= UNP_HAVEPC;
878		/*
879		 * The receiver's (server's) credentials are copied
880		 * from the unp_peercred member of socket on which the
881		 * former called listen(); unp_listen() cached that
882		 * process's credentials at that time so we can use
883		 * them now.
884		 */
885		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
886		    ("unp_connect: listener without cached peercred"));
887		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
888		    sizeof(unp->unp_peercred));
889		unp->unp_flags |= UNP_HAVEPC;
890#ifdef MAC
891		SOCK_LOCK(so);
892		mac_set_socket_peer_from_socket(so, so3);
893		mac_set_socket_peer_from_socket(so3, so);
894		SOCK_UNLOCK(so);
895#endif
896
897		so2 = so3;
898	}
899	error = unp_connect2(so, so2);
900bad2:
901	UNP_UNLOCK();
902	mtx_lock(&Giant);
903bad:
904	mtx_assert(&Giant, MA_OWNED);
905	if (vp != NULL)
906		vput(vp);
907	mtx_unlock(&Giant);
908	free(sa, M_SONAME);
909	UNP_LOCK();
910	return (error);
911}
912
913static int
914unp_connect2(so, so2)
915	register struct socket *so;
916	register struct socket *so2;
917{
918	register struct unpcb *unp = sotounpcb(so);
919	register struct unpcb *unp2;
920
921	UNP_LOCK_ASSERT();
922
923	if (so2->so_type != so->so_type)
924		return (EPROTOTYPE);
925	unp2 = sotounpcb(so2);
926	unp->unp_conn = unp2;
927	switch (so->so_type) {
928
929	case SOCK_DGRAM:
930		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
931		soisconnected(so);
932		break;
933
934	case SOCK_STREAM:
935		unp2->unp_conn = unp;
936		soisconnected(so);
937		soisconnected(so2);
938		break;
939
940	default:
941		panic("unp_connect2");
942	}
943	return (0);
944}
945
946static void
947unp_disconnect(unp)
948	struct unpcb *unp;
949{
950	register struct unpcb *unp2 = unp->unp_conn;
951	struct socket *so;
952
953	UNP_LOCK_ASSERT();
954
955	if (unp2 == NULL)
956		return;
957	unp->unp_conn = NULL;
958	switch (unp->unp_socket->so_type) {
959
960	case SOCK_DGRAM:
961		LIST_REMOVE(unp, unp_reflink);
962		so = unp->unp_socket;
963		SOCK_LOCK(so);
964		so->so_state &= ~SS_ISCONNECTED;
965		SOCK_UNLOCK(so);
966		break;
967
968	case SOCK_STREAM:
969		soisdisconnected(unp->unp_socket);
970		unp2->unp_conn = NULL;
971		soisdisconnected(unp2->unp_socket);
972		break;
973	}
974}
975
976#ifdef notdef
977void
978unp_abort(unp)
979	struct unpcb *unp;
980{
981
982	unp_detach(unp);
983}
984#endif
985
986/*
987 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
988 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
989 * are safe to reference.  It first scans the list of struct unpcb's to
990 * generate a pointer list, then it rescans its list one entry at a time to
991 * externalize and copyout.  It checks the generation number to see if a
992 * struct unpcb has been reused, and will skip it if so.
993 */
994static int
995unp_pcblist(SYSCTL_HANDLER_ARGS)
996{
997	int error, i, n;
998	struct unpcb *unp, **unp_list;
999	unp_gen_t gencnt;
1000	struct xunpgen *xug;
1001	struct unp_head *head;
1002	struct xunpcb *xu;
1003
1004	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1005
1006	/*
1007	 * The process of preparing the PCB list is too time-consuming and
1008	 * resource-intensive to repeat twice on every request.
1009	 */
1010	if (req->oldptr == NULL) {
1011		n = unp_count;
1012		req->oldidx = 2 * (sizeof *xug)
1013			+ (n + n/8) * sizeof(struct xunpcb);
1014		return (0);
1015	}
1016
1017	if (req->newptr != NULL)
1018		return (EPERM);
1019
1020	/*
1021	 * OK, now we're committed to doing something.
1022	 */
1023	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1024	UNP_LOCK();
1025	gencnt = unp_gencnt;
1026	n = unp_count;
1027	UNP_UNLOCK();
1028
1029	xug->xug_len = sizeof *xug;
1030	xug->xug_count = n;
1031	xug->xug_gen = gencnt;
1032	xug->xug_sogen = so_gencnt;
1033	error = SYSCTL_OUT(req, xug, sizeof *xug);
1034	if (error) {
1035		free(xug, M_TEMP);
1036		return (error);
1037	}
1038
1039	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1040
1041	UNP_LOCK();
1042	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1043	     unp = LIST_NEXT(unp, unp_link)) {
1044		if (unp->unp_gencnt <= gencnt) {
1045			if (cr_cansee(req->td->td_ucred,
1046			    unp->unp_socket->so_cred))
1047				continue;
1048			unp_list[i++] = unp;
1049		}
1050	}
1051	UNP_UNLOCK();
1052	n = i;			/* in case we lost some during malloc */
1053
1054	error = 0;
1055	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1056	for (i = 0; i < n; i++) {
1057		unp = unp_list[i];
1058		if (unp->unp_gencnt <= gencnt) {
1059			xu->xu_len = sizeof *xu;
1060			xu->xu_unpp = unp;
1061			/*
1062			 * XXX - need more locking here to protect against
1063			 * connect/disconnect races for SMP.
1064			 */
1065			if (unp->unp_addr != NULL)
1066				bcopy(unp->unp_addr, &xu->xu_addr,
1067				      unp->unp_addr->sun_len);
1068			if (unp->unp_conn != NULL &&
1069			    unp->unp_conn->unp_addr != NULL)
1070				bcopy(unp->unp_conn->unp_addr,
1071				      &xu->xu_caddr,
1072				      unp->unp_conn->unp_addr->sun_len);
1073			bcopy(unp, &xu->xu_unp, sizeof *unp);
1074			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1075			error = SYSCTL_OUT(req, xu, sizeof *xu);
1076		}
1077	}
1078	free(xu, M_TEMP);
1079	if (!error) {
1080		/*
1081		 * Give the user an updated idea of our state.
1082		 * If the generation differs from what we told
1083		 * her before, she knows that something happened
1084		 * while we were processing this request, and it
1085		 * might be necessary to retry.
1086		 */
1087		xug->xug_gen = unp_gencnt;
1088		xug->xug_sogen = so_gencnt;
1089		xug->xug_count = unp_count;
1090		error = SYSCTL_OUT(req, xug, sizeof *xug);
1091	}
1092	free(unp_list, M_TEMP);
1093	free(xug, M_TEMP);
1094	return (error);
1095}
1096
1097SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1098	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1099	    "List of active local datagram sockets");
1100SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1101	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1102	    "List of active local stream sockets");
1103
1104static void
1105unp_shutdown(unp)
1106	struct unpcb *unp;
1107{
1108	struct socket *so;
1109
1110	UNP_LOCK_ASSERT();
1111
1112	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1113	    (so = unp->unp_conn->unp_socket))
1114		socantrcvmore(so);
1115}
1116
1117static void
1118unp_drop(unp, errno)
1119	struct unpcb *unp;
1120	int errno;
1121{
1122	struct socket *so = unp->unp_socket;
1123
1124	UNP_LOCK_ASSERT();
1125
1126	so->so_error = errno;
1127	unp_disconnect(unp);
1128}
1129
1130#ifdef notdef
1131void
1132unp_drain()
1133{
1134
1135}
1136#endif
1137
1138static void
1139unp_freerights(rp, fdcount)
1140	struct file **rp;
1141	int fdcount;
1142{
1143	int i;
1144	struct file *fp;
1145
1146	for (i = 0; i < fdcount; i++) {
1147		fp = *rp;
1148		/*
1149		 * zero the pointer before calling
1150		 * unp_discard since it may end up
1151		 * in unp_gc()..
1152		 */
1153		*rp++ = 0;
1154		unp_discard(fp);
1155	}
1156}
1157
1158int
1159unp_externalize(control, controlp)
1160	struct mbuf *control, **controlp;
1161{
1162	struct thread *td = curthread;		/* XXX */
1163	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1164	int i;
1165	int *fdp;
1166	struct file **rp;
1167	struct file *fp;
1168	void *data;
1169	socklen_t clen = control->m_len, datalen;
1170	int error, newfds;
1171	int f;
1172	u_int newlen;
1173
1174	error = 0;
1175	if (controlp != NULL) /* controlp == NULL => free control messages */
1176		*controlp = NULL;
1177
1178	while (cm != NULL) {
1179		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1180			error = EINVAL;
1181			break;
1182		}
1183
1184		data = CMSG_DATA(cm);
1185		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1186
1187		if (cm->cmsg_level == SOL_SOCKET
1188		    && cm->cmsg_type == SCM_RIGHTS) {
1189			newfds = datalen / sizeof(struct file *);
1190			rp = data;
1191
1192			/* If we're not outputting the descriptors free them. */
1193			if (error || controlp == NULL) {
1194				unp_freerights(rp, newfds);
1195				goto next;
1196			}
1197			FILEDESC_LOCK(td->td_proc->p_fd);
1198			/* if the new FD's will not fit free them.  */
1199			if (!fdavail(td, newfds)) {
1200				FILEDESC_UNLOCK(td->td_proc->p_fd);
1201				error = EMSGSIZE;
1202				unp_freerights(rp, newfds);
1203				goto next;
1204			}
1205			/*
1206			 * now change each pointer to an fd in the global
1207			 * table to an integer that is the index to the
1208			 * local fd table entry that we set up to point
1209			 * to the global one we are transferring.
1210			 */
1211			newlen = newfds * sizeof(int);
1212			*controlp = sbcreatecontrol(NULL, newlen,
1213			    SCM_RIGHTS, SOL_SOCKET);
1214			if (*controlp == NULL) {
1215				FILEDESC_UNLOCK(td->td_proc->p_fd);
1216				error = E2BIG;
1217				unp_freerights(rp, newfds);
1218				goto next;
1219			}
1220
1221			fdp = (int *)
1222			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1223			for (i = 0; i < newfds; i++) {
1224				if (fdalloc(td, 0, &f))
1225					panic("unp_externalize fdalloc failed");
1226				fp = *rp++;
1227				td->td_proc->p_fd->fd_ofiles[f] = fp;
1228				FILE_LOCK(fp);
1229				fp->f_msgcount--;
1230				FILE_UNLOCK(fp);
1231				unp_rights--;
1232				*fdp++ = f;
1233			}
1234			FILEDESC_UNLOCK(td->td_proc->p_fd);
1235		} else { /* We can just copy anything else across */
1236			if (error || controlp == NULL)
1237				goto next;
1238			*controlp = sbcreatecontrol(NULL, datalen,
1239			    cm->cmsg_type, cm->cmsg_level);
1240			if (*controlp == NULL) {
1241				error = ENOBUFS;
1242				goto next;
1243			}
1244			bcopy(data,
1245			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1246			    datalen);
1247		}
1248
1249		controlp = &(*controlp)->m_next;
1250
1251next:
1252		if (CMSG_SPACE(datalen) < clen) {
1253			clen -= CMSG_SPACE(datalen);
1254			cm = (struct cmsghdr *)
1255			    ((caddr_t)cm + CMSG_SPACE(datalen));
1256		} else {
1257			clen = 0;
1258			cm = NULL;
1259		}
1260	}
1261
1262	m_freem(control);
1263
1264	return (error);
1265}
1266
1267void
1268unp_init(void)
1269{
1270	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1271	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1272	if (unp_zone == NULL)
1273		panic("unp_init");
1274	uma_zone_set_max(unp_zone, nmbclusters);
1275	LIST_INIT(&unp_dhead);
1276	LIST_INIT(&unp_shead);
1277
1278	UNP_LOCK_INIT();
1279}
1280
1281static int
1282unp_internalize(controlp, td)
1283	struct mbuf **controlp;
1284	struct thread *td;
1285{
1286	struct mbuf *control = *controlp;
1287	struct proc *p = td->td_proc;
1288	struct filedesc *fdescp = p->p_fd;
1289	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1290	struct cmsgcred *cmcred;
1291	struct file **rp;
1292	struct file *fp;
1293	struct timeval *tv;
1294	int i, fd, *fdp;
1295	void *data;
1296	socklen_t clen = control->m_len, datalen;
1297	int error, oldfds;
1298	u_int newlen;
1299
1300	error = 0;
1301	*controlp = NULL;
1302
1303	while (cm != NULL) {
1304		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1305		    || cm->cmsg_len > clen) {
1306			error = EINVAL;
1307			goto out;
1308		}
1309
1310		data = CMSG_DATA(cm);
1311		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1312
1313		switch (cm->cmsg_type) {
1314		/*
1315		 * Fill in credential information.
1316		 */
1317		case SCM_CREDS:
1318			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1319			    SCM_CREDS, SOL_SOCKET);
1320			if (*controlp == NULL) {
1321				error = ENOBUFS;
1322				goto out;
1323			}
1324
1325			cmcred = (struct cmsgcred *)
1326			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1327			cmcred->cmcred_pid = p->p_pid;
1328			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1329			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1330			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1331			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1332							CMGROUP_MAX);
1333			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1334				cmcred->cmcred_groups[i] =
1335				    td->td_ucred->cr_groups[i];
1336			break;
1337
1338		case SCM_RIGHTS:
1339			oldfds = datalen / sizeof (int);
1340			/*
1341			 * check that all the FDs passed in refer to legal files
1342			 * If not, reject the entire operation.
1343			 */
1344			fdp = data;
1345			FILEDESC_LOCK(fdescp);
1346			for (i = 0; i < oldfds; i++) {
1347				fd = *fdp++;
1348				if ((unsigned)fd >= fdescp->fd_nfiles ||
1349				    fdescp->fd_ofiles[fd] == NULL) {
1350					FILEDESC_UNLOCK(fdescp);
1351					error = EBADF;
1352					goto out;
1353				}
1354				fp = fdescp->fd_ofiles[fd];
1355				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1356					FILEDESC_UNLOCK(fdescp);
1357					error = EOPNOTSUPP;
1358					goto out;
1359				}
1360
1361			}
1362			/*
1363			 * Now replace the integer FDs with pointers to
1364			 * the associated global file table entry..
1365			 */
1366			newlen = oldfds * sizeof(struct file *);
1367			*controlp = sbcreatecontrol(NULL, newlen,
1368			    SCM_RIGHTS, SOL_SOCKET);
1369			if (*controlp == NULL) {
1370				FILEDESC_UNLOCK(fdescp);
1371				error = E2BIG;
1372				goto out;
1373			}
1374
1375			fdp = data;
1376			rp = (struct file **)
1377			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1378			for (i = 0; i < oldfds; i++) {
1379				fp = fdescp->fd_ofiles[*fdp++];
1380				*rp++ = fp;
1381				FILE_LOCK(fp);
1382				fp->f_count++;
1383				fp->f_msgcount++;
1384				FILE_UNLOCK(fp);
1385				unp_rights++;
1386			}
1387			FILEDESC_UNLOCK(fdescp);
1388			break;
1389
1390		case SCM_TIMESTAMP:
1391			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1392			    SCM_TIMESTAMP, SOL_SOCKET);
1393			if (*controlp == NULL) {
1394				error = ENOBUFS;
1395				goto out;
1396			}
1397			tv = (struct timeval *)
1398			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1399			microtime(tv);
1400			break;
1401
1402		default:
1403			error = EINVAL;
1404			goto out;
1405		}
1406
1407		controlp = &(*controlp)->m_next;
1408
1409		if (CMSG_SPACE(datalen) < clen) {
1410			clen -= CMSG_SPACE(datalen);
1411			cm = (struct cmsghdr *)
1412			    ((caddr_t)cm + CMSG_SPACE(datalen));
1413		} else {
1414			clen = 0;
1415			cm = NULL;
1416		}
1417	}
1418
1419out:
1420	m_freem(control);
1421
1422	return (error);
1423}
1424
1425static int	unp_defer, unp_gcing;
1426
1427static void
1428unp_gc()
1429{
1430	register struct file *fp, *nextfp;
1431	register struct socket *so;
1432	struct file **extra_ref, **fpp;
1433	int nunref, i;
1434	int nfiles_snap;
1435	int nfiles_slack = 20;
1436
1437	UNP_LOCK_ASSERT();
1438
1439	if (unp_gcing)
1440		return;
1441	unp_gcing = 1;
1442	unp_defer = 0;
1443	/*
1444	 * before going through all this, set all FDs to
1445	 * be NOT defered and NOT externally accessible
1446	 */
1447	/*
1448	 * XXXRW: Acquiring a sleep lock while holding UNP
1449	 * mutex cannot be a good thing.
1450	 */
1451	sx_slock(&filelist_lock);
1452	LIST_FOREACH(fp, &filehead, f_list)
1453		fp->f_gcflag &= ~(FMARK|FDEFER);
1454	do {
1455		LIST_FOREACH(fp, &filehead, f_list) {
1456			FILE_LOCK(fp);
1457			/*
1458			 * If the file is not open, skip it
1459			 */
1460			if (fp->f_count == 0) {
1461				FILE_UNLOCK(fp);
1462				continue;
1463			}
1464			/*
1465			 * If we already marked it as 'defer'  in a
1466			 * previous pass, then try process it this time
1467			 * and un-mark it
1468			 */
1469			if (fp->f_gcflag & FDEFER) {
1470				fp->f_gcflag &= ~FDEFER;
1471				unp_defer--;
1472			} else {
1473				/*
1474				 * if it's not defered, then check if it's
1475				 * already marked.. if so skip it
1476				 */
1477				if (fp->f_gcflag & FMARK) {
1478					FILE_UNLOCK(fp);
1479					continue;
1480				}
1481				/*
1482				 * If all references are from messages
1483				 * in transit, then skip it. it's not
1484				 * externally accessible.
1485				 */
1486				if (fp->f_count == fp->f_msgcount) {
1487					FILE_UNLOCK(fp);
1488					continue;
1489				}
1490				/*
1491				 * If it got this far then it must be
1492				 * externally accessible.
1493				 */
1494				fp->f_gcflag |= FMARK;
1495			}
1496			/*
1497			 * either it was defered, or it is externally
1498			 * accessible and not already marked so.
1499			 * Now check if it is possibly one of OUR sockets.
1500			 */
1501			if (fp->f_type != DTYPE_SOCKET ||
1502			    (so = fp->f_data) == NULL) {
1503				FILE_UNLOCK(fp);
1504				continue;
1505			}
1506			FILE_UNLOCK(fp);
1507			if (so->so_proto->pr_domain != &localdomain ||
1508			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1509				continue;
1510#ifdef notdef
1511			if (so->so_rcv.sb_flags & SB_LOCK) {
1512				/*
1513				 * This is problematical; it's not clear
1514				 * we need to wait for the sockbuf to be
1515				 * unlocked (on a uniprocessor, at least),
1516				 * and it's also not clear what to do
1517				 * if sbwait returns an error due to receipt
1518				 * of a signal.  If sbwait does return
1519				 * an error, we'll go into an infinite
1520				 * loop.  Delete all of this for now.
1521				 */
1522				(void) sbwait(&so->so_rcv);
1523				goto restart;
1524			}
1525#endif
1526			/*
1527			 * So, Ok, it's one of our sockets and it IS externally
1528			 * accessible (or was defered). Now we look
1529			 * to see if we hold any file descriptors in its
1530			 * message buffers. Follow those links and mark them
1531			 * as accessible too.
1532			 */
1533			SOCKBUF_LOCK(&so->so_rcv);
1534			unp_scan(so->so_rcv.sb_mb, unp_mark);
1535			SOCKBUF_UNLOCK(&so->so_rcv);
1536		}
1537	} while (unp_defer);
1538	sx_sunlock(&filelist_lock);
1539	/*
1540	 * We grab an extra reference to each of the file table entries
1541	 * that are not otherwise accessible and then free the rights
1542	 * that are stored in messages on them.
1543	 *
1544	 * The bug in the orginal code is a little tricky, so I'll describe
1545	 * what's wrong with it here.
1546	 *
1547	 * It is incorrect to simply unp_discard each entry for f_msgcount
1548	 * times -- consider the case of sockets A and B that contain
1549	 * references to each other.  On a last close of some other socket,
1550	 * we trigger a gc since the number of outstanding rights (unp_rights)
1551	 * is non-zero.  If during the sweep phase the gc code un_discards,
1552	 * we end up doing a (full) closef on the descriptor.  A closef on A
1553	 * results in the following chain.  Closef calls soo_close, which
1554	 * calls soclose.   Soclose calls first (through the switch
1555	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1556	 * returns because the previous instance had set unp_gcing, and
1557	 * we return all the way back to soclose, which marks the socket
1558	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1559	 * to free up the rights that are queued in messages on the socket A,
1560	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1561	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1562	 * instance of unp_discard just calls closef on B.
1563	 *
1564	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1565	 * which results in another closef on A.  Unfortunately, A is already
1566	 * being closed, and the descriptor has already been marked with
1567	 * SS_NOFDREF, and soclose panics at this point.
1568	 *
1569	 * Here, we first take an extra reference to each inaccessible
1570	 * descriptor.  Then, we call sorflush ourself, since we know
1571	 * it is a Unix domain socket anyhow.  After we destroy all the
1572	 * rights carried in messages, we do a last closef to get rid
1573	 * of our extra reference.  This is the last close, and the
1574	 * unp_detach etc will shut down the socket.
1575	 *
1576	 * 91/09/19, bsy@cs.cmu.edu
1577	 */
1578again:
1579	nfiles_snap = nfiles + nfiles_slack;	/* some slack */
1580	extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
1581	    M_WAITOK);
1582	sx_slock(&filelist_lock);
1583	if (nfiles_snap < nfiles) {
1584		sx_sunlock(&filelist_lock);
1585		free(extra_ref, M_TEMP);
1586		nfiles_slack += 20;
1587		goto again;
1588	}
1589	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1590	    fp != NULL; fp = nextfp) {
1591		nextfp = LIST_NEXT(fp, f_list);
1592		FILE_LOCK(fp);
1593		/*
1594		 * If it's not open, skip it
1595		 */
1596		if (fp->f_count == 0) {
1597			FILE_UNLOCK(fp);
1598			continue;
1599		}
1600		/*
1601		 * If all refs are from msgs, and it's not marked accessible
1602		 * then it must be referenced from some unreachable cycle
1603		 * of (shut-down) FDs, so include it in our
1604		 * list of FDs to remove
1605		 */
1606		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1607			*fpp++ = fp;
1608			nunref++;
1609			fp->f_count++;
1610		}
1611		FILE_UNLOCK(fp);
1612	}
1613	sx_sunlock(&filelist_lock);
1614	/*
1615	 * for each FD on our hit list, do the following two things
1616	 */
1617	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1618		struct file *tfp = *fpp;
1619		FILE_LOCK(tfp);
1620		if (tfp->f_type == DTYPE_SOCKET &&
1621		    tfp->f_data != NULL) {
1622			FILE_UNLOCK(tfp);
1623			sorflush(tfp->f_data);
1624		} else {
1625			FILE_UNLOCK(tfp);
1626		}
1627	}
1628	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1629		closef(*fpp, (struct thread *) NULL);
1630	free(extra_ref, M_TEMP);
1631	unp_gcing = 0;
1632}
1633
1634void
1635unp_dispose(m)
1636	struct mbuf *m;
1637{
1638
1639	if (m)
1640		unp_scan(m, unp_discard);
1641}
1642
1643static int
1644unp_listen(unp, td)
1645	struct unpcb *unp;
1646	struct thread *td;
1647{
1648	UNP_LOCK_ASSERT();
1649
1650	/*
1651	 * XXXRW: Why populate the local peer cred with our own credential?
1652	 */
1653	cru2x(td->td_ucred, &unp->unp_peercred);
1654	unp->unp_flags |= UNP_HAVEPCCACHED;
1655	return (0);
1656}
1657
1658static void
1659unp_scan(m0, op)
1660	register struct mbuf *m0;
1661	void (*op)(struct file *);
1662{
1663	struct mbuf *m;
1664	struct file **rp;
1665	struct cmsghdr *cm;
1666	void *data;
1667	int i;
1668	socklen_t clen, datalen;
1669	int qfds;
1670
1671	while (m0 != NULL) {
1672		for (m = m0; m; m = m->m_next) {
1673			if (m->m_type != MT_CONTROL)
1674				continue;
1675
1676			cm = mtod(m, struct cmsghdr *);
1677			clen = m->m_len;
1678
1679			while (cm != NULL) {
1680				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1681					break;
1682
1683				data = CMSG_DATA(cm);
1684				datalen = (caddr_t)cm + cm->cmsg_len
1685				    - (caddr_t)data;
1686
1687				if (cm->cmsg_level == SOL_SOCKET &&
1688				    cm->cmsg_type == SCM_RIGHTS) {
1689					qfds = datalen / sizeof (struct file *);
1690					rp = data;
1691					for (i = 0; i < qfds; i++)
1692						(*op)(*rp++);
1693				}
1694
1695				if (CMSG_SPACE(datalen) < clen) {
1696					clen -= CMSG_SPACE(datalen);
1697					cm = (struct cmsghdr *)
1698					    ((caddr_t)cm + CMSG_SPACE(datalen));
1699				} else {
1700					clen = 0;
1701					cm = NULL;
1702				}
1703			}
1704		}
1705		m0 = m0->m_act;
1706	}
1707}
1708
1709static void
1710unp_mark(fp)
1711	struct file *fp;
1712{
1713	if (fp->f_gcflag & FMARK)
1714		return;
1715	unp_defer++;
1716	fp->f_gcflag |= (FMARK|FDEFER);
1717}
1718
1719static void
1720unp_discard(fp)
1721	struct file *fp;
1722{
1723	FILE_LOCK(fp);
1724	fp->f_msgcount--;
1725	unp_rights--;
1726	FILE_UNLOCK(fp);
1727	(void) closef(fp, (struct thread *)NULL);
1728}
1729