uipc_usrreq.c revision 145312
1/*-
2 * Copyright 2004-2005 Robert N. M. Watson
3 * Copyright (c) 1982, 1986, 1989, 1991, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 145312 2005-04-20 02:57:56Z mdodd $");
35
36#include "opt_mac.h"
37
38#include <sys/param.h>
39#include <sys/domain.h>
40#include <sys/fcntl.h>
41#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/jail.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mac.h>
48#include <sys/mbuf.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/proc.h>
52#include <sys/protosw.h>
53#include <sys/resourcevar.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/signalvar.h>
57#include <sys/stat.h>
58#include <sys/sx.h>
59#include <sys/sysctl.h>
60#include <sys/systm.h>
61#include <sys/un.h>
62#include <sys/unpcb.h>
63#include <sys/vnode.h>
64
65#include <vm/uma.h>
66
67static uma_zone_t unp_zone;
68static	unp_gen_t unp_gencnt;
69static	u_int unp_count;
70
71static	struct unp_head unp_shead, unp_dhead;
72
73/*
74 * Unix communications domain.
75 *
76 * TODO:
77 *	SEQPACKET, RDM
78 *	rethink name space problems
79 *	need a proper out-of-band
80 *	lock pushdown
81 */
82static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
83static ino_t	unp_ino;		/* prototype for fake inode numbers */
84struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
85
86/*
87 * Currently, UNIX domain sockets are protected by a single subsystem lock,
88 * which covers global data structures and variables, the contents of each
89 * per-socket unpcb structure, and the so_pcb field in sockets attached to
90 * the UNIX domain.  This provides for a moderate degree of paralellism, as
91 * receive operations on UNIX domain sockets do not need to acquire the
92 * subsystem lock.  Finer grained locking to permit send() without acquiring
93 * a global lock would be a logical next step.
94 *
95 * The UNIX domain socket lock preceds all socket layer locks, including the
96 * socket lock and socket buffer lock, permitting UNIX domain socket code to
97 * call into socket support routines without releasing its locks.
98 *
99 * Some caution is required in areas where the UNIX domain socket code enters
100 * VFS in order to create or find rendezvous points.  This results in
101 * dropping of the UNIX domain socket subsystem lock, acquisition of the
102 * Giant lock, and potential sleeping.  This increases the chances of races,
103 * and exposes weaknesses in the socket->protocol API by offering poor
104 * failure modes.
105 */
106static struct mtx unp_mtx;
107#define	UNP_LOCK_INIT() \
108	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
109#define	UNP_LOCK()		mtx_lock(&unp_mtx)
110#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
111#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
112#define	UNP_UNLOCK_ASSERT()	mtx_assert(&unp_mtx, MA_NOTOWNED)
113
114static int     unp_attach(struct socket *);
115static void    unp_detach(struct unpcb *);
116static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
117static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
118static int     unp_connect2(struct socket *so, struct socket *so2, int);
119static void    unp_disconnect(struct unpcb *);
120static void    unp_shutdown(struct unpcb *);
121static void    unp_drop(struct unpcb *, int);
122static void    unp_gc(void);
123static void    unp_scan(struct mbuf *, void (*)(struct file *));
124static void    unp_mark(struct file *);
125static void    unp_discard(struct file *);
126static void    unp_freerights(struct file **, int);
127static int     unp_internalize(struct mbuf **, struct thread *);
128static int     unp_listen(struct socket *, struct unpcb *, struct thread *);
129
130static int
131uipc_abort(struct socket *so)
132{
133	struct unpcb *unp;
134
135	UNP_LOCK();
136	unp = sotounpcb(so);
137	if (unp == NULL) {
138		UNP_UNLOCK();
139		return (EINVAL);
140	}
141	unp_drop(unp, ECONNABORTED);
142	unp_detach(unp);
143	UNP_UNLOCK_ASSERT();
144	ACCEPT_LOCK();
145	SOCK_LOCK(so);
146	sotryfree(so);
147	return (0);
148}
149
150static int
151uipc_accept(struct socket *so, struct sockaddr **nam)
152{
153	struct unpcb *unp;
154	const struct sockaddr *sa;
155
156	/*
157	 * Pass back name of connected socket,
158	 * if it was bound and we are still connected
159	 * (our peer may have closed already!).
160	 */
161	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
162	UNP_LOCK();
163	unp = sotounpcb(so);
164	if (unp == NULL) {
165		UNP_UNLOCK();
166		free(*nam, M_SONAME);
167		*nam = NULL;
168		return (EINVAL);
169	}
170	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
171		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
172	else
173		sa = &sun_noname;
174	bcopy(sa, *nam, sa->sa_len);
175	UNP_UNLOCK();
176	return (0);
177}
178
179static int
180uipc_attach(struct socket *so, int proto, struct thread *td)
181{
182	struct unpcb *unp = sotounpcb(so);
183
184	if (unp != NULL)
185		return (EISCONN);
186	return (unp_attach(so));
187}
188
189static int
190uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
191{
192	struct unpcb *unp;
193	int error;
194
195	UNP_LOCK();
196	unp = sotounpcb(so);
197	if (unp == NULL) {
198		UNP_UNLOCK();
199		return (EINVAL);
200	}
201	error = unp_bind(unp, nam, td);
202	UNP_UNLOCK();
203	return (error);
204}
205
206static int
207uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
208{
209	struct unpcb *unp;
210	int error;
211
212	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
213
214	UNP_LOCK();
215	unp = sotounpcb(so);
216	if (unp == NULL) {
217		UNP_UNLOCK();
218		return (EINVAL);
219	}
220	error = unp_connect(so, nam, td);
221	UNP_UNLOCK();
222	return (error);
223}
224
225int
226uipc_connect2(struct socket *so1, struct socket *so2)
227{
228	struct unpcb *unp;
229	int error;
230
231	UNP_LOCK();
232	unp = sotounpcb(so1);
233	if (unp == NULL) {
234		UNP_UNLOCK();
235		return (EINVAL);
236	}
237	error = unp_connect2(so1, so2, PRU_CONNECT2);
238	UNP_UNLOCK();
239	return (error);
240}
241
242/* control is EOPNOTSUPP */
243
244static int
245uipc_detach(struct socket *so)
246{
247	struct unpcb *unp;
248
249	UNP_LOCK();
250	unp = sotounpcb(so);
251	if (unp == NULL) {
252		UNP_UNLOCK();
253		return (EINVAL);
254	}
255	unp_detach(unp);
256	UNP_UNLOCK_ASSERT();
257	return (0);
258}
259
260static int
261uipc_disconnect(struct socket *so)
262{
263	struct unpcb *unp;
264
265	UNP_LOCK();
266	unp = sotounpcb(so);
267	if (unp == NULL) {
268		UNP_UNLOCK();
269		return (EINVAL);
270	}
271	unp_disconnect(unp);
272	UNP_UNLOCK();
273	return (0);
274}
275
276static int
277uipc_listen(struct socket *so, struct thread *td)
278{
279	struct unpcb *unp;
280	int error;
281
282	UNP_LOCK();
283	unp = sotounpcb(so);
284	if (unp == NULL || unp->unp_vnode == NULL) {
285		UNP_UNLOCK();
286		return (EINVAL);
287	}
288	error = unp_listen(so, unp, td);
289	UNP_UNLOCK();
290	return (error);
291}
292
293static int
294uipc_peeraddr(struct socket *so, struct sockaddr **nam)
295{
296	struct unpcb *unp;
297	const struct sockaddr *sa;
298
299	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
300	UNP_LOCK();
301	unp = sotounpcb(so);
302	if (unp == NULL) {
303		UNP_UNLOCK();
304		free(*nam, M_SONAME);
305		*nam = NULL;
306		return (EINVAL);
307	}
308	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
309		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
310	else {
311		/*
312		 * XXX: It seems that this test always fails even when
313		 * connection is established.  So, this else clause is
314		 * added as workaround to return PF_LOCAL sockaddr.
315		 */
316		sa = &sun_noname;
317	}
318	bcopy(sa, *nam, sa->sa_len);
319	UNP_UNLOCK();
320	return (0);
321}
322
323static int
324uipc_rcvd(struct socket *so, int flags)
325{
326	struct unpcb *unp;
327	struct socket *so2;
328	u_long newhiwat;
329
330	UNP_LOCK();
331	unp = sotounpcb(so);
332	if (unp == NULL) {
333		UNP_UNLOCK();
334		return (EINVAL);
335	}
336	switch (so->so_type) {
337	case SOCK_DGRAM:
338		panic("uipc_rcvd DGRAM?");
339		/*NOTREACHED*/
340
341	case SOCK_STREAM:
342		if (unp->unp_conn == NULL)
343			break;
344		so2 = unp->unp_conn->unp_socket;
345		SOCKBUF_LOCK(&so2->so_snd);
346		SOCKBUF_LOCK(&so->so_rcv);
347		/*
348		 * Adjust backpressure on sender
349		 * and wakeup any waiting to write.
350		 */
351		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
352		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
353		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
354		    so->so_rcv.sb_cc;
355		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
356		    newhiwat, RLIM_INFINITY);
357		unp->unp_cc = so->so_rcv.sb_cc;
358		SOCKBUF_UNLOCK(&so->so_rcv);
359		sowwakeup_locked(so2);
360		break;
361
362	default:
363		panic("uipc_rcvd unknown socktype");
364	}
365	UNP_UNLOCK();
366	return (0);
367}
368
369/* pru_rcvoob is EOPNOTSUPP */
370
371static int
372uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
373    struct mbuf *control, struct thread *td)
374{
375	int error = 0;
376	struct unpcb *unp;
377	struct socket *so2;
378	u_long newhiwat;
379
380	unp = sotounpcb(so);
381	if (unp == NULL) {
382		error = EINVAL;
383		goto release;
384	}
385	if (flags & PRUS_OOB) {
386		error = EOPNOTSUPP;
387		goto release;
388	}
389
390	if (control != NULL && (error = unp_internalize(&control, td)))
391		goto release;
392
393	UNP_LOCK();
394	unp = sotounpcb(so);
395	if (unp == NULL) {
396		UNP_UNLOCK();
397		error = EINVAL;
398		goto dispose_release;
399	}
400
401	switch (so->so_type) {
402	case SOCK_DGRAM:
403	{
404		const struct sockaddr *from;
405
406		if (nam != NULL) {
407			if (unp->unp_conn != NULL) {
408				error = EISCONN;
409				break;
410			}
411			error = unp_connect(so, nam, td);
412			if (error)
413				break;
414		} else {
415			if (unp->unp_conn == NULL) {
416				error = ENOTCONN;
417				break;
418			}
419		}
420		so2 = unp->unp_conn->unp_socket;
421		if (unp->unp_addr != NULL)
422			from = (struct sockaddr *)unp->unp_addr;
423		else
424			from = &sun_noname;
425		if (unp->unp_conn->unp_flags & UNP_WANTCRED)
426			control = unp_addsockcred(td, control);
427		SOCKBUF_LOCK(&so2->so_rcv);
428		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
429			sorwakeup_locked(so2);
430			m = NULL;
431			control = NULL;
432		} else {
433			SOCKBUF_UNLOCK(&so2->so_rcv);
434			error = ENOBUFS;
435		}
436		if (nam != NULL)
437			unp_disconnect(unp);
438		break;
439	}
440
441	case SOCK_STREAM:
442		/* Connect if not connected yet. */
443		/*
444		 * Note: A better implementation would complain
445		 * if not equal to the peer's address.
446		 */
447		if ((so->so_state & SS_ISCONNECTED) == 0) {
448			if (nam != NULL) {
449				error = unp_connect(so, nam, td);
450				if (error)
451					break;	/* XXX */
452			} else {
453				error = ENOTCONN;
454				break;
455			}
456		}
457
458		SOCKBUF_LOCK(&so->so_snd);
459		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
460			SOCKBUF_UNLOCK(&so->so_snd);
461			error = EPIPE;
462			break;
463		}
464		if (unp->unp_conn == NULL)
465			panic("uipc_send connected but no connection?");
466		so2 = unp->unp_conn->unp_socket;
467		SOCKBUF_LOCK(&so2->so_rcv);
468		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
469			/*
470			 * Credentials are passed only once on
471			 * SOCK_STREAM.
472			 */
473			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
474			control = unp_addsockcred(td, control);
475		}
476		/*
477		 * Send to paired receive port, and then reduce
478		 * send buffer hiwater marks to maintain backpressure.
479		 * Wake up readers.
480		 */
481		if (control != NULL) {
482			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
483				control = NULL;
484		} else {
485			sbappend_locked(&so2->so_rcv, m);
486		}
487		so->so_snd.sb_mbmax -=
488			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
489		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
490		newhiwat = so->so_snd.sb_hiwat -
491		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
492		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
493		    newhiwat, RLIM_INFINITY);
494		SOCKBUF_UNLOCK(&so->so_snd);
495		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
496		sorwakeup_locked(so2);
497		m = NULL;
498		break;
499
500	default:
501		panic("uipc_send unknown socktype");
502	}
503
504	/*
505	 * SEND_EOF is equivalent to a SEND followed by
506	 * a SHUTDOWN.
507	 */
508	if (flags & PRUS_EOF) {
509		socantsendmore(so);
510		unp_shutdown(unp);
511	}
512	UNP_UNLOCK();
513
514dispose_release:
515	if (control != NULL && error != 0)
516		unp_dispose(control);
517
518release:
519	if (control != NULL)
520		m_freem(control);
521	if (m != NULL)
522		m_freem(m);
523	return (error);
524}
525
526static int
527uipc_sense(struct socket *so, struct stat *sb)
528{
529	struct unpcb *unp;
530	struct socket *so2;
531
532	UNP_LOCK();
533	unp = sotounpcb(so);
534	if (unp == NULL) {
535		UNP_UNLOCK();
536		return (EINVAL);
537	}
538	sb->st_blksize = so->so_snd.sb_hiwat;
539	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
540		so2 = unp->unp_conn->unp_socket;
541		sb->st_blksize += so2->so_rcv.sb_cc;
542	}
543	sb->st_dev = NODEV;
544	if (unp->unp_ino == 0)
545		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
546	sb->st_ino = unp->unp_ino;
547	UNP_UNLOCK();
548	return (0);
549}
550
551static int
552uipc_shutdown(struct socket *so)
553{
554	struct unpcb *unp;
555
556	UNP_LOCK();
557	unp = sotounpcb(so);
558	if (unp == NULL) {
559		UNP_UNLOCK();
560		return (EINVAL);
561	}
562	socantsendmore(so);
563	unp_shutdown(unp);
564	UNP_UNLOCK();
565	return (0);
566}
567
568static int
569uipc_sockaddr(struct socket *so, struct sockaddr **nam)
570{
571	struct unpcb *unp;
572	const struct sockaddr *sa;
573
574	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
575	UNP_LOCK();
576	unp = sotounpcb(so);
577	if (unp == NULL) {
578		UNP_UNLOCK();
579		free(*nam, M_SONAME);
580		*nam = NULL;
581		return (EINVAL);
582	}
583	if (unp->unp_addr != NULL)
584		sa = (struct sockaddr *) unp->unp_addr;
585	else
586		sa = &sun_noname;
587	bcopy(sa, *nam, sa->sa_len);
588	UNP_UNLOCK();
589	return (0);
590}
591
592struct pr_usrreqs uipc_usrreqs = {
593	.pru_abort = 		uipc_abort,
594	.pru_accept =		uipc_accept,
595	.pru_attach =		uipc_attach,
596	.pru_bind =		uipc_bind,
597	.pru_connect =		uipc_connect,
598	.pru_connect2 =		uipc_connect2,
599	.pru_detach =		uipc_detach,
600	.pru_disconnect =	uipc_disconnect,
601	.pru_listen =		uipc_listen,
602	.pru_peeraddr =		uipc_peeraddr,
603	.pru_rcvd =		uipc_rcvd,
604	.pru_send =		uipc_send,
605	.pru_sense =		uipc_sense,
606	.pru_shutdown =		uipc_shutdown,
607	.pru_sockaddr =		uipc_sockaddr,
608	.pru_sosend =		sosend,
609	.pru_soreceive =	soreceive,
610	.pru_sopoll =		sopoll,
611};
612
613int
614uipc_ctloutput(struct socket *so, struct sockopt *sopt)
615{
616	struct unpcb *unp;
617	struct xucred xu;
618	int error, optval;
619
620	if (sopt->sopt_level != 0)
621		return (EINVAL);
622
623	UNP_LOCK();
624	unp = sotounpcb(so);
625	if (unp == NULL) {
626		UNP_UNLOCK();
627		return (EINVAL);
628	}
629	error = 0;
630
631	switch (sopt->sopt_dir) {
632	case SOPT_GET:
633		switch (sopt->sopt_name) {
634		case LOCAL_PEERCRED:
635			if (unp->unp_flags & UNP_HAVEPC)
636				xu = unp->unp_peercred;
637			else {
638				if (so->so_type == SOCK_STREAM)
639					error = ENOTCONN;
640				else
641					error = EINVAL;
642			}
643			if (error == 0)
644				error = sooptcopyout(sopt, &xu, sizeof(xu));
645			break;
646		case LOCAL_CREDS:
647			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
648			error = sooptcopyout(sopt, &optval, sizeof(optval));
649			break;
650		case LOCAL_CONNWAIT:
651			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
652			error = sooptcopyout(sopt, &optval, sizeof(optval));
653			break;
654		default:
655			error = EOPNOTSUPP;
656			break;
657		}
658		break;
659	case SOPT_SET:
660		switch (sopt->sopt_name) {
661		case LOCAL_CREDS:
662		case LOCAL_CONNWAIT:
663			error = sooptcopyin(sopt, &optval, sizeof(optval),
664					    sizeof(optval));
665			if (error)
666				break;
667
668#define	OPTSET(bit) \
669	if (optval) \
670		unp->unp_flags |= bit; \
671	else \
672		unp->unp_flags &= ~bit;
673
674			switch (sopt->sopt_name) {
675			case LOCAL_CREDS:
676				OPTSET(UNP_WANTCRED);
677				break;
678			case LOCAL_CONNWAIT:
679				OPTSET(UNP_CONNWAIT);
680				break;
681			default:
682				break;
683			}
684			break;
685#undef	OPTSET
686		default:
687			error = ENOPROTOOPT;
688			break;
689		}
690	default:
691		error = EOPNOTSUPP;
692		break;
693	}
694	UNP_UNLOCK();
695	return (error);
696}
697
698/*
699 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
700 * for stream sockets, although the total for sender and receiver is
701 * actually only PIPSIZ.
702 * Datagram sockets really use the sendspace as the maximum datagram size,
703 * and don't really want to reserve the sendspace.  Their recvspace should
704 * be large enough for at least one max-size datagram plus address.
705 */
706#ifndef PIPSIZ
707#define	PIPSIZ	8192
708#endif
709static u_long	unpst_sendspace = PIPSIZ;
710static u_long	unpst_recvspace = PIPSIZ;
711static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
712static u_long	unpdg_recvspace = 4*1024;
713
714static int	unp_rights;			/* file descriptors in flight */
715
716SYSCTL_DECL(_net_local_stream);
717SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
718	   &unpst_sendspace, 0, "");
719SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
720	   &unpst_recvspace, 0, "");
721SYSCTL_DECL(_net_local_dgram);
722SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
723	   &unpdg_sendspace, 0, "");
724SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
725	   &unpdg_recvspace, 0, "");
726SYSCTL_DECL(_net_local);
727SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
728
729static int
730unp_attach(struct socket *so)
731{
732	struct unpcb *unp;
733	int error;
734
735	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
736		switch (so->so_type) {
737
738		case SOCK_STREAM:
739			error = soreserve(so, unpst_sendspace, unpst_recvspace);
740			break;
741
742		case SOCK_DGRAM:
743			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
744			break;
745
746		default:
747			panic("unp_attach");
748		}
749		if (error)
750			return (error);
751	}
752	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
753	if (unp == NULL)
754		return (ENOBUFS);
755	LIST_INIT(&unp->unp_refs);
756	unp->unp_socket = so;
757	so->so_pcb = unp;
758
759	UNP_LOCK();
760	unp->unp_gencnt = ++unp_gencnt;
761	unp_count++;
762	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
763			 : &unp_shead, unp, unp_link);
764	UNP_UNLOCK();
765
766	return (0);
767}
768
769static void
770unp_detach(struct unpcb *unp)
771{
772	struct vnode *vp;
773
774	UNP_LOCK_ASSERT();
775
776	LIST_REMOVE(unp, unp_link);
777	unp->unp_gencnt = ++unp_gencnt;
778	--unp_count;
779	if ((vp = unp->unp_vnode) != NULL) {
780		/*
781		 * XXXRW: should v_socket be frobbed only while holding
782		 * Giant?
783		 */
784		unp->unp_vnode->v_socket = NULL;
785		unp->unp_vnode = NULL;
786	}
787	if (unp->unp_conn != NULL)
788		unp_disconnect(unp);
789	while (!LIST_EMPTY(&unp->unp_refs)) {
790		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
791		unp_drop(ref, ECONNRESET);
792	}
793	soisdisconnected(unp->unp_socket);
794	unp->unp_socket->so_pcb = NULL;
795	if (unp_rights) {
796		/*
797		 * Normally the receive buffer is flushed later,
798		 * in sofree, but if our receive buffer holds references
799		 * to descriptors that are now garbage, we will dispose
800		 * of those descriptor references after the garbage collector
801		 * gets them (resulting in a "panic: closef: count < 0").
802		 */
803		sorflush(unp->unp_socket);
804		unp_gc();	/* Will unlock UNP. */
805	} else
806		UNP_UNLOCK();
807	UNP_UNLOCK_ASSERT();
808	if (unp->unp_addr != NULL)
809		FREE(unp->unp_addr, M_SONAME);
810	uma_zfree(unp_zone, unp);
811	if (vp) {
812		mtx_lock(&Giant);
813		vrele(vp);
814		mtx_unlock(&Giant);
815	}
816}
817
818static int
819unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
820{
821	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
822	struct vnode *vp;
823	struct mount *mp;
824	struct vattr vattr;
825	int error, namelen;
826	struct nameidata nd;
827	char *buf;
828
829	UNP_LOCK_ASSERT();
830
831	/*
832	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
833	 * unlocked read here is fine, but the value of unp_vnode needs
834	 * to be tested again after we do all the lookups to see if the
835	 * pcb is still unbound?
836	 */
837	if (unp->unp_vnode != NULL)
838		return (EINVAL);
839
840	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
841	if (namelen <= 0)
842		return (EINVAL);
843
844	UNP_UNLOCK();
845
846	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
847	strlcpy(buf, soun->sun_path, namelen + 1);
848
849	mtx_lock(&Giant);
850restart:
851	mtx_assert(&Giant, MA_OWNED);
852	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
853	    buf, td);
854/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
855	error = namei(&nd);
856	if (error)
857		goto done;
858	vp = nd.ni_vp;
859	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
860		NDFREE(&nd, NDF_ONLY_PNBUF);
861		if (nd.ni_dvp == vp)
862			vrele(nd.ni_dvp);
863		else
864			vput(nd.ni_dvp);
865		if (vp != NULL) {
866			vrele(vp);
867			error = EADDRINUSE;
868			goto done;
869		}
870		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
871		if (error)
872			goto done;
873		goto restart;
874	}
875	VATTR_NULL(&vattr);
876	vattr.va_type = VSOCK;
877	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
878#ifdef MAC
879	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
880	    &vattr);
881#endif
882	if (error == 0) {
883		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
884		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
885	}
886	NDFREE(&nd, NDF_ONLY_PNBUF);
887	vput(nd.ni_dvp);
888	if (error) {
889		vn_finished_write(mp);
890		goto done;
891	}
892	vp = nd.ni_vp;
893	ASSERT_VOP_LOCKED(vp, "unp_bind");
894	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
895	UNP_LOCK();
896	vp->v_socket = unp->unp_socket;
897	unp->unp_vnode = vp;
898	unp->unp_addr = soun;
899	UNP_UNLOCK();
900	VOP_UNLOCK(vp, 0, td);
901	vn_finished_write(mp);
902done:
903	mtx_unlock(&Giant);
904	free(buf, M_TEMP);
905	UNP_LOCK();
906	return (error);
907}
908
909static int
910unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
911{
912	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
913	struct vnode *vp;
914	struct socket *so2, *so3;
915	struct unpcb *unp, *unp2, *unp3;
916	int error, len;
917	struct nameidata nd;
918	char buf[SOCK_MAXADDRLEN];
919	struct sockaddr *sa;
920
921	UNP_LOCK_ASSERT();
922	unp = sotounpcb(so);
923
924	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
925	if (len <= 0)
926		return (EINVAL);
927	strlcpy(buf, soun->sun_path, len + 1);
928	UNP_UNLOCK();
929	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
930	mtx_lock(&Giant);
931	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
932	error = namei(&nd);
933	if (error)
934		vp = NULL;
935	else
936		vp = nd.ni_vp;
937	ASSERT_VOP_LOCKED(vp, "unp_connect");
938	NDFREE(&nd, NDF_ONLY_PNBUF);
939	if (error)
940		goto bad;
941
942	if (vp->v_type != VSOCK) {
943		error = ENOTSOCK;
944		goto bad;
945	}
946	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
947	if (error)
948		goto bad;
949	mtx_unlock(&Giant);
950	UNP_LOCK();
951	unp = sotounpcb(so);
952	if (unp == NULL) {
953		error = EINVAL;
954		goto bad2;
955	}
956	so2 = vp->v_socket;
957	if (so2 == NULL) {
958		error = ECONNREFUSED;
959		goto bad2;
960	}
961	if (so->so_type != so2->so_type) {
962		error = EPROTOTYPE;
963		goto bad2;
964	}
965	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
966		if (so2->so_options & SO_ACCEPTCONN) {
967			/*
968			 * NB: drop locks here so unp_attach is entered
969			 *     w/o locks; this avoids a recursive lock
970			 *     of the head and holding sleep locks across
971			 *     a (potentially) blocking malloc.
972			 */
973			UNP_UNLOCK();
974			so3 = sonewconn(so2, 0);
975			UNP_LOCK();
976		} else
977			so3 = NULL;
978		if (so3 == NULL) {
979			error = ECONNREFUSED;
980			goto bad2;
981		}
982		unp = sotounpcb(so);
983		unp2 = sotounpcb(so2);
984		unp3 = sotounpcb(so3);
985		if (unp2->unp_addr != NULL) {
986			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
987			unp3->unp_addr = (struct sockaddr_un *) sa;
988			sa = NULL;
989		}
990		/*
991		 * unp_peercred management:
992		 *
993		 * The connecter's (client's) credentials are copied
994		 * from its process structure at the time of connect()
995		 * (which is now).
996		 */
997		cru2x(td->td_ucred, &unp3->unp_peercred);
998		unp3->unp_flags |= UNP_HAVEPC;
999		/*
1000		 * The receiver's (server's) credentials are copied
1001		 * from the unp_peercred member of socket on which the
1002		 * former called listen(); unp_listen() cached that
1003		 * process's credentials at that time so we can use
1004		 * them now.
1005		 */
1006		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
1007		    ("unp_connect: listener without cached peercred"));
1008		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
1009		    sizeof(unp->unp_peercred));
1010		unp->unp_flags |= UNP_HAVEPC;
1011#ifdef MAC
1012		SOCK_LOCK(so);
1013		mac_set_socket_peer_from_socket(so, so3);
1014		mac_set_socket_peer_from_socket(so3, so);
1015		SOCK_UNLOCK(so);
1016#endif
1017
1018		so2 = so3;
1019	}
1020	error = unp_connect2(so, so2, PRU_CONNECT);
1021bad2:
1022	UNP_UNLOCK();
1023	mtx_lock(&Giant);
1024bad:
1025	mtx_assert(&Giant, MA_OWNED);
1026	if (vp != NULL)
1027		vput(vp);
1028	mtx_unlock(&Giant);
1029	free(sa, M_SONAME);
1030	UNP_LOCK();
1031	return (error);
1032}
1033
1034static int
1035unp_connect2(struct socket *so, struct socket *so2, int req)
1036{
1037	struct unpcb *unp = sotounpcb(so);
1038	struct unpcb *unp2;
1039
1040	UNP_LOCK_ASSERT();
1041
1042	if (so2->so_type != so->so_type)
1043		return (EPROTOTYPE);
1044	unp2 = sotounpcb(so2);
1045	unp->unp_conn = unp2;
1046	switch (so->so_type) {
1047
1048	case SOCK_DGRAM:
1049		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1050		soisconnected(so);
1051		break;
1052
1053	case SOCK_STREAM:
1054		unp2->unp_conn = unp;
1055		if (req == PRU_CONNECT &&
1056		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1057			soisconnecting(so);
1058		else
1059			soisconnected(so);
1060		soisconnected(so2);
1061		break;
1062
1063	default:
1064		panic("unp_connect2");
1065	}
1066	return (0);
1067}
1068
1069static void
1070unp_disconnect(struct unpcb *unp)
1071{
1072	struct unpcb *unp2 = unp->unp_conn;
1073	struct socket *so;
1074
1075	UNP_LOCK_ASSERT();
1076
1077	if (unp2 == NULL)
1078		return;
1079	unp->unp_conn = NULL;
1080	switch (unp->unp_socket->so_type) {
1081
1082	case SOCK_DGRAM:
1083		LIST_REMOVE(unp, unp_reflink);
1084		so = unp->unp_socket;
1085		SOCK_LOCK(so);
1086		so->so_state &= ~SS_ISCONNECTED;
1087		SOCK_UNLOCK(so);
1088		break;
1089
1090	case SOCK_STREAM:
1091		soisdisconnected(unp->unp_socket);
1092		unp2->unp_conn = NULL;
1093		soisdisconnected(unp2->unp_socket);
1094		break;
1095	}
1096}
1097
1098#ifdef notdef
1099void
1100unp_abort(struct unpcb *unp)
1101{
1102
1103	unp_detach(unp);
1104	UNP_UNLOCK_ASSERT();
1105}
1106#endif
1107
1108/*
1109 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
1110 * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
1111 * are safe to reference.  It first scans the list of struct unpcb's to
1112 * generate a pointer list, then it rescans its list one entry at a time to
1113 * externalize and copyout.  It checks the generation number to see if a
1114 * struct unpcb has been reused, and will skip it if so.
1115 */
1116static int
1117unp_pcblist(SYSCTL_HANDLER_ARGS)
1118{
1119	int error, i, n;
1120	struct unpcb *unp, **unp_list;
1121	unp_gen_t gencnt;
1122	struct xunpgen *xug;
1123	struct unp_head *head;
1124	struct xunpcb *xu;
1125
1126	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1127
1128	/*
1129	 * The process of preparing the PCB list is too time-consuming and
1130	 * resource-intensive to repeat twice on every request.
1131	 */
1132	if (req->oldptr == NULL) {
1133		n = unp_count;
1134		req->oldidx = 2 * (sizeof *xug)
1135			+ (n + n/8) * sizeof(struct xunpcb);
1136		return (0);
1137	}
1138
1139	if (req->newptr != NULL)
1140		return (EPERM);
1141
1142	/*
1143	 * OK, now we're committed to doing something.
1144	 */
1145	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1146	UNP_LOCK();
1147	gencnt = unp_gencnt;
1148	n = unp_count;
1149	UNP_UNLOCK();
1150
1151	xug->xug_len = sizeof *xug;
1152	xug->xug_count = n;
1153	xug->xug_gen = gencnt;
1154	xug->xug_sogen = so_gencnt;
1155	error = SYSCTL_OUT(req, xug, sizeof *xug);
1156	if (error) {
1157		free(xug, M_TEMP);
1158		return (error);
1159	}
1160
1161	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1162
1163	UNP_LOCK();
1164	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1165	     unp = LIST_NEXT(unp, unp_link)) {
1166		if (unp->unp_gencnt <= gencnt) {
1167			if (cr_cansee(req->td->td_ucred,
1168			    unp->unp_socket->so_cred))
1169				continue;
1170			unp_list[i++] = unp;
1171		}
1172	}
1173	UNP_UNLOCK();
1174	n = i;			/* in case we lost some during malloc */
1175
1176	error = 0;
1177	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
1178	for (i = 0; i < n; i++) {
1179		unp = unp_list[i];
1180		if (unp->unp_gencnt <= gencnt) {
1181			xu->xu_len = sizeof *xu;
1182			xu->xu_unpp = unp;
1183			/*
1184			 * XXX - need more locking here to protect against
1185			 * connect/disconnect races for SMP.
1186			 */
1187			if (unp->unp_addr != NULL)
1188				bcopy(unp->unp_addr, &xu->xu_addr,
1189				      unp->unp_addr->sun_len);
1190			if (unp->unp_conn != NULL &&
1191			    unp->unp_conn->unp_addr != NULL)
1192				bcopy(unp->unp_conn->unp_addr,
1193				      &xu->xu_caddr,
1194				      unp->unp_conn->unp_addr->sun_len);
1195			bcopy(unp, &xu->xu_unp, sizeof *unp);
1196			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1197			error = SYSCTL_OUT(req, xu, sizeof *xu);
1198		}
1199	}
1200	free(xu, M_TEMP);
1201	if (!error) {
1202		/*
1203		 * Give the user an updated idea of our state.
1204		 * If the generation differs from what we told
1205		 * her before, she knows that something happened
1206		 * while we were processing this request, and it
1207		 * might be necessary to retry.
1208		 */
1209		xug->xug_gen = unp_gencnt;
1210		xug->xug_sogen = so_gencnt;
1211		xug->xug_count = unp_count;
1212		error = SYSCTL_OUT(req, xug, sizeof *xug);
1213	}
1214	free(unp_list, M_TEMP);
1215	free(xug, M_TEMP);
1216	return (error);
1217}
1218
1219SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1220	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1221	    "List of active local datagram sockets");
1222SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1223	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1224	    "List of active local stream sockets");
1225
1226static void
1227unp_shutdown(struct unpcb *unp)
1228{
1229	struct socket *so;
1230
1231	UNP_LOCK_ASSERT();
1232
1233	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1234	    (so = unp->unp_conn->unp_socket))
1235		socantrcvmore(so);
1236}
1237
1238static void
1239unp_drop(struct unpcb *unp, int errno)
1240{
1241	struct socket *so = unp->unp_socket;
1242
1243	UNP_LOCK_ASSERT();
1244
1245	so->so_error = errno;
1246	unp_disconnect(unp);
1247}
1248
1249#ifdef notdef
1250void
1251unp_drain(void)
1252{
1253
1254}
1255#endif
1256
1257static void
1258unp_freerights(struct file **rp, int fdcount)
1259{
1260	int i;
1261	struct file *fp;
1262
1263	for (i = 0; i < fdcount; i++) {
1264		fp = *rp;
1265		/*
1266		 * zero the pointer before calling
1267		 * unp_discard since it may end up
1268		 * in unp_gc()..
1269		 */
1270		*rp++ = 0;
1271		unp_discard(fp);
1272	}
1273}
1274
1275int
1276unp_externalize(struct mbuf *control, struct mbuf **controlp)
1277{
1278	struct thread *td = curthread;		/* XXX */
1279	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1280	int i;
1281	int *fdp;
1282	struct file **rp;
1283	struct file *fp;
1284	void *data;
1285	socklen_t clen = control->m_len, datalen;
1286	int error, newfds;
1287	int f;
1288	u_int newlen;
1289
1290	UNP_UNLOCK_ASSERT();
1291
1292	error = 0;
1293	if (controlp != NULL) /* controlp == NULL => free control messages */
1294		*controlp = NULL;
1295
1296	while (cm != NULL) {
1297		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1298			error = EINVAL;
1299			break;
1300		}
1301
1302		data = CMSG_DATA(cm);
1303		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1304
1305		if (cm->cmsg_level == SOL_SOCKET
1306		    && cm->cmsg_type == SCM_RIGHTS) {
1307			newfds = datalen / sizeof(struct file *);
1308			rp = data;
1309
1310			/* If we're not outputting the descriptors free them. */
1311			if (error || controlp == NULL) {
1312				unp_freerights(rp, newfds);
1313				goto next;
1314			}
1315			FILEDESC_LOCK(td->td_proc->p_fd);
1316			/* if the new FD's will not fit free them.  */
1317			if (!fdavail(td, newfds)) {
1318				FILEDESC_UNLOCK(td->td_proc->p_fd);
1319				error = EMSGSIZE;
1320				unp_freerights(rp, newfds);
1321				goto next;
1322			}
1323			/*
1324			 * now change each pointer to an fd in the global
1325			 * table to an integer that is the index to the
1326			 * local fd table entry that we set up to point
1327			 * to the global one we are transferring.
1328			 */
1329			newlen = newfds * sizeof(int);
1330			*controlp = sbcreatecontrol(NULL, newlen,
1331			    SCM_RIGHTS, SOL_SOCKET);
1332			if (*controlp == NULL) {
1333				FILEDESC_UNLOCK(td->td_proc->p_fd);
1334				error = E2BIG;
1335				unp_freerights(rp, newfds);
1336				goto next;
1337			}
1338
1339			fdp = (int *)
1340			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1341			for (i = 0; i < newfds; i++) {
1342				if (fdalloc(td, 0, &f))
1343					panic("unp_externalize fdalloc failed");
1344				fp = *rp++;
1345				td->td_proc->p_fd->fd_ofiles[f] = fp;
1346				FILE_LOCK(fp);
1347				fp->f_msgcount--;
1348				FILE_UNLOCK(fp);
1349				unp_rights--;
1350				*fdp++ = f;
1351			}
1352			FILEDESC_UNLOCK(td->td_proc->p_fd);
1353		} else { /* We can just copy anything else across */
1354			if (error || controlp == NULL)
1355				goto next;
1356			*controlp = sbcreatecontrol(NULL, datalen,
1357			    cm->cmsg_type, cm->cmsg_level);
1358			if (*controlp == NULL) {
1359				error = ENOBUFS;
1360				goto next;
1361			}
1362			bcopy(data,
1363			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1364			    datalen);
1365		}
1366
1367		controlp = &(*controlp)->m_next;
1368
1369next:
1370		if (CMSG_SPACE(datalen) < clen) {
1371			clen -= CMSG_SPACE(datalen);
1372			cm = (struct cmsghdr *)
1373			    ((caddr_t)cm + CMSG_SPACE(datalen));
1374		} else {
1375			clen = 0;
1376			cm = NULL;
1377		}
1378	}
1379
1380	m_freem(control);
1381
1382	return (error);
1383}
1384
1385void
1386unp_init(void)
1387{
1388	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1389	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1390	if (unp_zone == NULL)
1391		panic("unp_init");
1392	uma_zone_set_max(unp_zone, nmbclusters);
1393	LIST_INIT(&unp_dhead);
1394	LIST_INIT(&unp_shead);
1395
1396	UNP_LOCK_INIT();
1397}
1398
1399static int
1400unp_internalize(struct mbuf **controlp, struct thread *td)
1401{
1402	struct mbuf *control = *controlp;
1403	struct proc *p = td->td_proc;
1404	struct filedesc *fdescp = p->p_fd;
1405	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1406	struct cmsgcred *cmcred;
1407	struct file **rp;
1408	struct file *fp;
1409	struct timeval *tv;
1410	int i, fd, *fdp;
1411	void *data;
1412	socklen_t clen = control->m_len, datalen;
1413	int error, oldfds;
1414	u_int newlen;
1415
1416	UNP_UNLOCK_ASSERT();
1417
1418	error = 0;
1419	*controlp = NULL;
1420
1421	while (cm != NULL) {
1422		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1423		    || cm->cmsg_len > clen) {
1424			error = EINVAL;
1425			goto out;
1426		}
1427
1428		data = CMSG_DATA(cm);
1429		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1430
1431		switch (cm->cmsg_type) {
1432		/*
1433		 * Fill in credential information.
1434		 */
1435		case SCM_CREDS:
1436			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1437			    SCM_CREDS, SOL_SOCKET);
1438			if (*controlp == NULL) {
1439				error = ENOBUFS;
1440				goto out;
1441			}
1442
1443			cmcred = (struct cmsgcred *)
1444			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1445			cmcred->cmcred_pid = p->p_pid;
1446			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1447			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1448			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1449			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1450							CMGROUP_MAX);
1451			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1452				cmcred->cmcred_groups[i] =
1453				    td->td_ucred->cr_groups[i];
1454			break;
1455
1456		case SCM_RIGHTS:
1457			oldfds = datalen / sizeof (int);
1458			/*
1459			 * check that all the FDs passed in refer to legal files
1460			 * If not, reject the entire operation.
1461			 */
1462			fdp = data;
1463			FILEDESC_LOCK(fdescp);
1464			for (i = 0; i < oldfds; i++) {
1465				fd = *fdp++;
1466				if ((unsigned)fd >= fdescp->fd_nfiles ||
1467				    fdescp->fd_ofiles[fd] == NULL) {
1468					FILEDESC_UNLOCK(fdescp);
1469					error = EBADF;
1470					goto out;
1471				}
1472				fp = fdescp->fd_ofiles[fd];
1473				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1474					FILEDESC_UNLOCK(fdescp);
1475					error = EOPNOTSUPP;
1476					goto out;
1477				}
1478
1479			}
1480			/*
1481			 * Now replace the integer FDs with pointers to
1482			 * the associated global file table entry..
1483			 */
1484			newlen = oldfds * sizeof(struct file *);
1485			*controlp = sbcreatecontrol(NULL, newlen,
1486			    SCM_RIGHTS, SOL_SOCKET);
1487			if (*controlp == NULL) {
1488				FILEDESC_UNLOCK(fdescp);
1489				error = E2BIG;
1490				goto out;
1491			}
1492
1493			fdp = data;
1494			rp = (struct file **)
1495			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1496			for (i = 0; i < oldfds; i++) {
1497				fp = fdescp->fd_ofiles[*fdp++];
1498				*rp++ = fp;
1499				FILE_LOCK(fp);
1500				fp->f_count++;
1501				fp->f_msgcount++;
1502				FILE_UNLOCK(fp);
1503				unp_rights++;
1504			}
1505			FILEDESC_UNLOCK(fdescp);
1506			break;
1507
1508		case SCM_TIMESTAMP:
1509			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1510			    SCM_TIMESTAMP, SOL_SOCKET);
1511			if (*controlp == NULL) {
1512				error = ENOBUFS;
1513				goto out;
1514			}
1515			tv = (struct timeval *)
1516			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1517			microtime(tv);
1518			break;
1519
1520		default:
1521			error = EINVAL;
1522			goto out;
1523		}
1524
1525		controlp = &(*controlp)->m_next;
1526
1527		if (CMSG_SPACE(datalen) < clen) {
1528			clen -= CMSG_SPACE(datalen);
1529			cm = (struct cmsghdr *)
1530			    ((caddr_t)cm + CMSG_SPACE(datalen));
1531		} else {
1532			clen = 0;
1533			cm = NULL;
1534		}
1535	}
1536
1537out:
1538	m_freem(control);
1539
1540	return (error);
1541}
1542
1543struct mbuf *
1544unp_addsockcred(struct thread *td, struct mbuf *control)
1545{
1546	struct mbuf *m, *n;
1547	struct sockcred *sc;
1548	int ngroups;
1549	int i;
1550
1551	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
1552
1553	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
1554	if (m == NULL)
1555		return (control);
1556	m->m_next = NULL;
1557
1558	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
1559	sc->sc_uid = td->td_ucred->cr_ruid;
1560	sc->sc_euid = td->td_ucred->cr_uid;
1561	sc->sc_gid = td->td_ucred->cr_rgid;
1562	sc->sc_egid = td->td_ucred->cr_gid;
1563	sc->sc_ngroups = ngroups;
1564	for (i = 0; i < sc->sc_ngroups; i++)
1565		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
1566
1567	/*
1568	 * If a control message already exists, append us to the end.
1569	 */
1570	if (control != NULL) {
1571		for (n = control; n->m_next != NULL; n = n->m_next)
1572			;
1573		n->m_next = m;
1574	} else
1575		control = m;
1576
1577	return (control);
1578}
1579
1580/*
1581 * unp_defer is thread-local during garbage collection, and does not require
1582 * explicit synchronization.  unp_gcing prevents other threads from entering
1583 * garbage collection, and perhaps should be an sx lock instead.
1584 */
1585static int	unp_defer, unp_gcing;
1586
1587static void
1588unp_gc(void)
1589{
1590	struct file *fp, *nextfp;
1591	struct socket *so;
1592	struct file **extra_ref, **fpp;
1593	int nunref, i;
1594	int nfiles_snap;
1595	int nfiles_slack = 20;
1596
1597	UNP_LOCK_ASSERT();
1598
1599	if (unp_gcing) {
1600		UNP_UNLOCK();
1601		return;
1602	}
1603	unp_gcing = 1;
1604	unp_defer = 0;
1605	UNP_UNLOCK();
1606	/*
1607	 * before going through all this, set all FDs to
1608	 * be NOT defered and NOT externally accessible
1609	 */
1610	sx_slock(&filelist_lock);
1611	LIST_FOREACH(fp, &filehead, f_list)
1612		fp->f_gcflag &= ~(FMARK|FDEFER);
1613	do {
1614		LIST_FOREACH(fp, &filehead, f_list) {
1615			FILE_LOCK(fp);
1616			/*
1617			 * If the file is not open, skip it
1618			 */
1619			if (fp->f_count == 0) {
1620				FILE_UNLOCK(fp);
1621				continue;
1622			}
1623			/*
1624			 * If we already marked it as 'defer'  in a
1625			 * previous pass, then try process it this time
1626			 * and un-mark it
1627			 */
1628			if (fp->f_gcflag & FDEFER) {
1629				fp->f_gcflag &= ~FDEFER;
1630				unp_defer--;
1631			} else {
1632				/*
1633				 * if it's not defered, then check if it's
1634				 * already marked.. if so skip it
1635				 */
1636				if (fp->f_gcflag & FMARK) {
1637					FILE_UNLOCK(fp);
1638					continue;
1639				}
1640				/*
1641				 * If all references are from messages
1642				 * in transit, then skip it. it's not
1643				 * externally accessible.
1644				 */
1645				if (fp->f_count == fp->f_msgcount) {
1646					FILE_UNLOCK(fp);
1647					continue;
1648				}
1649				/*
1650				 * If it got this far then it must be
1651				 * externally accessible.
1652				 */
1653				fp->f_gcflag |= FMARK;
1654			}
1655			/*
1656			 * either it was defered, or it is externally
1657			 * accessible and not already marked so.
1658			 * Now check if it is possibly one of OUR sockets.
1659			 */
1660			if (fp->f_type != DTYPE_SOCKET ||
1661			    (so = fp->f_data) == NULL) {
1662				FILE_UNLOCK(fp);
1663				continue;
1664			}
1665			FILE_UNLOCK(fp);
1666			if (so->so_proto->pr_domain != &localdomain ||
1667			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1668				continue;
1669#ifdef notdef
1670			if (so->so_rcv.sb_flags & SB_LOCK) {
1671				/*
1672				 * This is problematical; it's not clear
1673				 * we need to wait for the sockbuf to be
1674				 * unlocked (on a uniprocessor, at least),
1675				 * and it's also not clear what to do
1676				 * if sbwait returns an error due to receipt
1677				 * of a signal.  If sbwait does return
1678				 * an error, we'll go into an infinite
1679				 * loop.  Delete all of this for now.
1680				 */
1681				(void) sbwait(&so->so_rcv);
1682				goto restart;
1683			}
1684#endif
1685			/*
1686			 * So, Ok, it's one of our sockets and it IS externally
1687			 * accessible (or was defered). Now we look
1688			 * to see if we hold any file descriptors in its
1689			 * message buffers. Follow those links and mark them
1690			 * as accessible too.
1691			 */
1692			SOCKBUF_LOCK(&so->so_rcv);
1693			unp_scan(so->so_rcv.sb_mb, unp_mark);
1694			SOCKBUF_UNLOCK(&so->so_rcv);
1695		}
1696	} while (unp_defer);
1697	sx_sunlock(&filelist_lock);
1698	/*
1699	 * We grab an extra reference to each of the file table entries
1700	 * that are not otherwise accessible and then free the rights
1701	 * that are stored in messages on them.
1702	 *
1703	 * The bug in the orginal code is a little tricky, so I'll describe
1704	 * what's wrong with it here.
1705	 *
1706	 * It is incorrect to simply unp_discard each entry for f_msgcount
1707	 * times -- consider the case of sockets A and B that contain
1708	 * references to each other.  On a last close of some other socket,
1709	 * we trigger a gc since the number of outstanding rights (unp_rights)
1710	 * is non-zero.  If during the sweep phase the gc code un_discards,
1711	 * we end up doing a (full) closef on the descriptor.  A closef on A
1712	 * results in the following chain.  Closef calls soo_close, which
1713	 * calls soclose.   Soclose calls first (through the switch
1714	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1715	 * returns because the previous instance had set unp_gcing, and
1716	 * we return all the way back to soclose, which marks the socket
1717	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1718	 * to free up the rights that are queued in messages on the socket A,
1719	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1720	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1721	 * instance of unp_discard just calls closef on B.
1722	 *
1723	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1724	 * which results in another closef on A.  Unfortunately, A is already
1725	 * being closed, and the descriptor has already been marked with
1726	 * SS_NOFDREF, and soclose panics at this point.
1727	 *
1728	 * Here, we first take an extra reference to each inaccessible
1729	 * descriptor.  Then, we call sorflush ourself, since we know
1730	 * it is a Unix domain socket anyhow.  After we destroy all the
1731	 * rights carried in messages, we do a last closef to get rid
1732	 * of our extra reference.  This is the last close, and the
1733	 * unp_detach etc will shut down the socket.
1734	 *
1735	 * 91/09/19, bsy@cs.cmu.edu
1736	 */
1737again:
1738	nfiles_snap = openfiles + nfiles_slack;	/* some slack */
1739	extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
1740	    M_WAITOK);
1741	sx_slock(&filelist_lock);
1742	if (nfiles_snap < openfiles) {
1743		sx_sunlock(&filelist_lock);
1744		free(extra_ref, M_TEMP);
1745		nfiles_slack += 20;
1746		goto again;
1747	}
1748	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
1749	    fp != NULL; fp = nextfp) {
1750		nextfp = LIST_NEXT(fp, f_list);
1751		FILE_LOCK(fp);
1752		/*
1753		 * If it's not open, skip it
1754		 */
1755		if (fp->f_count == 0) {
1756			FILE_UNLOCK(fp);
1757			continue;
1758		}
1759		/*
1760		 * If all refs are from msgs, and it's not marked accessible
1761		 * then it must be referenced from some unreachable cycle
1762		 * of (shut-down) FDs, so include it in our
1763		 * list of FDs to remove
1764		 */
1765		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
1766			*fpp++ = fp;
1767			nunref++;
1768			fp->f_count++;
1769		}
1770		FILE_UNLOCK(fp);
1771	}
1772	sx_sunlock(&filelist_lock);
1773	/*
1774	 * for each FD on our hit list, do the following two things
1775	 */
1776	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1777		struct file *tfp = *fpp;
1778		FILE_LOCK(tfp);
1779		if (tfp->f_type == DTYPE_SOCKET &&
1780		    tfp->f_data != NULL) {
1781			FILE_UNLOCK(tfp);
1782			sorflush(tfp->f_data);
1783		} else {
1784			FILE_UNLOCK(tfp);
1785		}
1786	}
1787	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1788		closef(*fpp, (struct thread *) NULL);
1789	free(extra_ref, M_TEMP);
1790	unp_gcing = 0;
1791
1792	UNP_UNLOCK_ASSERT();
1793}
1794
1795void
1796unp_dispose(struct mbuf *m)
1797{
1798
1799	if (m)
1800		unp_scan(m, unp_discard);
1801}
1802
1803static int
1804unp_listen(struct socket *so, struct unpcb *unp, struct thread *td)
1805{
1806	int error;
1807
1808	UNP_LOCK_ASSERT();
1809
1810	SOCK_LOCK(so);
1811	error = solisten_proto_check(so);
1812	if (error == 0) {
1813		cru2x(td->td_ucred, &unp->unp_peercred);
1814		unp->unp_flags |= UNP_HAVEPCCACHED;
1815		solisten_proto(so);
1816	}
1817	SOCK_UNLOCK(so);
1818	return (error);
1819}
1820
1821static void
1822unp_scan(struct mbuf *m0, void (*op)(struct file *))
1823{
1824	struct mbuf *m;
1825	struct file **rp;
1826	struct cmsghdr *cm;
1827	void *data;
1828	int i;
1829	socklen_t clen, datalen;
1830	int qfds;
1831
1832	while (m0 != NULL) {
1833		for (m = m0; m; m = m->m_next) {
1834			if (m->m_type != MT_CONTROL)
1835				continue;
1836
1837			cm = mtod(m, struct cmsghdr *);
1838			clen = m->m_len;
1839
1840			while (cm != NULL) {
1841				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
1842					break;
1843
1844				data = CMSG_DATA(cm);
1845				datalen = (caddr_t)cm + cm->cmsg_len
1846				    - (caddr_t)data;
1847
1848				if (cm->cmsg_level == SOL_SOCKET &&
1849				    cm->cmsg_type == SCM_RIGHTS) {
1850					qfds = datalen / sizeof (struct file *);
1851					rp = data;
1852					for (i = 0; i < qfds; i++)
1853						(*op)(*rp++);
1854				}
1855
1856				if (CMSG_SPACE(datalen) < clen) {
1857					clen -= CMSG_SPACE(datalen);
1858					cm = (struct cmsghdr *)
1859					    ((caddr_t)cm + CMSG_SPACE(datalen));
1860				} else {
1861					clen = 0;
1862					cm = NULL;
1863				}
1864			}
1865		}
1866		m0 = m0->m_act;
1867	}
1868}
1869
1870static void
1871unp_mark(struct file *fp)
1872{
1873	if (fp->f_gcflag & FMARK)
1874		return;
1875	unp_defer++;
1876	fp->f_gcflag |= (FMARK|FDEFER);
1877}
1878
1879static void
1880unp_discard(struct file *fp)
1881{
1882	FILE_LOCK(fp);
1883	fp->f_msgcount--;
1884	unp_rights--;
1885	FILE_UNLOCK(fp);
1886	(void) closef(fp, (struct thread *)NULL);
1887}
1888