uipc_usrreq.c revision 183563
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004-2008 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
32 */
33
34/*
35 * UNIX Domain (Local) Sockets
36 *
37 * This is an implementation of UNIX (local) domain sockets.  Each socket has
38 * an associated struct unpcb (UNIX protocol control block).  Stream sockets
39 * may be connected to 0 or 1 other socket.  Datagram sockets may be
40 * connected to 0, 1, or many other sockets.  Sockets may be created and
41 * connected in pairs (socketpair(2)), or bound/connected to using the file
42 * system name space.  For most purposes, only the receive socket buffer is
43 * used, as sending on one socket delivers directly to the receive socket
44 * buffer of a second socket.
45 *
46 * The implementation is substantially complicated by the fact that
47 * "ancillary data", such as file descriptors or credentials, may be passed
48 * across UNIX domain sockets.  The potential for passing UNIX domain sockets
49 * over other UNIX domain sockets requires the implementation of a simple
50 * garbage collector to find and tear down cycles of disconnected sockets.
51 *
52 * TODO:
53 *	SEQPACKET, RDM
54 *	rethink name space problems
55 *	need a proper out-of-band
56 */
57
58#include <sys/cdefs.h>
59__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 183563 2008-10-03 09:01:55Z rwatson $");
60
61#include "opt_ddb.h"
62#include "opt_mac.h"
63
64#include <sys/param.h>
65#include <sys/domain.h>
66#include <sys/fcntl.h>
67#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
68#include <sys/eventhandler.h>
69#include <sys/file.h>
70#include <sys/filedesc.h>
71#include <sys/jail.h>
72#include <sys/kernel.h>
73#include <sys/lock.h>
74#include <sys/mbuf.h>
75#include <sys/mount.h>
76#include <sys/mutex.h>
77#include <sys/namei.h>
78#include <sys/proc.h>
79#include <sys/protosw.h>
80#include <sys/resourcevar.h>
81#include <sys/rwlock.h>
82#include <sys/socket.h>
83#include <sys/socketvar.h>
84#include <sys/signalvar.h>
85#include <sys/stat.h>
86#include <sys/sx.h>
87#include <sys/sysctl.h>
88#include <sys/systm.h>
89#include <sys/taskqueue.h>
90#include <sys/un.h>
91#include <sys/unpcb.h>
92#include <sys/vnode.h>
93
94#ifdef DDB
95#include <ddb/ddb.h>
96#endif
97
98#include <security/mac/mac_framework.h>
99
100#include <vm/uma.h>
101
102static uma_zone_t	unp_zone;
103static unp_gen_t	unp_gencnt;
104static u_int		unp_count;	/* Count of local sockets. */
105static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
106static int		unp_rights;	/* File descriptors in flight. */
107static struct unp_head	unp_shead;	/* List of local stream sockets. */
108static struct unp_head	unp_dhead;	/* List of local datagram sockets. */
109
110static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
111
112/*
113 * Garbage collection of cyclic file descriptor/socket references occurs
114 * asynchronously in a taskqueue context in order to avoid recursion and
115 * reentrance in the UNIX domain socket, file descriptor, and socket layer
116 * code.  See unp_gc() for a full description.
117 */
118static struct task	unp_gc_task;
119
120/*
121 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
122 * stream sockets, although the total for sender and receiver is actually
123 * only PIPSIZ.
124 *
125 * Datagram sockets really use the sendspace as the maximum datagram size,
126 * and don't really want to reserve the sendspace.  Their recvspace should be
127 * large enough for at least one max-size datagram plus address.
128 */
129#ifndef PIPSIZ
130#define	PIPSIZ	8192
131#endif
132static u_long	unpst_sendspace = PIPSIZ;
133static u_long	unpst_recvspace = PIPSIZ;
134static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
135static u_long	unpdg_recvspace = 4*1024;
136
137SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
138SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
139SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
140
141SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
142	   &unpst_sendspace, 0, "Default stream send space.");
143SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
144	   &unpst_recvspace, 0, "Default stream receive space.");
145SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
146	   &unpdg_sendspace, 0, "Default datagram send space.");
147SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
148	   &unpdg_recvspace, 0, "Default datagram receive space.");
149SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
150    "File descriptors in flight.");
151
152/*-
153 * Locking and synchronization:
154 *
155 * The global UNIX domain socket rwlock (unp_global_rwlock) protects all
156 * global variables, including the linked lists tracking the set of allocated
157 * UNIX domain sockets.  The global rwlock also serves to prevent deadlock
158 * when more than one PCB lock is acquired at a time (i.e., during
159 * connect()).  Finally, the global rwlock protects uncounted references from
160 * vnodes to sockets bound to those vnodes: to safely dereference the
161 * v_socket pointer, the global rwlock must be held while a full reference is
162 * acquired.
163 *
164 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
165 * allocated in pru_attach() and freed in pru_detach().  The validity of that
166 * pointer is an invariant, so no lock is required to dereference the so_pcb
167 * pointer if a valid socket reference is held by the caller.  In practice,
168 * this is always true during operations performed on a socket.  Each unpcb
169 * has a back-pointer to its socket, unp_socket, which will be stable under
170 * the same circumstances.
171 *
172 * This pointer may only be safely dereferenced as long as a valid reference
173 * to the unpcb is held.  Typically, this reference will be from the socket,
174 * or from another unpcb when the referring unpcb's lock is held (in order
175 * that the reference not be invalidated during use).  For example, to follow
176 * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
177 * as unp_socket remains valid as long as the reference to unp_conn is valid.
178 *
179 * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
180 * atomic reads without the lock may be performed "lockless", but more
181 * complex reads and read-modify-writes require the mutex to be held.  No
182 * lock order is defined between unpcb locks -- multiple unpcb locks may be
183 * acquired at the same time only when holding the global UNIX domain socket
184 * rwlock exclusively, which prevents deadlocks.
185 *
186 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
187 * protocols, bind() is a non-atomic operation, and connect() requires
188 * potential sleeping in the protocol, due to potentially waiting on local or
189 * distributed file systems.  We try to separate "lookup" operations, which
190 * may sleep, and the IPC operations themselves, which typically can occur
191 * with relative atomicity as locks can be held over the entire operation.
192 *
193 * Another tricky issue is simultaneous multi-threaded or multi-process
194 * access to a single UNIX domain socket.  These are handled by the flags
195 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
196 * binding, both of which involve dropping UNIX domain socket locks in order
197 * to perform namei() and other file system operations.
198 */
199static struct rwlock	unp_global_rwlock;
200
201#define	UNP_GLOBAL_LOCK_INIT()		rw_init(&unp_global_rwlock,	\
202					    "unp_global_rwlock")
203
204#define	UNP_GLOBAL_LOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
205					    RA_LOCKED)
206#define	UNP_GLOBAL_UNLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
207					    RA_UNLOCKED)
208
209#define	UNP_GLOBAL_WLOCK()		rw_wlock(&unp_global_rwlock)
210#define	UNP_GLOBAL_WUNLOCK()		rw_wunlock(&unp_global_rwlock)
211#define	UNP_GLOBAL_WLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
212					    RA_WLOCKED)
213#define	UNP_GLOBAL_WOWNED()		rw_wowned(&unp_global_rwlock)
214
215#define	UNP_GLOBAL_RLOCK()		rw_rlock(&unp_global_rwlock)
216#define	UNP_GLOBAL_RUNLOCK()		rw_runlock(&unp_global_rwlock)
217#define	UNP_GLOBAL_RLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
218					    RA_RLOCKED)
219
220#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
221					    "unp_mtx", "unp_mtx",	\
222					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
223#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
224#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
225#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
226#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
227
228static int	unp_connect(struct socket *, struct sockaddr *,
229		    struct thread *);
230static int	unp_connect2(struct socket *so, struct socket *so2, int);
231static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
232static void	unp_shutdown(struct unpcb *);
233static void	unp_drop(struct unpcb *, int);
234static void	unp_gc(__unused void *, int);
235static void	unp_scan(struct mbuf *, void (*)(struct file *));
236static void	unp_discard(struct file *);
237static void	unp_freerights(struct file **, int);
238static int	unp_internalize(struct mbuf **, struct thread *);
239static void	unp_internalize_fp(struct file *);
240static void	unp_externalize_fp(struct file *);
241static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
242
243/*
244 * Definitions of protocols supported in the LOCAL domain.
245 */
246static struct domain localdomain;
247static struct protosw localsw[] = {
248{
249	.pr_type =		SOCK_STREAM,
250	.pr_domain =		&localdomain,
251	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
252	.pr_ctloutput =		&uipc_ctloutput,
253	.pr_usrreqs =		&uipc_usrreqs
254},
255{
256	.pr_type =		SOCK_DGRAM,
257	.pr_domain =		&localdomain,
258	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
259	.pr_usrreqs =		&uipc_usrreqs
260},
261};
262
263static struct domain localdomain = {
264	.dom_family =		AF_LOCAL,
265	.dom_name =		"local",
266	.dom_init =		unp_init,
267	.dom_externalize =	unp_externalize,
268	.dom_dispose =		unp_dispose,
269	.dom_protosw =		localsw,
270	.dom_protoswNPROTOSW =	&localsw[sizeof(localsw)/sizeof(localsw[0])]
271};
272DOMAIN_SET(local);
273
274static void
275uipc_abort(struct socket *so)
276{
277	struct unpcb *unp, *unp2;
278
279	unp = sotounpcb(so);
280	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
281
282	UNP_GLOBAL_WLOCK();
283	UNP_PCB_LOCK(unp);
284	unp2 = unp->unp_conn;
285	if (unp2 != NULL) {
286		UNP_PCB_LOCK(unp2);
287		unp_drop(unp2, ECONNABORTED);
288		UNP_PCB_UNLOCK(unp2);
289	}
290	UNP_PCB_UNLOCK(unp);
291	UNP_GLOBAL_WUNLOCK();
292}
293
294static int
295uipc_accept(struct socket *so, struct sockaddr **nam)
296{
297	struct unpcb *unp, *unp2;
298	const struct sockaddr *sa;
299
300	/*
301	 * Pass back name of connected socket, if it was bound and we are
302	 * still connected (our peer may have closed already!).
303	 */
304	unp = sotounpcb(so);
305	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
306
307	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
308	UNP_GLOBAL_RLOCK();
309	unp2 = unp->unp_conn;
310	if (unp2 != NULL && unp2->unp_addr != NULL) {
311		UNP_PCB_LOCK(unp2);
312		sa = (struct sockaddr *) unp2->unp_addr;
313		bcopy(sa, *nam, sa->sa_len);
314		UNP_PCB_UNLOCK(unp2);
315	} else {
316		sa = &sun_noname;
317		bcopy(sa, *nam, sa->sa_len);
318	}
319	UNP_GLOBAL_RUNLOCK();
320	return (0);
321}
322
323static int
324uipc_attach(struct socket *so, int proto, struct thread *td)
325{
326	u_long sendspace, recvspace;
327	struct unpcb *unp;
328	int error, locked;
329
330	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
331	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
332		switch (so->so_type) {
333		case SOCK_STREAM:
334			sendspace = unpst_sendspace;
335			recvspace = unpst_recvspace;
336			break;
337
338		case SOCK_DGRAM:
339			sendspace = unpdg_sendspace;
340			recvspace = unpdg_recvspace;
341			break;
342
343		default:
344			panic("uipc_attach");
345		}
346		error = soreserve(so, sendspace, recvspace);
347		if (error)
348			return (error);
349	}
350	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
351	if (unp == NULL)
352		return (ENOBUFS);
353	LIST_INIT(&unp->unp_refs);
354	UNP_PCB_LOCK_INIT(unp);
355	unp->unp_socket = so;
356	so->so_pcb = unp;
357	unp->unp_refcount = 1;
358
359	/*
360	 * uipc_attach() may be called indirectly from within the UNIX domain
361	 * socket code via sonewconn() in unp_connect().  Since rwlocks can
362	 * not be recursed, we do the closest thing.
363	 */
364	locked = 0;
365	if (!UNP_GLOBAL_WOWNED()) {
366		UNP_GLOBAL_WLOCK();
367		locked = 1;
368	}
369	unp->unp_gencnt = ++unp_gencnt;
370	unp_count++;
371	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead,
372	    unp, unp_link);
373	if (locked)
374		UNP_GLOBAL_WUNLOCK();
375
376	return (0);
377}
378
379static int
380uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
381{
382	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
383	struct vattr vattr;
384	int error, namelen, vfslocked;
385	struct nameidata nd;
386	struct unpcb *unp;
387	struct vnode *vp;
388	struct mount *mp;
389	char *buf;
390
391	unp = sotounpcb(so);
392	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
393
394	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
395	if (namelen <= 0)
396		return (EINVAL);
397
398	/*
399	 * We don't allow simultaneous bind() calls on a single UNIX domain
400	 * socket, so flag in-progress operations, and return an error if an
401	 * operation is already in progress.
402	 *
403	 * Historically, we have not allowed a socket to be rebound, so this
404	 * also returns an error.  Not allowing re-binding simplifies the
405	 * implementation and avoids a great many possible failure modes.
406	 */
407	UNP_PCB_LOCK(unp);
408	if (unp->unp_vnode != NULL) {
409		UNP_PCB_UNLOCK(unp);
410		return (EINVAL);
411	}
412	if (unp->unp_flags & UNP_BINDING) {
413		UNP_PCB_UNLOCK(unp);
414		return (EALREADY);
415	}
416	unp->unp_flags |= UNP_BINDING;
417	UNP_PCB_UNLOCK(unp);
418
419	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
420	bcopy(soun->sun_path, buf, namelen);
421	buf[namelen] = 0;
422
423restart:
424	vfslocked = 0;
425	NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME,
426	    UIO_SYSSPACE, buf, td);
427/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
428	error = namei(&nd);
429	if (error)
430		goto error;
431	vp = nd.ni_vp;
432	vfslocked = NDHASGIANT(&nd);
433	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
434		NDFREE(&nd, NDF_ONLY_PNBUF);
435		if (nd.ni_dvp == vp)
436			vrele(nd.ni_dvp);
437		else
438			vput(nd.ni_dvp);
439		if (vp != NULL) {
440			vrele(vp);
441			error = EADDRINUSE;
442			goto error;
443		}
444		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
445		if (error)
446			goto error;
447		VFS_UNLOCK_GIANT(vfslocked);
448		goto restart;
449	}
450	VATTR_NULL(&vattr);
451	vattr.va_type = VSOCK;
452	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
453#ifdef MAC
454	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
455	    &vattr);
456#endif
457	if (error == 0) {
458		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
459		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
460	}
461	NDFREE(&nd, NDF_ONLY_PNBUF);
462	vput(nd.ni_dvp);
463	if (error) {
464		vn_finished_write(mp);
465		goto error;
466	}
467	vp = nd.ni_vp;
468	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
469	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
470
471	UNP_GLOBAL_WLOCK();
472	UNP_PCB_LOCK(unp);
473	vp->v_socket = unp->unp_socket;
474	unp->unp_vnode = vp;
475	unp->unp_addr = soun;
476	unp->unp_flags &= ~UNP_BINDING;
477	UNP_PCB_UNLOCK(unp);
478	UNP_GLOBAL_WUNLOCK();
479	VOP_UNLOCK(vp, 0);
480	vn_finished_write(mp);
481	VFS_UNLOCK_GIANT(vfslocked);
482	free(buf, M_TEMP);
483	return (0);
484
485error:
486	VFS_UNLOCK_GIANT(vfslocked);
487	UNP_PCB_LOCK(unp);
488	unp->unp_flags &= ~UNP_BINDING;
489	UNP_PCB_UNLOCK(unp);
490	free(buf, M_TEMP);
491	return (error);
492}
493
494static int
495uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
496{
497	int error;
498
499	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
500	UNP_GLOBAL_WLOCK();
501	error = unp_connect(so, nam, td);
502	UNP_GLOBAL_WUNLOCK();
503	return (error);
504}
505
506static void
507uipc_close(struct socket *so)
508{
509	struct unpcb *unp, *unp2;
510
511	unp = sotounpcb(so);
512	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
513
514	UNP_GLOBAL_WLOCK();
515	UNP_PCB_LOCK(unp);
516	unp2 = unp->unp_conn;
517	if (unp2 != NULL) {
518		UNP_PCB_LOCK(unp2);
519		unp_disconnect(unp, unp2);
520		UNP_PCB_UNLOCK(unp2);
521	}
522	UNP_PCB_UNLOCK(unp);
523	UNP_GLOBAL_WUNLOCK();
524}
525
526int
527uipc_connect2(struct socket *so1, struct socket *so2)
528{
529	struct unpcb *unp, *unp2;
530	int error;
531
532	UNP_GLOBAL_WLOCK();
533	unp = so1->so_pcb;
534	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
535	UNP_PCB_LOCK(unp);
536	unp2 = so2->so_pcb;
537	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
538	UNP_PCB_LOCK(unp2);
539	error = unp_connect2(so1, so2, PRU_CONNECT2);
540	UNP_PCB_UNLOCK(unp2);
541	UNP_PCB_UNLOCK(unp);
542	UNP_GLOBAL_WUNLOCK();
543	return (error);
544}
545
546/* control is EOPNOTSUPP */
547
548static void
549uipc_detach(struct socket *so)
550{
551	struct unpcb *unp, *unp2;
552	struct sockaddr_un *saved_unp_addr;
553	struct vnode *vp;
554	int freeunp, local_unp_rights;
555
556	unp = sotounpcb(so);
557	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
558
559	UNP_GLOBAL_WLOCK();
560	UNP_PCB_LOCK(unp);
561
562	LIST_REMOVE(unp, unp_link);
563	unp->unp_gencnt = ++unp_gencnt;
564	--unp_count;
565
566	/*
567	 * XXXRW: Should assert vp->v_socket == so.
568	 */
569	if ((vp = unp->unp_vnode) != NULL) {
570		unp->unp_vnode->v_socket = NULL;
571		unp->unp_vnode = NULL;
572	}
573	unp2 = unp->unp_conn;
574	if (unp2 != NULL) {
575		UNP_PCB_LOCK(unp2);
576		unp_disconnect(unp, unp2);
577		UNP_PCB_UNLOCK(unp2);
578	}
579
580	/*
581	 * We hold the global lock exclusively, so it's OK to acquire
582	 * multiple pcb locks at a time.
583	 */
584	while (!LIST_EMPTY(&unp->unp_refs)) {
585		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
586
587		UNP_PCB_LOCK(ref);
588		unp_drop(ref, ECONNRESET);
589		UNP_PCB_UNLOCK(ref);
590	}
591	local_unp_rights = unp_rights;
592	UNP_GLOBAL_WUNLOCK();
593	unp->unp_socket->so_pcb = NULL;
594	saved_unp_addr = unp->unp_addr;
595	unp->unp_addr = NULL;
596	unp->unp_refcount--;
597	freeunp = (unp->unp_refcount == 0);
598	if (saved_unp_addr != NULL)
599		FREE(saved_unp_addr, M_SONAME);
600	if (freeunp) {
601		UNP_PCB_LOCK_DESTROY(unp);
602		uma_zfree(unp_zone, unp);
603	} else
604		UNP_PCB_UNLOCK(unp);
605	if (vp) {
606		int vfslocked;
607
608		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
609		vrele(vp);
610		VFS_UNLOCK_GIANT(vfslocked);
611	}
612	if (local_unp_rights)
613		taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
614}
615
616static int
617uipc_disconnect(struct socket *so)
618{
619	struct unpcb *unp, *unp2;
620
621	unp = sotounpcb(so);
622	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
623
624	UNP_GLOBAL_WLOCK();
625	UNP_PCB_LOCK(unp);
626	unp2 = unp->unp_conn;
627	if (unp2 != NULL) {
628		UNP_PCB_LOCK(unp2);
629		unp_disconnect(unp, unp2);
630		UNP_PCB_UNLOCK(unp2);
631	}
632	UNP_PCB_UNLOCK(unp);
633	UNP_GLOBAL_WUNLOCK();
634	return (0);
635}
636
637static int
638uipc_listen(struct socket *so, int backlog, struct thread *td)
639{
640	struct unpcb *unp;
641	int error;
642
643	unp = sotounpcb(so);
644	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
645
646	UNP_PCB_LOCK(unp);
647	if (unp->unp_vnode == NULL) {
648		UNP_PCB_UNLOCK(unp);
649		return (EINVAL);
650	}
651
652	SOCK_LOCK(so);
653	error = solisten_proto_check(so);
654	if (error == 0) {
655		cru2x(td->td_ucred, &unp->unp_peercred);
656		unp->unp_flags |= UNP_HAVEPCCACHED;
657		solisten_proto(so, backlog);
658	}
659	SOCK_UNLOCK(so);
660	UNP_PCB_UNLOCK(unp);
661	return (error);
662}
663
664static int
665uipc_peeraddr(struct socket *so, struct sockaddr **nam)
666{
667	struct unpcb *unp, *unp2;
668	const struct sockaddr *sa;
669
670	unp = sotounpcb(so);
671	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
672
673	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
674	UNP_PCB_LOCK(unp);
675	/*
676	 * XXX: It seems that this test always fails even when connection is
677	 * established.  So, this else clause is added as workaround to
678	 * return PF_LOCAL sockaddr.
679	 */
680	unp2 = unp->unp_conn;
681	if (unp2 != NULL) {
682		UNP_PCB_LOCK(unp2);
683		if (unp2->unp_addr != NULL)
684			sa = (struct sockaddr *) unp->unp_conn->unp_addr;
685		else
686			sa = &sun_noname;
687		bcopy(sa, *nam, sa->sa_len);
688		UNP_PCB_UNLOCK(unp2);
689	} else {
690		sa = &sun_noname;
691		bcopy(sa, *nam, sa->sa_len);
692	}
693	UNP_PCB_UNLOCK(unp);
694	return (0);
695}
696
697static int
698uipc_rcvd(struct socket *so, int flags)
699{
700	struct unpcb *unp, *unp2;
701	struct socket *so2;
702	u_int mbcnt, sbcc;
703	u_long newhiwat;
704
705	unp = sotounpcb(so);
706	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
707
708	if (so->so_type == SOCK_DGRAM)
709		panic("uipc_rcvd DGRAM?");
710
711	if (so->so_type != SOCK_STREAM)
712		panic("uipc_rcvd unknown socktype");
713
714	/*
715	 * Adjust backpressure on sender and wakeup any waiting to write.
716	 *
717	 * The unp lock is acquired to maintain the validity of the unp_conn
718	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
719	 * static as long as we don't permit unp2 to disconnect from unp,
720	 * which is prevented by the lock on unp.  We cache values from
721	 * so_rcv to avoid holding the so_rcv lock over the entire
722	 * transaction on the remote so_snd.
723	 */
724	SOCKBUF_LOCK(&so->so_rcv);
725	mbcnt = so->so_rcv.sb_mbcnt;
726	sbcc = so->so_rcv.sb_cc;
727	SOCKBUF_UNLOCK(&so->so_rcv);
728	UNP_PCB_LOCK(unp);
729	unp2 = unp->unp_conn;
730	if (unp2 == NULL) {
731		UNP_PCB_UNLOCK(unp);
732		return (0);
733	}
734	so2 = unp2->unp_socket;
735	SOCKBUF_LOCK(&so2->so_snd);
736	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
737	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
738	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
739	    newhiwat, RLIM_INFINITY);
740	sowwakeup_locked(so2);
741	unp->unp_mbcnt = mbcnt;
742	unp->unp_cc = sbcc;
743	UNP_PCB_UNLOCK(unp);
744	return (0);
745}
746
747static int
748uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
749    struct mbuf *control, struct thread *td)
750{
751	struct unpcb *unp, *unp2;
752	struct socket *so2;
753	u_int mbcnt, sbcc;
754	u_long newhiwat;
755	int error = 0;
756
757	unp = sotounpcb(so);
758	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
759
760	if (flags & PRUS_OOB) {
761		error = EOPNOTSUPP;
762		goto release;
763	}
764	if (control != NULL && (error = unp_internalize(&control, td)))
765		goto release;
766	if ((nam != NULL) || (flags & PRUS_EOF))
767		UNP_GLOBAL_WLOCK();
768	else
769		UNP_GLOBAL_RLOCK();
770	switch (so->so_type) {
771	case SOCK_DGRAM:
772	{
773		const struct sockaddr *from;
774
775		unp2 = unp->unp_conn;
776		if (nam != NULL) {
777			UNP_GLOBAL_WLOCK_ASSERT();
778			if (unp2 != NULL) {
779				error = EISCONN;
780				break;
781			}
782			error = unp_connect(so, nam, td);
783			if (error)
784				break;
785			unp2 = unp->unp_conn;
786		}
787
788		/*
789		 * Because connect() and send() are non-atomic in a sendto()
790		 * with a target address, it's possible that the socket will
791		 * have disconnected before the send() can run.  In that case
792		 * return the slightly counter-intuitive but otherwise
793		 * correct error that the socket is not connected.
794		 */
795		if (unp2 == NULL) {
796			error = ENOTCONN;
797			break;
798		}
799		/* Lockless read. */
800		if (unp2->unp_flags & UNP_WANTCRED)
801			control = unp_addsockcred(td, control);
802		UNP_PCB_LOCK(unp);
803		if (unp->unp_addr != NULL)
804			from = (struct sockaddr *)unp->unp_addr;
805		else
806			from = &sun_noname;
807		so2 = unp2->unp_socket;
808		SOCKBUF_LOCK(&so2->so_rcv);
809		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
810			sorwakeup_locked(so2);
811			m = NULL;
812			control = NULL;
813		} else {
814			SOCKBUF_UNLOCK(&so2->so_rcv);
815			error = ENOBUFS;
816		}
817		if (nam != NULL) {
818			UNP_GLOBAL_WLOCK_ASSERT();
819			UNP_PCB_LOCK(unp2);
820			unp_disconnect(unp, unp2);
821			UNP_PCB_UNLOCK(unp2);
822		}
823		UNP_PCB_UNLOCK(unp);
824		break;
825	}
826
827	case SOCK_STREAM:
828		if ((so->so_state & SS_ISCONNECTED) == 0) {
829			if (nam != NULL) {
830				UNP_GLOBAL_WLOCK_ASSERT();
831				error = unp_connect(so, nam, td);
832				if (error)
833					break;	/* XXX */
834			} else {
835				error = ENOTCONN;
836				break;
837			}
838		}
839
840		/* Lockless read. */
841		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
842			error = EPIPE;
843			break;
844		}
845
846		/*
847		 * Because connect() and send() are non-atomic in a sendto()
848		 * with a target address, it's possible that the socket will
849		 * have disconnected before the send() can run.  In that case
850		 * return the slightly counter-intuitive but otherwise
851		 * correct error that the socket is not connected.
852		 *
853		 * Locking here must be done carefully: the global lock
854		 * prevents interconnections between unpcbs from changing, so
855		 * we can traverse from unp to unp2 without acquiring unp's
856		 * lock.  Socket buffer locks follow unpcb locks, so we can
857		 * acquire both remote and lock socket buffer locks.
858		 */
859		unp2 = unp->unp_conn;
860		if (unp2 == NULL) {
861			error = ENOTCONN;
862			break;
863		}
864		so2 = unp2->unp_socket;
865		UNP_PCB_LOCK(unp2);
866		SOCKBUF_LOCK(&so2->so_rcv);
867		if (unp2->unp_flags & UNP_WANTCRED) {
868			/*
869			 * Credentials are passed only once on SOCK_STREAM.
870			 */
871			unp2->unp_flags &= ~UNP_WANTCRED;
872			control = unp_addsockcred(td, control);
873		}
874		/*
875		 * Send to paired receive port, and then reduce send buffer
876		 * hiwater marks to maintain backpressure.  Wake up readers.
877		 */
878		if (control != NULL) {
879			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
880				control = NULL;
881		} else
882			sbappend_locked(&so2->so_rcv, m);
883		mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
884		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
885		sbcc = so2->so_rcv.sb_cc;
886		sorwakeup_locked(so2);
887
888		SOCKBUF_LOCK(&so->so_snd);
889		newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
890		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
891		    newhiwat, RLIM_INFINITY);
892		so->so_snd.sb_mbmax -= mbcnt;
893		SOCKBUF_UNLOCK(&so->so_snd);
894		unp2->unp_cc = sbcc;
895		UNP_PCB_UNLOCK(unp2);
896		m = NULL;
897		break;
898
899	default:
900		panic("uipc_send unknown socktype");
901	}
902
903	/*
904	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
905	 */
906	if (flags & PRUS_EOF) {
907		UNP_PCB_LOCK(unp);
908		socantsendmore(so);
909		unp_shutdown(unp);
910		UNP_PCB_UNLOCK(unp);
911	}
912
913	if ((nam != NULL) || (flags & PRUS_EOF))
914		UNP_GLOBAL_WUNLOCK();
915	else
916		UNP_GLOBAL_RUNLOCK();
917
918	if (control != NULL && error != 0)
919		unp_dispose(control);
920
921release:
922	if (control != NULL)
923		m_freem(control);
924	if (m != NULL)
925		m_freem(m);
926	return (error);
927}
928
929static int
930uipc_sense(struct socket *so, struct stat *sb)
931{
932	struct unpcb *unp, *unp2;
933	struct socket *so2;
934
935	unp = sotounpcb(so);
936	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
937
938	sb->st_blksize = so->so_snd.sb_hiwat;
939	UNP_GLOBAL_RLOCK();
940	UNP_PCB_LOCK(unp);
941	unp2 = unp->unp_conn;
942	if (so->so_type == SOCK_STREAM && unp2 != NULL) {
943		so2 = unp2->unp_socket;
944		sb->st_blksize += so2->so_rcv.sb_cc;
945	}
946	sb->st_dev = NODEV;
947	if (unp->unp_ino == 0)
948		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
949	sb->st_ino = unp->unp_ino;
950	UNP_PCB_UNLOCK(unp);
951	UNP_GLOBAL_RUNLOCK();
952	return (0);
953}
954
955static int
956uipc_shutdown(struct socket *so)
957{
958	struct unpcb *unp;
959
960	unp = sotounpcb(so);
961	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
962
963	UNP_GLOBAL_WLOCK();
964	UNP_PCB_LOCK(unp);
965	socantsendmore(so);
966	unp_shutdown(unp);
967	UNP_PCB_UNLOCK(unp);
968	UNP_GLOBAL_WUNLOCK();
969	return (0);
970}
971
972static int
973uipc_sockaddr(struct socket *so, struct sockaddr **nam)
974{
975	struct unpcb *unp;
976	const struct sockaddr *sa;
977
978	unp = sotounpcb(so);
979	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
980
981	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
982	UNP_PCB_LOCK(unp);
983	if (unp->unp_addr != NULL)
984		sa = (struct sockaddr *) unp->unp_addr;
985	else
986		sa = &sun_noname;
987	bcopy(sa, *nam, sa->sa_len);
988	UNP_PCB_UNLOCK(unp);
989	return (0);
990}
991
992struct pr_usrreqs uipc_usrreqs = {
993	.pru_abort = 		uipc_abort,
994	.pru_accept =		uipc_accept,
995	.pru_attach =		uipc_attach,
996	.pru_bind =		uipc_bind,
997	.pru_connect =		uipc_connect,
998	.pru_connect2 =		uipc_connect2,
999	.pru_detach =		uipc_detach,
1000	.pru_disconnect =	uipc_disconnect,
1001	.pru_listen =		uipc_listen,
1002	.pru_peeraddr =		uipc_peeraddr,
1003	.pru_rcvd =		uipc_rcvd,
1004	.pru_send =		uipc_send,
1005	.pru_sense =		uipc_sense,
1006	.pru_shutdown =		uipc_shutdown,
1007	.pru_sockaddr =		uipc_sockaddr,
1008	.pru_close =		uipc_close,
1009};
1010
1011int
1012uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1013{
1014	struct unpcb *unp;
1015	struct xucred xu;
1016	int error, optval;
1017
1018	if (sopt->sopt_level != 0)
1019		return (EINVAL);
1020
1021	unp = sotounpcb(so);
1022	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1023	error = 0;
1024	switch (sopt->sopt_dir) {
1025	case SOPT_GET:
1026		switch (sopt->sopt_name) {
1027		case LOCAL_PEERCRED:
1028			UNP_PCB_LOCK(unp);
1029			if (unp->unp_flags & UNP_HAVEPC)
1030				xu = unp->unp_peercred;
1031			else {
1032				if (so->so_type == SOCK_STREAM)
1033					error = ENOTCONN;
1034				else
1035					error = EINVAL;
1036			}
1037			UNP_PCB_UNLOCK(unp);
1038			if (error == 0)
1039				error = sooptcopyout(sopt, &xu, sizeof(xu));
1040			break;
1041
1042		case LOCAL_CREDS:
1043			/* Unlocked read. */
1044			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
1045			error = sooptcopyout(sopt, &optval, sizeof(optval));
1046			break;
1047
1048		case LOCAL_CONNWAIT:
1049			/* Unlocked read. */
1050			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
1051			error = sooptcopyout(sopt, &optval, sizeof(optval));
1052			break;
1053
1054		default:
1055			error = EOPNOTSUPP;
1056			break;
1057		}
1058		break;
1059
1060	case SOPT_SET:
1061		switch (sopt->sopt_name) {
1062		case LOCAL_CREDS:
1063		case LOCAL_CONNWAIT:
1064			error = sooptcopyin(sopt, &optval, sizeof(optval),
1065					    sizeof(optval));
1066			if (error)
1067				break;
1068
1069#define	OPTSET(bit) do {						\
1070	UNP_PCB_LOCK(unp);						\
1071	if (optval)							\
1072		unp->unp_flags |= bit;					\
1073	else								\
1074		unp->unp_flags &= ~bit;					\
1075	UNP_PCB_UNLOCK(unp);						\
1076} while (0)
1077
1078			switch (sopt->sopt_name) {
1079			case LOCAL_CREDS:
1080				OPTSET(UNP_WANTCRED);
1081				break;
1082
1083			case LOCAL_CONNWAIT:
1084				OPTSET(UNP_CONNWAIT);
1085				break;
1086
1087			default:
1088				break;
1089			}
1090			break;
1091#undef	OPTSET
1092		default:
1093			error = ENOPROTOOPT;
1094			break;
1095		}
1096		break;
1097
1098	default:
1099		error = EOPNOTSUPP;
1100		break;
1101	}
1102	return (error);
1103}
1104
1105static int
1106unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1107{
1108	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
1109	struct vnode *vp;
1110	struct socket *so2, *so3;
1111	struct unpcb *unp, *unp2, *unp3;
1112	int error, len, vfslocked;
1113	struct nameidata nd;
1114	char buf[SOCK_MAXADDRLEN];
1115	struct sockaddr *sa;
1116
1117	UNP_GLOBAL_WLOCK_ASSERT();
1118
1119	unp = sotounpcb(so);
1120	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1121
1122	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1123	if (len <= 0)
1124		return (EINVAL);
1125	bcopy(soun->sun_path, buf, len);
1126	buf[len] = 0;
1127
1128	UNP_PCB_LOCK(unp);
1129	if (unp->unp_flags & UNP_CONNECTING) {
1130		UNP_PCB_UNLOCK(unp);
1131		return (EALREADY);
1132	}
1133	UNP_GLOBAL_WUNLOCK();
1134	unp->unp_flags |= UNP_CONNECTING;
1135	UNP_PCB_UNLOCK(unp);
1136
1137	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1138	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf,
1139	    td);
1140	error = namei(&nd);
1141	if (error)
1142		vp = NULL;
1143	else
1144		vp = nd.ni_vp;
1145	ASSERT_VOP_LOCKED(vp, "unp_connect");
1146	vfslocked = NDHASGIANT(&nd);
1147	NDFREE(&nd, NDF_ONLY_PNBUF);
1148	if (error)
1149		goto bad;
1150
1151	if (vp->v_type != VSOCK) {
1152		error = ENOTSOCK;
1153		goto bad;
1154	}
1155#ifdef MAC
1156	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1157	if (error)
1158		goto bad;
1159#endif
1160	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1161	if (error)
1162		goto bad;
1163	VFS_UNLOCK_GIANT(vfslocked);
1164
1165	unp = sotounpcb(so);
1166	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1167
1168	/*
1169	 * Lock global lock for two reasons: make sure v_socket is stable,
1170	 * and to protect simultaneous locking of multiple pcbs.
1171	 */
1172	UNP_GLOBAL_WLOCK();
1173	so2 = vp->v_socket;
1174	if (so2 == NULL) {
1175		error = ECONNREFUSED;
1176		goto bad2;
1177	}
1178	if (so->so_type != so2->so_type) {
1179		error = EPROTOTYPE;
1180		goto bad2;
1181	}
1182	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
1183		if (so2->so_options & SO_ACCEPTCONN) {
1184			/*
1185			 * We can't drop the global lock here or 'so2' may
1186			 * become invalid.  As a result, we need to handle
1187			 * possibly lock recursion in uipc_attach.
1188			 */
1189			so3 = sonewconn(so2, 0);
1190		} else
1191			so3 = NULL;
1192		if (so3 == NULL) {
1193			error = ECONNREFUSED;
1194			goto bad2;
1195		}
1196		unp = sotounpcb(so);
1197		unp2 = sotounpcb(so2);
1198		unp3 = sotounpcb(so3);
1199		UNP_PCB_LOCK(unp);
1200		UNP_PCB_LOCK(unp2);
1201		UNP_PCB_LOCK(unp3);
1202		if (unp2->unp_addr != NULL) {
1203			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
1204			unp3->unp_addr = (struct sockaddr_un *) sa;
1205			sa = NULL;
1206		}
1207		/*
1208		 * unp_peercred management:
1209		 *
1210		 * The connecter's (client's) credentials are copied from its
1211		 * process structure at the time of connect() (which is now).
1212		 */
1213		cru2x(td->td_ucred, &unp3->unp_peercred);
1214		unp3->unp_flags |= UNP_HAVEPC;
1215		/*
1216		 * The receiver's (server's) credentials are copied from the
1217		 * unp_peercred member of socket on which the former called
1218		 * listen(); uipc_listen() cached that process's credentials
1219		 * at that time so we can use them now.
1220		 */
1221		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
1222		    ("unp_connect: listener without cached peercred"));
1223		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
1224		    sizeof(unp->unp_peercred));
1225		unp->unp_flags |= UNP_HAVEPC;
1226		if (unp2->unp_flags & UNP_WANTCRED)
1227			unp3->unp_flags |= UNP_WANTCRED;
1228		UNP_PCB_UNLOCK(unp3);
1229		UNP_PCB_UNLOCK(unp2);
1230		UNP_PCB_UNLOCK(unp);
1231#ifdef MAC
1232		SOCK_LOCK(so);
1233		mac_socketpeer_set_from_socket(so, so3);
1234		mac_socketpeer_set_from_socket(so3, so);
1235		SOCK_UNLOCK(so);
1236#endif
1237
1238		so2 = so3;
1239	}
1240	unp = sotounpcb(so);
1241	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1242	unp2 = sotounpcb(so2);
1243	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
1244	UNP_PCB_LOCK(unp);
1245	UNP_PCB_LOCK(unp2);
1246	error = unp_connect2(so, so2, PRU_CONNECT);
1247	UNP_PCB_UNLOCK(unp2);
1248	UNP_PCB_UNLOCK(unp);
1249bad2:
1250	UNP_GLOBAL_WUNLOCK();
1251	if (vfslocked)
1252		/*
1253		 * Giant has been previously acquired. This means filesystem
1254		 * isn't MPSAFE.  Do it once again.
1255		 */
1256		mtx_lock(&Giant);
1257bad:
1258	if (vp != NULL)
1259		vput(vp);
1260	VFS_UNLOCK_GIANT(vfslocked);
1261	free(sa, M_SONAME);
1262	UNP_GLOBAL_WLOCK();
1263	UNP_PCB_LOCK(unp);
1264	unp->unp_flags &= ~UNP_CONNECTING;
1265	UNP_PCB_UNLOCK(unp);
1266	return (error);
1267}
1268
1269static int
1270unp_connect2(struct socket *so, struct socket *so2, int req)
1271{
1272	struct unpcb *unp;
1273	struct unpcb *unp2;
1274
1275	unp = sotounpcb(so);
1276	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
1277	unp2 = sotounpcb(so2);
1278	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
1279
1280	UNP_GLOBAL_WLOCK_ASSERT();
1281	UNP_PCB_LOCK_ASSERT(unp);
1282	UNP_PCB_LOCK_ASSERT(unp2);
1283
1284	if (so2->so_type != so->so_type)
1285		return (EPROTOTYPE);
1286	unp->unp_conn = unp2;
1287
1288	switch (so->so_type) {
1289	case SOCK_DGRAM:
1290		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1291		soisconnected(so);
1292		break;
1293
1294	case SOCK_STREAM:
1295		unp2->unp_conn = unp;
1296		if (req == PRU_CONNECT &&
1297		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1298			soisconnecting(so);
1299		else
1300			soisconnected(so);
1301		soisconnected(so2);
1302		break;
1303
1304	default:
1305		panic("unp_connect2");
1306	}
1307	return (0);
1308}
1309
1310static void
1311unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
1312{
1313	struct socket *so;
1314
1315	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
1316
1317	UNP_GLOBAL_WLOCK_ASSERT();
1318	UNP_PCB_LOCK_ASSERT(unp);
1319	UNP_PCB_LOCK_ASSERT(unp2);
1320
1321	unp->unp_conn = NULL;
1322	switch (unp->unp_socket->so_type) {
1323	case SOCK_DGRAM:
1324		LIST_REMOVE(unp, unp_reflink);
1325		so = unp->unp_socket;
1326		SOCK_LOCK(so);
1327		so->so_state &= ~SS_ISCONNECTED;
1328		SOCK_UNLOCK(so);
1329		break;
1330
1331	case SOCK_STREAM:
1332		soisdisconnected(unp->unp_socket);
1333		unp2->unp_conn = NULL;
1334		soisdisconnected(unp2->unp_socket);
1335		break;
1336	}
1337}
1338
1339/*
1340 * unp_pcblist() walks the global list of struct unpcb's to generate a
1341 * pointer list, bumping the refcount on each unpcb.  It then copies them out
1342 * sequentially, validating the generation number on each to see if it has
1343 * been detached.  All of this is necessary because copyout() may sleep on
1344 * disk I/O.
1345 */
1346static int
1347unp_pcblist(SYSCTL_HANDLER_ARGS)
1348{
1349	int error, i, n;
1350	int freeunp;
1351	struct unpcb *unp, **unp_list;
1352	unp_gen_t gencnt;
1353	struct xunpgen *xug;
1354	struct unp_head *head;
1355	struct xunpcb *xu;
1356
1357	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1358
1359	/*
1360	 * The process of preparing the PCB list is too time-consuming and
1361	 * resource-intensive to repeat twice on every request.
1362	 */
1363	if (req->oldptr == NULL) {
1364		n = unp_count;
1365		req->oldidx = 2 * (sizeof *xug)
1366			+ (n + n/8) * sizeof(struct xunpcb);
1367		return (0);
1368	}
1369
1370	if (req->newptr != NULL)
1371		return (EPERM);
1372
1373	/*
1374	 * OK, now we're committed to doing something.
1375	 */
1376	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1377	UNP_GLOBAL_RLOCK();
1378	gencnt = unp_gencnt;
1379	n = unp_count;
1380	UNP_GLOBAL_RUNLOCK();
1381
1382	xug->xug_len = sizeof *xug;
1383	xug->xug_count = n;
1384	xug->xug_gen = gencnt;
1385	xug->xug_sogen = so_gencnt;
1386	error = SYSCTL_OUT(req, xug, sizeof *xug);
1387	if (error) {
1388		free(xug, M_TEMP);
1389		return (error);
1390	}
1391
1392	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1393
1394	UNP_GLOBAL_RLOCK();
1395	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1396	     unp = LIST_NEXT(unp, unp_link)) {
1397		UNP_PCB_LOCK(unp);
1398		if (unp->unp_gencnt <= gencnt) {
1399			if (cr_cansee(req->td->td_ucred,
1400			    unp->unp_socket->so_cred)) {
1401				UNP_PCB_UNLOCK(unp);
1402				continue;
1403			}
1404			unp_list[i++] = unp;
1405			unp->unp_refcount++;
1406		}
1407		UNP_PCB_UNLOCK(unp);
1408	}
1409	UNP_GLOBAL_RUNLOCK();
1410	n = i;			/* In case we lost some during malloc. */
1411
1412	error = 0;
1413	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
1414	for (i = 0; i < n; i++) {
1415		unp = unp_list[i];
1416		UNP_PCB_LOCK(unp);
1417		unp->unp_refcount--;
1418	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
1419			xu->xu_len = sizeof *xu;
1420			xu->xu_unpp = unp;
1421			/*
1422			 * XXX - need more locking here to protect against
1423			 * connect/disconnect races for SMP.
1424			 */
1425			if (unp->unp_addr != NULL)
1426				bcopy(unp->unp_addr, &xu->xu_addr,
1427				      unp->unp_addr->sun_len);
1428			if (unp->unp_conn != NULL &&
1429			    unp->unp_conn->unp_addr != NULL)
1430				bcopy(unp->unp_conn->unp_addr,
1431				      &xu->xu_caddr,
1432				      unp->unp_conn->unp_addr->sun_len);
1433			bcopy(unp, &xu->xu_unp, sizeof *unp);
1434			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1435			UNP_PCB_UNLOCK(unp);
1436			error = SYSCTL_OUT(req, xu, sizeof *xu);
1437		} else {
1438			freeunp = (unp->unp_refcount == 0);
1439			UNP_PCB_UNLOCK(unp);
1440			if (freeunp) {
1441				UNP_PCB_LOCK_DESTROY(unp);
1442				uma_zfree(unp_zone, unp);
1443			}
1444		}
1445	}
1446	free(xu, M_TEMP);
1447	if (!error) {
1448		/*
1449		 * Give the user an updated idea of our state.  If the
1450		 * generation differs from what we told her before, she knows
1451		 * that something happened while we were processing this
1452		 * request, and it might be necessary to retry.
1453		 */
1454		xug->xug_gen = unp_gencnt;
1455		xug->xug_sogen = so_gencnt;
1456		xug->xug_count = unp_count;
1457		error = SYSCTL_OUT(req, xug, sizeof *xug);
1458	}
1459	free(unp_list, M_TEMP);
1460	free(xug, M_TEMP);
1461	return (error);
1462}
1463
1464SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1465	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1466	    "List of active local datagram sockets");
1467SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1468	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1469	    "List of active local stream sockets");
1470
1471static void
1472unp_shutdown(struct unpcb *unp)
1473{
1474	struct unpcb *unp2;
1475	struct socket *so;
1476
1477	UNP_GLOBAL_WLOCK_ASSERT();
1478	UNP_PCB_LOCK_ASSERT(unp);
1479
1480	unp2 = unp->unp_conn;
1481	if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) {
1482		so = unp2->unp_socket;
1483		if (so != NULL)
1484			socantrcvmore(so);
1485	}
1486}
1487
1488static void
1489unp_drop(struct unpcb *unp, int errno)
1490{
1491	struct socket *so = unp->unp_socket;
1492	struct unpcb *unp2;
1493
1494	UNP_GLOBAL_WLOCK_ASSERT();
1495	UNP_PCB_LOCK_ASSERT(unp);
1496
1497	so->so_error = errno;
1498	unp2 = unp->unp_conn;
1499	if (unp2 == NULL)
1500		return;
1501
1502	UNP_PCB_LOCK(unp2);
1503	unp_disconnect(unp, unp2);
1504	UNP_PCB_UNLOCK(unp2);
1505}
1506
1507static void
1508unp_freerights(struct file **rp, int fdcount)
1509{
1510	int i;
1511	struct file *fp;
1512
1513	for (i = 0; i < fdcount; i++) {
1514		/*
1515		 * Zero the pointer before calling unp_discard since it may
1516		 * end up in unp_gc()..
1517		 *
1518		 * XXXRW: This is less true than it used to be.
1519		 */
1520		fp = *rp;
1521		*rp++ = NULL;
1522		unp_discard(fp);
1523	}
1524}
1525
1526int
1527unp_externalize(struct mbuf *control, struct mbuf **controlp)
1528{
1529	struct thread *td = curthread;		/* XXX */
1530	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1531	int i;
1532	int *fdp;
1533	struct file **rp;
1534	struct file *fp;
1535	void *data;
1536	socklen_t clen = control->m_len, datalen;
1537	int error, newfds;
1538	int f;
1539	u_int newlen;
1540
1541	UNP_GLOBAL_UNLOCK_ASSERT();
1542
1543	error = 0;
1544	if (controlp != NULL) /* controlp == NULL => free control messages */
1545		*controlp = NULL;
1546
1547	while (cm != NULL) {
1548		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1549			error = EINVAL;
1550			break;
1551		}
1552
1553		data = CMSG_DATA(cm);
1554		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1555
1556		if (cm->cmsg_level == SOL_SOCKET
1557		    && cm->cmsg_type == SCM_RIGHTS) {
1558			newfds = datalen / sizeof(struct file *);
1559			rp = data;
1560
1561			/* If we're not outputting the descriptors free them. */
1562			if (error || controlp == NULL) {
1563				unp_freerights(rp, newfds);
1564				goto next;
1565			}
1566			FILEDESC_XLOCK(td->td_proc->p_fd);
1567			/* if the new FD's will not fit free them.  */
1568			if (!fdavail(td, newfds)) {
1569				FILEDESC_XUNLOCK(td->td_proc->p_fd);
1570				error = EMSGSIZE;
1571				unp_freerights(rp, newfds);
1572				goto next;
1573			}
1574
1575			/*
1576			 * Now change each pointer to an fd in the global
1577			 * table to an integer that is the index to the local
1578			 * fd table entry that we set up to point to the
1579			 * global one we are transferring.
1580			 */
1581			newlen = newfds * sizeof(int);
1582			*controlp = sbcreatecontrol(NULL, newlen,
1583			    SCM_RIGHTS, SOL_SOCKET);
1584			if (*controlp == NULL) {
1585				FILEDESC_XUNLOCK(td->td_proc->p_fd);
1586				error = E2BIG;
1587				unp_freerights(rp, newfds);
1588				goto next;
1589			}
1590
1591			fdp = (int *)
1592			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1593			for (i = 0; i < newfds; i++) {
1594				if (fdalloc(td, 0, &f))
1595					panic("unp_externalize fdalloc failed");
1596				fp = *rp++;
1597				td->td_proc->p_fd->fd_ofiles[f] = fp;
1598				unp_externalize_fp(fp);
1599				*fdp++ = f;
1600			}
1601			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1602		} else {
1603			/* We can just copy anything else across. */
1604			if (error || controlp == NULL)
1605				goto next;
1606			*controlp = sbcreatecontrol(NULL, datalen,
1607			    cm->cmsg_type, cm->cmsg_level);
1608			if (*controlp == NULL) {
1609				error = ENOBUFS;
1610				goto next;
1611			}
1612			bcopy(data,
1613			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1614			    datalen);
1615		}
1616
1617		controlp = &(*controlp)->m_next;
1618
1619next:
1620		if (CMSG_SPACE(datalen) < clen) {
1621			clen -= CMSG_SPACE(datalen);
1622			cm = (struct cmsghdr *)
1623			    ((caddr_t)cm + CMSG_SPACE(datalen));
1624		} else {
1625			clen = 0;
1626			cm = NULL;
1627		}
1628	}
1629
1630	m_freem(control);
1631
1632	return (error);
1633}
1634
1635static void
1636unp_zone_change(void *tag)
1637{
1638
1639	uma_zone_set_max(unp_zone, maxsockets);
1640}
1641
1642void
1643unp_init(void)
1644{
1645
1646	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1647	    NULL, NULL, UMA_ALIGN_PTR, 0);
1648	if (unp_zone == NULL)
1649		panic("unp_init");
1650	uma_zone_set_max(unp_zone, maxsockets);
1651	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
1652	    NULL, EVENTHANDLER_PRI_ANY);
1653	LIST_INIT(&unp_dhead);
1654	LIST_INIT(&unp_shead);
1655	TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
1656	UNP_GLOBAL_LOCK_INIT();
1657}
1658
1659static int
1660unp_internalize(struct mbuf **controlp, struct thread *td)
1661{
1662	struct mbuf *control = *controlp;
1663	struct proc *p = td->td_proc;
1664	struct filedesc *fdescp = p->p_fd;
1665	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1666	struct cmsgcred *cmcred;
1667	struct file **rp;
1668	struct file *fp;
1669	struct timeval *tv;
1670	int i, fd, *fdp;
1671	void *data;
1672	socklen_t clen = control->m_len, datalen;
1673	int error, oldfds;
1674	u_int newlen;
1675
1676	UNP_GLOBAL_UNLOCK_ASSERT();
1677
1678	error = 0;
1679	*controlp = NULL;
1680
1681	while (cm != NULL) {
1682		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1683		    || cm->cmsg_len > clen) {
1684			error = EINVAL;
1685			goto out;
1686		}
1687
1688		data = CMSG_DATA(cm);
1689		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1690
1691		switch (cm->cmsg_type) {
1692		/*
1693		 * Fill in credential information.
1694		 */
1695		case SCM_CREDS:
1696			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1697			    SCM_CREDS, SOL_SOCKET);
1698			if (*controlp == NULL) {
1699				error = ENOBUFS;
1700				goto out;
1701			}
1702
1703			cmcred = (struct cmsgcred *)
1704			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1705			cmcred->cmcred_pid = p->p_pid;
1706			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1707			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1708			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1709			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1710							CMGROUP_MAX);
1711			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1712				cmcred->cmcred_groups[i] =
1713				    td->td_ucred->cr_groups[i];
1714			break;
1715
1716		case SCM_RIGHTS:
1717			oldfds = datalen / sizeof (int);
1718			/*
1719			 * Check that all the FDs passed in refer to legal
1720			 * files.  If not, reject the entire operation.
1721			 */
1722			fdp = data;
1723			FILEDESC_SLOCK(fdescp);
1724			for (i = 0; i < oldfds; i++) {
1725				fd = *fdp++;
1726				if ((unsigned)fd >= fdescp->fd_nfiles ||
1727				    fdescp->fd_ofiles[fd] == NULL) {
1728					FILEDESC_SUNLOCK(fdescp);
1729					error = EBADF;
1730					goto out;
1731				}
1732				fp = fdescp->fd_ofiles[fd];
1733				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1734					FILEDESC_SUNLOCK(fdescp);
1735					error = EOPNOTSUPP;
1736					goto out;
1737				}
1738
1739			}
1740
1741			/*
1742			 * Now replace the integer FDs with pointers to
1743			 * the associated global file table entry..
1744			 */
1745			newlen = oldfds * sizeof(struct file *);
1746			*controlp = sbcreatecontrol(NULL, newlen,
1747			    SCM_RIGHTS, SOL_SOCKET);
1748			if (*controlp == NULL) {
1749				FILEDESC_SUNLOCK(fdescp);
1750				error = E2BIG;
1751				goto out;
1752			}
1753
1754			fdp = data;
1755			rp = (struct file **)
1756			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1757			for (i = 0; i < oldfds; i++) {
1758				fp = fdescp->fd_ofiles[*fdp++];
1759				*rp++ = fp;
1760				unp_internalize_fp(fp);
1761			}
1762			FILEDESC_SUNLOCK(fdescp);
1763			break;
1764
1765		case SCM_TIMESTAMP:
1766			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1767			    SCM_TIMESTAMP, SOL_SOCKET);
1768			if (*controlp == NULL) {
1769				error = ENOBUFS;
1770				goto out;
1771			}
1772			tv = (struct timeval *)
1773			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1774			microtime(tv);
1775			break;
1776
1777		default:
1778			error = EINVAL;
1779			goto out;
1780		}
1781
1782		controlp = &(*controlp)->m_next;
1783
1784		if (CMSG_SPACE(datalen) < clen) {
1785			clen -= CMSG_SPACE(datalen);
1786			cm = (struct cmsghdr *)
1787			    ((caddr_t)cm + CMSG_SPACE(datalen));
1788		} else {
1789			clen = 0;
1790			cm = NULL;
1791		}
1792	}
1793
1794out:
1795	m_freem(control);
1796
1797	return (error);
1798}
1799
1800static struct mbuf *
1801unp_addsockcred(struct thread *td, struct mbuf *control)
1802{
1803	struct mbuf *m, *n, *n_prev;
1804	struct sockcred *sc;
1805	const struct cmsghdr *cm;
1806	int ngroups;
1807	int i;
1808
1809	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
1810
1811	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
1812	if (m == NULL)
1813		return (control);
1814
1815	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
1816	sc->sc_uid = td->td_ucred->cr_ruid;
1817	sc->sc_euid = td->td_ucred->cr_uid;
1818	sc->sc_gid = td->td_ucred->cr_rgid;
1819	sc->sc_egid = td->td_ucred->cr_gid;
1820	sc->sc_ngroups = ngroups;
1821	for (i = 0; i < sc->sc_ngroups; i++)
1822		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
1823
1824	/*
1825	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
1826	 * created SCM_CREDS control message (struct sockcred) has another
1827	 * format.
1828	 */
1829	if (control != NULL)
1830		for (n = control, n_prev = NULL; n != NULL;) {
1831			cm = mtod(n, struct cmsghdr *);
1832    			if (cm->cmsg_level == SOL_SOCKET &&
1833			    cm->cmsg_type == SCM_CREDS) {
1834    				if (n_prev == NULL)
1835					control = n->m_next;
1836				else
1837					n_prev->m_next = n->m_next;
1838				n = m_free(n);
1839			} else {
1840				n_prev = n;
1841				n = n->m_next;
1842			}
1843		}
1844
1845	/* Prepend it to the head. */
1846	m->m_next = control;
1847
1848	return (m);
1849}
1850
1851static struct unpcb *
1852fptounp(struct file *fp)
1853{
1854	struct socket *so;
1855
1856	if (fp->f_type != DTYPE_SOCKET)
1857		return (NULL);
1858	if ((so = fp->f_data) == NULL)
1859		return (NULL);
1860	if (so->so_proto->pr_domain != &localdomain)
1861		return (NULL);
1862	return sotounpcb(so);
1863}
1864
1865static void
1866unp_discard(struct file *fp)
1867{
1868
1869	unp_externalize_fp(fp);
1870	(void) closef(fp, (struct thread *)NULL);
1871}
1872
1873static void
1874unp_internalize_fp(struct file *fp)
1875{
1876	struct unpcb *unp;
1877
1878	UNP_GLOBAL_WLOCK();
1879	if ((unp = fptounp(fp)) != NULL) {
1880		unp->unp_file = fp;
1881		unp->unp_msgcount++;
1882	}
1883	fhold(fp);
1884	unp_rights++;
1885	UNP_GLOBAL_WUNLOCK();
1886}
1887
1888static void
1889unp_externalize_fp(struct file *fp)
1890{
1891	struct unpcb *unp;
1892
1893	UNP_GLOBAL_WLOCK();
1894	if ((unp = fptounp(fp)) != NULL)
1895		unp->unp_msgcount--;
1896	unp_rights--;
1897	UNP_GLOBAL_WUNLOCK();
1898}
1899
1900/*
1901 * unp_defer indicates whether additional work has been defered for a future
1902 * pass through unp_gc().  It is thread local and does not require explicit
1903 * synchronization.
1904 */
1905static int	unp_marked;
1906static int	unp_unreachable;
1907
1908static void
1909unp_accessable(struct file *fp)
1910{
1911	struct unpcb *unp;
1912
1913	if ((unp = fptounp(fp)) == NULL)
1914		return;
1915	if (unp->unp_gcflag & UNPGC_REF)
1916		return;
1917	unp->unp_gcflag &= ~UNPGC_DEAD;
1918	unp->unp_gcflag |= UNPGC_REF;
1919	unp_marked++;
1920}
1921
1922static void
1923unp_gc_process(struct unpcb *unp)
1924{
1925	struct socket *soa;
1926	struct socket *so;
1927	struct file *fp;
1928
1929	/* Already processed. */
1930	if (unp->unp_gcflag & UNPGC_SCANNED)
1931		return;
1932	fp = unp->unp_file;
1933
1934	/*
1935	 * Check for a socket potentially in a cycle.  It must be in a
1936	 * queue as indicated by msgcount, and this must equal the file
1937	 * reference count.  Note that when msgcount is 0 the file is NULL.
1938	 */
1939	if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp &&
1940	    unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) {
1941		unp->unp_gcflag |= UNPGC_DEAD;
1942		unp_unreachable++;
1943		return;
1944	}
1945
1946	/*
1947	 * Mark all sockets we reference with RIGHTS.
1948	 */
1949	so = unp->unp_socket;
1950	SOCKBUF_LOCK(&so->so_rcv);
1951	unp_scan(so->so_rcv.sb_mb, unp_accessable);
1952	SOCKBUF_UNLOCK(&so->so_rcv);
1953
1954	/*
1955	 * Mark all sockets in our accept queue.
1956	 */
1957	ACCEPT_LOCK();
1958	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
1959		SOCKBUF_LOCK(&soa->so_rcv);
1960		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
1961		SOCKBUF_UNLOCK(&soa->so_rcv);
1962	}
1963	ACCEPT_UNLOCK();
1964	unp->unp_gcflag |= UNPGC_SCANNED;
1965}
1966
1967static int unp_recycled;
1968SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
1969    "Number of unreachable sockets claimed by the garbage collector.");
1970
1971static int unp_taskcount;
1972SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
1973    "Number of times the garbage collector has run.");
1974
1975static void
1976unp_gc(__unused void *arg, int pending)
1977{
1978	struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL };
1979	struct unp_head **head;
1980	struct file **unref;
1981	struct unpcb *unp;
1982	int i;
1983
1984	unp_taskcount++;
1985	UNP_GLOBAL_RLOCK();
1986	/*
1987	 * First clear all gc flags from previous runs.
1988	 */
1989	for (head = heads; *head != NULL; head++)
1990		LIST_FOREACH(unp, *head, unp_link)
1991			unp->unp_gcflag = 0;
1992
1993	/*
1994	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
1995	 * is reachable all of the sockets it references are reachable.
1996	 * Stop the scan once we do a complete loop without discovering
1997	 * a new reachable socket.
1998	 */
1999	do {
2000		unp_unreachable = 0;
2001		unp_marked = 0;
2002		for (head = heads; *head != NULL; head++)
2003			LIST_FOREACH(unp, *head, unp_link)
2004				unp_gc_process(unp);
2005	} while (unp_marked);
2006	UNP_GLOBAL_RUNLOCK();
2007	if (unp_unreachable == 0)
2008		return;
2009
2010	/*
2011	 * Allocate space for a local list of dead unpcbs.
2012	 */
2013	unref = malloc(unp_unreachable * sizeof(struct file *),
2014	    M_TEMP, M_WAITOK);
2015
2016	/*
2017	 * Iterate looking for sockets which have been specifically marked
2018	 * as as unreachable and store them locally.
2019	 */
2020	UNP_GLOBAL_RLOCK();
2021	for (i = 0, head = heads; *head != NULL; head++)
2022		LIST_FOREACH(unp, *head, unp_link)
2023			if (unp->unp_gcflag & UNPGC_DEAD) {
2024				unref[i++] = unp->unp_file;
2025				fhold(unp->unp_file);
2026				KASSERT(unp->unp_file != NULL,
2027				    ("unp_gc: Invalid unpcb."));
2028				KASSERT(i <= unp_unreachable,
2029				    ("unp_gc: incorrect unreachable count."));
2030			}
2031	UNP_GLOBAL_RUNLOCK();
2032
2033	/*
2034	 * Now flush all sockets, free'ing rights.  This will free the
2035	 * struct files associated with these sockets but leave each socket
2036	 * with one remaining ref.
2037	 */
2038	for (i = 0; i < unp_unreachable; i++)
2039		sorflush(unref[i]->f_data);
2040
2041	/*
2042	 * And finally release the sockets so they can be reclaimed.
2043	 */
2044	for (i = 0; i < unp_unreachable; i++)
2045		fdrop(unref[i], NULL);
2046	unp_recycled += unp_unreachable;
2047	free(unref, M_TEMP);
2048}
2049
2050void
2051unp_dispose(struct mbuf *m)
2052{
2053
2054	if (m)
2055		unp_scan(m, unp_discard);
2056}
2057
2058static void
2059unp_scan(struct mbuf *m0, void (*op)(struct file *))
2060{
2061	struct mbuf *m;
2062	struct file **rp;
2063	struct cmsghdr *cm;
2064	void *data;
2065	int i;
2066	socklen_t clen, datalen;
2067	int qfds;
2068
2069	while (m0 != NULL) {
2070		for (m = m0; m; m = m->m_next) {
2071			if (m->m_type != MT_CONTROL)
2072				continue;
2073
2074			cm = mtod(m, struct cmsghdr *);
2075			clen = m->m_len;
2076
2077			while (cm != NULL) {
2078				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
2079					break;
2080
2081				data = CMSG_DATA(cm);
2082				datalen = (caddr_t)cm + cm->cmsg_len
2083				    - (caddr_t)data;
2084
2085				if (cm->cmsg_level == SOL_SOCKET &&
2086				    cm->cmsg_type == SCM_RIGHTS) {
2087					qfds = datalen / sizeof (struct file *);
2088					rp = data;
2089					for (i = 0; i < qfds; i++)
2090						(*op)(*rp++);
2091				}
2092
2093				if (CMSG_SPACE(datalen) < clen) {
2094					clen -= CMSG_SPACE(datalen);
2095					cm = (struct cmsghdr *)
2096					    ((caddr_t)cm + CMSG_SPACE(datalen));
2097				} else {
2098					clen = 0;
2099					cm = NULL;
2100				}
2101			}
2102		}
2103		m0 = m0->m_act;
2104	}
2105}
2106
2107#ifdef DDB
2108static void
2109db_print_indent(int indent)
2110{
2111	int i;
2112
2113	for (i = 0; i < indent; i++)
2114		db_printf(" ");
2115}
2116
2117static void
2118db_print_unpflags(int unp_flags)
2119{
2120	int comma;
2121
2122	comma = 0;
2123	if (unp_flags & UNP_HAVEPC) {
2124		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
2125		comma = 1;
2126	}
2127	if (unp_flags & UNP_HAVEPCCACHED) {
2128		db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
2129		comma = 1;
2130	}
2131	if (unp_flags & UNP_WANTCRED) {
2132		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
2133		comma = 1;
2134	}
2135	if (unp_flags & UNP_CONNWAIT) {
2136		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
2137		comma = 1;
2138	}
2139	if (unp_flags & UNP_CONNECTING) {
2140		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
2141		comma = 1;
2142	}
2143	if (unp_flags & UNP_BINDING) {
2144		db_printf("%sUNP_BINDING", comma ? ", " : "");
2145		comma = 1;
2146	}
2147}
2148
2149static void
2150db_print_xucred(int indent, struct xucred *xu)
2151{
2152	int comma, i;
2153
2154	db_print_indent(indent);
2155	db_printf("cr_version: %u   cr_uid: %u   cr_ngroups: %d\n",
2156	    xu->cr_version, xu->cr_uid, xu->cr_ngroups);
2157	db_print_indent(indent);
2158	db_printf("cr_groups: ");
2159	comma = 0;
2160	for (i = 0; i < xu->cr_ngroups; i++) {
2161		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
2162		comma = 1;
2163	}
2164	db_printf("\n");
2165}
2166
2167static void
2168db_print_unprefs(int indent, struct unp_head *uh)
2169{
2170	struct unpcb *unp;
2171	int counter;
2172
2173	counter = 0;
2174	LIST_FOREACH(unp, uh, unp_reflink) {
2175		if (counter % 4 == 0)
2176			db_print_indent(indent);
2177		db_printf("%p  ", unp);
2178		if (counter % 4 == 3)
2179			db_printf("\n");
2180		counter++;
2181	}
2182	if (counter != 0 && counter % 4 != 0)
2183		db_printf("\n");
2184}
2185
2186DB_SHOW_COMMAND(unpcb, db_show_unpcb)
2187{
2188	struct unpcb *unp;
2189
2190        if (!have_addr) {
2191                db_printf("usage: show unpcb <addr>\n");
2192                return;
2193        }
2194        unp = (struct unpcb *)addr;
2195
2196	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
2197	    unp->unp_vnode);
2198
2199	db_printf("unp_ino: %d   unp_conn: %p\n", unp->unp_ino,
2200	    unp->unp_conn);
2201
2202	db_printf("unp_refs:\n");
2203	db_print_unprefs(2, &unp->unp_refs);
2204
2205	/* XXXRW: Would be nice to print the full address, if any. */
2206	db_printf("unp_addr: %p\n", unp->unp_addr);
2207
2208	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
2209	    unp->unp_cc, unp->unp_mbcnt,
2210	    (unsigned long long)unp->unp_gencnt);
2211
2212	db_printf("unp_flags: %x (", unp->unp_flags);
2213	db_print_unpflags(unp->unp_flags);
2214	db_printf(")\n");
2215
2216	db_printf("unp_peercred:\n");
2217	db_print_xucred(2, &unp->unp_peercred);
2218
2219	db_printf("unp_refcount: %u\n", unp->unp_refcount);
2220}
2221#endif
2222