uipc_usrreq.c revision 193332
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004-2009 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
32 */
33
34/*
35 * UNIX Domain (Local) Sockets
36 *
37 * This is an implementation of UNIX (local) domain sockets.  Each socket has
38 * an associated struct unpcb (UNIX protocol control block).  Stream sockets
39 * may be connected to 0 or 1 other socket.  Datagram sockets may be
40 * connected to 0, 1, or many other sockets.  Sockets may be created and
41 * connected in pairs (socketpair(2)), or bound/connected to using the file
42 * system name space.  For most purposes, only the receive socket buffer is
43 * used, as sending on one socket delivers directly to the receive socket
44 * buffer of a second socket.
45 *
46 * The implementation is substantially complicated by the fact that
47 * "ancillary data", such as file descriptors or credentials, may be passed
48 * across UNIX domain sockets.  The potential for passing UNIX domain sockets
49 * over other UNIX domain sockets requires the implementation of a simple
50 * garbage collector to find and tear down cycles of disconnected sockets.
51 *
52 * TODO:
53 *	SEQPACKET, RDM
54 *	rethink name space problems
55 *	need a proper out-of-band
56 */
57
58#include <sys/cdefs.h>
59__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 193332 2009-06-02 18:26:17Z rwatson $");
60
61#include "opt_ddb.h"
62#include "opt_mac.h"
63
64#include <sys/param.h>
65#include <sys/domain.h>
66#include <sys/fcntl.h>
67#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
68#include <sys/eventhandler.h>
69#include <sys/file.h>
70#include <sys/filedesc.h>
71#include <sys/jail.h>
72#include <sys/kernel.h>
73#include <sys/lock.h>
74#include <sys/mbuf.h>
75#include <sys/mount.h>
76#include <sys/mutex.h>
77#include <sys/namei.h>
78#include <sys/proc.h>
79#include <sys/protosw.h>
80#include <sys/resourcevar.h>
81#include <sys/rwlock.h>
82#include <sys/socket.h>
83#include <sys/socketvar.h>
84#include <sys/signalvar.h>
85#include <sys/stat.h>
86#include <sys/sx.h>
87#include <sys/sysctl.h>
88#include <sys/systm.h>
89#include <sys/taskqueue.h>
90#include <sys/un.h>
91#include <sys/unpcb.h>
92#include <sys/vnode.h>
93#include <sys/vimage.h>
94
95#ifdef DDB
96#include <ddb/ddb.h>
97#endif
98
99#include <security/mac/mac_framework.h>
100
101#include <vm/uma.h>
102
103/*
104 * Locking key:
105 * (l)	Locked using list lock
106 * (g)	Locked using linkage lock
107 */
108
109static uma_zone_t	unp_zone;
110static unp_gen_t	unp_gencnt;	/* (l) */
111static u_int		unp_count;	/* (l) Count of local sockets. */
112static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
113static int		unp_rights;	/* (g) File descriptors in flight. */
114static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
115static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
116
117static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
118
119/*
120 * Garbage collection of cyclic file descriptor/socket references occurs
121 * asynchronously in a taskqueue context in order to avoid recursion and
122 * reentrance in the UNIX domain socket, file descriptor, and socket layer
123 * code.  See unp_gc() for a full description.
124 */
125static struct task	unp_gc_task;
126
127/*
128 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
129 * stream sockets, although the total for sender and receiver is actually
130 * only PIPSIZ.
131 *
132 * Datagram sockets really use the sendspace as the maximum datagram size,
133 * and don't really want to reserve the sendspace.  Their recvspace should be
134 * large enough for at least one max-size datagram plus address.
135 */
136#ifndef PIPSIZ
137#define	PIPSIZ	8192
138#endif
139static u_long	unpst_sendspace = PIPSIZ;
140static u_long	unpst_recvspace = PIPSIZ;
141static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
142static u_long	unpdg_recvspace = 4*1024;
143
144SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
145SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
146SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
147
148SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
149	   &unpst_sendspace, 0, "Default stream send space.");
150SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
151	   &unpst_recvspace, 0, "Default stream receive space.");
152SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
153	   &unpdg_sendspace, 0, "Default datagram send space.");
154SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
155	   &unpdg_recvspace, 0, "Default datagram receive space.");
156SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
157    "File descriptors in flight.");
158
159/*-
160 * Locking and synchronization:
161 *
162 * Three types of locks exit in the local domain socket implementation: a
163 * global list mutex, a global linkage rwlock, and per-unpcb mutexes.  Of the
164 * global locks, the list lock protects the socket count, global generation
165 * number, and stream/datagram global lists.  The linkage lock protects the
166 * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
167 * held exclusively over the acquisition of multiple unpcb locks to prevent
168 * deadlock.
169 *
170 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
171 * allocated in pru_attach() and freed in pru_detach().  The validity of that
172 * pointer is an invariant, so no lock is required to dereference the so_pcb
173 * pointer if a valid socket reference is held by the caller.  In practice,
174 * this is always true during operations performed on a socket.  Each unpcb
175 * has a back-pointer to its socket, unp_socket, which will be stable under
176 * the same circumstances.
177 *
178 * This pointer may only be safely dereferenced as long as a valid reference
179 * to the unpcb is held.  Typically, this reference will be from the socket,
180 * or from another unpcb when the referring unpcb's lock is held (in order
181 * that the reference not be invalidated during use).  For example, to follow
182 * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
183 * as unp_socket remains valid as long as the reference to unp_conn is valid.
184 *
185 * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
186 * atomic reads without the lock may be performed "lockless", but more
187 * complex reads and read-modify-writes require the mutex to be held.  No
188 * lock order is defined between unpcb locks -- multiple unpcb locks may be
189 * acquired at the same time only when holding the linkage rwlock
190 * exclusively, which prevents deadlocks.
191 *
192 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
193 * protocols, bind() is a non-atomic operation, and connect() requires
194 * potential sleeping in the protocol, due to potentially waiting on local or
195 * distributed file systems.  We try to separate "lookup" operations, which
196 * may sleep, and the IPC operations themselves, which typically can occur
197 * with relative atomicity as locks can be held over the entire operation.
198 *
199 * Another tricky issue is simultaneous multi-threaded or multi-process
200 * access to a single UNIX domain socket.  These are handled by the flags
201 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
202 * binding, both of which involve dropping UNIX domain socket locks in order
203 * to perform namei() and other file system operations.
204 */
205static struct rwlock	unp_link_rwlock;
206static struct mtx	unp_list_lock;
207
208#define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
209					    "unp_link_rwlock")
210
211#define	UNP_LINK_LOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
212					    RA_LOCKED)
213#define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
214					    RA_UNLOCKED)
215
216#define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
217#define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
218#define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
219#define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
220#define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
221					    RA_WLOCKED)
222
223#define	UNP_LIST_LOCK_INIT()		mtx_init(&unp_list_lock,	\
224					    "unp_list_lock", NULL, MTX_DEF)
225#define	UNP_LIST_LOCK()			mtx_lock(&unp_list_lock)
226#define	UNP_LIST_UNLOCK()		mtx_unlock(&unp_list_lock)
227
228#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
229					    "unp_mtx", "unp_mtx",	\
230					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
231#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
232#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
233#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
234#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
235
236static int	uipc_connect2(struct socket *, struct socket *);
237static int	uipc_ctloutput(struct socket *, struct sockopt *);
238static int	unp_connect(struct socket *, struct sockaddr *,
239		    struct thread *);
240static int	unp_connect2(struct socket *so, struct socket *so2, int);
241static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
242static void	unp_dispose(struct mbuf *);
243static void	unp_shutdown(struct unpcb *);
244static void	unp_drop(struct unpcb *, int);
245static void	unp_gc(__unused void *, int);
246static void	unp_scan(struct mbuf *, void (*)(struct file *));
247static void	unp_discard(struct file *);
248static void	unp_freerights(struct file **, int);
249static void	unp_init(void);
250static int	unp_internalize(struct mbuf **, struct thread *);
251static void	unp_internalize_fp(struct file *);
252static int	unp_externalize(struct mbuf *, struct mbuf **);
253static void	unp_externalize_fp(struct file *);
254static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
255
256/*
257 * Definitions of protocols supported in the LOCAL domain.
258 */
259static struct domain localdomain;
260static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
261static struct protosw localsw[] = {
262{
263	.pr_type =		SOCK_STREAM,
264	.pr_domain =		&localdomain,
265	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
266	.pr_ctloutput =		&uipc_ctloutput,
267	.pr_usrreqs =		&uipc_usrreqs_stream
268},
269{
270	.pr_type =		SOCK_DGRAM,
271	.pr_domain =		&localdomain,
272	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
273	.pr_usrreqs =		&uipc_usrreqs_dgram
274},
275};
276
277static struct domain localdomain = {
278	.dom_family =		AF_LOCAL,
279	.dom_name =		"local",
280	.dom_init =		unp_init,
281	.dom_externalize =	unp_externalize,
282	.dom_dispose =		unp_dispose,
283	.dom_protosw =		localsw,
284	.dom_protoswNPROTOSW =	&localsw[sizeof(localsw)/sizeof(localsw[0])]
285};
286DOMAIN_SET(local);
287
288static void
289uipc_abort(struct socket *so)
290{
291	struct unpcb *unp, *unp2;
292
293	unp = sotounpcb(so);
294	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
295
296	UNP_LINK_WLOCK();
297	UNP_PCB_LOCK(unp);
298	unp2 = unp->unp_conn;
299	if (unp2 != NULL) {
300		UNP_PCB_LOCK(unp2);
301		unp_drop(unp2, ECONNABORTED);
302		UNP_PCB_UNLOCK(unp2);
303	}
304	UNP_PCB_UNLOCK(unp);
305	UNP_LINK_WUNLOCK();
306}
307
308static int
309uipc_accept(struct socket *so, struct sockaddr **nam)
310{
311	struct unpcb *unp, *unp2;
312	const struct sockaddr *sa;
313
314	/*
315	 * Pass back name of connected socket, if it was bound and we are
316	 * still connected (our peer may have closed already!).
317	 */
318	unp = sotounpcb(so);
319	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
320
321	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
322	UNP_LINK_RLOCK();
323	unp2 = unp->unp_conn;
324	if (unp2 != NULL && unp2->unp_addr != NULL) {
325		UNP_PCB_LOCK(unp2);
326		sa = (struct sockaddr *) unp2->unp_addr;
327		bcopy(sa, *nam, sa->sa_len);
328		UNP_PCB_UNLOCK(unp2);
329	} else {
330		sa = &sun_noname;
331		bcopy(sa, *nam, sa->sa_len);
332	}
333	UNP_LINK_RUNLOCK();
334	return (0);
335}
336
337static int
338uipc_attach(struct socket *so, int proto, struct thread *td)
339{
340	u_long sendspace, recvspace;
341	struct unpcb *unp;
342	int error;
343
344	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
345	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
346		switch (so->so_type) {
347		case SOCK_STREAM:
348			sendspace = unpst_sendspace;
349			recvspace = unpst_recvspace;
350			break;
351
352		case SOCK_DGRAM:
353			sendspace = unpdg_sendspace;
354			recvspace = unpdg_recvspace;
355			break;
356
357		default:
358			panic("uipc_attach");
359		}
360		error = soreserve(so, sendspace, recvspace);
361		if (error)
362			return (error);
363	}
364	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
365	if (unp == NULL)
366		return (ENOBUFS);
367	LIST_INIT(&unp->unp_refs);
368	UNP_PCB_LOCK_INIT(unp);
369	unp->unp_socket = so;
370	so->so_pcb = unp;
371	unp->unp_refcount = 1;
372
373	UNP_LIST_LOCK();
374	unp->unp_gencnt = ++unp_gencnt;
375	unp_count++;
376	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead,
377	    unp, unp_link);
378	UNP_LIST_UNLOCK();
379
380	return (0);
381}
382
383static int
384uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
385{
386	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
387	struct vattr vattr;
388	int error, namelen, vfslocked;
389	struct nameidata nd;
390	struct unpcb *unp;
391	struct vnode *vp;
392	struct mount *mp;
393	char *buf;
394
395	unp = sotounpcb(so);
396	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
397
398	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
399	if (namelen <= 0)
400		return (EINVAL);
401
402	/*
403	 * We don't allow simultaneous bind() calls on a single UNIX domain
404	 * socket, so flag in-progress operations, and return an error if an
405	 * operation is already in progress.
406	 *
407	 * Historically, we have not allowed a socket to be rebound, so this
408	 * also returns an error.  Not allowing re-binding simplifies the
409	 * implementation and avoids a great many possible failure modes.
410	 */
411	UNP_PCB_LOCK(unp);
412	if (unp->unp_vnode != NULL) {
413		UNP_PCB_UNLOCK(unp);
414		return (EINVAL);
415	}
416	if (unp->unp_flags & UNP_BINDING) {
417		UNP_PCB_UNLOCK(unp);
418		return (EALREADY);
419	}
420	unp->unp_flags |= UNP_BINDING;
421	UNP_PCB_UNLOCK(unp);
422
423	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
424	bcopy(soun->sun_path, buf, namelen);
425	buf[namelen] = 0;
426
427restart:
428	vfslocked = 0;
429	NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME,
430	    UIO_SYSSPACE, buf, td);
431/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
432	error = namei(&nd);
433	if (error)
434		goto error;
435	vp = nd.ni_vp;
436	vfslocked = NDHASGIANT(&nd);
437	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
438		NDFREE(&nd, NDF_ONLY_PNBUF);
439		if (nd.ni_dvp == vp)
440			vrele(nd.ni_dvp);
441		else
442			vput(nd.ni_dvp);
443		if (vp != NULL) {
444			vrele(vp);
445			error = EADDRINUSE;
446			goto error;
447		}
448		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
449		if (error)
450			goto error;
451		VFS_UNLOCK_GIANT(vfslocked);
452		goto restart;
453	}
454	VATTR_NULL(&vattr);
455	vattr.va_type = VSOCK;
456	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
457#ifdef MAC
458	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
459	    &vattr);
460#endif
461	if (error == 0)
462		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
463	NDFREE(&nd, NDF_ONLY_PNBUF);
464	vput(nd.ni_dvp);
465	if (error) {
466		vn_finished_write(mp);
467		goto error;
468	}
469	vp = nd.ni_vp;
470	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
471	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
472
473	UNP_LINK_WLOCK();
474	UNP_PCB_LOCK(unp);
475	vp->v_socket = unp->unp_socket;
476	unp->unp_vnode = vp;
477	unp->unp_addr = soun;
478	unp->unp_flags &= ~UNP_BINDING;
479	UNP_PCB_UNLOCK(unp);
480	UNP_LINK_WUNLOCK();
481	VOP_UNLOCK(vp, 0);
482	vn_finished_write(mp);
483	VFS_UNLOCK_GIANT(vfslocked);
484	free(buf, M_TEMP);
485	return (0);
486
487error:
488	VFS_UNLOCK_GIANT(vfslocked);
489	UNP_PCB_LOCK(unp);
490	unp->unp_flags &= ~UNP_BINDING;
491	UNP_PCB_UNLOCK(unp);
492	free(buf, M_TEMP);
493	return (error);
494}
495
496static int
497uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
498{
499	int error;
500
501	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
502	UNP_LINK_WLOCK();
503	error = unp_connect(so, nam, td);
504	UNP_LINK_WUNLOCK();
505	return (error);
506}
507
508static void
509uipc_close(struct socket *so)
510{
511	struct unpcb *unp, *unp2;
512
513	unp = sotounpcb(so);
514	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
515
516	UNP_LINK_WLOCK();
517	UNP_PCB_LOCK(unp);
518	unp2 = unp->unp_conn;
519	if (unp2 != NULL) {
520		UNP_PCB_LOCK(unp2);
521		unp_disconnect(unp, unp2);
522		UNP_PCB_UNLOCK(unp2);
523	}
524	UNP_PCB_UNLOCK(unp);
525	UNP_LINK_WUNLOCK();
526}
527
528static int
529uipc_connect2(struct socket *so1, struct socket *so2)
530{
531	struct unpcb *unp, *unp2;
532	int error;
533
534	UNP_LINK_WLOCK();
535	unp = so1->so_pcb;
536	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
537	UNP_PCB_LOCK(unp);
538	unp2 = so2->so_pcb;
539	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
540	UNP_PCB_LOCK(unp2);
541	error = unp_connect2(so1, so2, PRU_CONNECT2);
542	UNP_PCB_UNLOCK(unp2);
543	UNP_PCB_UNLOCK(unp);
544	UNP_LINK_WUNLOCK();
545	return (error);
546}
547
548static void
549uipc_detach(struct socket *so)
550{
551	struct unpcb *unp, *unp2;
552	struct sockaddr_un *saved_unp_addr;
553	struct vnode *vp;
554	int freeunp, local_unp_rights;
555
556	unp = sotounpcb(so);
557	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
558
559	UNP_LINK_WLOCK();
560	UNP_LIST_LOCK();
561	UNP_PCB_LOCK(unp);
562	LIST_REMOVE(unp, unp_link);
563	unp->unp_gencnt = ++unp_gencnt;
564	--unp_count;
565	UNP_LIST_UNLOCK();
566
567	/*
568	 * XXXRW: Should assert vp->v_socket == so.
569	 */
570	if ((vp = unp->unp_vnode) != NULL) {
571		unp->unp_vnode->v_socket = NULL;
572		unp->unp_vnode = NULL;
573	}
574	unp2 = unp->unp_conn;
575	if (unp2 != NULL) {
576		UNP_PCB_LOCK(unp2);
577		unp_disconnect(unp, unp2);
578		UNP_PCB_UNLOCK(unp2);
579	}
580
581	/*
582	 * We hold the linkage lock exclusively, so it's OK to acquire
583	 * multiple pcb locks at a time.
584	 */
585	while (!LIST_EMPTY(&unp->unp_refs)) {
586		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
587
588		UNP_PCB_LOCK(ref);
589		unp_drop(ref, ECONNRESET);
590		UNP_PCB_UNLOCK(ref);
591	}
592	local_unp_rights = unp_rights;
593	UNP_LINK_WUNLOCK();
594	unp->unp_socket->so_pcb = NULL;
595	saved_unp_addr = unp->unp_addr;
596	unp->unp_addr = NULL;
597	unp->unp_refcount--;
598	freeunp = (unp->unp_refcount == 0);
599	if (saved_unp_addr != NULL)
600		free(saved_unp_addr, M_SONAME);
601	if (freeunp) {
602		UNP_PCB_LOCK_DESTROY(unp);
603		uma_zfree(unp_zone, unp);
604	} else
605		UNP_PCB_UNLOCK(unp);
606	if (vp) {
607		int vfslocked;
608
609		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
610		vrele(vp);
611		VFS_UNLOCK_GIANT(vfslocked);
612	}
613	if (local_unp_rights)
614		taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
615}
616
617static int
618uipc_disconnect(struct socket *so)
619{
620	struct unpcb *unp, *unp2;
621
622	unp = sotounpcb(so);
623	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
624
625	UNP_LINK_WLOCK();
626	UNP_PCB_LOCK(unp);
627	unp2 = unp->unp_conn;
628	if (unp2 != NULL) {
629		UNP_PCB_LOCK(unp2);
630		unp_disconnect(unp, unp2);
631		UNP_PCB_UNLOCK(unp2);
632	}
633	UNP_PCB_UNLOCK(unp);
634	UNP_LINK_WUNLOCK();
635	return (0);
636}
637
638static int
639uipc_listen(struct socket *so, int backlog, struct thread *td)
640{
641	struct unpcb *unp;
642	int error;
643
644	unp = sotounpcb(so);
645	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
646
647	UNP_PCB_LOCK(unp);
648	if (unp->unp_vnode == NULL) {
649		UNP_PCB_UNLOCK(unp);
650		return (EINVAL);
651	}
652
653	SOCK_LOCK(so);
654	error = solisten_proto_check(so);
655	if (error == 0) {
656		cru2x(td->td_ucred, &unp->unp_peercred);
657		unp->unp_flags |= UNP_HAVEPCCACHED;
658		solisten_proto(so, backlog);
659	}
660	SOCK_UNLOCK(so);
661	UNP_PCB_UNLOCK(unp);
662	return (error);
663}
664
665static int
666uipc_peeraddr(struct socket *so, struct sockaddr **nam)
667{
668	struct unpcb *unp, *unp2;
669	const struct sockaddr *sa;
670
671	unp = sotounpcb(so);
672	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
673
674	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
675	UNP_PCB_LOCK(unp);
676	/*
677	 * XXX: It seems that this test always fails even when connection is
678	 * established.  So, this else clause is added as workaround to
679	 * return PF_LOCAL sockaddr.
680	 */
681	unp2 = unp->unp_conn;
682	if (unp2 != NULL) {
683		UNP_PCB_LOCK(unp2);
684		if (unp2->unp_addr != NULL)
685			sa = (struct sockaddr *) unp->unp_conn->unp_addr;
686		else
687			sa = &sun_noname;
688		bcopy(sa, *nam, sa->sa_len);
689		UNP_PCB_UNLOCK(unp2);
690	} else {
691		sa = &sun_noname;
692		bcopy(sa, *nam, sa->sa_len);
693	}
694	UNP_PCB_UNLOCK(unp);
695	return (0);
696}
697
698static int
699uipc_rcvd(struct socket *so, int flags)
700{
701	struct unpcb *unp, *unp2;
702	struct socket *so2;
703	u_int mbcnt, sbcc;
704	u_long newhiwat;
705
706	unp = sotounpcb(so);
707	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
708
709	if (so->so_type == SOCK_DGRAM)
710		panic("uipc_rcvd DGRAM?");
711
712	if (so->so_type != SOCK_STREAM)
713		panic("uipc_rcvd unknown socktype");
714
715	/*
716	 * Adjust backpressure on sender and wakeup any waiting to write.
717	 *
718	 * The unp lock is acquired to maintain the validity of the unp_conn
719	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
720	 * static as long as we don't permit unp2 to disconnect from unp,
721	 * which is prevented by the lock on unp.  We cache values from
722	 * so_rcv to avoid holding the so_rcv lock over the entire
723	 * transaction on the remote so_snd.
724	 */
725	SOCKBUF_LOCK(&so->so_rcv);
726	mbcnt = so->so_rcv.sb_mbcnt;
727	sbcc = so->so_rcv.sb_cc;
728	SOCKBUF_UNLOCK(&so->so_rcv);
729	UNP_PCB_LOCK(unp);
730	unp2 = unp->unp_conn;
731	if (unp2 == NULL) {
732		UNP_PCB_UNLOCK(unp);
733		return (0);
734	}
735	so2 = unp2->unp_socket;
736	SOCKBUF_LOCK(&so2->so_snd);
737	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
738	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
739	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
740	    newhiwat, RLIM_INFINITY);
741	sowwakeup_locked(so2);
742	unp->unp_mbcnt = mbcnt;
743	unp->unp_cc = sbcc;
744	UNP_PCB_UNLOCK(unp);
745	return (0);
746}
747
748static int
749uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
750    struct mbuf *control, struct thread *td)
751{
752	struct unpcb *unp, *unp2;
753	struct socket *so2;
754	u_int mbcnt_delta, sbcc;
755	u_long newhiwat;
756	int error = 0;
757
758	unp = sotounpcb(so);
759	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
760
761	if (flags & PRUS_OOB) {
762		error = EOPNOTSUPP;
763		goto release;
764	}
765	if (control != NULL && (error = unp_internalize(&control, td)))
766		goto release;
767	if ((nam != NULL) || (flags & PRUS_EOF))
768		UNP_LINK_WLOCK();
769	else
770		UNP_LINK_RLOCK();
771	switch (so->so_type) {
772	case SOCK_DGRAM:
773	{
774		const struct sockaddr *from;
775
776		unp2 = unp->unp_conn;
777		if (nam != NULL) {
778			UNP_LINK_WLOCK_ASSERT();
779			if (unp2 != NULL) {
780				error = EISCONN;
781				break;
782			}
783			error = unp_connect(so, nam, td);
784			if (error)
785				break;
786			unp2 = unp->unp_conn;
787		}
788
789		/*
790		 * Because connect() and send() are non-atomic in a sendto()
791		 * with a target address, it's possible that the socket will
792		 * have disconnected before the send() can run.  In that case
793		 * return the slightly counter-intuitive but otherwise
794		 * correct error that the socket is not connected.
795		 */
796		if (unp2 == NULL) {
797			error = ENOTCONN;
798			break;
799		}
800		/* Lockless read. */
801		if (unp2->unp_flags & UNP_WANTCRED)
802			control = unp_addsockcred(td, control);
803		UNP_PCB_LOCK(unp);
804		if (unp->unp_addr != NULL)
805			from = (struct sockaddr *)unp->unp_addr;
806		else
807			from = &sun_noname;
808		so2 = unp2->unp_socket;
809		SOCKBUF_LOCK(&so2->so_rcv);
810		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
811			sorwakeup_locked(so2);
812			m = NULL;
813			control = NULL;
814		} else {
815			SOCKBUF_UNLOCK(&so2->so_rcv);
816			error = ENOBUFS;
817		}
818		if (nam != NULL) {
819			UNP_LINK_WLOCK_ASSERT();
820			UNP_PCB_LOCK(unp2);
821			unp_disconnect(unp, unp2);
822			UNP_PCB_UNLOCK(unp2);
823		}
824		UNP_PCB_UNLOCK(unp);
825		break;
826	}
827
828	case SOCK_STREAM:
829		if ((so->so_state & SS_ISCONNECTED) == 0) {
830			if (nam != NULL) {
831				UNP_LINK_WLOCK_ASSERT();
832				error = unp_connect(so, nam, td);
833				if (error)
834					break;	/* XXX */
835			} else {
836				error = ENOTCONN;
837				break;
838			}
839		}
840
841		/* Lockless read. */
842		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
843			error = EPIPE;
844			break;
845		}
846
847		/*
848		 * Because connect() and send() are non-atomic in a sendto()
849		 * with a target address, it's possible that the socket will
850		 * have disconnected before the send() can run.  In that case
851		 * return the slightly counter-intuitive but otherwise
852		 * correct error that the socket is not connected.
853		 *
854		 * Locking here must be done carefully: the inkage lock
855		 * prevents interconnections between unpcbs from changing, so
856		 * we can traverse from unp to unp2 without acquiring unp's
857		 * lock.  Socket buffer locks follow unpcb locks, so we can
858		 * acquire both remote and lock socket buffer locks.
859		 */
860		unp2 = unp->unp_conn;
861		if (unp2 == NULL) {
862			error = ENOTCONN;
863			break;
864		}
865		so2 = unp2->unp_socket;
866		UNP_PCB_LOCK(unp2);
867		SOCKBUF_LOCK(&so2->so_rcv);
868		if (unp2->unp_flags & UNP_WANTCRED) {
869			/*
870			 * Credentials are passed only once on SOCK_STREAM.
871			 */
872			unp2->unp_flags &= ~UNP_WANTCRED;
873			control = unp_addsockcred(td, control);
874		}
875		/*
876		 * Send to paired receive port, and then reduce send buffer
877		 * hiwater marks to maintain backpressure.  Wake up readers.
878		 */
879		if (control != NULL) {
880			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
881				control = NULL;
882		} else
883			sbappend_locked(&so2->so_rcv, m);
884		mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
885		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
886		sbcc = so2->so_rcv.sb_cc;
887		sorwakeup_locked(so2);
888
889		SOCKBUF_LOCK(&so->so_snd);
890		newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
891		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
892		    newhiwat, RLIM_INFINITY);
893		so->so_snd.sb_mbmax -= mbcnt_delta;
894		SOCKBUF_UNLOCK(&so->so_snd);
895		unp2->unp_cc = sbcc;
896		UNP_PCB_UNLOCK(unp2);
897		m = NULL;
898		break;
899
900	default:
901		panic("uipc_send unknown socktype");
902	}
903
904	/*
905	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
906	 */
907	if (flags & PRUS_EOF) {
908		UNP_PCB_LOCK(unp);
909		socantsendmore(so);
910		unp_shutdown(unp);
911		UNP_PCB_UNLOCK(unp);
912	}
913
914	if ((nam != NULL) || (flags & PRUS_EOF))
915		UNP_LINK_WUNLOCK();
916	else
917		UNP_LINK_RUNLOCK();
918
919	if (control != NULL && error != 0)
920		unp_dispose(control);
921
922release:
923	if (control != NULL)
924		m_freem(control);
925	if (m != NULL)
926		m_freem(m);
927	return (error);
928}
929
930static int
931uipc_sense(struct socket *so, struct stat *sb)
932{
933	struct unpcb *unp, *unp2;
934	struct socket *so2;
935
936	unp = sotounpcb(so);
937	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
938
939	sb->st_blksize = so->so_snd.sb_hiwat;
940	UNP_LINK_RLOCK();
941	UNP_PCB_LOCK(unp);
942	unp2 = unp->unp_conn;
943	if (so->so_type == SOCK_STREAM && unp2 != NULL) {
944		so2 = unp2->unp_socket;
945		sb->st_blksize += so2->so_rcv.sb_cc;
946	}
947	sb->st_dev = NODEV;
948	if (unp->unp_ino == 0)
949		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
950	sb->st_ino = unp->unp_ino;
951	UNP_PCB_UNLOCK(unp);
952	UNP_LINK_RUNLOCK();
953	return (0);
954}
955
956static int
957uipc_shutdown(struct socket *so)
958{
959	struct unpcb *unp;
960
961	unp = sotounpcb(so);
962	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
963
964	UNP_LINK_WLOCK();
965	UNP_PCB_LOCK(unp);
966	socantsendmore(so);
967	unp_shutdown(unp);
968	UNP_PCB_UNLOCK(unp);
969	UNP_LINK_WUNLOCK();
970	return (0);
971}
972
973static int
974uipc_sockaddr(struct socket *so, struct sockaddr **nam)
975{
976	struct unpcb *unp;
977	const struct sockaddr *sa;
978
979	unp = sotounpcb(so);
980	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
981
982	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
983	UNP_PCB_LOCK(unp);
984	if (unp->unp_addr != NULL)
985		sa = (struct sockaddr *) unp->unp_addr;
986	else
987		sa = &sun_noname;
988	bcopy(sa, *nam, sa->sa_len);
989	UNP_PCB_UNLOCK(unp);
990	return (0);
991}
992
993static struct pr_usrreqs uipc_usrreqs_dgram = {
994	.pru_abort = 		uipc_abort,
995	.pru_accept =		uipc_accept,
996	.pru_attach =		uipc_attach,
997	.pru_bind =		uipc_bind,
998	.pru_connect =		uipc_connect,
999	.pru_connect2 =		uipc_connect2,
1000	.pru_detach =		uipc_detach,
1001	.pru_disconnect =	uipc_disconnect,
1002	.pru_listen =		uipc_listen,
1003	.pru_peeraddr =		uipc_peeraddr,
1004	.pru_rcvd =		uipc_rcvd,
1005	.pru_send =		uipc_send,
1006	.pru_sense =		uipc_sense,
1007	.pru_shutdown =		uipc_shutdown,
1008	.pru_sockaddr =		uipc_sockaddr,
1009	.pru_soreceive =	soreceive_dgram,
1010	.pru_close =		uipc_close,
1011};
1012
1013static struct pr_usrreqs uipc_usrreqs_stream = {
1014	.pru_abort = 		uipc_abort,
1015	.pru_accept =		uipc_accept,
1016	.pru_attach =		uipc_attach,
1017	.pru_bind =		uipc_bind,
1018	.pru_connect =		uipc_connect,
1019	.pru_connect2 =		uipc_connect2,
1020	.pru_detach =		uipc_detach,
1021	.pru_disconnect =	uipc_disconnect,
1022	.pru_listen =		uipc_listen,
1023	.pru_peeraddr =		uipc_peeraddr,
1024	.pru_rcvd =		uipc_rcvd,
1025	.pru_send =		uipc_send,
1026	.pru_sense =		uipc_sense,
1027	.pru_shutdown =		uipc_shutdown,
1028	.pru_sockaddr =		uipc_sockaddr,
1029	.pru_soreceive =	soreceive_generic,
1030	.pru_close =		uipc_close,
1031};
1032
1033static int
1034uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1035{
1036	struct unpcb *unp;
1037	struct xucred xu;
1038	int error, optval;
1039
1040	if (sopt->sopt_level != 0)
1041		return (EINVAL);
1042
1043	unp = sotounpcb(so);
1044	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1045	error = 0;
1046	switch (sopt->sopt_dir) {
1047	case SOPT_GET:
1048		switch (sopt->sopt_name) {
1049		case LOCAL_PEERCRED:
1050			UNP_PCB_LOCK(unp);
1051			if (unp->unp_flags & UNP_HAVEPC)
1052				xu = unp->unp_peercred;
1053			else {
1054				if (so->so_type == SOCK_STREAM)
1055					error = ENOTCONN;
1056				else
1057					error = EINVAL;
1058			}
1059			UNP_PCB_UNLOCK(unp);
1060			if (error == 0)
1061				error = sooptcopyout(sopt, &xu, sizeof(xu));
1062			break;
1063
1064		case LOCAL_CREDS:
1065			/* Unlocked read. */
1066			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
1067			error = sooptcopyout(sopt, &optval, sizeof(optval));
1068			break;
1069
1070		case LOCAL_CONNWAIT:
1071			/* Unlocked read. */
1072			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
1073			error = sooptcopyout(sopt, &optval, sizeof(optval));
1074			break;
1075
1076		default:
1077			error = EOPNOTSUPP;
1078			break;
1079		}
1080		break;
1081
1082	case SOPT_SET:
1083		switch (sopt->sopt_name) {
1084		case LOCAL_CREDS:
1085		case LOCAL_CONNWAIT:
1086			error = sooptcopyin(sopt, &optval, sizeof(optval),
1087					    sizeof(optval));
1088			if (error)
1089				break;
1090
1091#define	OPTSET(bit) do {						\
1092	UNP_PCB_LOCK(unp);						\
1093	if (optval)							\
1094		unp->unp_flags |= bit;					\
1095	else								\
1096		unp->unp_flags &= ~bit;					\
1097	UNP_PCB_UNLOCK(unp);						\
1098} while (0)
1099
1100			switch (sopt->sopt_name) {
1101			case LOCAL_CREDS:
1102				OPTSET(UNP_WANTCRED);
1103				break;
1104
1105			case LOCAL_CONNWAIT:
1106				OPTSET(UNP_CONNWAIT);
1107				break;
1108
1109			default:
1110				break;
1111			}
1112			break;
1113#undef	OPTSET
1114		default:
1115			error = ENOPROTOOPT;
1116			break;
1117		}
1118		break;
1119
1120	default:
1121		error = EOPNOTSUPP;
1122		break;
1123	}
1124	return (error);
1125}
1126
1127static int
1128unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1129{
1130	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
1131	struct vnode *vp;
1132	struct socket *so2, *so3;
1133	struct unpcb *unp, *unp2, *unp3;
1134	int error, len, vfslocked;
1135	struct nameidata nd;
1136	char buf[SOCK_MAXADDRLEN];
1137	struct sockaddr *sa;
1138
1139	UNP_LINK_WLOCK_ASSERT();
1140
1141	unp = sotounpcb(so);
1142	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1143
1144	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1145	if (len <= 0)
1146		return (EINVAL);
1147	bcopy(soun->sun_path, buf, len);
1148	buf[len] = 0;
1149
1150	UNP_PCB_LOCK(unp);
1151	if (unp->unp_flags & UNP_CONNECTING) {
1152		UNP_PCB_UNLOCK(unp);
1153		return (EALREADY);
1154	}
1155	UNP_LINK_WUNLOCK();
1156	unp->unp_flags |= UNP_CONNECTING;
1157	UNP_PCB_UNLOCK(unp);
1158
1159	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1160	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf,
1161	    td);
1162	error = namei(&nd);
1163	if (error)
1164		vp = NULL;
1165	else
1166		vp = nd.ni_vp;
1167	ASSERT_VOP_LOCKED(vp, "unp_connect");
1168	vfslocked = NDHASGIANT(&nd);
1169	NDFREE(&nd, NDF_ONLY_PNBUF);
1170	if (error)
1171		goto bad;
1172
1173	if (vp->v_type != VSOCK) {
1174		error = ENOTSOCK;
1175		goto bad;
1176	}
1177#ifdef MAC
1178	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1179	if (error)
1180		goto bad;
1181#endif
1182	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1183	if (error)
1184		goto bad;
1185	VFS_UNLOCK_GIANT(vfslocked);
1186
1187	unp = sotounpcb(so);
1188	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1189
1190	/*
1191	 * Lock linkage lock for two reasons: make sure v_socket is stable,
1192	 * and to protect simultaneous locking of multiple pcbs.
1193	 */
1194	UNP_LINK_WLOCK();
1195	so2 = vp->v_socket;
1196	if (so2 == NULL) {
1197		error = ECONNREFUSED;
1198		goto bad2;
1199	}
1200	if (so->so_type != so2->so_type) {
1201		error = EPROTOTYPE;
1202		goto bad2;
1203	}
1204	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
1205		if (so2->so_options & SO_ACCEPTCONN) {
1206			so3 = sonewconn(so2, 0);
1207		} else
1208			so3 = NULL;
1209		if (so3 == NULL) {
1210			error = ECONNREFUSED;
1211			goto bad2;
1212		}
1213		unp = sotounpcb(so);
1214		unp2 = sotounpcb(so2);
1215		unp3 = sotounpcb(so3);
1216		UNP_PCB_LOCK(unp);
1217		UNP_PCB_LOCK(unp2);
1218		UNP_PCB_LOCK(unp3);
1219		if (unp2->unp_addr != NULL) {
1220			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
1221			unp3->unp_addr = (struct sockaddr_un *) sa;
1222			sa = NULL;
1223		}
1224
1225		/*
1226		 * The connecter's (client's) credentials are copied from its
1227		 * process structure at the time of connect() (which is now).
1228		 */
1229		cru2x(td->td_ucred, &unp3->unp_peercred);
1230		unp3->unp_flags |= UNP_HAVEPC;
1231
1232		/*
1233		 * The receiver's (server's) credentials are copied from the
1234		 * unp_peercred member of socket on which the former called
1235		 * listen(); uipc_listen() cached that process's credentials
1236		 * at that time so we can use them now.
1237		 */
1238		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
1239		    ("unp_connect: listener without cached peercred"));
1240		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
1241		    sizeof(unp->unp_peercred));
1242		unp->unp_flags |= UNP_HAVEPC;
1243		if (unp2->unp_flags & UNP_WANTCRED)
1244			unp3->unp_flags |= UNP_WANTCRED;
1245		UNP_PCB_UNLOCK(unp3);
1246		UNP_PCB_UNLOCK(unp2);
1247		UNP_PCB_UNLOCK(unp);
1248#ifdef MAC
1249		mac_socketpeer_set_from_socket(so, so3);
1250		mac_socketpeer_set_from_socket(so3, so);
1251#endif
1252
1253		so2 = so3;
1254	}
1255	unp = sotounpcb(so);
1256	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1257	unp2 = sotounpcb(so2);
1258	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
1259	UNP_PCB_LOCK(unp);
1260	UNP_PCB_LOCK(unp2);
1261	error = unp_connect2(so, so2, PRU_CONNECT);
1262	UNP_PCB_UNLOCK(unp2);
1263	UNP_PCB_UNLOCK(unp);
1264bad2:
1265	UNP_LINK_WUNLOCK();
1266	if (vfslocked)
1267		/*
1268		 * Giant has been previously acquired. This means filesystem
1269		 * isn't MPSAFE.  Do it once again.
1270		 */
1271		mtx_lock(&Giant);
1272bad:
1273	if (vp != NULL)
1274		vput(vp);
1275	VFS_UNLOCK_GIANT(vfslocked);
1276	free(sa, M_SONAME);
1277	UNP_LINK_WLOCK();
1278	UNP_PCB_LOCK(unp);
1279	unp->unp_flags &= ~UNP_CONNECTING;
1280	UNP_PCB_UNLOCK(unp);
1281	return (error);
1282}
1283
1284static int
1285unp_connect2(struct socket *so, struct socket *so2, int req)
1286{
1287	struct unpcb *unp;
1288	struct unpcb *unp2;
1289
1290	unp = sotounpcb(so);
1291	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
1292	unp2 = sotounpcb(so2);
1293	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
1294
1295	UNP_LINK_WLOCK_ASSERT();
1296	UNP_PCB_LOCK_ASSERT(unp);
1297	UNP_PCB_LOCK_ASSERT(unp2);
1298
1299	if (so2->so_type != so->so_type)
1300		return (EPROTOTYPE);
1301	unp->unp_conn = unp2;
1302
1303	switch (so->so_type) {
1304	case SOCK_DGRAM:
1305		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1306		soisconnected(so);
1307		break;
1308
1309	case SOCK_STREAM:
1310		unp2->unp_conn = unp;
1311		if (req == PRU_CONNECT &&
1312		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1313			soisconnecting(so);
1314		else
1315			soisconnected(so);
1316		soisconnected(so2);
1317		break;
1318
1319	default:
1320		panic("unp_connect2");
1321	}
1322	return (0);
1323}
1324
1325static void
1326unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
1327{
1328	struct socket *so;
1329
1330	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
1331
1332	UNP_LINK_WLOCK_ASSERT();
1333	UNP_PCB_LOCK_ASSERT(unp);
1334	UNP_PCB_LOCK_ASSERT(unp2);
1335
1336	unp->unp_conn = NULL;
1337	switch (unp->unp_socket->so_type) {
1338	case SOCK_DGRAM:
1339		LIST_REMOVE(unp, unp_reflink);
1340		so = unp->unp_socket;
1341		SOCK_LOCK(so);
1342		so->so_state &= ~SS_ISCONNECTED;
1343		SOCK_UNLOCK(so);
1344		break;
1345
1346	case SOCK_STREAM:
1347		soisdisconnected(unp->unp_socket);
1348		unp2->unp_conn = NULL;
1349		soisdisconnected(unp2->unp_socket);
1350		break;
1351	}
1352}
1353
1354/*
1355 * unp_pcblist() walks the global list of struct unpcb's to generate a
1356 * pointer list, bumping the refcount on each unpcb.  It then copies them out
1357 * sequentially, validating the generation number on each to see if it has
1358 * been detached.  All of this is necessary because copyout() may sleep on
1359 * disk I/O.
1360 */
1361static int
1362unp_pcblist(SYSCTL_HANDLER_ARGS)
1363{
1364	int error, i, n;
1365	int freeunp;
1366	struct unpcb *unp, **unp_list;
1367	unp_gen_t gencnt;
1368	struct xunpgen *xug;
1369	struct unp_head *head;
1370	struct xunpcb *xu;
1371
1372	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
1373
1374	/*
1375	 * The process of preparing the PCB list is too time-consuming and
1376	 * resource-intensive to repeat twice on every request.
1377	 */
1378	if (req->oldptr == NULL) {
1379		n = unp_count;
1380		req->oldidx = 2 * (sizeof *xug)
1381			+ (n + n/8) * sizeof(struct xunpcb);
1382		return (0);
1383	}
1384
1385	if (req->newptr != NULL)
1386		return (EPERM);
1387
1388	/*
1389	 * OK, now we're committed to doing something.
1390	 */
1391	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
1392	UNP_LIST_LOCK();
1393	gencnt = unp_gencnt;
1394	n = unp_count;
1395	UNP_LIST_UNLOCK();
1396
1397	xug->xug_len = sizeof *xug;
1398	xug->xug_count = n;
1399	xug->xug_gen = gencnt;
1400	xug->xug_sogen = so_gencnt;
1401	error = SYSCTL_OUT(req, xug, sizeof *xug);
1402	if (error) {
1403		free(xug, M_TEMP);
1404		return (error);
1405	}
1406
1407	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1408
1409	UNP_LIST_LOCK();
1410	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1411	     unp = LIST_NEXT(unp, unp_link)) {
1412		UNP_PCB_LOCK(unp);
1413		if (unp->unp_gencnt <= gencnt) {
1414			if (cr_cansee(req->td->td_ucred,
1415			    unp->unp_socket->so_cred)) {
1416				UNP_PCB_UNLOCK(unp);
1417				continue;
1418			}
1419			unp_list[i++] = unp;
1420			unp->unp_refcount++;
1421		}
1422		UNP_PCB_UNLOCK(unp);
1423	}
1424	UNP_LIST_UNLOCK();
1425	n = i;			/* In case we lost some during malloc. */
1426
1427	error = 0;
1428	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
1429	for (i = 0; i < n; i++) {
1430		unp = unp_list[i];
1431		UNP_PCB_LOCK(unp);
1432		unp->unp_refcount--;
1433	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
1434			xu->xu_len = sizeof *xu;
1435			xu->xu_unpp = unp;
1436			/*
1437			 * XXX - need more locking here to protect against
1438			 * connect/disconnect races for SMP.
1439			 */
1440			if (unp->unp_addr != NULL)
1441				bcopy(unp->unp_addr, &xu->xu_addr,
1442				      unp->unp_addr->sun_len);
1443			if (unp->unp_conn != NULL &&
1444			    unp->unp_conn->unp_addr != NULL)
1445				bcopy(unp->unp_conn->unp_addr,
1446				      &xu->xu_caddr,
1447				      unp->unp_conn->unp_addr->sun_len);
1448			bcopy(unp, &xu->xu_unp, sizeof *unp);
1449			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1450			UNP_PCB_UNLOCK(unp);
1451			error = SYSCTL_OUT(req, xu, sizeof *xu);
1452		} else {
1453			freeunp = (unp->unp_refcount == 0);
1454			UNP_PCB_UNLOCK(unp);
1455			if (freeunp) {
1456				UNP_PCB_LOCK_DESTROY(unp);
1457				uma_zfree(unp_zone, unp);
1458			}
1459		}
1460	}
1461	free(xu, M_TEMP);
1462	if (!error) {
1463		/*
1464		 * Give the user an updated idea of our state.  If the
1465		 * generation differs from what we told her before, she knows
1466		 * that something happened while we were processing this
1467		 * request, and it might be necessary to retry.
1468		 */
1469		xug->xug_gen = unp_gencnt;
1470		xug->xug_sogen = so_gencnt;
1471		xug->xug_count = unp_count;
1472		error = SYSCTL_OUT(req, xug, sizeof *xug);
1473	}
1474	free(unp_list, M_TEMP);
1475	free(xug, M_TEMP);
1476	return (error);
1477}
1478
1479SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1480	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1481	    "List of active local datagram sockets");
1482SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1483	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1484	    "List of active local stream sockets");
1485
1486static void
1487unp_shutdown(struct unpcb *unp)
1488{
1489	struct unpcb *unp2;
1490	struct socket *so;
1491
1492	UNP_LINK_WLOCK_ASSERT();
1493	UNP_PCB_LOCK_ASSERT(unp);
1494
1495	unp2 = unp->unp_conn;
1496	if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) {
1497		so = unp2->unp_socket;
1498		if (so != NULL)
1499			socantrcvmore(so);
1500	}
1501}
1502
1503static void
1504unp_drop(struct unpcb *unp, int errno)
1505{
1506	struct socket *so = unp->unp_socket;
1507	struct unpcb *unp2;
1508
1509	UNP_LINK_WLOCK_ASSERT();
1510	UNP_PCB_LOCK_ASSERT(unp);
1511
1512	so->so_error = errno;
1513	unp2 = unp->unp_conn;
1514	if (unp2 == NULL)
1515		return;
1516	UNP_PCB_LOCK(unp2);
1517	unp_disconnect(unp, unp2);
1518	UNP_PCB_UNLOCK(unp2);
1519}
1520
1521static void
1522unp_freerights(struct file **rp, int fdcount)
1523{
1524	int i;
1525	struct file *fp;
1526
1527	for (i = 0; i < fdcount; i++) {
1528		fp = *rp;
1529		*rp++ = NULL;
1530		unp_discard(fp);
1531	}
1532}
1533
1534static int
1535unp_externalize(struct mbuf *control, struct mbuf **controlp)
1536{
1537	struct thread *td = curthread;		/* XXX */
1538	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1539	int i;
1540	int *fdp;
1541	struct file **rp;
1542	struct file *fp;
1543	void *data;
1544	socklen_t clen = control->m_len, datalen;
1545	int error, newfds;
1546	int f;
1547	u_int newlen;
1548
1549	UNP_LINK_UNLOCK_ASSERT();
1550
1551	error = 0;
1552	if (controlp != NULL) /* controlp == NULL => free control messages */
1553		*controlp = NULL;
1554	while (cm != NULL) {
1555		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1556			error = EINVAL;
1557			break;
1558		}
1559		data = CMSG_DATA(cm);
1560		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1561		if (cm->cmsg_level == SOL_SOCKET
1562		    && cm->cmsg_type == SCM_RIGHTS) {
1563			newfds = datalen / sizeof(struct file *);
1564			rp = data;
1565
1566			/* If we're not outputting the descriptors free them. */
1567			if (error || controlp == NULL) {
1568				unp_freerights(rp, newfds);
1569				goto next;
1570			}
1571			FILEDESC_XLOCK(td->td_proc->p_fd);
1572			/* if the new FD's will not fit free them.  */
1573			if (!fdavail(td, newfds)) {
1574				FILEDESC_XUNLOCK(td->td_proc->p_fd);
1575				error = EMSGSIZE;
1576				unp_freerights(rp, newfds);
1577				goto next;
1578			}
1579
1580			/*
1581			 * Now change each pointer to an fd in the global
1582			 * table to an integer that is the index to the local
1583			 * fd table entry that we set up to point to the
1584			 * global one we are transferring.
1585			 */
1586			newlen = newfds * sizeof(int);
1587			*controlp = sbcreatecontrol(NULL, newlen,
1588			    SCM_RIGHTS, SOL_SOCKET);
1589			if (*controlp == NULL) {
1590				FILEDESC_XUNLOCK(td->td_proc->p_fd);
1591				error = E2BIG;
1592				unp_freerights(rp, newfds);
1593				goto next;
1594			}
1595
1596			fdp = (int *)
1597			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1598			for (i = 0; i < newfds; i++) {
1599				if (fdalloc(td, 0, &f))
1600					panic("unp_externalize fdalloc failed");
1601				fp = *rp++;
1602				td->td_proc->p_fd->fd_ofiles[f] = fp;
1603				unp_externalize_fp(fp);
1604				*fdp++ = f;
1605			}
1606			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1607		} else {
1608			/* We can just copy anything else across. */
1609			if (error || controlp == NULL)
1610				goto next;
1611			*controlp = sbcreatecontrol(NULL, datalen,
1612			    cm->cmsg_type, cm->cmsg_level);
1613			if (*controlp == NULL) {
1614				error = ENOBUFS;
1615				goto next;
1616			}
1617			bcopy(data,
1618			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
1619			    datalen);
1620		}
1621		controlp = &(*controlp)->m_next;
1622
1623next:
1624		if (CMSG_SPACE(datalen) < clen) {
1625			clen -= CMSG_SPACE(datalen);
1626			cm = (struct cmsghdr *)
1627			    ((caddr_t)cm + CMSG_SPACE(datalen));
1628		} else {
1629			clen = 0;
1630			cm = NULL;
1631		}
1632	}
1633
1634	m_freem(control);
1635	return (error);
1636}
1637
1638static void
1639unp_zone_change(void *tag)
1640{
1641
1642	uma_zone_set_max(unp_zone, maxsockets);
1643}
1644
1645static void
1646unp_init(void)
1647{
1648
1649#ifdef VIMAGE
1650	if (!IS_DEFAULT_VNET(curvnet))
1651		return;
1652#endif
1653	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
1654	    NULL, NULL, UMA_ALIGN_PTR, 0);
1655	if (unp_zone == NULL)
1656		panic("unp_init");
1657	uma_zone_set_max(unp_zone, maxsockets);
1658	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
1659	    NULL, EVENTHANDLER_PRI_ANY);
1660	LIST_INIT(&unp_dhead);
1661	LIST_INIT(&unp_shead);
1662	TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
1663	UNP_LINK_LOCK_INIT();
1664	UNP_LIST_LOCK_INIT();
1665}
1666
1667static int
1668unp_internalize(struct mbuf **controlp, struct thread *td)
1669{
1670	struct mbuf *control = *controlp;
1671	struct proc *p = td->td_proc;
1672	struct filedesc *fdescp = p->p_fd;
1673	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1674	struct cmsgcred *cmcred;
1675	struct file **rp;
1676	struct file *fp;
1677	struct timeval *tv;
1678	int i, fd, *fdp;
1679	void *data;
1680	socklen_t clen = control->m_len, datalen;
1681	int error, oldfds;
1682	u_int newlen;
1683
1684	UNP_LINK_UNLOCK_ASSERT();
1685
1686	error = 0;
1687	*controlp = NULL;
1688	while (cm != NULL) {
1689		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
1690		    || cm->cmsg_len > clen) {
1691			error = EINVAL;
1692			goto out;
1693		}
1694		data = CMSG_DATA(cm);
1695		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
1696
1697		switch (cm->cmsg_type) {
1698		/*
1699		 * Fill in credential information.
1700		 */
1701		case SCM_CREDS:
1702			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
1703			    SCM_CREDS, SOL_SOCKET);
1704			if (*controlp == NULL) {
1705				error = ENOBUFS;
1706				goto out;
1707			}
1708			cmcred = (struct cmsgcred *)
1709			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1710			cmcred->cmcred_pid = p->p_pid;
1711			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
1712			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
1713			cmcred->cmcred_euid = td->td_ucred->cr_uid;
1714			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
1715			    CMGROUP_MAX);
1716			for (i = 0; i < cmcred->cmcred_ngroups; i++)
1717				cmcred->cmcred_groups[i] =
1718				    td->td_ucred->cr_groups[i];
1719			break;
1720
1721		case SCM_RIGHTS:
1722			oldfds = datalen / sizeof (int);
1723			/*
1724			 * Check that all the FDs passed in refer to legal
1725			 * files.  If not, reject the entire operation.
1726			 */
1727			fdp = data;
1728			FILEDESC_SLOCK(fdescp);
1729			for (i = 0; i < oldfds; i++) {
1730				fd = *fdp++;
1731				if ((unsigned)fd >= fdescp->fd_nfiles ||
1732				    fdescp->fd_ofiles[fd] == NULL) {
1733					FILEDESC_SUNLOCK(fdescp);
1734					error = EBADF;
1735					goto out;
1736				}
1737				fp = fdescp->fd_ofiles[fd];
1738				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
1739					FILEDESC_SUNLOCK(fdescp);
1740					error = EOPNOTSUPP;
1741					goto out;
1742				}
1743
1744			}
1745
1746			/*
1747			 * Now replace the integer FDs with pointers to the
1748			 * associated global file table entry..
1749			 */
1750			newlen = oldfds * sizeof(struct file *);
1751			*controlp = sbcreatecontrol(NULL, newlen,
1752			    SCM_RIGHTS, SOL_SOCKET);
1753			if (*controlp == NULL) {
1754				FILEDESC_SUNLOCK(fdescp);
1755				error = E2BIG;
1756				goto out;
1757			}
1758			fdp = data;
1759			rp = (struct file **)
1760			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1761			for (i = 0; i < oldfds; i++) {
1762				fp = fdescp->fd_ofiles[*fdp++];
1763				*rp++ = fp;
1764				unp_internalize_fp(fp);
1765			}
1766			FILEDESC_SUNLOCK(fdescp);
1767			break;
1768
1769		case SCM_TIMESTAMP:
1770			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
1771			    SCM_TIMESTAMP, SOL_SOCKET);
1772			if (*controlp == NULL) {
1773				error = ENOBUFS;
1774				goto out;
1775			}
1776			tv = (struct timeval *)
1777			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
1778			microtime(tv);
1779			break;
1780
1781		default:
1782			error = EINVAL;
1783			goto out;
1784		}
1785
1786		controlp = &(*controlp)->m_next;
1787		if (CMSG_SPACE(datalen) < clen) {
1788			clen -= CMSG_SPACE(datalen);
1789			cm = (struct cmsghdr *)
1790			    ((caddr_t)cm + CMSG_SPACE(datalen));
1791		} else {
1792			clen = 0;
1793			cm = NULL;
1794		}
1795	}
1796
1797out:
1798	m_freem(control);
1799	return (error);
1800}
1801
1802static struct mbuf *
1803unp_addsockcred(struct thread *td, struct mbuf *control)
1804{
1805	struct mbuf *m, *n, *n_prev;
1806	struct sockcred *sc;
1807	const struct cmsghdr *cm;
1808	int ngroups;
1809	int i;
1810
1811	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
1812	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
1813	if (m == NULL)
1814		return (control);
1815
1816	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
1817	sc->sc_uid = td->td_ucred->cr_ruid;
1818	sc->sc_euid = td->td_ucred->cr_uid;
1819	sc->sc_gid = td->td_ucred->cr_rgid;
1820	sc->sc_egid = td->td_ucred->cr_gid;
1821	sc->sc_ngroups = ngroups;
1822	for (i = 0; i < sc->sc_ngroups; i++)
1823		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
1824
1825	/*
1826	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
1827	 * created SCM_CREDS control message (struct sockcred) has another
1828	 * format.
1829	 */
1830	if (control != NULL)
1831		for (n = control, n_prev = NULL; n != NULL;) {
1832			cm = mtod(n, struct cmsghdr *);
1833    			if (cm->cmsg_level == SOL_SOCKET &&
1834			    cm->cmsg_type == SCM_CREDS) {
1835    				if (n_prev == NULL)
1836					control = n->m_next;
1837				else
1838					n_prev->m_next = n->m_next;
1839				n = m_free(n);
1840			} else {
1841				n_prev = n;
1842				n = n->m_next;
1843			}
1844		}
1845
1846	/* Prepend it to the head. */
1847	m->m_next = control;
1848	return (m);
1849}
1850
1851static struct unpcb *
1852fptounp(struct file *fp)
1853{
1854	struct socket *so;
1855
1856	if (fp->f_type != DTYPE_SOCKET)
1857		return (NULL);
1858	if ((so = fp->f_data) == NULL)
1859		return (NULL);
1860	if (so->so_proto->pr_domain != &localdomain)
1861		return (NULL);
1862	return sotounpcb(so);
1863}
1864
1865static void
1866unp_discard(struct file *fp)
1867{
1868
1869	unp_externalize_fp(fp);
1870	(void) closef(fp, (struct thread *)NULL);
1871}
1872
1873static void
1874unp_internalize_fp(struct file *fp)
1875{
1876	struct unpcb *unp;
1877
1878	UNP_LINK_WLOCK();
1879	if ((unp = fptounp(fp)) != NULL) {
1880		unp->unp_file = fp;
1881		unp->unp_msgcount++;
1882	}
1883	fhold(fp);
1884	unp_rights++;
1885	UNP_LINK_WUNLOCK();
1886}
1887
1888static void
1889unp_externalize_fp(struct file *fp)
1890{
1891	struct unpcb *unp;
1892
1893	UNP_LINK_WLOCK();
1894	if ((unp = fptounp(fp)) != NULL)
1895		unp->unp_msgcount--;
1896	unp_rights--;
1897	UNP_LINK_WUNLOCK();
1898}
1899
1900/*
1901 * unp_defer indicates whether additional work has been defered for a future
1902 * pass through unp_gc().  It is thread local and does not require explicit
1903 * synchronization.
1904 */
1905static int	unp_marked;
1906static int	unp_unreachable;
1907
1908static void
1909unp_accessable(struct file *fp)
1910{
1911	struct unpcb *unp;
1912
1913	if ((unp = fptounp(fp)) == NULL)
1914		return;
1915	if (unp->unp_gcflag & UNPGC_REF)
1916		return;
1917	unp->unp_gcflag &= ~UNPGC_DEAD;
1918	unp->unp_gcflag |= UNPGC_REF;
1919	unp_marked++;
1920}
1921
1922static void
1923unp_gc_process(struct unpcb *unp)
1924{
1925	struct socket *soa;
1926	struct socket *so;
1927	struct file *fp;
1928
1929	/* Already processed. */
1930	if (unp->unp_gcflag & UNPGC_SCANNED)
1931		return;
1932	fp = unp->unp_file;
1933
1934	/*
1935	 * Check for a socket potentially in a cycle.  It must be in a
1936	 * queue as indicated by msgcount, and this must equal the file
1937	 * reference count.  Note that when msgcount is 0 the file is NULL.
1938	 */
1939	if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp &&
1940	    unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) {
1941		unp->unp_gcflag |= UNPGC_DEAD;
1942		unp_unreachable++;
1943		return;
1944	}
1945
1946	/*
1947	 * Mark all sockets we reference with RIGHTS.
1948	 */
1949	so = unp->unp_socket;
1950	SOCKBUF_LOCK(&so->so_rcv);
1951	unp_scan(so->so_rcv.sb_mb, unp_accessable);
1952	SOCKBUF_UNLOCK(&so->so_rcv);
1953
1954	/*
1955	 * Mark all sockets in our accept queue.
1956	 */
1957	ACCEPT_LOCK();
1958	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
1959		SOCKBUF_LOCK(&soa->so_rcv);
1960		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
1961		SOCKBUF_UNLOCK(&soa->so_rcv);
1962	}
1963	ACCEPT_UNLOCK();
1964	unp->unp_gcflag |= UNPGC_SCANNED;
1965}
1966
1967static int unp_recycled;
1968SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
1969    "Number of unreachable sockets claimed by the garbage collector.");
1970
1971static int unp_taskcount;
1972SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
1973    "Number of times the garbage collector has run.");
1974
1975static void
1976unp_gc(__unused void *arg, int pending)
1977{
1978	struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL };
1979	struct unp_head **head;
1980	struct file **unref;
1981	struct unpcb *unp;
1982	int i;
1983
1984	unp_taskcount++;
1985	UNP_LIST_LOCK();
1986	/*
1987	 * First clear all gc flags from previous runs.
1988	 */
1989	for (head = heads; *head != NULL; head++)
1990		LIST_FOREACH(unp, *head, unp_link)
1991			unp->unp_gcflag = 0;
1992
1993	/*
1994	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
1995	 * is reachable all of the sockets it references are reachable.
1996	 * Stop the scan once we do a complete loop without discovering
1997	 * a new reachable socket.
1998	 */
1999	do {
2000		unp_unreachable = 0;
2001		unp_marked = 0;
2002		for (head = heads; *head != NULL; head++)
2003			LIST_FOREACH(unp, *head, unp_link)
2004				unp_gc_process(unp);
2005	} while (unp_marked);
2006	UNP_LIST_UNLOCK();
2007	if (unp_unreachable == 0)
2008		return;
2009
2010	/*
2011	 * Allocate space for a local list of dead unpcbs.
2012	 */
2013	unref = malloc(unp_unreachable * sizeof(struct file *),
2014	    M_TEMP, M_WAITOK);
2015
2016	/*
2017	 * Iterate looking for sockets which have been specifically marked
2018	 * as as unreachable and store them locally.
2019	 */
2020	UNP_LIST_LOCK();
2021	for (i = 0, head = heads; *head != NULL; head++)
2022		LIST_FOREACH(unp, *head, unp_link)
2023			if (unp->unp_gcflag & UNPGC_DEAD) {
2024				unref[i++] = unp->unp_file;
2025				fhold(unp->unp_file);
2026				KASSERT(unp->unp_file != NULL,
2027				    ("unp_gc: Invalid unpcb."));
2028				KASSERT(i <= unp_unreachable,
2029				    ("unp_gc: incorrect unreachable count."));
2030			}
2031	UNP_LIST_UNLOCK();
2032
2033	/*
2034	 * Now flush all sockets, free'ing rights.  This will free the
2035	 * struct files associated with these sockets but leave each socket
2036	 * with one remaining ref.
2037	 */
2038	for (i = 0; i < unp_unreachable; i++)
2039		sorflush(unref[i]->f_data);
2040
2041	/*
2042	 * And finally release the sockets so they can be reclaimed.
2043	 */
2044	for (i = 0; i < unp_unreachable; i++)
2045		fdrop(unref[i], NULL);
2046	unp_recycled += unp_unreachable;
2047	free(unref, M_TEMP);
2048}
2049
2050static void
2051unp_dispose(struct mbuf *m)
2052{
2053
2054	if (m)
2055		unp_scan(m, unp_discard);
2056}
2057
2058static void
2059unp_scan(struct mbuf *m0, void (*op)(struct file *))
2060{
2061	struct mbuf *m;
2062	struct file **rp;
2063	struct cmsghdr *cm;
2064	void *data;
2065	int i;
2066	socklen_t clen, datalen;
2067	int qfds;
2068
2069	while (m0 != NULL) {
2070		for (m = m0; m; m = m->m_next) {
2071			if (m->m_type != MT_CONTROL)
2072				continue;
2073
2074			cm = mtod(m, struct cmsghdr *);
2075			clen = m->m_len;
2076
2077			while (cm != NULL) {
2078				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
2079					break;
2080
2081				data = CMSG_DATA(cm);
2082				datalen = (caddr_t)cm + cm->cmsg_len
2083				    - (caddr_t)data;
2084
2085				if (cm->cmsg_level == SOL_SOCKET &&
2086				    cm->cmsg_type == SCM_RIGHTS) {
2087					qfds = datalen / sizeof (struct file *);
2088					rp = data;
2089					for (i = 0; i < qfds; i++)
2090						(*op)(*rp++);
2091				}
2092
2093				if (CMSG_SPACE(datalen) < clen) {
2094					clen -= CMSG_SPACE(datalen);
2095					cm = (struct cmsghdr *)
2096					    ((caddr_t)cm + CMSG_SPACE(datalen));
2097				} else {
2098					clen = 0;
2099					cm = NULL;
2100				}
2101			}
2102		}
2103		m0 = m0->m_act;
2104	}
2105}
2106
2107#ifdef DDB
2108static void
2109db_print_indent(int indent)
2110{
2111	int i;
2112
2113	for (i = 0; i < indent; i++)
2114		db_printf(" ");
2115}
2116
2117static void
2118db_print_unpflags(int unp_flags)
2119{
2120	int comma;
2121
2122	comma = 0;
2123	if (unp_flags & UNP_HAVEPC) {
2124		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
2125		comma = 1;
2126	}
2127	if (unp_flags & UNP_HAVEPCCACHED) {
2128		db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
2129		comma = 1;
2130	}
2131	if (unp_flags & UNP_WANTCRED) {
2132		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
2133		comma = 1;
2134	}
2135	if (unp_flags & UNP_CONNWAIT) {
2136		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
2137		comma = 1;
2138	}
2139	if (unp_flags & UNP_CONNECTING) {
2140		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
2141		comma = 1;
2142	}
2143	if (unp_flags & UNP_BINDING) {
2144		db_printf("%sUNP_BINDING", comma ? ", " : "");
2145		comma = 1;
2146	}
2147}
2148
2149static void
2150db_print_xucred(int indent, struct xucred *xu)
2151{
2152	int comma, i;
2153
2154	db_print_indent(indent);
2155	db_printf("cr_version: %u   cr_uid: %u   cr_ngroups: %d\n",
2156	    xu->cr_version, xu->cr_uid, xu->cr_ngroups);
2157	db_print_indent(indent);
2158	db_printf("cr_groups: ");
2159	comma = 0;
2160	for (i = 0; i < xu->cr_ngroups; i++) {
2161		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
2162		comma = 1;
2163	}
2164	db_printf("\n");
2165}
2166
2167static void
2168db_print_unprefs(int indent, struct unp_head *uh)
2169{
2170	struct unpcb *unp;
2171	int counter;
2172
2173	counter = 0;
2174	LIST_FOREACH(unp, uh, unp_reflink) {
2175		if (counter % 4 == 0)
2176			db_print_indent(indent);
2177		db_printf("%p  ", unp);
2178		if (counter % 4 == 3)
2179			db_printf("\n");
2180		counter++;
2181	}
2182	if (counter != 0 && counter % 4 != 0)
2183		db_printf("\n");
2184}
2185
2186DB_SHOW_COMMAND(unpcb, db_show_unpcb)
2187{
2188	struct unpcb *unp;
2189
2190        if (!have_addr) {
2191                db_printf("usage: show unpcb <addr>\n");
2192                return;
2193        }
2194        unp = (struct unpcb *)addr;
2195
2196	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
2197	    unp->unp_vnode);
2198
2199	db_printf("unp_ino: %d   unp_conn: %p\n", unp->unp_ino,
2200	    unp->unp_conn);
2201
2202	db_printf("unp_refs:\n");
2203	db_print_unprefs(2, &unp->unp_refs);
2204
2205	/* XXXRW: Would be nice to print the full address, if any. */
2206	db_printf("unp_addr: %p\n", unp->unp_addr);
2207
2208	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
2209	    unp->unp_cc, unp->unp_mbcnt,
2210	    (unsigned long long)unp->unp_gencnt);
2211
2212	db_printf("unp_flags: %x (", unp->unp_flags);
2213	db_print_unpflags(unp->unp_flags);
2214	db_printf(")\n");
2215
2216	db_printf("unp_peercred:\n");
2217	db_print_xucred(2, &unp->unp_peercred);
2218
2219	db_printf("unp_refcount: %u\n", unp->unp_refcount);
2220}
2221#endif
2222