1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1991, 1993
5 *	The Regents of the University of California. All Rights Reserved.
6 * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
7 * Copyright (c) 2018 Matthew Macy
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34 */
35
36/*
37 * UNIX Domain (Local) Sockets
38 *
39 * This is an implementation of UNIX (local) domain sockets.  Each socket has
40 * an associated struct unpcb (UNIX protocol control block).  Stream sockets
41 * may be connected to 0 or 1 other socket.  Datagram sockets may be
42 * connected to 0, 1, or many other sockets.  Sockets may be created and
43 * connected in pairs (socketpair(2)), or bound/connected to using the file
44 * system name space.  For most purposes, only the receive socket buffer is
45 * used, as sending on one socket delivers directly to the receive socket
46 * buffer of a second socket.
47 *
48 * The implementation is substantially complicated by the fact that
49 * "ancillary data", such as file descriptors or credentials, may be passed
50 * across UNIX domain sockets.  The potential for passing UNIX domain sockets
51 * over other UNIX domain sockets requires the implementation of a simple
52 * garbage collector to find and tear down cycles of disconnected sockets.
53 *
54 * TODO:
55 *	RDM
56 *	rethink name space problems
57 *	need a proper out-of-band
58 */
59
60#include <sys/cdefs.h>
61__FBSDID("$FreeBSD$");
62
63#include "opt_ddb.h"
64
65#include <sys/param.h>
66#include <sys/capsicum.h>
67#include <sys/domain.h>
68#include <sys/eventhandler.h>
69#include <sys/fcntl.h>
70#include <sys/file.h>
71#include <sys/filedesc.h>
72#include <sys/kernel.h>
73#include <sys/lock.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/mount.h>
77#include <sys/mutex.h>
78#include <sys/namei.h>
79#include <sys/proc.h>
80#include <sys/protosw.h>
81#include <sys/queue.h>
82#include <sys/resourcevar.h>
83#include <sys/rwlock.h>
84#include <sys/socket.h>
85#include <sys/socketvar.h>
86#include <sys/signalvar.h>
87#include <sys/stat.h>
88#include <sys/sx.h>
89#include <sys/sysctl.h>
90#include <sys/systm.h>
91#include <sys/taskqueue.h>
92#include <sys/un.h>
93#include <sys/unpcb.h>
94#include <sys/vnode.h>
95
96#include <net/vnet.h>
97
98#ifdef DDB
99#include <ddb/ddb.h>
100#endif
101
102#include <security/mac/mac_framework.h>
103
104#include <vm/uma.h>
105
106MALLOC_DECLARE(M_FILECAPS);
107
108/*
109 * See unpcb.h for the locking key.
110 */
111
112static uma_zone_t	unp_zone;
113static unp_gen_t	unp_gencnt;	/* (l) */
114static u_int		unp_count;	/* (l) Count of local sockets. */
115static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
116static int		unp_rights;	/* (g) File descriptors in flight. */
117static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
118static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
119static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
120
121struct unp_defer {
122	SLIST_ENTRY(unp_defer) ud_link;
123	struct file *ud_fp;
124};
125static SLIST_HEAD(, unp_defer) unp_defers;
126static int unp_defers_count;
127
128static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
129
130/*
131 * Garbage collection of cyclic file descriptor/socket references occurs
132 * asynchronously in a taskqueue context in order to avoid recursion and
133 * reentrance in the UNIX domain socket, file descriptor, and socket layer
134 * code.  See unp_gc() for a full description.
135 */
136static struct timeout_task unp_gc_task;
137
138/*
139 * The close of unix domain sockets attached as SCM_RIGHTS is
140 * postponed to the taskqueue, to avoid arbitrary recursion depth.
141 * The attached sockets might have another sockets attached.
142 */
143static struct task	unp_defer_task;
144
145/*
146 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
147 * stream sockets, although the total for sender and receiver is actually
148 * only PIPSIZ.
149 *
150 * Datagram sockets really use the sendspace as the maximum datagram size,
151 * and don't really want to reserve the sendspace.  Their recvspace should be
152 * large enough for at least one max-size datagram plus address.
153 */
154#ifndef PIPSIZ
155#define	PIPSIZ	8192
156#endif
157static u_long	unpst_sendspace = PIPSIZ;
158static u_long	unpst_recvspace = PIPSIZ;
159static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
160static u_long	unpdg_recvspace = 4*1024;
161static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
162static u_long	unpsp_recvspace = PIPSIZ;
163
164static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
165    "Local domain");
166static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
167    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
168    "SOCK_STREAM");
169static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
170    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
171    "SOCK_DGRAM");
172static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
173    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
174    "SOCK_SEQPACKET");
175
176SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
177	   &unpst_sendspace, 0, "Default stream send space.");
178SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
179	   &unpst_recvspace, 0, "Default stream receive space.");
180SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
181	   &unpdg_sendspace, 0, "Default datagram send space.");
182SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
183	   &unpdg_recvspace, 0, "Default datagram receive space.");
184SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
185	   &unpsp_sendspace, 0, "Default seqpacket send space.");
186SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
187	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
188SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
189    "File descriptors in flight.");
190SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
191    &unp_defers_count, 0,
192    "File descriptors deferred to taskqueue for close.");
193
194/*
195 * Locking and synchronization:
196 *
197 * Several types of locks exist in the local domain socket implementation:
198 * - a global linkage lock
199 * - a global connection list lock
200 * - the mtxpool lock
201 * - per-unpcb mutexes
202 *
203 * The linkage lock protects the global socket lists, the generation number
204 * counter and garbage collector state.
205 *
206 * The connection list lock protects the list of referring sockets in a datagram
207 * socket PCB.  This lock is also overloaded to protect a global list of
208 * sockets whose buffers contain socket references in the form of SCM_RIGHTS
209 * messages.  To avoid recursion, such references are released by a dedicated
210 * thread.
211 *
212 * The mtxpool lock protects the vnode from being modified while referenced.
213 * Lock ordering rules require that it be acquired before any PCB locks.
214 *
215 * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
216 * unpcb.  This includes the unp_conn field, which either links two connected
217 * PCBs together (for connected socket types) or points at the destination
218 * socket (for connectionless socket types).  The operations of creating or
219 * destroying a connection therefore involve locking multiple PCBs.  To avoid
220 * lock order reversals, in some cases this involves dropping a PCB lock and
221 * using a reference counter to maintain liveness.
222 *
223 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
224 * allocated in pru_attach() and freed in pru_detach().  The validity of that
225 * pointer is an invariant, so no lock is required to dereference the so_pcb
226 * pointer if a valid socket reference is held by the caller.  In practice,
227 * this is always true during operations performed on a socket.  Each unpcb
228 * has a back-pointer to its socket, unp_socket, which will be stable under
229 * the same circumstances.
230 *
231 * This pointer may only be safely dereferenced as long as a valid reference
232 * to the unpcb is held.  Typically, this reference will be from the socket,
233 * or from another unpcb when the referring unpcb's lock is held (in order
234 * that the reference not be invalidated during use).  For example, to follow
235 * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
236 * that detach is not run clearing unp_socket.
237 *
238 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
239 * protocols, bind() is a non-atomic operation, and connect() requires
240 * potential sleeping in the protocol, due to potentially waiting on local or
241 * distributed file systems.  We try to separate "lookup" operations, which
242 * may sleep, and the IPC operations themselves, which typically can occur
243 * with relative atomicity as locks can be held over the entire operation.
244 *
245 * Another tricky issue is simultaneous multi-threaded or multi-process
246 * access to a single UNIX domain socket.  These are handled by the flags
247 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
248 * binding, both of which involve dropping UNIX domain socket locks in order
249 * to perform namei() and other file system operations.
250 */
251static struct rwlock	unp_link_rwlock;
252static struct mtx	unp_defers_lock;
253
254#define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
255					    "unp_link_rwlock")
256
257#define	UNP_LINK_LOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
258					    RA_LOCKED)
259#define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
260					    RA_UNLOCKED)
261
262#define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
263#define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
264#define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
265#define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
266#define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
267					    RA_WLOCKED)
268#define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
269
270#define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
271					    "unp_defer", NULL, MTX_DEF)
272#define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
273#define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
274
275#define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
276#define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
277
278#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
279					    "unp", "unp",	\
280					    MTX_DUPOK|MTX_DEF)
281#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
282#define	UNP_PCB_LOCKPTR(unp)		(&(unp)->unp_mtx)
283#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
284#define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
285#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
286#define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
287#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
288#define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
289
290static int	uipc_connect2(struct socket *, struct socket *);
291static int	uipc_ctloutput(struct socket *, struct sockopt *);
292static int	unp_connect(struct socket *, struct sockaddr *,
293		    struct thread *);
294static int	unp_connectat(int, struct socket *, struct sockaddr *,
295		    struct thread *);
296static int	unp_connect2(struct socket *so, struct socket *so2, int);
297static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
298static void	unp_dispose(struct socket *so);
299static void	unp_dispose_mbuf(struct mbuf *);
300static void	unp_shutdown(struct unpcb *);
301static void	unp_drop(struct unpcb *);
302static void	unp_gc(__unused void *, int);
303static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
304static void	unp_discard(struct file *);
305static void	unp_freerights(struct filedescent **, int);
306static void	unp_init(void);
307static int	unp_internalize(struct mbuf **, struct thread *);
308static void	unp_internalize_fp(struct file *);
309static int	unp_externalize(struct mbuf *, struct mbuf **, int);
310static int	unp_externalize_fp(struct file *);
311static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *, int);
312static void	unp_process_defers(void * __unused, int);
313
314static void
315unp_pcb_hold(struct unpcb *unp)
316{
317	u_int old __unused;
318
319	old = refcount_acquire(&unp->unp_refcount);
320	KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
321}
322
323static __result_use_check bool
324unp_pcb_rele(struct unpcb *unp)
325{
326	bool ret;
327
328	UNP_PCB_LOCK_ASSERT(unp);
329
330	if ((ret = refcount_release(&unp->unp_refcount))) {
331		UNP_PCB_UNLOCK(unp);
332		UNP_PCB_LOCK_DESTROY(unp);
333		uma_zfree(unp_zone, unp);
334	}
335	return (ret);
336}
337
338static void
339unp_pcb_rele_notlast(struct unpcb *unp)
340{
341	bool ret __unused;
342
343	ret = refcount_release(&unp->unp_refcount);
344	KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
345}
346
347static void
348unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
349{
350	UNP_PCB_UNLOCK_ASSERT(unp);
351	UNP_PCB_UNLOCK_ASSERT(unp2);
352
353	if (unp == unp2) {
354		UNP_PCB_LOCK(unp);
355	} else if ((uintptr_t)unp2 > (uintptr_t)unp) {
356		UNP_PCB_LOCK(unp);
357		UNP_PCB_LOCK(unp2);
358	} else {
359		UNP_PCB_LOCK(unp2);
360		UNP_PCB_LOCK(unp);
361	}
362}
363
364static void
365unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
366{
367	UNP_PCB_UNLOCK(unp);
368	if (unp != unp2)
369		UNP_PCB_UNLOCK(unp2);
370}
371
372/*
373 * Try to lock the connected peer of an already locked socket.  In some cases
374 * this requires that we unlock the current socket.  The pairbusy counter is
375 * used to block concurrent connection attempts while the lock is dropped.  The
376 * caller must be careful to revalidate PCB state.
377 */
378static struct unpcb *
379unp_pcb_lock_peer(struct unpcb *unp)
380{
381	struct unpcb *unp2;
382
383	UNP_PCB_LOCK_ASSERT(unp);
384	unp2 = unp->unp_conn;
385	if (unp2 == NULL)
386		return (NULL);
387	if (__predict_false(unp == unp2))
388		return (unp);
389
390	UNP_PCB_UNLOCK_ASSERT(unp2);
391
392	if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
393		return (unp2);
394	if ((uintptr_t)unp2 > (uintptr_t)unp) {
395		UNP_PCB_LOCK(unp2);
396		return (unp2);
397	}
398	unp->unp_pairbusy++;
399	unp_pcb_hold(unp2);
400	UNP_PCB_UNLOCK(unp);
401
402	UNP_PCB_LOCK(unp2);
403	UNP_PCB_LOCK(unp);
404	KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
405	    ("%s: socket %p was reconnected", __func__, unp));
406	if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
407		unp->unp_flags &= ~UNP_WAITING;
408		wakeup(unp);
409	}
410	if (unp_pcb_rele(unp2)) {
411		/* unp2 is unlocked. */
412		return (NULL);
413	}
414	if (unp->unp_conn == NULL) {
415		UNP_PCB_UNLOCK(unp2);
416		return (NULL);
417	}
418	return (unp2);
419}
420
421/*
422 * Definitions of protocols supported in the LOCAL domain.
423 */
424static struct domain localdomain;
425static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
426static struct pr_usrreqs uipc_usrreqs_seqpacket;
427static struct protosw localsw[] = {
428{
429	.pr_type =		SOCK_STREAM,
430	.pr_domain =		&localdomain,
431	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
432	.pr_ctloutput =		&uipc_ctloutput,
433	.pr_usrreqs =		&uipc_usrreqs_stream
434},
435{
436	.pr_type =		SOCK_DGRAM,
437	.pr_domain =		&localdomain,
438	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
439	.pr_ctloutput =		&uipc_ctloutput,
440	.pr_usrreqs =		&uipc_usrreqs_dgram
441},
442{
443	.pr_type =		SOCK_SEQPACKET,
444	.pr_domain =		&localdomain,
445
446	/*
447	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
448	 * due to our use of sbappendaddr.  A new sbappend variants is needed
449	 * that supports both atomic record writes and control data.
450	 */
451	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
452				    PR_RIGHTS,
453	.pr_ctloutput =		&uipc_ctloutput,
454	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
455},
456};
457
458static struct domain localdomain = {
459	.dom_family =		AF_LOCAL,
460	.dom_name =		"local",
461	.dom_init =		unp_init,
462	.dom_externalize =	unp_externalize,
463	.dom_dispose =		unp_dispose,
464	.dom_protosw =		localsw,
465	.dom_protoswNPROTOSW =	&localsw[nitems(localsw)]
466};
467DOMAIN_SET(local);
468
469static void
470uipc_abort(struct socket *so)
471{
472	struct unpcb *unp, *unp2;
473
474	unp = sotounpcb(so);
475	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
476	UNP_PCB_UNLOCK_ASSERT(unp);
477
478	UNP_PCB_LOCK(unp);
479	unp2 = unp->unp_conn;
480	if (unp2 != NULL) {
481		unp_pcb_hold(unp2);
482		UNP_PCB_UNLOCK(unp);
483		unp_drop(unp2);
484	} else
485		UNP_PCB_UNLOCK(unp);
486}
487
488static int
489uipc_accept(struct socket *so, struct sockaddr **nam)
490{
491	struct unpcb *unp, *unp2;
492	const struct sockaddr *sa;
493
494	/*
495	 * Pass back name of connected socket, if it was bound and we are
496	 * still connected (our peer may have closed already!).
497	 */
498	unp = sotounpcb(so);
499	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
500
501	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
502	UNP_PCB_LOCK(unp);
503	unp2 = unp_pcb_lock_peer(unp);
504	if (unp2 != NULL && unp2->unp_addr != NULL)
505		sa = (struct sockaddr *)unp2->unp_addr;
506	else
507		sa = &sun_noname;
508	bcopy(sa, *nam, sa->sa_len);
509	if (unp2 != NULL)
510		unp_pcb_unlock_pair(unp, unp2);
511	else
512		UNP_PCB_UNLOCK(unp);
513	return (0);
514}
515
516static int
517uipc_attach(struct socket *so, int proto, struct thread *td)
518{
519	u_long sendspace, recvspace;
520	struct unpcb *unp;
521	int error;
522	bool locked;
523
524	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
525	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
526		switch (so->so_type) {
527		case SOCK_STREAM:
528			sendspace = unpst_sendspace;
529			recvspace = unpst_recvspace;
530			break;
531
532		case SOCK_DGRAM:
533			sendspace = unpdg_sendspace;
534			recvspace = unpdg_recvspace;
535			break;
536
537		case SOCK_SEQPACKET:
538			sendspace = unpsp_sendspace;
539			recvspace = unpsp_recvspace;
540			break;
541
542		default:
543			panic("uipc_attach");
544		}
545		error = soreserve(so, sendspace, recvspace);
546		if (error)
547			return (error);
548	}
549	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
550	if (unp == NULL)
551		return (ENOBUFS);
552	LIST_INIT(&unp->unp_refs);
553	UNP_PCB_LOCK_INIT(unp);
554	unp->unp_socket = so;
555	so->so_pcb = unp;
556	refcount_init(&unp->unp_refcount, 1);
557
558	if ((locked = UNP_LINK_WOWNED()) == false)
559		UNP_LINK_WLOCK();
560
561	unp->unp_gencnt = ++unp_gencnt;
562	unp->unp_ino = ++unp_ino;
563	unp_count++;
564	switch (so->so_type) {
565	case SOCK_STREAM:
566		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
567		break;
568
569	case SOCK_DGRAM:
570		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
571		break;
572
573	case SOCK_SEQPACKET:
574		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
575		break;
576
577	default:
578		panic("uipc_attach");
579	}
580
581	if (locked == false)
582		UNP_LINK_WUNLOCK();
583
584	return (0);
585}
586
587static int
588uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
589{
590	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
591	struct vattr vattr;
592	int error, namelen;
593	struct nameidata nd;
594	struct unpcb *unp;
595	struct vnode *vp;
596	struct mount *mp;
597	cap_rights_t rights;
598	char *buf;
599
600	if (nam->sa_family != AF_UNIX)
601		return (EAFNOSUPPORT);
602
603	unp = sotounpcb(so);
604	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
605
606	if (soun->sun_len > sizeof(struct sockaddr_un))
607		return (EINVAL);
608	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
609	if (namelen <= 0)
610		return (EINVAL);
611
612	/*
613	 * We don't allow simultaneous bind() calls on a single UNIX domain
614	 * socket, so flag in-progress operations, and return an error if an
615	 * operation is already in progress.
616	 *
617	 * Historically, we have not allowed a socket to be rebound, so this
618	 * also returns an error.  Not allowing re-binding simplifies the
619	 * implementation and avoids a great many possible failure modes.
620	 */
621	UNP_PCB_LOCK(unp);
622	if (unp->unp_vnode != NULL) {
623		UNP_PCB_UNLOCK(unp);
624		return (EINVAL);
625	}
626	if (unp->unp_flags & UNP_BINDING) {
627		UNP_PCB_UNLOCK(unp);
628		return (EALREADY);
629	}
630	unp->unp_flags |= UNP_BINDING;
631	UNP_PCB_UNLOCK(unp);
632
633	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
634	bcopy(soun->sun_path, buf, namelen);
635	buf[namelen] = 0;
636
637restart:
638	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
639	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT),
640	    td);
641/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
642	error = namei(&nd);
643	if (error)
644		goto error;
645	vp = nd.ni_vp;
646	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
647		NDFREE(&nd, NDF_ONLY_PNBUF);
648		if (nd.ni_dvp == vp)
649			vrele(nd.ni_dvp);
650		else
651			vput(nd.ni_dvp);
652		if (vp != NULL) {
653			vrele(vp);
654			error = EADDRINUSE;
655			goto error;
656		}
657		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
658		if (error)
659			goto error;
660		goto restart;
661	}
662	VATTR_NULL(&vattr);
663	vattr.va_type = VSOCK;
664	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask);
665#ifdef MAC
666	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
667	    &vattr);
668#endif
669	if (error == 0)
670		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
671	NDFREE(&nd, NDF_ONLY_PNBUF);
672	if (error) {
673		VOP_VPUT_PAIR(nd.ni_dvp, NULL, true);
674		vn_finished_write(mp);
675		if (error == ERELOOKUP)
676			goto restart;
677		goto error;
678	}
679	vp = nd.ni_vp;
680	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
681	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
682
683	UNP_PCB_LOCK(unp);
684	VOP_UNP_BIND(vp, unp);
685	unp->unp_vnode = vp;
686	unp->unp_addr = soun;
687	unp->unp_flags &= ~UNP_BINDING;
688	UNP_PCB_UNLOCK(unp);
689	vref(vp);
690	VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
691	vn_finished_write(mp);
692	free(buf, M_TEMP);
693	return (0);
694
695error:
696	UNP_PCB_LOCK(unp);
697	unp->unp_flags &= ~UNP_BINDING;
698	UNP_PCB_UNLOCK(unp);
699	free(buf, M_TEMP);
700	return (error);
701}
702
703static int
704uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
705{
706
707	return (uipc_bindat(AT_FDCWD, so, nam, td));
708}
709
710static int
711uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
712{
713	int error;
714
715	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
716	error = unp_connect(so, nam, td);
717	return (error);
718}
719
720static int
721uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
722    struct thread *td)
723{
724	int error;
725
726	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
727	error = unp_connectat(fd, so, nam, td);
728	return (error);
729}
730
731static void
732uipc_close(struct socket *so)
733{
734	struct unpcb *unp, *unp2;
735	struct vnode *vp = NULL;
736	struct mtx *vplock;
737
738	unp = sotounpcb(so);
739	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
740
741	vplock = NULL;
742	if ((vp = unp->unp_vnode) != NULL) {
743		vplock = mtx_pool_find(mtxpool_sleep, vp);
744		mtx_lock(vplock);
745	}
746	UNP_PCB_LOCK(unp);
747	if (vp && unp->unp_vnode == NULL) {
748		mtx_unlock(vplock);
749		vp = NULL;
750	}
751	if (vp != NULL) {
752		VOP_UNP_DETACH(vp);
753		unp->unp_vnode = NULL;
754	}
755	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
756		unp_disconnect(unp, unp2);
757	else
758		UNP_PCB_UNLOCK(unp);
759	if (vp) {
760		mtx_unlock(vplock);
761		vrele(vp);
762	}
763}
764
765static int
766uipc_connect2(struct socket *so1, struct socket *so2)
767{
768	struct unpcb *unp, *unp2;
769	int error;
770
771	unp = so1->so_pcb;
772	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
773	unp2 = so2->so_pcb;
774	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
775	unp_pcb_lock_pair(unp, unp2);
776	error = unp_connect2(so1, so2, PRU_CONNECT2);
777	unp_pcb_unlock_pair(unp, unp2);
778	return (error);
779}
780
781static void
782uipc_detach(struct socket *so)
783{
784	struct unpcb *unp, *unp2;
785	struct mtx *vplock;
786	struct vnode *vp;
787	int local_unp_rights;
788
789	unp = sotounpcb(so);
790	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
791
792	vp = NULL;
793	vplock = NULL;
794
795	SOCK_LOCK(so);
796	if (!SOLISTENING(so)) {
797		/*
798		 * Once the socket is removed from the global lists,
799		 * uipc_ready() will not be able to locate its socket buffer, so
800		 * clear the buffer now.  At this point internalized rights have
801		 * already been disposed of.
802		 */
803		sbrelease(&so->so_rcv, so);
804	}
805	SOCK_UNLOCK(so);
806
807	UNP_LINK_WLOCK();
808	LIST_REMOVE(unp, unp_link);
809	if (unp->unp_gcflag & UNPGC_DEAD)
810		LIST_REMOVE(unp, unp_dead);
811	unp->unp_gencnt = ++unp_gencnt;
812	--unp_count;
813	UNP_LINK_WUNLOCK();
814
815	UNP_PCB_UNLOCK_ASSERT(unp);
816 restart:
817	if ((vp = unp->unp_vnode) != NULL) {
818		vplock = mtx_pool_find(mtxpool_sleep, vp);
819		mtx_lock(vplock);
820	}
821	UNP_PCB_LOCK(unp);
822	if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
823		if (vplock)
824			mtx_unlock(vplock);
825		UNP_PCB_UNLOCK(unp);
826		goto restart;
827	}
828	if ((vp = unp->unp_vnode) != NULL) {
829		VOP_UNP_DETACH(vp);
830		unp->unp_vnode = NULL;
831	}
832	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
833		unp_disconnect(unp, unp2);
834	else
835		UNP_PCB_UNLOCK(unp);
836
837	UNP_REF_LIST_LOCK();
838	while (!LIST_EMPTY(&unp->unp_refs)) {
839		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
840
841		unp_pcb_hold(ref);
842		UNP_REF_LIST_UNLOCK();
843
844		MPASS(ref != unp);
845		UNP_PCB_UNLOCK_ASSERT(ref);
846		unp_drop(ref);
847		UNP_REF_LIST_LOCK();
848	}
849	UNP_REF_LIST_UNLOCK();
850
851	UNP_PCB_LOCK(unp);
852	local_unp_rights = unp_rights;
853	unp->unp_socket->so_pcb = NULL;
854	unp->unp_socket = NULL;
855	free(unp->unp_addr, M_SONAME);
856	unp->unp_addr = NULL;
857	if (!unp_pcb_rele(unp))
858		UNP_PCB_UNLOCK(unp);
859	if (vp) {
860		mtx_unlock(vplock);
861		vrele(vp);
862	}
863	if (local_unp_rights)
864		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
865}
866
867static int
868uipc_disconnect(struct socket *so)
869{
870	struct unpcb *unp, *unp2;
871
872	unp = sotounpcb(so);
873	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
874
875	UNP_PCB_LOCK(unp);
876	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
877		unp_disconnect(unp, unp2);
878	else
879		UNP_PCB_UNLOCK(unp);
880	return (0);
881}
882
883static int
884uipc_listen(struct socket *so, int backlog, struct thread *td)
885{
886	struct unpcb *unp;
887	int error;
888
889	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
890		return (EOPNOTSUPP);
891
892	unp = sotounpcb(so);
893	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
894
895	UNP_PCB_LOCK(unp);
896	if (unp->unp_vnode == NULL) {
897		/* Already connected or not bound to an address. */
898		error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ;
899		UNP_PCB_UNLOCK(unp);
900		return (error);
901	}
902
903	SOCK_LOCK(so);
904	error = solisten_proto_check(so);
905	if (error == 0) {
906		cru2xt(td, &unp->unp_peercred);
907		solisten_proto(so, backlog);
908	}
909	SOCK_UNLOCK(so);
910	UNP_PCB_UNLOCK(unp);
911	return (error);
912}
913
914static int
915uipc_peeraddr(struct socket *so, struct sockaddr **nam)
916{
917	struct unpcb *unp, *unp2;
918	const struct sockaddr *sa;
919
920	unp = sotounpcb(so);
921	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
922
923	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
924	UNP_LINK_RLOCK();
925	/*
926	 * XXX: It seems that this test always fails even when connection is
927	 * established.  So, this else clause is added as workaround to
928	 * return PF_LOCAL sockaddr.
929	 */
930	unp2 = unp->unp_conn;
931	if (unp2 != NULL) {
932		UNP_PCB_LOCK(unp2);
933		if (unp2->unp_addr != NULL)
934			sa = (struct sockaddr *) unp2->unp_addr;
935		else
936			sa = &sun_noname;
937		bcopy(sa, *nam, sa->sa_len);
938		UNP_PCB_UNLOCK(unp2);
939	} else {
940		sa = &sun_noname;
941		bcopy(sa, *nam, sa->sa_len);
942	}
943	UNP_LINK_RUNLOCK();
944	return (0);
945}
946
947static int
948uipc_rcvd(struct socket *so, int flags)
949{
950	struct unpcb *unp, *unp2;
951	struct socket *so2;
952	u_int mbcnt, sbcc;
953
954	unp = sotounpcb(so);
955	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
956	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
957	    ("%s: socktype %d", __func__, so->so_type));
958
959	/*
960	 * Adjust backpressure on sender and wakeup any waiting to write.
961	 *
962	 * The unp lock is acquired to maintain the validity of the unp_conn
963	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
964	 * static as long as we don't permit unp2 to disconnect from unp,
965	 * which is prevented by the lock on unp.  We cache values from
966	 * so_rcv to avoid holding the so_rcv lock over the entire
967	 * transaction on the remote so_snd.
968	 */
969	SOCKBUF_LOCK(&so->so_rcv);
970	mbcnt = so->so_rcv.sb_mbcnt;
971	sbcc = sbavail(&so->so_rcv);
972	SOCKBUF_UNLOCK(&so->so_rcv);
973	/*
974	 * There is a benign race condition at this point.  If we're planning to
975	 * clear SB_STOP, but uipc_send is called on the connected socket at
976	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
977	 * we would erroneously clear SB_STOP below, even though the sockbuf is
978	 * full.  The race is benign because the only ill effect is to allow the
979	 * sockbuf to exceed its size limit, and the size limits are not
980	 * strictly guaranteed anyway.
981	 */
982	UNP_PCB_LOCK(unp);
983	unp2 = unp->unp_conn;
984	if (unp2 == NULL) {
985		UNP_PCB_UNLOCK(unp);
986		return (0);
987	}
988	so2 = unp2->unp_socket;
989	SOCKBUF_LOCK(&so2->so_snd);
990	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
991		so2->so_snd.sb_flags &= ~SB_STOP;
992	sowwakeup_locked(so2);
993	UNP_PCB_UNLOCK(unp);
994	return (0);
995}
996
997static int
998uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
999    struct mbuf *control, struct thread *td)
1000{
1001	struct unpcb *unp, *unp2;
1002	struct socket *so2;
1003	u_int mbcnt, sbcc;
1004	int freed, error;
1005
1006	unp = sotounpcb(so);
1007	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
1008	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM ||
1009	    so->so_type == SOCK_SEQPACKET,
1010	    ("%s: socktype %d", __func__, so->so_type));
1011
1012	freed = error = 0;
1013	if (flags & PRUS_OOB) {
1014		error = EOPNOTSUPP;
1015		goto release;
1016	}
1017	if (control != NULL && (error = unp_internalize(&control, td)))
1018		goto release;
1019
1020	unp2 = NULL;
1021	switch (so->so_type) {
1022	case SOCK_DGRAM:
1023	{
1024		const struct sockaddr *from;
1025
1026		if (nam != NULL) {
1027			error = unp_connect(so, nam, td);
1028			if (error != 0)
1029				break;
1030		}
1031		UNP_PCB_LOCK(unp);
1032
1033		/*
1034		 * Because connect() and send() are non-atomic in a sendto()
1035		 * with a target address, it's possible that the socket will
1036		 * have disconnected before the send() can run.  In that case
1037		 * return the slightly counter-intuitive but otherwise
1038		 * correct error that the socket is not connected.
1039		 */
1040		unp2 = unp_pcb_lock_peer(unp);
1041		if (unp2 == NULL) {
1042			UNP_PCB_UNLOCK(unp);
1043			error = ENOTCONN;
1044			break;
1045		}
1046
1047		if (unp2->unp_flags & UNP_WANTCRED_MASK)
1048			control = unp_addsockcred(td, control,
1049			    unp2->unp_flags);
1050		if (unp->unp_addr != NULL)
1051			from = (struct sockaddr *)unp->unp_addr;
1052		else
1053			from = &sun_noname;
1054		so2 = unp2->unp_socket;
1055		SOCKBUF_LOCK(&so2->so_rcv);
1056		if (sbappendaddr_locked(&so2->so_rcv, from, m,
1057		    control)) {
1058			sorwakeup_locked(so2);
1059			m = NULL;
1060			control = NULL;
1061		} else {
1062			SOCKBUF_UNLOCK(&so2->so_rcv);
1063			error = ENOBUFS;
1064		}
1065		if (nam != NULL)
1066			unp_disconnect(unp, unp2);
1067		else
1068			unp_pcb_unlock_pair(unp, unp2);
1069		break;
1070	}
1071
1072	case SOCK_SEQPACKET:
1073	case SOCK_STREAM:
1074		if ((so->so_state & SS_ISCONNECTED) == 0) {
1075			if (nam != NULL) {
1076				error = unp_connect(so, nam, td);
1077				if (error != 0)
1078					break;
1079			} else {
1080				error = ENOTCONN;
1081				break;
1082			}
1083		}
1084
1085		UNP_PCB_LOCK(unp);
1086		if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
1087			UNP_PCB_UNLOCK(unp);
1088			error = ENOTCONN;
1089			break;
1090		} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1091			unp_pcb_unlock_pair(unp, unp2);
1092			error = EPIPE;
1093			break;
1094		}
1095		UNP_PCB_UNLOCK(unp);
1096		if ((so2 = unp2->unp_socket) == NULL) {
1097			UNP_PCB_UNLOCK(unp2);
1098			error = ENOTCONN;
1099			break;
1100		}
1101		SOCKBUF_LOCK(&so2->so_rcv);
1102		if (unp2->unp_flags & UNP_WANTCRED_MASK) {
1103			/*
1104			 * Credentials are passed only once on SOCK_STREAM and
1105			 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
1106			 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
1107			 */
1108			control = unp_addsockcred(td, control, unp2->unp_flags);
1109			unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
1110		}
1111
1112		/*
1113		 * Send to paired receive port and wake up readers.  Don't
1114		 * check for space available in the receive buffer if we're
1115		 * attaching ancillary data; Unix domain sockets only check
1116		 * for space in the sending sockbuf, and that check is
1117		 * performed one level up the stack.  At that level we cannot
1118		 * precisely account for the amount of buffer space used
1119		 * (e.g., because control messages are not yet internalized).
1120		 */
1121		switch (so->so_type) {
1122		case SOCK_STREAM:
1123			if (control != NULL) {
1124				sbappendcontrol_locked(&so2->so_rcv, m,
1125				    control, flags);
1126				control = NULL;
1127			} else
1128				sbappend_locked(&so2->so_rcv, m, flags);
1129			break;
1130
1131		case SOCK_SEQPACKET:
1132			if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
1133			    &sun_noname, m, control))
1134				control = NULL;
1135			break;
1136		}
1137
1138		mbcnt = so2->so_rcv.sb_mbcnt;
1139		sbcc = sbavail(&so2->so_rcv);
1140		if (sbcc)
1141			sorwakeup_locked(so2);
1142		else
1143			SOCKBUF_UNLOCK(&so2->so_rcv);
1144
1145		/*
1146		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
1147		 * it would be possible for uipc_rcvd to be called at this
1148		 * point, drain the receiving sockbuf, clear SB_STOP, and then
1149		 * we would set SB_STOP below.  That could lead to an empty
1150		 * sockbuf having SB_STOP set
1151		 */
1152		SOCKBUF_LOCK(&so->so_snd);
1153		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
1154			so->so_snd.sb_flags |= SB_STOP;
1155		SOCKBUF_UNLOCK(&so->so_snd);
1156		UNP_PCB_UNLOCK(unp2);
1157		m = NULL;
1158		break;
1159	}
1160
1161	/*
1162	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
1163	 */
1164	if (flags & PRUS_EOF) {
1165		UNP_PCB_LOCK(unp);
1166		socantsendmore(so);
1167		unp_shutdown(unp);
1168		UNP_PCB_UNLOCK(unp);
1169	}
1170	if (control != NULL && error != 0)
1171		unp_dispose_mbuf(control);
1172
1173release:
1174	if (control != NULL)
1175		m_freem(control);
1176	/*
1177	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
1178	 * for freeing memory.
1179	 */
1180	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1181		m_freem(m);
1182	return (error);
1183}
1184
1185static bool
1186uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
1187{
1188	struct mbuf *mb, *n;
1189	struct sockbuf *sb;
1190
1191	SOCK_LOCK(so);
1192	if (SOLISTENING(so)) {
1193		SOCK_UNLOCK(so);
1194		return (false);
1195	}
1196	mb = NULL;
1197	sb = &so->so_rcv;
1198	SOCKBUF_LOCK(sb);
1199	if (sb->sb_fnrdy != NULL) {
1200		for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) {
1201			if (mb == m) {
1202				*errorp = sbready(sb, m, count);
1203				break;
1204			}
1205			mb = mb->m_next;
1206			if (mb == NULL) {
1207				mb = n;
1208				if (mb != NULL)
1209					n = mb->m_nextpkt;
1210			}
1211		}
1212	}
1213	SOCKBUF_UNLOCK(sb);
1214	SOCK_UNLOCK(so);
1215	return (mb != NULL);
1216}
1217
1218static int
1219uipc_ready(struct socket *so, struct mbuf *m, int count)
1220{
1221	struct unpcb *unp, *unp2;
1222	struct socket *so2;
1223	int error, i;
1224
1225	unp = sotounpcb(so);
1226
1227	KASSERT(so->so_type == SOCK_STREAM,
1228	    ("%s: unexpected socket type for %p", __func__, so));
1229
1230	UNP_PCB_LOCK(unp);
1231	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
1232		UNP_PCB_UNLOCK(unp);
1233		so2 = unp2->unp_socket;
1234		SOCKBUF_LOCK(&so2->so_rcv);
1235		if ((error = sbready(&so2->so_rcv, m, count)) == 0)
1236			sorwakeup_locked(so2);
1237		else
1238			SOCKBUF_UNLOCK(&so2->so_rcv);
1239		UNP_PCB_UNLOCK(unp2);
1240		return (error);
1241	}
1242	UNP_PCB_UNLOCK(unp);
1243
1244	/*
1245	 * The receiving socket has been disconnected, but may still be valid.
1246	 * In this case, the now-ready mbufs are still present in its socket
1247	 * buffer, so perform an exhaustive search before giving up and freeing
1248	 * the mbufs.
1249	 */
1250	UNP_LINK_RLOCK();
1251	LIST_FOREACH(unp, &unp_shead, unp_link) {
1252		if (uipc_ready_scan(unp->unp_socket, m, count, &error))
1253			break;
1254	}
1255	UNP_LINK_RUNLOCK();
1256
1257	if (unp == NULL) {
1258		for (i = 0; i < count; i++)
1259			m = m_free(m);
1260		error = ECONNRESET;
1261	}
1262	return (error);
1263}
1264
1265static int
1266uipc_sense(struct socket *so, struct stat *sb)
1267{
1268	struct unpcb *unp;
1269
1270	unp = sotounpcb(so);
1271	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
1272
1273	sb->st_blksize = so->so_snd.sb_hiwat;
1274	sb->st_dev = NODEV;
1275	sb->st_ino = unp->unp_ino;
1276	return (0);
1277}
1278
1279static int
1280uipc_shutdown(struct socket *so)
1281{
1282	struct unpcb *unp;
1283
1284	unp = sotounpcb(so);
1285	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
1286
1287	UNP_PCB_LOCK(unp);
1288	socantsendmore(so);
1289	unp_shutdown(unp);
1290	UNP_PCB_UNLOCK(unp);
1291	return (0);
1292}
1293
1294static int
1295uipc_sockaddr(struct socket *so, struct sockaddr **nam)
1296{
1297	struct unpcb *unp;
1298	const struct sockaddr *sa;
1299
1300	unp = sotounpcb(so);
1301	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
1302
1303	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1304	UNP_PCB_LOCK(unp);
1305	if (unp->unp_addr != NULL)
1306		sa = (struct sockaddr *) unp->unp_addr;
1307	else
1308		sa = &sun_noname;
1309	bcopy(sa, *nam, sa->sa_len);
1310	UNP_PCB_UNLOCK(unp);
1311	return (0);
1312}
1313
1314static struct pr_usrreqs uipc_usrreqs_dgram = {
1315	.pru_abort = 		uipc_abort,
1316	.pru_accept =		uipc_accept,
1317	.pru_attach =		uipc_attach,
1318	.pru_bind =		uipc_bind,
1319	.pru_bindat =		uipc_bindat,
1320	.pru_connect =		uipc_connect,
1321	.pru_connectat =	uipc_connectat,
1322	.pru_connect2 =		uipc_connect2,
1323	.pru_detach =		uipc_detach,
1324	.pru_disconnect =	uipc_disconnect,
1325	.pru_listen =		uipc_listen,
1326	.pru_peeraddr =		uipc_peeraddr,
1327	.pru_rcvd =		uipc_rcvd,
1328	.pru_send =		uipc_send,
1329	.pru_sense =		uipc_sense,
1330	.pru_shutdown =		uipc_shutdown,
1331	.pru_sockaddr =		uipc_sockaddr,
1332	.pru_soreceive =	soreceive_dgram,
1333	.pru_close =		uipc_close,
1334};
1335
1336static struct pr_usrreqs uipc_usrreqs_seqpacket = {
1337	.pru_abort =		uipc_abort,
1338	.pru_accept =		uipc_accept,
1339	.pru_attach =		uipc_attach,
1340	.pru_bind =		uipc_bind,
1341	.pru_bindat =		uipc_bindat,
1342	.pru_connect =		uipc_connect,
1343	.pru_connectat =	uipc_connectat,
1344	.pru_connect2 =		uipc_connect2,
1345	.pru_detach =		uipc_detach,
1346	.pru_disconnect =	uipc_disconnect,
1347	.pru_listen =		uipc_listen,
1348	.pru_peeraddr =		uipc_peeraddr,
1349	.pru_rcvd =		uipc_rcvd,
1350	.pru_send =		uipc_send,
1351	.pru_sense =		uipc_sense,
1352	.pru_shutdown =		uipc_shutdown,
1353	.pru_sockaddr =		uipc_sockaddr,
1354	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
1355	.pru_close =		uipc_close,
1356};
1357
1358static struct pr_usrreqs uipc_usrreqs_stream = {
1359	.pru_abort = 		uipc_abort,
1360	.pru_accept =		uipc_accept,
1361	.pru_attach =		uipc_attach,
1362	.pru_bind =		uipc_bind,
1363	.pru_bindat =		uipc_bindat,
1364	.pru_connect =		uipc_connect,
1365	.pru_connectat =	uipc_connectat,
1366	.pru_connect2 =		uipc_connect2,
1367	.pru_detach =		uipc_detach,
1368	.pru_disconnect =	uipc_disconnect,
1369	.pru_listen =		uipc_listen,
1370	.pru_peeraddr =		uipc_peeraddr,
1371	.pru_rcvd =		uipc_rcvd,
1372	.pru_send =		uipc_send,
1373	.pru_ready =		uipc_ready,
1374	.pru_sense =		uipc_sense,
1375	.pru_shutdown =		uipc_shutdown,
1376	.pru_sockaddr =		uipc_sockaddr,
1377	.pru_soreceive =	soreceive_generic,
1378	.pru_close =		uipc_close,
1379};
1380
1381static int
1382uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1383{
1384	struct unpcb *unp;
1385	struct xucred xu;
1386	int error, optval;
1387
1388	if (sopt->sopt_level != SOL_LOCAL)
1389		return (EINVAL);
1390
1391	unp = sotounpcb(so);
1392	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1393	error = 0;
1394	switch (sopt->sopt_dir) {
1395	case SOPT_GET:
1396		switch (sopt->sopt_name) {
1397		case LOCAL_PEERCRED:
1398			UNP_PCB_LOCK(unp);
1399			if (unp->unp_flags & UNP_HAVEPC)
1400				xu = unp->unp_peercred;
1401			else {
1402				if (so->so_type == SOCK_STREAM)
1403					error = ENOTCONN;
1404				else
1405					error = EINVAL;
1406			}
1407			UNP_PCB_UNLOCK(unp);
1408			if (error == 0)
1409				error = sooptcopyout(sopt, &xu, sizeof(xu));
1410			break;
1411
1412		case LOCAL_CREDS:
1413			/* Unlocked read. */
1414			optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
1415			error = sooptcopyout(sopt, &optval, sizeof(optval));
1416			break;
1417
1418		case LOCAL_CREDS_PERSISTENT:
1419			/* Unlocked read. */
1420			optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
1421			error = sooptcopyout(sopt, &optval, sizeof(optval));
1422			break;
1423
1424		case LOCAL_CONNWAIT:
1425			/* Unlocked read. */
1426			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
1427			error = sooptcopyout(sopt, &optval, sizeof(optval));
1428			break;
1429
1430		default:
1431			error = EOPNOTSUPP;
1432			break;
1433		}
1434		break;
1435
1436	case SOPT_SET:
1437		switch (sopt->sopt_name) {
1438		case LOCAL_CREDS:
1439		case LOCAL_CREDS_PERSISTENT:
1440		case LOCAL_CONNWAIT:
1441			error = sooptcopyin(sopt, &optval, sizeof(optval),
1442					    sizeof(optval));
1443			if (error)
1444				break;
1445
1446#define	OPTSET(bit, exclusive) do {					\
1447	UNP_PCB_LOCK(unp);						\
1448	if (optval) {							\
1449		if ((unp->unp_flags & (exclusive)) != 0) {		\
1450			UNP_PCB_UNLOCK(unp);				\
1451			error = EINVAL;					\
1452			break;						\
1453		}							\
1454		unp->unp_flags |= (bit);				\
1455	} else								\
1456		unp->unp_flags &= ~(bit);				\
1457	UNP_PCB_UNLOCK(unp);						\
1458} while (0)
1459
1460			switch (sopt->sopt_name) {
1461			case LOCAL_CREDS:
1462				OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
1463				break;
1464
1465			case LOCAL_CREDS_PERSISTENT:
1466				OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
1467				break;
1468
1469			case LOCAL_CONNWAIT:
1470				OPTSET(UNP_CONNWAIT, 0);
1471				break;
1472
1473			default:
1474				break;
1475			}
1476			break;
1477#undef	OPTSET
1478		default:
1479			error = ENOPROTOOPT;
1480			break;
1481		}
1482		break;
1483
1484	default:
1485		error = EOPNOTSUPP;
1486		break;
1487	}
1488	return (error);
1489}
1490
1491static int
1492unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1493{
1494
1495	return (unp_connectat(AT_FDCWD, so, nam, td));
1496}
1497
1498static int
1499unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
1500    struct thread *td)
1501{
1502	struct mtx *vplock;
1503	struct sockaddr_un *soun;
1504	struct vnode *vp;
1505	struct socket *so2;
1506	struct unpcb *unp, *unp2, *unp3;
1507	struct nameidata nd;
1508	char buf[SOCK_MAXADDRLEN];
1509	struct sockaddr *sa;
1510	cap_rights_t rights;
1511	int error, len;
1512	bool connreq;
1513
1514	if (nam->sa_family != AF_UNIX)
1515		return (EAFNOSUPPORT);
1516	if (nam->sa_len > sizeof(struct sockaddr_un))
1517		return (EINVAL);
1518	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1519	if (len <= 0)
1520		return (EINVAL);
1521	soun = (struct sockaddr_un *)nam;
1522	bcopy(soun->sun_path, buf, len);
1523	buf[len] = 0;
1524
1525	unp = sotounpcb(so);
1526	UNP_PCB_LOCK(unp);
1527	for (;;) {
1528		/*
1529		 * Wait for connection state to stabilize.  If a connection
1530		 * already exists, give up.  For datagram sockets, which permit
1531		 * multiple consecutive connect(2) calls, upper layers are
1532		 * responsible for disconnecting in advance of a subsequent
1533		 * connect(2), but this is not synchronized with PCB connection
1534		 * state.
1535		 *
1536		 * Also make sure that no threads are currently attempting to
1537		 * lock the peer socket, to ensure that unp_conn cannot
1538		 * transition between two valid sockets while locks are dropped.
1539		 */
1540		if (unp->unp_conn != NULL) {
1541			UNP_PCB_UNLOCK(unp);
1542			return (EISCONN);
1543		}
1544		if ((unp->unp_flags & UNP_CONNECTING) != 0) {
1545			UNP_PCB_UNLOCK(unp);
1546			return (EALREADY);
1547		}
1548		if (unp->unp_pairbusy > 0) {
1549			unp->unp_flags |= UNP_WAITING;
1550			mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
1551			continue;
1552		}
1553		break;
1554	}
1555	unp->unp_flags |= UNP_CONNECTING;
1556	UNP_PCB_UNLOCK(unp);
1557
1558	connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
1559	if (connreq)
1560		sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1561	else
1562		sa = NULL;
1563	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
1564	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT),
1565	    td);
1566	error = namei(&nd);
1567	if (error)
1568		vp = NULL;
1569	else
1570		vp = nd.ni_vp;
1571	ASSERT_VOP_LOCKED(vp, "unp_connect");
1572	NDFREE_NOTHING(&nd);
1573	if (error)
1574		goto bad;
1575
1576	if (vp->v_type != VSOCK) {
1577		error = ENOTSOCK;
1578		goto bad;
1579	}
1580#ifdef MAC
1581	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1582	if (error)
1583		goto bad;
1584#endif
1585	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1586	if (error)
1587		goto bad;
1588
1589	unp = sotounpcb(so);
1590	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1591
1592	vplock = mtx_pool_find(mtxpool_sleep, vp);
1593	mtx_lock(vplock);
1594	VOP_UNP_CONNECT(vp, &unp2);
1595	if (unp2 == NULL) {
1596		error = ECONNREFUSED;
1597		goto bad2;
1598	}
1599	so2 = unp2->unp_socket;
1600	if (so->so_type != so2->so_type) {
1601		error = EPROTOTYPE;
1602		goto bad2;
1603	}
1604	if (connreq) {
1605		if (SOLISTENING(so2)) {
1606			CURVNET_SET(so2->so_vnet);
1607			so2 = sonewconn(so2, 0);
1608			CURVNET_RESTORE();
1609		} else
1610			so2 = NULL;
1611		if (so2 == NULL) {
1612			error = ECONNREFUSED;
1613			goto bad2;
1614		}
1615		unp3 = sotounpcb(so2);
1616		unp_pcb_lock_pair(unp2, unp3);
1617		if (unp2->unp_addr != NULL) {
1618			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
1619			unp3->unp_addr = (struct sockaddr_un *) sa;
1620			sa = NULL;
1621		}
1622
1623		unp_copy_peercred(td, unp3, unp, unp2);
1624
1625		UNP_PCB_UNLOCK(unp2);
1626		unp2 = unp3;
1627
1628		/*
1629		 * It is safe to block on the PCB lock here since unp2 is
1630		 * nascent and cannot be connected to any other sockets.
1631		 */
1632		UNP_PCB_LOCK(unp);
1633#ifdef MAC
1634		mac_socketpeer_set_from_socket(so, so2);
1635		mac_socketpeer_set_from_socket(so2, so);
1636#endif
1637	} else {
1638		unp_pcb_lock_pair(unp, unp2);
1639	}
1640	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
1641	    sotounpcb(so2) == unp2,
1642	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
1643	error = unp_connect2(so, so2, PRU_CONNECT);
1644	unp_pcb_unlock_pair(unp, unp2);
1645bad2:
1646	mtx_unlock(vplock);
1647bad:
1648	if (vp != NULL) {
1649		vput(vp);
1650	}
1651	free(sa, M_SONAME);
1652	UNP_PCB_LOCK(unp);
1653	KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
1654	    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
1655	unp->unp_flags &= ~UNP_CONNECTING;
1656	UNP_PCB_UNLOCK(unp);
1657	return (error);
1658}
1659
1660/*
1661 * Set socket peer credentials at connection time.
1662 *
1663 * The client's PCB credentials are copied from its process structure.  The
1664 * server's PCB credentials are copied from the socket on which it called
1665 * listen(2).  uipc_listen cached that process's credentials at the time.
1666 */
1667void
1668unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
1669    struct unpcb *server_unp, struct unpcb *listen_unp)
1670{
1671	cru2xt(td, &client_unp->unp_peercred);
1672	client_unp->unp_flags |= UNP_HAVEPC;
1673
1674	memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
1675	    sizeof(server_unp->unp_peercred));
1676	server_unp->unp_flags |= UNP_HAVEPC;
1677	client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
1678}
1679
1680static int
1681unp_connect2(struct socket *so, struct socket *so2, int req)
1682{
1683	struct unpcb *unp;
1684	struct unpcb *unp2;
1685
1686	unp = sotounpcb(so);
1687	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
1688	unp2 = sotounpcb(so2);
1689	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
1690
1691	UNP_PCB_LOCK_ASSERT(unp);
1692	UNP_PCB_LOCK_ASSERT(unp2);
1693	KASSERT(unp->unp_conn == NULL,
1694	    ("%s: socket %p is already connected", __func__, unp));
1695
1696	if (so2->so_type != so->so_type)
1697		return (EPROTOTYPE);
1698	unp->unp_conn = unp2;
1699	unp_pcb_hold(unp2);
1700	unp_pcb_hold(unp);
1701	switch (so->so_type) {
1702	case SOCK_DGRAM:
1703		UNP_REF_LIST_LOCK();
1704		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1705		UNP_REF_LIST_UNLOCK();
1706		soisconnected(so);
1707		break;
1708
1709	case SOCK_STREAM:
1710	case SOCK_SEQPACKET:
1711		KASSERT(unp2->unp_conn == NULL,
1712		    ("%s: socket %p is already connected", __func__, unp2));
1713		unp2->unp_conn = unp;
1714		if (req == PRU_CONNECT &&
1715		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1716			soisconnecting(so);
1717		else
1718			soisconnected(so);
1719		soisconnected(so2);
1720		break;
1721
1722	default:
1723		panic("unp_connect2");
1724	}
1725	return (0);
1726}
1727
1728static void
1729unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
1730{
1731	struct socket *so, *so2;
1732#ifdef INVARIANTS
1733	struct unpcb *unptmp;
1734#endif
1735
1736	UNP_PCB_LOCK_ASSERT(unp);
1737	UNP_PCB_LOCK_ASSERT(unp2);
1738	KASSERT(unp->unp_conn == unp2,
1739	    ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
1740
1741	unp->unp_conn = NULL;
1742	so = unp->unp_socket;
1743	so2 = unp2->unp_socket;
1744	switch (unp->unp_socket->so_type) {
1745	case SOCK_DGRAM:
1746		UNP_REF_LIST_LOCK();
1747#ifdef INVARIANTS
1748		LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
1749			if (unptmp == unp)
1750				break;
1751		}
1752		KASSERT(unptmp != NULL,
1753		    ("%s: %p not found in reflist of %p", __func__, unp, unp2));
1754#endif
1755		LIST_REMOVE(unp, unp_reflink);
1756		UNP_REF_LIST_UNLOCK();
1757		if (so) {
1758			SOCK_LOCK(so);
1759			so->so_state &= ~SS_ISCONNECTED;
1760			SOCK_UNLOCK(so);
1761		}
1762		break;
1763
1764	case SOCK_STREAM:
1765	case SOCK_SEQPACKET:
1766		if (so)
1767			soisdisconnected(so);
1768		MPASS(unp2->unp_conn == unp);
1769		unp2->unp_conn = NULL;
1770		if (so2)
1771			soisdisconnected(so2);
1772		break;
1773	}
1774
1775	if (unp == unp2) {
1776		unp_pcb_rele_notlast(unp);
1777		if (!unp_pcb_rele(unp))
1778			UNP_PCB_UNLOCK(unp);
1779	} else {
1780		if (!unp_pcb_rele(unp))
1781			UNP_PCB_UNLOCK(unp);
1782		if (!unp_pcb_rele(unp2))
1783			UNP_PCB_UNLOCK(unp2);
1784	}
1785}
1786
1787/*
1788 * unp_pcblist() walks the global list of struct unpcb's to generate a
1789 * pointer list, bumping the refcount on each unpcb.  It then copies them out
1790 * sequentially, validating the generation number on each to see if it has
1791 * been detached.  All of this is necessary because copyout() may sleep on
1792 * disk I/O.
1793 */
1794static int
1795unp_pcblist(SYSCTL_HANDLER_ARGS)
1796{
1797	struct unpcb *unp, **unp_list;
1798	unp_gen_t gencnt;
1799	struct xunpgen *xug;
1800	struct unp_head *head;
1801	struct xunpcb *xu;
1802	u_int i;
1803	int error, n;
1804
1805	switch ((intptr_t)arg1) {
1806	case SOCK_STREAM:
1807		head = &unp_shead;
1808		break;
1809
1810	case SOCK_DGRAM:
1811		head = &unp_dhead;
1812		break;
1813
1814	case SOCK_SEQPACKET:
1815		head = &unp_sphead;
1816		break;
1817
1818	default:
1819		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
1820	}
1821
1822	/*
1823	 * The process of preparing the PCB list is too time-consuming and
1824	 * resource-intensive to repeat twice on every request.
1825	 */
1826	if (req->oldptr == NULL) {
1827		n = unp_count;
1828		req->oldidx = 2 * (sizeof *xug)
1829			+ (n + n/8) * sizeof(struct xunpcb);
1830		return (0);
1831	}
1832
1833	if (req->newptr != NULL)
1834		return (EPERM);
1835
1836	/*
1837	 * OK, now we're committed to doing something.
1838	 */
1839	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
1840	UNP_LINK_RLOCK();
1841	gencnt = unp_gencnt;
1842	n = unp_count;
1843	UNP_LINK_RUNLOCK();
1844
1845	xug->xug_len = sizeof *xug;
1846	xug->xug_count = n;
1847	xug->xug_gen = gencnt;
1848	xug->xug_sogen = so_gencnt;
1849	error = SYSCTL_OUT(req, xug, sizeof *xug);
1850	if (error) {
1851		free(xug, M_TEMP);
1852		return (error);
1853	}
1854
1855	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1856
1857	UNP_LINK_RLOCK();
1858	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1859	     unp = LIST_NEXT(unp, unp_link)) {
1860		UNP_PCB_LOCK(unp);
1861		if (unp->unp_gencnt <= gencnt) {
1862			if (cr_cansee(req->td->td_ucred,
1863			    unp->unp_socket->so_cred)) {
1864				UNP_PCB_UNLOCK(unp);
1865				continue;
1866			}
1867			unp_list[i++] = unp;
1868			unp_pcb_hold(unp);
1869		}
1870		UNP_PCB_UNLOCK(unp);
1871	}
1872	UNP_LINK_RUNLOCK();
1873	n = i;			/* In case we lost some during malloc. */
1874
1875	error = 0;
1876	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
1877	for (i = 0; i < n; i++) {
1878		unp = unp_list[i];
1879		UNP_PCB_LOCK(unp);
1880		if (unp_pcb_rele(unp))
1881			continue;
1882
1883		if (unp->unp_gencnt <= gencnt) {
1884			xu->xu_len = sizeof *xu;
1885			xu->xu_unpp = (uintptr_t)unp;
1886			/*
1887			 * XXX - need more locking here to protect against
1888			 * connect/disconnect races for SMP.
1889			 */
1890			if (unp->unp_addr != NULL)
1891				bcopy(unp->unp_addr, &xu->xu_addr,
1892				      unp->unp_addr->sun_len);
1893			else
1894				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
1895			if (unp->unp_conn != NULL &&
1896			    unp->unp_conn->unp_addr != NULL)
1897				bcopy(unp->unp_conn->unp_addr,
1898				      &xu->xu_caddr,
1899				      unp->unp_conn->unp_addr->sun_len);
1900			else
1901				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
1902			xu->unp_vnode = (uintptr_t)unp->unp_vnode;
1903			xu->unp_conn = (uintptr_t)unp->unp_conn;
1904			xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
1905			xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
1906			xu->unp_gencnt = unp->unp_gencnt;
1907			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1908			UNP_PCB_UNLOCK(unp);
1909			error = SYSCTL_OUT(req, xu, sizeof *xu);
1910		} else {
1911			UNP_PCB_UNLOCK(unp);
1912		}
1913	}
1914	free(xu, M_TEMP);
1915	if (!error) {
1916		/*
1917		 * Give the user an updated idea of our state.  If the
1918		 * generation differs from what we told her before, she knows
1919		 * that something happened while we were processing this
1920		 * request, and it might be necessary to retry.
1921		 */
1922		xug->xug_gen = unp_gencnt;
1923		xug->xug_sogen = so_gencnt;
1924		xug->xug_count = unp_count;
1925		error = SYSCTL_OUT(req, xug, sizeof *xug);
1926	}
1927	free(unp_list, M_TEMP);
1928	free(xug, M_TEMP);
1929	return (error);
1930}
1931
1932SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
1933    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
1934    (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1935    "List of active local datagram sockets");
1936SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
1937    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
1938    (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1939    "List of active local stream sockets");
1940SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
1941    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
1942    (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
1943    "List of active local seqpacket sockets");
1944
1945static void
1946unp_shutdown(struct unpcb *unp)
1947{
1948	struct unpcb *unp2;
1949	struct socket *so;
1950
1951	UNP_PCB_LOCK_ASSERT(unp);
1952
1953	unp2 = unp->unp_conn;
1954	if ((unp->unp_socket->so_type == SOCK_STREAM ||
1955	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
1956		so = unp2->unp_socket;
1957		if (so != NULL)
1958			socantrcvmore(so);
1959	}
1960}
1961
1962static void
1963unp_drop(struct unpcb *unp)
1964{
1965	struct socket *so = unp->unp_socket;
1966	struct unpcb *unp2;
1967
1968	/*
1969	 * Regardless of whether the socket's peer dropped the connection
1970	 * with this socket by aborting or disconnecting, POSIX requires
1971	 * that ECONNRESET is returned.
1972	 */
1973
1974	UNP_PCB_LOCK(unp);
1975	if (so)
1976		so->so_error = ECONNRESET;
1977	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
1978		/* Last reference dropped in unp_disconnect(). */
1979		unp_pcb_rele_notlast(unp);
1980		unp_disconnect(unp, unp2);
1981	} else if (!unp_pcb_rele(unp)) {
1982		UNP_PCB_UNLOCK(unp);
1983	}
1984}
1985
1986static void
1987unp_freerights(struct filedescent **fdep, int fdcount)
1988{
1989	struct file *fp;
1990	int i;
1991
1992	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
1993
1994	for (i = 0; i < fdcount; i++) {
1995		fp = fdep[i]->fde_file;
1996		filecaps_free(&fdep[i]->fde_caps);
1997		unp_discard(fp);
1998	}
1999	free(fdep[0], M_FILECAPS);
2000}
2001
2002static int
2003unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
2004{
2005	struct thread *td = curthread;		/* XXX */
2006	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
2007	int i;
2008	int *fdp;
2009	struct filedesc *fdesc = td->td_proc->p_fd;
2010	struct filedescent **fdep;
2011	void *data;
2012	socklen_t clen = control->m_len, datalen;
2013	int error, newfds;
2014	u_int newlen;
2015
2016	UNP_LINK_UNLOCK_ASSERT();
2017
2018	error = 0;
2019	if (controlp != NULL) /* controlp == NULL => free control messages */
2020		*controlp = NULL;
2021	while (cm != NULL) {
2022		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
2023			error = EINVAL;
2024			break;
2025		}
2026		data = CMSG_DATA(cm);
2027		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2028		if (cm->cmsg_level == SOL_SOCKET
2029		    && cm->cmsg_type == SCM_RIGHTS) {
2030			newfds = datalen / sizeof(*fdep);
2031			if (newfds == 0)
2032				goto next;
2033			fdep = data;
2034
2035			/* If we're not outputting the descriptors free them. */
2036			if (error || controlp == NULL) {
2037				unp_freerights(fdep, newfds);
2038				goto next;
2039			}
2040			FILEDESC_XLOCK(fdesc);
2041
2042			/*
2043			 * Now change each pointer to an fd in the global
2044			 * table to an integer that is the index to the local
2045			 * fd table entry that we set up to point to the
2046			 * global one we are transferring.
2047			 */
2048			newlen = newfds * sizeof(int);
2049			*controlp = sbcreatecontrol(NULL, newlen,
2050			    SCM_RIGHTS, SOL_SOCKET);
2051			if (*controlp == NULL) {
2052				FILEDESC_XUNLOCK(fdesc);
2053				error = E2BIG;
2054				unp_freerights(fdep, newfds);
2055				goto next;
2056			}
2057
2058			fdp = (int *)
2059			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2060			if (fdallocn(td, 0, fdp, newfds) != 0) {
2061				FILEDESC_XUNLOCK(fdesc);
2062				error = EMSGSIZE;
2063				unp_freerights(fdep, newfds);
2064				m_freem(*controlp);
2065				*controlp = NULL;
2066				goto next;
2067			}
2068			for (i = 0; i < newfds; i++, fdp++) {
2069				_finstall(fdesc, fdep[i]->fde_file, *fdp,
2070				    (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0,
2071				    &fdep[i]->fde_caps);
2072				unp_externalize_fp(fdep[i]->fde_file);
2073			}
2074
2075			/*
2076			 * The new type indicates that the mbuf data refers to
2077			 * kernel resources that may need to be released before
2078			 * the mbuf is freed.
2079			 */
2080			m_chtype(*controlp, MT_EXTCONTROL);
2081			FILEDESC_XUNLOCK(fdesc);
2082			free(fdep[0], M_FILECAPS);
2083		} else {
2084			/* We can just copy anything else across. */
2085			if (error || controlp == NULL)
2086				goto next;
2087			*controlp = sbcreatecontrol(NULL, datalen,
2088			    cm->cmsg_type, cm->cmsg_level);
2089			if (*controlp == NULL) {
2090				error = ENOBUFS;
2091				goto next;
2092			}
2093			bcopy(data,
2094			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
2095			    datalen);
2096		}
2097		controlp = &(*controlp)->m_next;
2098
2099next:
2100		if (CMSG_SPACE(datalen) < clen) {
2101			clen -= CMSG_SPACE(datalen);
2102			cm = (struct cmsghdr *)
2103			    ((caddr_t)cm + CMSG_SPACE(datalen));
2104		} else {
2105			clen = 0;
2106			cm = NULL;
2107		}
2108	}
2109
2110	m_freem(control);
2111	return (error);
2112}
2113
2114static void
2115unp_zone_change(void *tag)
2116{
2117
2118	uma_zone_set_max(unp_zone, maxsockets);
2119}
2120
2121#ifdef INVARIANTS
2122static void
2123unp_zdtor(void *mem, int size __unused, void *arg __unused)
2124{
2125	struct unpcb *unp;
2126
2127	unp = mem;
2128
2129	KASSERT(LIST_EMPTY(&unp->unp_refs),
2130	    ("%s: unpcb %p has lingering refs", __func__, unp));
2131	KASSERT(unp->unp_socket == NULL,
2132	    ("%s: unpcb %p has socket backpointer", __func__, unp));
2133	KASSERT(unp->unp_vnode == NULL,
2134	    ("%s: unpcb %p has vnode references", __func__, unp));
2135	KASSERT(unp->unp_conn == NULL,
2136	    ("%s: unpcb %p is still connected", __func__, unp));
2137	KASSERT(unp->unp_addr == NULL,
2138	    ("%s: unpcb %p has leaked addr", __func__, unp));
2139}
2140#endif
2141
2142static void
2143unp_init(void)
2144{
2145	uma_dtor dtor;
2146
2147#ifdef VIMAGE
2148	if (!IS_DEFAULT_VNET(curvnet))
2149		return;
2150#endif
2151
2152#ifdef INVARIANTS
2153	dtor = unp_zdtor;
2154#else
2155	dtor = NULL;
2156#endif
2157	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
2158	    NULL, NULL, UMA_ALIGN_CACHE, 0);
2159	uma_zone_set_max(unp_zone, maxsockets);
2160	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
2161	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
2162	    NULL, EVENTHANDLER_PRI_ANY);
2163	LIST_INIT(&unp_dhead);
2164	LIST_INIT(&unp_shead);
2165	LIST_INIT(&unp_sphead);
2166	SLIST_INIT(&unp_defers);
2167	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
2168	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
2169	UNP_LINK_LOCK_INIT();
2170	UNP_DEFERRED_LOCK_INIT();
2171}
2172
2173static void
2174unp_internalize_cleanup_rights(struct mbuf *control)
2175{
2176	struct cmsghdr *cp;
2177	struct mbuf *m;
2178	void *data;
2179	socklen_t datalen;
2180
2181	for (m = control; m != NULL; m = m->m_next) {
2182		cp = mtod(m, struct cmsghdr *);
2183		if (cp->cmsg_level != SOL_SOCKET ||
2184		    cp->cmsg_type != SCM_RIGHTS)
2185			continue;
2186		data = CMSG_DATA(cp);
2187		datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
2188		unp_freerights(data, datalen / sizeof(struct filedesc *));
2189	}
2190}
2191
2192static int
2193unp_internalize(struct mbuf **controlp, struct thread *td)
2194{
2195	struct mbuf *control, **initial_controlp;
2196	struct proc *p;
2197	struct filedesc *fdesc;
2198	struct bintime *bt;
2199	struct cmsghdr *cm;
2200	struct cmsgcred *cmcred;
2201	struct filedescent *fde, **fdep, *fdev;
2202	struct file *fp;
2203	struct timeval *tv;
2204	struct timespec *ts;
2205	void *data;
2206	socklen_t clen, datalen;
2207	int i, j, error, *fdp, oldfds;
2208	u_int newlen;
2209
2210	UNP_LINK_UNLOCK_ASSERT();
2211
2212	p = td->td_proc;
2213	fdesc = p->p_fd;
2214	error = 0;
2215	control = *controlp;
2216	clen = control->m_len;
2217	*controlp = NULL;
2218	initial_controlp = controlp;
2219	for (cm = mtod(control, struct cmsghdr *); cm != NULL;) {
2220		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
2221		    || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) {
2222			error = EINVAL;
2223			goto out;
2224		}
2225		data = CMSG_DATA(cm);
2226		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2227
2228		switch (cm->cmsg_type) {
2229		/*
2230		 * Fill in credential information.
2231		 */
2232		case SCM_CREDS:
2233			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
2234			    SCM_CREDS, SOL_SOCKET);
2235			if (*controlp == NULL) {
2236				error = ENOBUFS;
2237				goto out;
2238			}
2239			cmcred = (struct cmsgcred *)
2240			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2241			cmcred->cmcred_pid = p->p_pid;
2242			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
2243			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
2244			cmcred->cmcred_euid = td->td_ucred->cr_uid;
2245			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
2246			    CMGROUP_MAX);
2247			for (i = 0; i < cmcred->cmcred_ngroups; i++)
2248				cmcred->cmcred_groups[i] =
2249				    td->td_ucred->cr_groups[i];
2250			break;
2251
2252		case SCM_RIGHTS:
2253			oldfds = datalen / sizeof (int);
2254			if (oldfds == 0)
2255				break;
2256			/*
2257			 * Check that all the FDs passed in refer to legal
2258			 * files.  If not, reject the entire operation.
2259			 */
2260			fdp = data;
2261			FILEDESC_SLOCK(fdesc);
2262			for (i = 0; i < oldfds; i++, fdp++) {
2263				fp = fget_locked(fdesc, *fdp);
2264				if (fp == NULL) {
2265					FILEDESC_SUNLOCK(fdesc);
2266					error = EBADF;
2267					goto out;
2268				}
2269				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
2270					FILEDESC_SUNLOCK(fdesc);
2271					error = EOPNOTSUPP;
2272					goto out;
2273				}
2274			}
2275
2276			/*
2277			 * Now replace the integer FDs with pointers to the
2278			 * file structure and capability rights.
2279			 */
2280			newlen = oldfds * sizeof(fdep[0]);
2281			*controlp = sbcreatecontrol(NULL, newlen,
2282			    SCM_RIGHTS, SOL_SOCKET);
2283			if (*controlp == NULL) {
2284				FILEDESC_SUNLOCK(fdesc);
2285				error = E2BIG;
2286				goto out;
2287			}
2288			fdp = data;
2289			for (i = 0; i < oldfds; i++, fdp++) {
2290				if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
2291					fdp = data;
2292					for (j = 0; j < i; j++, fdp++) {
2293						fdrop(fdesc->fd_ofiles[*fdp].
2294						    fde_file, td);
2295					}
2296					FILEDESC_SUNLOCK(fdesc);
2297					error = EBADF;
2298					goto out;
2299				}
2300			}
2301			fdp = data;
2302			fdep = (struct filedescent **)
2303			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2304			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
2305			    M_WAITOK);
2306			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
2307				fde = &fdesc->fd_ofiles[*fdp];
2308				fdep[i] = fdev;
2309				fdep[i]->fde_file = fde->fde_file;
2310				filecaps_copy(&fde->fde_caps,
2311				    &fdep[i]->fde_caps, true);
2312				unp_internalize_fp(fdep[i]->fde_file);
2313			}
2314			FILEDESC_SUNLOCK(fdesc);
2315			break;
2316
2317		case SCM_TIMESTAMP:
2318			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
2319			    SCM_TIMESTAMP, SOL_SOCKET);
2320			if (*controlp == NULL) {
2321				error = ENOBUFS;
2322				goto out;
2323			}
2324			tv = (struct timeval *)
2325			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2326			microtime(tv);
2327			break;
2328
2329		case SCM_BINTIME:
2330			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
2331			    SCM_BINTIME, SOL_SOCKET);
2332			if (*controlp == NULL) {
2333				error = ENOBUFS;
2334				goto out;
2335			}
2336			bt = (struct bintime *)
2337			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2338			bintime(bt);
2339			break;
2340
2341		case SCM_REALTIME:
2342			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2343			    SCM_REALTIME, SOL_SOCKET);
2344			if (*controlp == NULL) {
2345				error = ENOBUFS;
2346				goto out;
2347			}
2348			ts = (struct timespec *)
2349			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2350			nanotime(ts);
2351			break;
2352
2353		case SCM_MONOTONIC:
2354			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2355			    SCM_MONOTONIC, SOL_SOCKET);
2356			if (*controlp == NULL) {
2357				error = ENOBUFS;
2358				goto out;
2359			}
2360			ts = (struct timespec *)
2361			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2362			nanouptime(ts);
2363			break;
2364
2365		default:
2366			error = EINVAL;
2367			goto out;
2368		}
2369
2370		if (*controlp != NULL)
2371			controlp = &(*controlp)->m_next;
2372		if (CMSG_SPACE(datalen) < clen) {
2373			clen -= CMSG_SPACE(datalen);
2374			cm = (struct cmsghdr *)
2375			    ((caddr_t)cm + CMSG_SPACE(datalen));
2376		} else {
2377			clen = 0;
2378			cm = NULL;
2379		}
2380	}
2381
2382out:
2383	if (error != 0 && initial_controlp != NULL)
2384		unp_internalize_cleanup_rights(*initial_controlp);
2385	m_freem(control);
2386	return (error);
2387}
2388
2389static struct mbuf *
2390unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
2391{
2392	struct mbuf *m, *n, *n_prev;
2393	const struct cmsghdr *cm;
2394	int ngroups, i, cmsgtype;
2395	size_t ctrlsz;
2396
2397	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
2398	if (mode & UNP_WANTCRED_ALWAYS) {
2399		ctrlsz = SOCKCRED2SIZE(ngroups);
2400		cmsgtype = SCM_CREDS2;
2401	} else {
2402		ctrlsz = SOCKCREDSIZE(ngroups);
2403		cmsgtype = SCM_CREDS;
2404	}
2405
2406	m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET);
2407	if (m == NULL)
2408		return (control);
2409
2410	if (mode & UNP_WANTCRED_ALWAYS) {
2411		struct sockcred2 *sc;
2412
2413		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2414		sc->sc_version = 0;
2415		sc->sc_pid = td->td_proc->p_pid;
2416		sc->sc_uid = td->td_ucred->cr_ruid;
2417		sc->sc_euid = td->td_ucred->cr_uid;
2418		sc->sc_gid = td->td_ucred->cr_rgid;
2419		sc->sc_egid = td->td_ucred->cr_gid;
2420		sc->sc_ngroups = ngroups;
2421		for (i = 0; i < sc->sc_ngroups; i++)
2422			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2423	} else {
2424		struct sockcred *sc;
2425
2426		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2427		sc->sc_uid = td->td_ucred->cr_ruid;
2428		sc->sc_euid = td->td_ucred->cr_uid;
2429		sc->sc_gid = td->td_ucred->cr_rgid;
2430		sc->sc_egid = td->td_ucred->cr_gid;
2431		sc->sc_ngroups = ngroups;
2432		for (i = 0; i < sc->sc_ngroups; i++)
2433			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2434	}
2435
2436	/*
2437	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
2438	 * created SCM_CREDS control message (struct sockcred) has another
2439	 * format.
2440	 */
2441	if (control != NULL && cmsgtype == SCM_CREDS)
2442		for (n = control, n_prev = NULL; n != NULL;) {
2443			cm = mtod(n, struct cmsghdr *);
2444    			if (cm->cmsg_level == SOL_SOCKET &&
2445			    cm->cmsg_type == SCM_CREDS) {
2446    				if (n_prev == NULL)
2447					control = n->m_next;
2448				else
2449					n_prev->m_next = n->m_next;
2450				n = m_free(n);
2451			} else {
2452				n_prev = n;
2453				n = n->m_next;
2454			}
2455		}
2456
2457	/* Prepend it to the head. */
2458	m->m_next = control;
2459	return (m);
2460}
2461
2462static struct unpcb *
2463fptounp(struct file *fp)
2464{
2465	struct socket *so;
2466
2467	if (fp->f_type != DTYPE_SOCKET)
2468		return (NULL);
2469	if ((so = fp->f_data) == NULL)
2470		return (NULL);
2471	if (so->so_proto->pr_domain != &localdomain)
2472		return (NULL);
2473	return sotounpcb(so);
2474}
2475
2476static void
2477unp_discard(struct file *fp)
2478{
2479	struct unp_defer *dr;
2480
2481	if (unp_externalize_fp(fp)) {
2482		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
2483		dr->ud_fp = fp;
2484		UNP_DEFERRED_LOCK();
2485		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
2486		UNP_DEFERRED_UNLOCK();
2487		atomic_add_int(&unp_defers_count, 1);
2488		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
2489	} else
2490		closef_nothread(fp);
2491}
2492
2493static void
2494unp_process_defers(void *arg __unused, int pending)
2495{
2496	struct unp_defer *dr;
2497	SLIST_HEAD(, unp_defer) drl;
2498	int count;
2499
2500	SLIST_INIT(&drl);
2501	for (;;) {
2502		UNP_DEFERRED_LOCK();
2503		if (SLIST_FIRST(&unp_defers) == NULL) {
2504			UNP_DEFERRED_UNLOCK();
2505			break;
2506		}
2507		SLIST_SWAP(&unp_defers, &drl, unp_defer);
2508		UNP_DEFERRED_UNLOCK();
2509		count = 0;
2510		while ((dr = SLIST_FIRST(&drl)) != NULL) {
2511			SLIST_REMOVE_HEAD(&drl, ud_link);
2512			closef_nothread(dr->ud_fp);
2513			free(dr, M_TEMP);
2514			count++;
2515		}
2516		atomic_add_int(&unp_defers_count, -count);
2517	}
2518}
2519
2520static void
2521unp_internalize_fp(struct file *fp)
2522{
2523	struct unpcb *unp;
2524
2525	UNP_LINK_WLOCK();
2526	if ((unp = fptounp(fp)) != NULL) {
2527		unp->unp_file = fp;
2528		unp->unp_msgcount++;
2529	}
2530	unp_rights++;
2531	UNP_LINK_WUNLOCK();
2532}
2533
2534static int
2535unp_externalize_fp(struct file *fp)
2536{
2537	struct unpcb *unp;
2538	int ret;
2539
2540	UNP_LINK_WLOCK();
2541	if ((unp = fptounp(fp)) != NULL) {
2542		unp->unp_msgcount--;
2543		ret = 1;
2544	} else
2545		ret = 0;
2546	unp_rights--;
2547	UNP_LINK_WUNLOCK();
2548	return (ret);
2549}
2550
2551/*
2552 * unp_defer indicates whether additional work has been defered for a future
2553 * pass through unp_gc().  It is thread local and does not require explicit
2554 * synchronization.
2555 */
2556static int	unp_marked;
2557
2558static void
2559unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
2560{
2561	struct unpcb *unp;
2562	struct file *fp;
2563	int i;
2564
2565	/*
2566	 * This function can only be called from the gc task.
2567	 */
2568	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2569	    ("%s: not on gc callout", __func__));
2570	UNP_LINK_LOCK_ASSERT();
2571
2572	for (i = 0; i < fdcount; i++) {
2573		fp = fdep[i]->fde_file;
2574		if ((unp = fptounp(fp)) == NULL)
2575			continue;
2576		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2577			continue;
2578		unp->unp_gcrefs--;
2579	}
2580}
2581
2582static void
2583unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
2584{
2585	struct unpcb *unp;
2586	struct file *fp;
2587	int i;
2588
2589	/*
2590	 * This function can only be called from the gc task.
2591	 */
2592	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2593	    ("%s: not on gc callout", __func__));
2594	UNP_LINK_LOCK_ASSERT();
2595
2596	for (i = 0; i < fdcount; i++) {
2597		fp = fdep[i]->fde_file;
2598		if ((unp = fptounp(fp)) == NULL)
2599			continue;
2600		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2601			continue;
2602		unp->unp_gcrefs++;
2603		unp_marked++;
2604	}
2605}
2606
2607static void
2608unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
2609{
2610	struct socket *so, *soa;
2611
2612	so = unp->unp_socket;
2613	SOCK_LOCK(so);
2614	if (SOLISTENING(so)) {
2615		/*
2616		 * Mark all sockets in our accept queue.
2617		 */
2618		TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
2619			if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
2620				continue;
2621			SOCKBUF_LOCK(&soa->so_rcv);
2622			unp_scan(soa->so_rcv.sb_mb, op);
2623			SOCKBUF_UNLOCK(&soa->so_rcv);
2624		}
2625	} else {
2626		/*
2627		 * Mark all sockets we reference with RIGHTS.
2628		 */
2629		if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
2630			SOCKBUF_LOCK(&so->so_rcv);
2631			unp_scan(so->so_rcv.sb_mb, op);
2632			SOCKBUF_UNLOCK(&so->so_rcv);
2633		}
2634	}
2635	SOCK_UNLOCK(so);
2636}
2637
2638static int unp_recycled;
2639SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
2640    "Number of unreachable sockets claimed by the garbage collector.");
2641
2642static int unp_taskcount;
2643SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
2644    "Number of times the garbage collector has run.");
2645
2646SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0,
2647    "Number of active local sockets.");
2648
2649static void
2650unp_gc(__unused void *arg, int pending)
2651{
2652	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
2653				    NULL };
2654	struct unp_head **head;
2655	struct unp_head unp_deadhead;	/* List of potentially-dead sockets. */
2656	struct file *f, **unref;
2657	struct unpcb *unp, *unptmp;
2658	int i, total, unp_unreachable;
2659
2660	LIST_INIT(&unp_deadhead);
2661	unp_taskcount++;
2662	UNP_LINK_RLOCK();
2663	/*
2664	 * First determine which sockets may be in cycles.
2665	 */
2666	unp_unreachable = 0;
2667
2668	for (head = heads; *head != NULL; head++)
2669		LIST_FOREACH(unp, *head, unp_link) {
2670			KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
2671			    ("%s: unp %p has unexpected gc flags 0x%x",
2672			    __func__, unp, (unsigned int)unp->unp_gcflag));
2673
2674			f = unp->unp_file;
2675
2676			/*
2677			 * Check for an unreachable socket potentially in a
2678			 * cycle.  It must be in a queue as indicated by
2679			 * msgcount, and this must equal the file reference
2680			 * count.  Note that when msgcount is 0 the file is
2681			 * NULL.
2682			 */
2683			if (f != NULL && unp->unp_msgcount != 0 &&
2684			    refcount_load(&f->f_count) == unp->unp_msgcount) {
2685				LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
2686				unp->unp_gcflag |= UNPGC_DEAD;
2687				unp->unp_gcrefs = unp->unp_msgcount;
2688				unp_unreachable++;
2689			}
2690		}
2691
2692	/*
2693	 * Scan all sockets previously marked as potentially being in a cycle
2694	 * and remove the references each socket holds on any UNPGC_DEAD
2695	 * sockets in its queue.  After this step, all remaining references on
2696	 * sockets marked UNPGC_DEAD should not be part of any cycle.
2697	 */
2698	LIST_FOREACH(unp, &unp_deadhead, unp_dead)
2699		unp_gc_scan(unp, unp_remove_dead_ref);
2700
2701	/*
2702	 * If a socket still has a non-negative refcount, it cannot be in a
2703	 * cycle.  In this case increment refcount of all children iteratively.
2704	 * Stop the scan once we do a complete loop without discovering
2705	 * a new reachable socket.
2706	 */
2707	do {
2708		unp_marked = 0;
2709		LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
2710			if (unp->unp_gcrefs > 0) {
2711				unp->unp_gcflag &= ~UNPGC_DEAD;
2712				LIST_REMOVE(unp, unp_dead);
2713				KASSERT(unp_unreachable > 0,
2714				    ("%s: unp_unreachable underflow.",
2715				    __func__));
2716				unp_unreachable--;
2717				unp_gc_scan(unp, unp_restore_undead_ref);
2718			}
2719	} while (unp_marked);
2720
2721	UNP_LINK_RUNLOCK();
2722
2723	if (unp_unreachable == 0)
2724		return;
2725
2726	/*
2727	 * Allocate space for a local array of dead unpcbs.
2728	 * TODO: can this path be simplified by instead using the local
2729	 * dead list at unp_deadhead, after taking out references
2730	 * on the file object and/or unpcb and dropping the link lock?
2731	 */
2732	unref = malloc(unp_unreachable * sizeof(struct file *),
2733	    M_TEMP, M_WAITOK);
2734
2735	/*
2736	 * Iterate looking for sockets which have been specifically marked
2737	 * as unreachable and store them locally.
2738	 */
2739	UNP_LINK_RLOCK();
2740	total = 0;
2741	LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
2742		KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
2743		    ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
2744		unp->unp_gcflag &= ~UNPGC_DEAD;
2745		f = unp->unp_file;
2746		if (unp->unp_msgcount == 0 || f == NULL ||
2747		    refcount_load(&f->f_count) != unp->unp_msgcount ||
2748		    !fhold(f))
2749			continue;
2750		unref[total++] = f;
2751		KASSERT(total <= unp_unreachable,
2752		    ("%s: incorrect unreachable count.", __func__));
2753	}
2754	UNP_LINK_RUNLOCK();
2755
2756	/*
2757	 * Now flush all sockets, free'ing rights.  This will free the
2758	 * struct files associated with these sockets but leave each socket
2759	 * with one remaining ref.
2760	 */
2761	for (i = 0; i < total; i++) {
2762		struct socket *so;
2763
2764		so = unref[i]->f_data;
2765		CURVNET_SET(so->so_vnet);
2766		sorflush(so);
2767		CURVNET_RESTORE();
2768	}
2769
2770	/*
2771	 * And finally release the sockets so they can be reclaimed.
2772	 */
2773	for (i = 0; i < total; i++)
2774		fdrop(unref[i], NULL);
2775	unp_recycled += total;
2776	free(unref, M_TEMP);
2777}
2778
2779static void
2780unp_dispose_mbuf(struct mbuf *m)
2781{
2782
2783	if (m)
2784		unp_scan(m, unp_freerights);
2785}
2786
2787/*
2788 * Synchronize against unp_gc, which can trip over data as we are freeing it.
2789 */
2790static void
2791unp_dispose(struct socket *so)
2792{
2793	struct unpcb *unp;
2794
2795	unp = sotounpcb(so);
2796	UNP_LINK_WLOCK();
2797	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
2798	UNP_LINK_WUNLOCK();
2799	if (!SOLISTENING(so))
2800		unp_dispose_mbuf(so->so_rcv.sb_mb);
2801}
2802
2803static void
2804unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
2805{
2806	struct mbuf *m;
2807	struct cmsghdr *cm;
2808	void *data;
2809	socklen_t clen, datalen;
2810
2811	while (m0 != NULL) {
2812		for (m = m0; m; m = m->m_next) {
2813			if (m->m_type != MT_CONTROL)
2814				continue;
2815
2816			cm = mtod(m, struct cmsghdr *);
2817			clen = m->m_len;
2818
2819			while (cm != NULL) {
2820				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
2821					break;
2822
2823				data = CMSG_DATA(cm);
2824				datalen = (caddr_t)cm + cm->cmsg_len
2825				    - (caddr_t)data;
2826
2827				if (cm->cmsg_level == SOL_SOCKET &&
2828				    cm->cmsg_type == SCM_RIGHTS) {
2829					(*op)(data, datalen /
2830					    sizeof(struct filedescent *));
2831				}
2832
2833				if (CMSG_SPACE(datalen) < clen) {
2834					clen -= CMSG_SPACE(datalen);
2835					cm = (struct cmsghdr *)
2836					    ((caddr_t)cm + CMSG_SPACE(datalen));
2837				} else {
2838					clen = 0;
2839					cm = NULL;
2840				}
2841			}
2842		}
2843		m0 = m0->m_nextpkt;
2844	}
2845}
2846
2847/*
2848 * A helper function called by VFS before socket-type vnode reclamation.
2849 * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
2850 * use count.
2851 */
2852void
2853vfs_unp_reclaim(struct vnode *vp)
2854{
2855	struct unpcb *unp;
2856	int active;
2857	struct mtx *vplock;
2858
2859	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
2860	KASSERT(vp->v_type == VSOCK,
2861	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
2862
2863	active = 0;
2864	vplock = mtx_pool_find(mtxpool_sleep, vp);
2865	mtx_lock(vplock);
2866	VOP_UNP_CONNECT(vp, &unp);
2867	if (unp == NULL)
2868		goto done;
2869	UNP_PCB_LOCK(unp);
2870	if (unp->unp_vnode == vp) {
2871		VOP_UNP_DETACH(vp);
2872		unp->unp_vnode = NULL;
2873		active = 1;
2874	}
2875	UNP_PCB_UNLOCK(unp);
2876 done:
2877	mtx_unlock(vplock);
2878	if (active)
2879		vunref(vp);
2880}
2881
2882#ifdef DDB
2883static void
2884db_print_indent(int indent)
2885{
2886	int i;
2887
2888	for (i = 0; i < indent; i++)
2889		db_printf(" ");
2890}
2891
2892static void
2893db_print_unpflags(int unp_flags)
2894{
2895	int comma;
2896
2897	comma = 0;
2898	if (unp_flags & UNP_HAVEPC) {
2899		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
2900		comma = 1;
2901	}
2902	if (unp_flags & UNP_WANTCRED_ALWAYS) {
2903		db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
2904		comma = 1;
2905	}
2906	if (unp_flags & UNP_WANTCRED_ONESHOT) {
2907		db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
2908		comma = 1;
2909	}
2910	if (unp_flags & UNP_CONNWAIT) {
2911		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
2912		comma = 1;
2913	}
2914	if (unp_flags & UNP_CONNECTING) {
2915		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
2916		comma = 1;
2917	}
2918	if (unp_flags & UNP_BINDING) {
2919		db_printf("%sUNP_BINDING", comma ? ", " : "");
2920		comma = 1;
2921	}
2922}
2923
2924static void
2925db_print_xucred(int indent, struct xucred *xu)
2926{
2927	int comma, i;
2928
2929	db_print_indent(indent);
2930	db_printf("cr_version: %u   cr_uid: %u   cr_pid: %d   cr_ngroups: %d\n",
2931	    xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
2932	db_print_indent(indent);
2933	db_printf("cr_groups: ");
2934	comma = 0;
2935	for (i = 0; i < xu->cr_ngroups; i++) {
2936		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
2937		comma = 1;
2938	}
2939	db_printf("\n");
2940}
2941
2942static void
2943db_print_unprefs(int indent, struct unp_head *uh)
2944{
2945	struct unpcb *unp;
2946	int counter;
2947
2948	counter = 0;
2949	LIST_FOREACH(unp, uh, unp_reflink) {
2950		if (counter % 4 == 0)
2951			db_print_indent(indent);
2952		db_printf("%p  ", unp);
2953		if (counter % 4 == 3)
2954			db_printf("\n");
2955		counter++;
2956	}
2957	if (counter != 0 && counter % 4 != 0)
2958		db_printf("\n");
2959}
2960
2961DB_SHOW_COMMAND(unpcb, db_show_unpcb)
2962{
2963	struct unpcb *unp;
2964
2965        if (!have_addr) {
2966                db_printf("usage: show unpcb <addr>\n");
2967                return;
2968        }
2969        unp = (struct unpcb *)addr;
2970
2971	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
2972	    unp->unp_vnode);
2973
2974	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
2975	    unp->unp_conn);
2976
2977	db_printf("unp_refs:\n");
2978	db_print_unprefs(2, &unp->unp_refs);
2979
2980	/* XXXRW: Would be nice to print the full address, if any. */
2981	db_printf("unp_addr: %p\n", unp->unp_addr);
2982
2983	db_printf("unp_gencnt: %llu\n",
2984	    (unsigned long long)unp->unp_gencnt);
2985
2986	db_printf("unp_flags: %x (", unp->unp_flags);
2987	db_print_unpflags(unp->unp_flags);
2988	db_printf(")\n");
2989
2990	db_printf("unp_peercred:\n");
2991	db_print_xucred(2, &unp->unp_peercred);
2992
2993	db_printf("unp_refcount: %u\n", unp->unp_refcount);
2994}
2995#endif
2996