uipc_usrreq.c revision 81875
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34 * $FreeBSD: head/sys/kern/uipc_usrreq.c 81875 2001-08-18 02:53:50Z julian $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/domain.h>
42#include <sys/filedesc.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
45#include <sys/file.h>
46#include <sys/mutex.h>
47#include <sys/mbuf.h>
48#include <sys/namei.h>
49#include <sys/proc.h>
50#include <sys/protosw.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/resourcevar.h>
54#include <sys/stat.h>
55#include <sys/sysctl.h>
56#include <sys/un.h>
57#include <sys/unpcb.h>
58#include <sys/vnode.h>
59#include <sys/jail.h>
60
61#include <vm/vm_zone.h>
62
63static	struct vm_zone *unp_zone;
64static	unp_gen_t unp_gencnt;
65static	u_int unp_count;
66
67static	struct unp_head unp_shead, unp_dhead;
68
69/*
70 * Unix communications domain.
71 *
72 * TODO:
73 *	SEQPACKET, RDM
74 *	rethink name space problems
75 *	need a proper out-of-band
76 *	lock pushdown
77 */
78static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
79static ino_t	unp_ino;		/* prototype for fake inode numbers */
80
81static int     unp_attach __P((struct socket *));
82static void    unp_detach __P((struct unpcb *));
83static int     unp_bind __P((struct unpcb *,struct sockaddr *, struct proc *));
84static int     unp_connect __P((struct socket *,struct sockaddr *,
85				struct proc *));
86static void    unp_disconnect __P((struct unpcb *));
87static void    unp_shutdown __P((struct unpcb *));
88static void    unp_drop __P((struct unpcb *, int));
89static void    unp_gc __P((void));
90static void    unp_scan __P((struct mbuf *, void (*)(struct file *)));
91static void    unp_mark __P((struct file *));
92static void    unp_discard __P((struct file *));
93static int     unp_internalize __P((struct mbuf *, struct proc *));
94static int     unp_listen __P((struct unpcb *, struct proc *));
95
96static int
97uipc_abort(struct socket *so)
98{
99	struct unpcb *unp = sotounpcb(so);
100
101	if (unp == 0)
102		return EINVAL;
103	unp_drop(unp, ECONNABORTED);
104	return 0;
105}
106
107static int
108uipc_accept(struct socket *so, struct sockaddr **nam)
109{
110	struct unpcb *unp = sotounpcb(so);
111
112	if (unp == 0)
113		return EINVAL;
114
115	/*
116	 * Pass back name of connected socket,
117	 * if it was bound and we are still connected
118	 * (our peer may have closed already!).
119	 */
120	if (unp->unp_conn && unp->unp_conn->unp_addr) {
121		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
122				    1);
123	} else {
124		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
125	}
126	return 0;
127}
128
129static int
130uipc_attach(struct socket *so, int proto, struct proc *p)
131{
132	struct unpcb *unp = sotounpcb(so);
133
134	if (unp != 0)
135		return EISCONN;
136	return unp_attach(so);
137}
138
139static int
140uipc_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
141{
142	struct unpcb *unp = sotounpcb(so);
143
144	if (unp == 0)
145		return EINVAL;
146
147	return unp_bind(unp, nam, p);
148}
149
150static int
151uipc_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
152{
153	struct unpcb *unp = sotounpcb(so);
154
155	if (unp == 0)
156		return EINVAL;
157	return unp_connect(so, nam, curproc);
158}
159
160static int
161uipc_connect2(struct socket *so1, struct socket *so2)
162{
163	struct unpcb *unp = sotounpcb(so1);
164
165	if (unp == 0)
166		return EINVAL;
167
168	return unp_connect2(so1, so2);
169}
170
171/* control is EOPNOTSUPP */
172
173static int
174uipc_detach(struct socket *so)
175{
176	struct unpcb *unp = sotounpcb(so);
177
178	if (unp == 0)
179		return EINVAL;
180
181	unp_detach(unp);
182	return 0;
183}
184
185static int
186uipc_disconnect(struct socket *so)
187{
188	struct unpcb *unp = sotounpcb(so);
189
190	if (unp == 0)
191		return EINVAL;
192	unp_disconnect(unp);
193	return 0;
194}
195
196static int
197uipc_listen(struct socket *so, struct proc *p)
198{
199	struct unpcb *unp = sotounpcb(so);
200
201	if (unp == 0 || unp->unp_vnode == 0)
202		return EINVAL;
203	return unp_listen(unp, p);
204}
205
206static int
207uipc_peeraddr(struct socket *so, struct sockaddr **nam)
208{
209	struct unpcb *unp = sotounpcb(so);
210
211	if (unp == 0)
212		return EINVAL;
213	if (unp->unp_conn && unp->unp_conn->unp_addr)
214		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
215				    1);
216	return 0;
217}
218
219static int
220uipc_rcvd(struct socket *so, int flags)
221{
222	struct unpcb *unp = sotounpcb(so);
223	struct socket *so2;
224	u_long newhiwat;
225
226	if (unp == 0)
227		return EINVAL;
228	switch (so->so_type) {
229	case SOCK_DGRAM:
230		panic("uipc_rcvd DGRAM?");
231		/*NOTREACHED*/
232
233	case SOCK_STREAM:
234		if (unp->unp_conn == 0)
235			break;
236		so2 = unp->unp_conn->unp_socket;
237		/*
238		 * Adjust backpressure on sender
239		 * and wakeup any waiting to write.
240		 */
241		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
242		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
243		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
244		    so->so_rcv.sb_cc;
245		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
246		    newhiwat, RLIM_INFINITY);
247		unp->unp_cc = so->so_rcv.sb_cc;
248		sowwakeup(so2);
249		break;
250
251	default:
252		panic("uipc_rcvd unknown socktype");
253	}
254	return 0;
255}
256
257/* pru_rcvoob is EOPNOTSUPP */
258
259static int
260uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
261	  struct mbuf *control, struct proc *p)
262{
263	int error = 0;
264	struct unpcb *unp = sotounpcb(so);
265	struct socket *so2;
266	u_long newhiwat;
267
268	if (unp == 0) {
269		error = EINVAL;
270		goto release;
271	}
272	if (flags & PRUS_OOB) {
273		error = EOPNOTSUPP;
274		goto release;
275	}
276
277	if (control && (error = unp_internalize(control, p)))
278		goto release;
279
280	switch (so->so_type) {
281	case SOCK_DGRAM:
282	{
283		struct sockaddr *from;
284
285		if (nam) {
286			if (unp->unp_conn) {
287				error = EISCONN;
288				break;
289			}
290			error = unp_connect(so, nam, p);
291			if (error)
292				break;
293		} else {
294			if (unp->unp_conn == 0) {
295				error = ENOTCONN;
296				break;
297			}
298		}
299		so2 = unp->unp_conn->unp_socket;
300		if (unp->unp_addr)
301			from = (struct sockaddr *)unp->unp_addr;
302		else
303			from = &sun_noname;
304		if (sbappendaddr(&so2->so_rcv, from, m, control)) {
305			sorwakeup(so2);
306			m = 0;
307			control = 0;
308		} else
309			error = ENOBUFS;
310		if (nam)
311			unp_disconnect(unp);
312		break;
313	}
314
315	case SOCK_STREAM:
316		/* Connect if not connected yet. */
317		/*
318		 * Note: A better implementation would complain
319		 * if not equal to the peer's address.
320		 */
321		if ((so->so_state & SS_ISCONNECTED) == 0) {
322			if (nam) {
323				error = unp_connect(so, nam, p);
324				if (error)
325					break;	/* XXX */
326			} else {
327				error = ENOTCONN;
328				break;
329			}
330		}
331
332		if (so->so_state & SS_CANTSENDMORE) {
333			error = EPIPE;
334			break;
335		}
336		if (unp->unp_conn == 0)
337			panic("uipc_send connected but no connection?");
338		so2 = unp->unp_conn->unp_socket;
339		/*
340		 * Send to paired receive port, and then reduce
341		 * send buffer hiwater marks to maintain backpressure.
342		 * Wake up readers.
343		 */
344		if (control) {
345			if (sbappendcontrol(&so2->so_rcv, m, control))
346				control = 0;
347		} else
348			sbappend(&so2->so_rcv, m);
349		so->so_snd.sb_mbmax -=
350			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
351		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
352		newhiwat = so->so_snd.sb_hiwat -
353		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
354		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
355		    newhiwat, RLIM_INFINITY);
356		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
357		sorwakeup(so2);
358		m = 0;
359		break;
360
361	default:
362		panic("uipc_send unknown socktype");
363	}
364
365	/*
366	 * SEND_EOF is equivalent to a SEND followed by
367	 * a SHUTDOWN.
368	 */
369	if (flags & PRUS_EOF) {
370		socantsendmore(so);
371		unp_shutdown(unp);
372	}
373
374	if (control && error != 0)
375		unp_dispose(control);
376
377release:
378	if (control)
379		m_freem(control);
380	if (m)
381		m_freem(m);
382	return error;
383}
384
385static int
386uipc_sense(struct socket *so, struct stat *sb)
387{
388	struct unpcb *unp = sotounpcb(so);
389	struct socket *so2;
390
391	if (unp == 0)
392		return EINVAL;
393	sb->st_blksize = so->so_snd.sb_hiwat;
394	if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
395		so2 = unp->unp_conn->unp_socket;
396		sb->st_blksize += so2->so_rcv.sb_cc;
397	}
398	sb->st_dev = NOUDEV;
399	if (unp->unp_ino == 0)
400		unp->unp_ino = unp_ino++;
401	sb->st_ino = unp->unp_ino;
402	return (0);
403}
404
405static int
406uipc_shutdown(struct socket *so)
407{
408	struct unpcb *unp = sotounpcb(so);
409
410	if (unp == 0)
411		return EINVAL;
412	socantsendmore(so);
413	unp_shutdown(unp);
414	return 0;
415}
416
417static int
418uipc_sockaddr(struct socket *so, struct sockaddr **nam)
419{
420	struct unpcb *unp = sotounpcb(so);
421
422	if (unp == 0)
423		return EINVAL;
424	if (unp->unp_addr)
425		*nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
426	else
427		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
428	return 0;
429}
430
431struct pr_usrreqs uipc_usrreqs = {
432	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
433	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
434	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
435	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
436	sosend, soreceive, sopoll
437};
438
439int
440uipc_ctloutput(so, sopt)
441	struct socket *so;
442	struct sockopt *sopt;
443{
444	struct unpcb *unp = sotounpcb(so);
445	int error;
446
447	switch (sopt->sopt_dir) {
448	case SOPT_GET:
449		switch (sopt->sopt_name) {
450		case LOCAL_PEERCRED:
451			if (unp->unp_flags & UNP_HAVEPC)
452				error = sooptcopyout(sopt, &unp->unp_peercred,
453				    sizeof(unp->unp_peercred));
454			else {
455				if (so->so_type == SOCK_STREAM)
456					error = ENOTCONN;
457				else
458					error = EINVAL;
459			}
460			break;
461		default:
462			error = EOPNOTSUPP;
463			break;
464		}
465		break;
466	case SOPT_SET:
467	default:
468		error = EOPNOTSUPP;
469		break;
470	}
471	return (error);
472}
473
474/*
475 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
476 * for stream sockets, although the total for sender and receiver is
477 * actually only PIPSIZ.
478 * Datagram sockets really use the sendspace as the maximum datagram size,
479 * and don't really want to reserve the sendspace.  Their recvspace should
480 * be large enough for at least one max-size datagram plus address.
481 */
482#ifndef PIPSIZ
483#define	PIPSIZ	8192
484#endif
485static u_long	unpst_sendspace = PIPSIZ;
486static u_long	unpst_recvspace = PIPSIZ;
487static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
488static u_long	unpdg_recvspace = 4*1024;
489
490static int	unp_rights;			/* file descriptors in flight */
491
492SYSCTL_DECL(_net_local_stream);
493SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
494	   &unpst_sendspace, 0, "");
495SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
496	   &unpst_recvspace, 0, "");
497SYSCTL_DECL(_net_local_dgram);
498SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
499	   &unpdg_sendspace, 0, "");
500SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
501	   &unpdg_recvspace, 0, "");
502SYSCTL_DECL(_net_local);
503SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
504
505static int
506unp_attach(so)
507	struct socket *so;
508{
509	register struct unpcb *unp;
510	int error;
511
512	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
513		switch (so->so_type) {
514
515		case SOCK_STREAM:
516			error = soreserve(so, unpst_sendspace, unpst_recvspace);
517			break;
518
519		case SOCK_DGRAM:
520			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
521			break;
522
523		default:
524			panic("unp_attach");
525		}
526		if (error)
527			return (error);
528	}
529	unp = zalloc(unp_zone);
530	if (unp == NULL)
531		return (ENOBUFS);
532	bzero(unp, sizeof *unp);
533	unp->unp_gencnt = ++unp_gencnt;
534	unp_count++;
535	LIST_INIT(&unp->unp_refs);
536	unp->unp_socket = so;
537	unp->unp_rvnode = curproc->p_fd->fd_rdir;
538	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
539			 : &unp_shead, unp, unp_link);
540	so->so_pcb = (caddr_t)unp;
541	return (0);
542}
543
544static void
545unp_detach(unp)
546	register struct unpcb *unp;
547{
548	LIST_REMOVE(unp, unp_link);
549	unp->unp_gencnt = ++unp_gencnt;
550	--unp_count;
551	if (unp->unp_vnode) {
552		unp->unp_vnode->v_socket = 0;
553		vrele(unp->unp_vnode);
554		unp->unp_vnode = 0;
555	}
556	if (unp->unp_conn)
557		unp_disconnect(unp);
558	while (!LIST_EMPTY(&unp->unp_refs))
559		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
560	soisdisconnected(unp->unp_socket);
561	unp->unp_socket->so_pcb = 0;
562	if (unp_rights) {
563		/*
564		 * Normally the receive buffer is flushed later,
565		 * in sofree, but if our receive buffer holds references
566		 * to descriptors that are now garbage, we will dispose
567		 * of those descriptor references after the garbage collector
568		 * gets them (resulting in a "panic: closef: count < 0").
569		 */
570		sorflush(unp->unp_socket);
571		unp_gc();
572	}
573	if (unp->unp_addr)
574		FREE(unp->unp_addr, M_SONAME);
575	zfree(unp_zone, unp);
576}
577
578static int
579unp_bind(unp, nam, p)
580	struct unpcb *unp;
581	struct sockaddr *nam;
582	struct proc *p;
583{
584	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
585	struct vnode *vp;
586	struct mount *mp;
587	struct vattr vattr;
588	int error, namelen;
589	struct nameidata nd;
590	char *buf;
591
592	if (unp->unp_vnode != NULL)
593		return (EINVAL);
594	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
595	if (namelen <= 0)
596		return EINVAL;
597	buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK);
598	strncpy(buf, soun->sun_path, namelen);
599	buf[namelen] = 0;	/* null-terminate the string */
600restart:
601	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
602	    buf, p);
603/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
604	error = namei(&nd);
605	if (error) {
606		return (error);
607		free(buf, M_TEMP);
608	}
609	vp = nd.ni_vp;
610	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
611		NDFREE(&nd, NDF_ONLY_PNBUF);
612		if (nd.ni_dvp == vp)
613			vrele(nd.ni_dvp);
614		else
615			vput(nd.ni_dvp);
616		if (vp != NULL) {
617			vrele(vp);
618			free(buf, M_TEMP);
619			return (EADDRINUSE);
620		}
621		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
622		if (error) {
623			free(buf, M_TEMP);
624			return (error);
625		}
626		goto restart;
627	}
628	VATTR_NULL(&vattr);
629	vattr.va_type = VSOCK;
630	vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
631	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
632	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
633	NDFREE(&nd, NDF_ONLY_PNBUF);
634	vput(nd.ni_dvp);
635	if (error) {
636		free(buf, M_TEMP);
637		return (error);
638	}
639	vp = nd.ni_vp;
640	vp->v_socket = unp->unp_socket;
641	unp->unp_vnode = vp;
642	unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
643	VOP_UNLOCK(vp, 0, p);
644	vn_finished_write(mp);
645	free(buf, M_TEMP);
646	return (0);
647}
648
649static int
650unp_connect(so, nam, p)
651	struct socket *so;
652	struct sockaddr *nam;
653	struct proc *p;
654{
655	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
656	register struct vnode *vp;
657	register struct socket *so2, *so3;
658	struct unpcb *unp, *unp2, *unp3;
659	int error, len;
660	struct nameidata nd;
661	char buf[SOCK_MAXADDRLEN];
662
663	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
664	if (len <= 0)
665		return EINVAL;
666	strncpy(buf, soun->sun_path, len);
667	buf[len] = 0;
668
669	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, p);
670	error = namei(&nd);
671	if (error)
672		return (error);
673	vp = nd.ni_vp;
674	NDFREE(&nd, NDF_ONLY_PNBUF);
675	if (vp->v_type != VSOCK) {
676		error = ENOTSOCK;
677		goto bad;
678	}
679	error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
680	if (error)
681		goto bad;
682	so2 = vp->v_socket;
683	if (so2 == 0) {
684		error = ECONNREFUSED;
685		goto bad;
686	}
687	if (so->so_type != so2->so_type) {
688		error = EPROTOTYPE;
689		goto bad;
690	}
691	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
692		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
693		    (so3 = sonewconn3(so2, 0, p)) == 0) {
694			error = ECONNREFUSED;
695			goto bad;
696		}
697		unp = sotounpcb(so);
698		unp2 = sotounpcb(so2);
699		unp3 = sotounpcb(so3);
700		if (unp2->unp_addr)
701			unp3->unp_addr = (struct sockaddr_un *)
702				dup_sockaddr((struct sockaddr *)
703					     unp2->unp_addr, 1);
704
705		/*
706		 * unp_peercred management:
707		 *
708		 * The connecter's (client's) credentials are copied
709		 * from its process structure at the time of connect()
710		 * (which is now).
711		 */
712		memset(&unp3->unp_peercred, '\0', sizeof(unp3->unp_peercred));
713		unp3->unp_peercred.cr_uid = p->p_ucred->cr_uid;
714		unp3->unp_peercred.cr_ngroups = p->p_ucred->cr_ngroups;
715		memcpy(unp3->unp_peercred.cr_groups, p->p_ucred->cr_groups,
716		    sizeof(unp3->unp_peercred.cr_groups));
717		unp3->unp_flags |= UNP_HAVEPC;
718		/*
719		 * The receiver's (server's) credentials are copied
720		 * from the unp_peercred member of socket on which the
721		 * former called listen(); unp_listen() cached that
722		 * process's credentials at that time so we can use
723		 * them now.
724		 */
725		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
726		    ("unp_connect: listener without cached peercred"));
727		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
728		    sizeof(unp->unp_peercred));
729		unp->unp_flags |= UNP_HAVEPC;
730
731		so2 = so3;
732	}
733	error = unp_connect2(so, so2);
734bad:
735	vput(vp);
736	return (error);
737}
738
739int
740unp_connect2(so, so2)
741	register struct socket *so;
742	register struct socket *so2;
743{
744	register struct unpcb *unp = sotounpcb(so);
745	register struct unpcb *unp2;
746
747	if (so2->so_type != so->so_type)
748		return (EPROTOTYPE);
749	unp2 = sotounpcb(so2);
750	unp->unp_conn = unp2;
751	switch (so->so_type) {
752
753	case SOCK_DGRAM:
754		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
755		soisconnected(so);
756		break;
757
758	case SOCK_STREAM:
759		unp2->unp_conn = unp;
760		soisconnected(so);
761		soisconnected(so2);
762		break;
763
764	default:
765		panic("unp_connect2");
766	}
767	return (0);
768}
769
770static void
771unp_disconnect(unp)
772	struct unpcb *unp;
773{
774	register struct unpcb *unp2 = unp->unp_conn;
775
776	if (unp2 == 0)
777		return;
778	unp->unp_conn = 0;
779	switch (unp->unp_socket->so_type) {
780
781	case SOCK_DGRAM:
782		LIST_REMOVE(unp, unp_reflink);
783		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
784		break;
785
786	case SOCK_STREAM:
787		soisdisconnected(unp->unp_socket);
788		unp2->unp_conn = 0;
789		soisdisconnected(unp2->unp_socket);
790		break;
791	}
792}
793
794#ifdef notdef
795void
796unp_abort(unp)
797	struct unpcb *unp;
798{
799
800	unp_detach(unp);
801}
802#endif
803
804static int
805prison_unpcb(struct proc *p, struct unpcb *unp)
806{
807	if (!jailed(p->p_ucred))
808		return (0);
809	if (p->p_fd->fd_rdir == unp->unp_rvnode)
810		return (0);
811	return (1);
812}
813
814static int
815unp_pcblist(SYSCTL_HANDLER_ARGS)
816{
817	int error, i, n;
818	struct unpcb *unp, **unp_list;
819	unp_gen_t gencnt;
820	struct xunpgen *xug;
821	struct unp_head *head;
822	struct xunpcb *xu;
823
824	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
825
826	/*
827	 * The process of preparing the PCB list is too time-consuming and
828	 * resource-intensive to repeat twice on every request.
829	 */
830	if (req->oldptr == 0) {
831		n = unp_count;
832		req->oldidx = 2 * (sizeof *xug)
833			+ (n + n/8) * sizeof(struct xunpcb);
834		return 0;
835	}
836
837	if (req->newptr != 0)
838		return EPERM;
839
840	/*
841	 * OK, now we're committed to doing something.
842	 */
843	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
844	gencnt = unp_gencnt;
845	n = unp_count;
846
847	xug->xug_len = sizeof *xug;
848	xug->xug_count = n;
849	xug->xug_gen = gencnt;
850	xug->xug_sogen = so_gencnt;
851	error = SYSCTL_OUT(req, xug, sizeof *xug);
852	if (error) {
853		free(xug, M_TEMP);
854		return error;
855	}
856
857	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
858	if (unp_list == 0)
859		return ENOMEM;
860
861	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
862	     unp = LIST_NEXT(unp, unp_link)) {
863		if (unp->unp_gencnt <= gencnt && !prison_unpcb(req->p, unp))
864			unp_list[i++] = unp;
865	}
866	n = i;			/* in case we lost some during malloc */
867
868	error = 0;
869	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
870	for (i = 0; i < n; i++) {
871		unp = unp_list[i];
872		if (unp->unp_gencnt <= gencnt) {
873			xu->xu_len = sizeof *xu;
874			xu->xu_unpp = unp;
875			/*
876			 * XXX - need more locking here to protect against
877			 * connect/disconnect races for SMP.
878			 */
879			if (unp->unp_addr)
880				bcopy(unp->unp_addr, &xu->xu_addr,
881				      unp->unp_addr->sun_len);
882			if (unp->unp_conn && unp->unp_conn->unp_addr)
883				bcopy(unp->unp_conn->unp_addr,
884				      &xu->xu_caddr,
885				      unp->unp_conn->unp_addr->sun_len);
886			bcopy(unp, &xu->xu_unp, sizeof *unp);
887			sotoxsocket(unp->unp_socket, &xu->xu_socket);
888			error = SYSCTL_OUT(req, xu, sizeof *xu);
889		}
890	}
891	free(xu, M_TEMP);
892	if (!error) {
893		/*
894		 * Give the user an updated idea of our state.
895		 * If the generation differs from what we told
896		 * her before, she knows that something happened
897		 * while we were processing this request, and it
898		 * might be necessary to retry.
899		 */
900		xug->xug_gen = unp_gencnt;
901		xug->xug_sogen = so_gencnt;
902		xug->xug_count = unp_count;
903		error = SYSCTL_OUT(req, xug, sizeof *xug);
904	}
905	free(unp_list, M_TEMP);
906	free(xug, M_TEMP);
907	return error;
908}
909
910SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
911	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
912	    "List of active local datagram sockets");
913SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
914	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
915	    "List of active local stream sockets");
916
917static void
918unp_shutdown(unp)
919	struct unpcb *unp;
920{
921	struct socket *so;
922
923	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
924	    (so = unp->unp_conn->unp_socket))
925		socantrcvmore(so);
926}
927
928static void
929unp_drop(unp, errno)
930	struct unpcb *unp;
931	int errno;
932{
933	struct socket *so = unp->unp_socket;
934
935	so->so_error = errno;
936	unp_disconnect(unp);
937	if (so->so_head) {
938		LIST_REMOVE(unp, unp_link);
939		unp->unp_gencnt = ++unp_gencnt;
940		unp_count--;
941		so->so_pcb = (caddr_t) 0;
942		if (unp->unp_addr)
943			FREE(unp->unp_addr, M_SONAME);
944		zfree(unp_zone, unp);
945		sofree(so);
946	}
947}
948
949#ifdef notdef
950void
951unp_drain()
952{
953
954}
955#endif
956
957int
958unp_externalize(rights)
959	struct mbuf *rights;
960{
961	struct proc *p = curproc;		/* XXX */
962	register int i;
963	register struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
964	register int *fdp;
965	register struct file **rp;
966	register struct file *fp;
967	int newfds = (cm->cmsg_len - (CMSG_DATA(cm) - (u_char *)cm))
968		/ sizeof (struct file *);
969	int f;
970
971	/*
972	 * if the new FD's will not fit, then we free them all
973	 */
974	if (!fdavail(p, newfds)) {
975		rp = (struct file **)CMSG_DATA(cm);
976		for (i = 0; i < newfds; i++) {
977			fp = *rp;
978			/*
979			 * zero the pointer before calling unp_discard,
980			 * since it may end up in unp_gc()..
981			 */
982			*rp++ = 0;
983			unp_discard(fp);
984		}
985		return (EMSGSIZE);
986	}
987	/*
988	 * now change each pointer to an fd in the global table to
989	 * an integer that is the index to the local fd table entry
990	 * that we set up to point to the global one we are transferring.
991	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
992	 * then do it in forward order. In that case, an integer will
993	 * always come in the same place or before its corresponding
994	 * struct file pointer.
995	 * If sizeof (struct file *) is smaller than sizeof int, then
996	 * do it in reverse order.
997	 */
998	if (sizeof (struct file *) >= sizeof (int)) {
999		fdp = (int *)(cm + 1);
1000		rp = (struct file **)CMSG_DATA(cm);
1001		for (i = 0; i < newfds; i++) {
1002			if (fdalloc(p, 0, &f))
1003				panic("unp_externalize");
1004			fp = *rp++;
1005			p->p_fd->fd_ofiles[f] = fp;
1006			fp->f_msgcount--;
1007			unp_rights--;
1008			*fdp++ = f;
1009		}
1010	} else {
1011		fdp = (int *)(cm + 1) + newfds - 1;
1012		rp = (struct file **)CMSG_DATA(cm) + newfds - 1;
1013		for (i = 0; i < newfds; i++) {
1014			if (fdalloc(p, 0, &f))
1015				panic("unp_externalize");
1016			fp = *rp--;
1017			p->p_fd->fd_ofiles[f] = fp;
1018			fp->f_msgcount--;
1019			unp_rights--;
1020			*fdp-- = f;
1021		}
1022	}
1023
1024	/*
1025	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1026	 * differs.
1027	 */
1028	cm->cmsg_len = CMSG_LEN(newfds * sizeof(int));
1029	rights->m_len = cm->cmsg_len;
1030	return (0);
1031}
1032
1033void
1034unp_init(void)
1035{
1036	unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0);
1037	if (unp_zone == 0)
1038		panic("unp_init");
1039	LIST_INIT(&unp_dhead);
1040	LIST_INIT(&unp_shead);
1041}
1042
1043#ifndef MIN
1044#define	MIN(a,b) (((a)<(b))?(a):(b))
1045#endif
1046
1047static int
1048unp_internalize(control, p)
1049	struct mbuf *control;
1050	struct proc *p;
1051{
1052	struct filedesc *fdescp = p->p_fd;
1053	register struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1054	register struct file **rp;
1055	register struct file *fp;
1056	register int i, fd, *fdp;
1057	register struct cmsgcred *cmcred;
1058	int oldfds;
1059	u_int newlen;
1060
1061	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
1062	    cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len)
1063		return (EINVAL);
1064
1065	/*
1066	 * Fill in credential information.
1067	 */
1068	if (cm->cmsg_type == SCM_CREDS) {
1069		cmcred = (struct cmsgcred *)(cm + 1);
1070		cmcred->cmcred_pid = p->p_pid;
1071		cmcred->cmcred_uid = p->p_ucred->cr_ruid;
1072		cmcred->cmcred_gid = p->p_ucred->cr_rgid;
1073		cmcred->cmcred_euid = p->p_ucred->cr_uid;
1074		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
1075							CMGROUP_MAX);
1076		for (i = 0; i < cmcred->cmcred_ngroups; i++)
1077			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
1078		return(0);
1079	}
1080
1081	oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
1082	/*
1083	 * check that all the FDs passed in refer to legal OPEN files
1084	 * If not, reject the entire operation.
1085	 */
1086	fdp = (int *)(cm + 1);
1087	for (i = 0; i < oldfds; i++) {
1088		fd = *fdp++;
1089		if ((unsigned)fd >= fdescp->fd_nfiles ||
1090		    fdescp->fd_ofiles[fd] == NULL)
1091			return (EBADF);
1092	}
1093	/*
1094	 * Now replace the integer FDs with pointers to
1095	 * the associated global file table entry..
1096	 * Allocate a bigger buffer as necessary. But if an cluster is not
1097	 * enough, return E2BIG.
1098	 */
1099	newlen = CMSG_LEN(oldfds * sizeof(struct file *));
1100	if (newlen > MCLBYTES)
1101		return (E2BIG);
1102	if (newlen - control->m_len > M_TRAILINGSPACE(control)) {
1103		if (control->m_flags & M_EXT)
1104			return (E2BIG);
1105		MCLGET(control, M_TRYWAIT);
1106		if ((control->m_flags & M_EXT) == 0)
1107			return (ENOBUFS);
1108
1109		/* copy the data to the cluster */
1110		memcpy(mtod(control, char *), cm, cm->cmsg_len);
1111		cm = mtod(control, struct cmsghdr *);
1112	}
1113
1114	/*
1115	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1116	 * differs.
1117	 */
1118	control->m_len = cm->cmsg_len = newlen;
1119
1120	/*
1121	 * Transform the file descriptors into struct file pointers.
1122	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1123	 * then do it in reverse order so that the int won't get until
1124	 * we're done.
1125	 * If sizeof (struct file *) is smaller than sizeof int, then
1126	 * do it in forward order.
1127	 */
1128	if (sizeof (struct file *) >= sizeof (int)) {
1129		fdp = (int *)(cm + 1) + oldfds - 1;
1130		rp = (struct file **)CMSG_DATA(cm) + oldfds - 1;
1131		for (i = 0; i < oldfds; i++) {
1132			fp = fdescp->fd_ofiles[*fdp--];
1133			*rp-- = fp;
1134			fp->f_count++;
1135			fp->f_msgcount++;
1136			unp_rights++;
1137		}
1138	} else {
1139		fdp = (int *)(cm + 1);
1140		rp = (struct file **)CMSG_DATA(cm);
1141		for (i = 0; i < oldfds; i++) {
1142			fp = fdescp->fd_ofiles[*fdp++];
1143			*rp++ = fp;
1144			fp->f_count++;
1145			fp->f_msgcount++;
1146			unp_rights++;
1147		}
1148	}
1149	return (0);
1150}
1151
1152static int	unp_defer, unp_gcing;
1153
1154static void
1155unp_gc()
1156{
1157	register struct file *fp, *nextfp;
1158	register struct socket *so;
1159	struct file **extra_ref, **fpp;
1160	int nunref, i;
1161
1162	if (unp_gcing)
1163		return;
1164	unp_gcing = 1;
1165	unp_defer = 0;
1166	/*
1167	 * before going through all this, set all FDs to
1168	 * be NOT defered and NOT externally accessible
1169	 */
1170	LIST_FOREACH(fp, &filehead, f_list)
1171		fp->f_flag &= ~(FMARK|FDEFER);
1172	do {
1173		LIST_FOREACH(fp, &filehead, f_list) {
1174			/*
1175			 * If the file is not open, skip it
1176			 */
1177			if (fp->f_count == 0)
1178				continue;
1179			/*
1180			 * If we already marked it as 'defer'  in a
1181			 * previous pass, then try process it this time
1182			 * and un-mark it
1183			 */
1184			if (fp->f_flag & FDEFER) {
1185				fp->f_flag &= ~FDEFER;
1186				unp_defer--;
1187			} else {
1188				/*
1189				 * if it's not defered, then check if it's
1190				 * already marked.. if so skip it
1191				 */
1192				if (fp->f_flag & FMARK)
1193					continue;
1194				/*
1195				 * If all references are from messages
1196				 * in transit, then skip it. it's not
1197				 * externally accessible.
1198				 */
1199				if (fp->f_count == fp->f_msgcount)
1200					continue;
1201				/*
1202				 * If it got this far then it must be
1203				 * externally accessible.
1204				 */
1205				fp->f_flag |= FMARK;
1206			}
1207			/*
1208			 * either it was defered, or it is externally
1209			 * accessible and not already marked so.
1210			 * Now check if it is possibly one of OUR sockets.
1211			 */
1212			if (fp->f_type != DTYPE_SOCKET ||
1213			    (so = (struct socket *)fp->f_data) == 0)
1214				continue;
1215			if (so->so_proto->pr_domain != &localdomain ||
1216			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1217				continue;
1218#ifdef notdef
1219			if (so->so_rcv.sb_flags & SB_LOCK) {
1220				/*
1221				 * This is problematical; it's not clear
1222				 * we need to wait for the sockbuf to be
1223				 * unlocked (on a uniprocessor, at least),
1224				 * and it's also not clear what to do
1225				 * if sbwait returns an error due to receipt
1226				 * of a signal.  If sbwait does return
1227				 * an error, we'll go into an infinite
1228				 * loop.  Delete all of this for now.
1229				 */
1230				(void) sbwait(&so->so_rcv);
1231				goto restart;
1232			}
1233#endif
1234			/*
1235			 * So, Ok, it's one of our sockets and it IS externally
1236			 * accessible (or was defered). Now we look
1237			 * to see if we hold any file descriptors in its
1238			 * message buffers. Follow those links and mark them
1239			 * as accessible too.
1240			 */
1241			unp_scan(so->so_rcv.sb_mb, unp_mark);
1242		}
1243	} while (unp_defer);
1244	/*
1245	 * We grab an extra reference to each of the file table entries
1246	 * that are not otherwise accessible and then free the rights
1247	 * that are stored in messages on them.
1248	 *
1249	 * The bug in the orginal code is a little tricky, so I'll describe
1250	 * what's wrong with it here.
1251	 *
1252	 * It is incorrect to simply unp_discard each entry for f_msgcount
1253	 * times -- consider the case of sockets A and B that contain
1254	 * references to each other.  On a last close of some other socket,
1255	 * we trigger a gc since the number of outstanding rights (unp_rights)
1256	 * is non-zero.  If during the sweep phase the gc code un_discards,
1257	 * we end up doing a (full) closef on the descriptor.  A closef on A
1258	 * results in the following chain.  Closef calls soo_close, which
1259	 * calls soclose.   Soclose calls first (through the switch
1260	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1261	 * returns because the previous instance had set unp_gcing, and
1262	 * we return all the way back to soclose, which marks the socket
1263	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1264	 * to free up the rights that are queued in messages on the socket A,
1265	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1266	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1267	 * instance of unp_discard just calls closef on B.
1268	 *
1269	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1270	 * which results in another closef on A.  Unfortunately, A is already
1271	 * being closed, and the descriptor has already been marked with
1272	 * SS_NOFDREF, and soclose panics at this point.
1273	 *
1274	 * Here, we first take an extra reference to each inaccessible
1275	 * descriptor.  Then, we call sorflush ourself, since we know
1276	 * it is a Unix domain socket anyhow.  After we destroy all the
1277	 * rights carried in messages, we do a last closef to get rid
1278	 * of our extra reference.  This is the last close, and the
1279	 * unp_detach etc will shut down the socket.
1280	 *
1281	 * 91/09/19, bsy@cs.cmu.edu
1282	 */
1283	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
1284	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
1285	    fp = nextfp) {
1286		nextfp = LIST_NEXT(fp, f_list);
1287		/*
1288		 * If it's not open, skip it
1289		 */
1290		if (fp->f_count == 0)
1291			continue;
1292		/*
1293		 * If all refs are from msgs, and it's not marked accessible
1294		 * then it must be referenced from some unreachable cycle
1295		 * of (shut-down) FDs, so include it in our
1296		 * list of FDs to remove
1297		 */
1298		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1299			*fpp++ = fp;
1300			nunref++;
1301			fp->f_count++;
1302		}
1303	}
1304	/*
1305	 * for each FD on our hit list, do the following two things
1306	 */
1307	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1308		struct file *tfp = *fpp;
1309		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
1310			sorflush((struct socket *)(tfp->f_data));
1311	}
1312	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
1313		closef(*fpp, (struct proc *) NULL);
1314	free((caddr_t)extra_ref, M_FILE);
1315	unp_gcing = 0;
1316}
1317
1318void
1319unp_dispose(m)
1320	struct mbuf *m;
1321{
1322
1323	if (m)
1324		unp_scan(m, unp_discard);
1325}
1326
1327static int
1328unp_listen(unp, p)
1329	struct unpcb *unp;
1330	struct proc *p;
1331{
1332
1333	bzero(&unp->unp_peercred, sizeof(unp->unp_peercred));
1334	unp->unp_peercred.cr_uid = p->p_ucred->cr_uid;
1335	unp->unp_peercred.cr_ngroups = p->p_ucred->cr_ngroups;
1336	bcopy(p->p_ucred->cr_groups, unp->unp_peercred.cr_groups,
1337	    sizeof(unp->unp_peercred.cr_groups));
1338	unp->unp_flags |= UNP_HAVEPCCACHED;
1339	return (0);
1340}
1341
1342static void
1343unp_scan(m0, op)
1344	register struct mbuf *m0;
1345	void (*op) __P((struct file *));
1346{
1347	register struct mbuf *m;
1348	register struct file **rp;
1349	register struct cmsghdr *cm;
1350	register int i;
1351	int qfds;
1352
1353	while (m0) {
1354		for (m = m0; m; m = m->m_next)
1355			if (m->m_type == MT_CONTROL &&
1356			    m->m_len >= sizeof(*cm)) {
1357				cm = mtod(m, struct cmsghdr *);
1358				if (cm->cmsg_level != SOL_SOCKET ||
1359				    cm->cmsg_type != SCM_RIGHTS)
1360					continue;
1361				qfds = (cm->cmsg_len -
1362					(CMSG_DATA(cm) - (u_char *)cm))
1363						/ sizeof (struct file *);
1364				rp = (struct file **)CMSG_DATA(cm);
1365				for (i = 0; i < qfds; i++)
1366					(*op)(*rp++);
1367				break;		/* XXX, but saves time */
1368			}
1369		m0 = m0->m_act;
1370	}
1371}
1372
1373static void
1374unp_mark(fp)
1375	struct file *fp;
1376{
1377
1378	if (fp->f_flag & FMARK)
1379		return;
1380	unp_defer++;
1381	fp->f_flag |= (FMARK|FDEFER);
1382}
1383
1384static void
1385unp_discard(fp)
1386	struct file *fp;
1387{
1388
1389	fp->f_msgcount--;
1390	unp_rights--;
1391	(void) closef(fp, (struct proc *)NULL);
1392}
1393