kern_jail.c revision 298832
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 298832 2016-04-30 03:05:36Z jamie $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/racct.h>
53#include <sys/refcount.h>
54#include <sys/sx.h>
55#include <sys/sysent.h>
56#include <sys/namei.h>
57#include <sys/mount.h>
58#include <sys/queue.h>
59#include <sys/socket.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/vnode.h>
63
64#include <net/if.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68
69#ifdef DDB
70#include <ddb/ddb.h>
71#ifdef INET6
72#include <netinet6/in6_var.h>
73#endif /* INET6 */
74#endif /* DDB */
75
76#include <security/mac/mac_framework.h>
77
78#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79
80MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82
83/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84#ifdef INET
85#ifdef INET6
86#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87#else
88#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89#endif
90#else /* !INET */
91#ifdef INET6
92#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93#else
94#define	_PR_IP_SADDRSEL	0
95#endif
96#endif
97
98/* prison0 describes what is "real" about the system. */
99struct prison prison0 = {
100	.pr_id		= 0,
101	.pr_name	= "0",
102	.pr_ref		= 1,
103	.pr_uref	= 1,
104	.pr_path	= "/",
105	.pr_securelevel	= -1,
106	.pr_devfs_rsnum = 0,
107	.pr_childmax	= JAIL_MAX,
108	.pr_hostuuid	= DEFAULT_HOSTUUID,
109	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
110#ifdef VIMAGE
111	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
112#else
113	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
114#endif
115	.pr_allow	= PR_ALLOW_ALL,
116};
117MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
118
119/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
120struct	sx allprison_lock;
121SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
122struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
123LIST_HEAD(, prison_racct) allprison_racct;
124int	lastprid = 0;
125
126static int do_jail_attach(struct thread *td, struct prison *pr);
127static void prison_complete(void *context, int pending);
128static void prison_deref(struct prison *pr, int flags);
129static char *prison_path(struct prison *pr1, struct prison *pr2);
130static void prison_remove_one(struct prison *pr);
131#ifdef RACCT
132static void prison_racct_attach(struct prison *pr);
133static void prison_racct_modify(struct prison *pr);
134static void prison_racct_detach(struct prison *pr);
135#endif
136#ifdef INET
137static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
138static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
139#endif
140#ifdef INET6
141static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
142static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
143#endif
144
145/* Flags for prison_deref */
146#define	PD_DEREF	0x01
147#define	PD_DEUREF	0x02
148#define	PD_LOCKED	0x04
149#define	PD_LIST_SLOCKED	0x08
150#define	PD_LIST_XLOCKED	0x10
151
152/*
153 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
154 * as we cannot figure out the size of a sparse array, or an array without a
155 * terminating entry.
156 */
157static char *pr_flag_names[] = {
158	[0] = "persist",
159#ifdef INET
160	[7] = "ip4.saddrsel",
161#endif
162#ifdef INET6
163	[8] = "ip6.saddrsel",
164#endif
165};
166const size_t pr_flag_names_size = sizeof(pr_flag_names);
167
168static char *pr_flag_nonames[] = {
169	[0] = "nopersist",
170#ifdef INET
171	[7] = "ip4.nosaddrsel",
172#endif
173#ifdef INET6
174	[8] = "ip6.nosaddrsel",
175#endif
176};
177const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
178
179struct jailsys_flags {
180	const char	*name;
181	unsigned	 disable;
182	unsigned	 new;
183} pr_flag_jailsys[] = {
184	{ "host", 0, PR_HOST },
185#ifdef VIMAGE
186	{ "vnet", 0, PR_VNET },
187#endif
188#ifdef INET
189	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
190#endif
191#ifdef INET6
192	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
193#endif
194};
195const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196
197static char *pr_allow_names[] = {
198	"allow.set_hostname",
199	"allow.sysvipc",
200	"allow.raw_sockets",
201	"allow.chflags",
202	"allow.mount",
203	"allow.quotas",
204	"allow.socket_af",
205	"allow.mount.devfs",
206	"allow.mount.nullfs",
207	"allow.mount.zfs",
208	"allow.mount.procfs",
209	"allow.mount.tmpfs",
210	"allow.mount.fdescfs",
211	"allow.mount.linprocfs",
212	"allow.mount.linsysfs",
213};
214const size_t pr_allow_names_size = sizeof(pr_allow_names);
215
216static char *pr_allow_nonames[] = {
217	"allow.noset_hostname",
218	"allow.nosysvipc",
219	"allow.noraw_sockets",
220	"allow.nochflags",
221	"allow.nomount",
222	"allow.noquotas",
223	"allow.nosocket_af",
224	"allow.mount.nodevfs",
225	"allow.mount.nonullfs",
226	"allow.mount.nozfs",
227	"allow.mount.noprocfs",
228	"allow.mount.notmpfs",
229	"allow.mount.nofdescfs",
230	"allow.mount.nolinprocfs",
231	"allow.mount.nolinsysfs",
232};
233const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
234
235#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
236#define	JAIL_DEFAULT_ENFORCE_STATFS	2
237#define	JAIL_DEFAULT_DEVFS_RSNUM	0
238static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241#if defined(INET) || defined(INET6)
242static unsigned jail_max_af_ips = 255;
243#endif
244
245/*
246 * Initialize the parts of prison0 that can't be static-initialized with
247 * constants.  This is called from proc0_init() after creating thread0 cpuset.
248 */
249void
250prison0_init(void)
251{
252
253	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
254	prison0.pr_osreldate = osreldate;
255	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
256}
257
258#ifdef INET
259static int
260qcmp_v4(const void *ip1, const void *ip2)
261{
262	in_addr_t iaa, iab;
263
264	/*
265	 * We need to compare in HBO here to get the list sorted as expected
266	 * by the result of the code.  Sorting NBO addresses gives you
267	 * interesting results.  If you do not understand, do not try.
268	 */
269	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
270	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
271
272	/*
273	 * Do not simply return the difference of the two numbers, the int is
274	 * not wide enough.
275	 */
276	if (iaa > iab)
277		return (1);
278	else if (iaa < iab)
279		return (-1);
280	else
281		return (0);
282}
283#endif
284
285#ifdef INET6
286static int
287qcmp_v6(const void *ip1, const void *ip2)
288{
289	const struct in6_addr *ia6a, *ia6b;
290	int i, rc;
291
292	ia6a = (const struct in6_addr *)ip1;
293	ia6b = (const struct in6_addr *)ip2;
294
295	rc = 0;
296	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
297		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
298			rc = 1;
299		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
300			rc = -1;
301	}
302	return (rc);
303}
304#endif
305
306/*
307 * struct jail_args {
308 *	struct jail *jail;
309 * };
310 */
311int
312sys_jail(struct thread *td, struct jail_args *uap)
313{
314	uint32_t version;
315	int error;
316	struct jail j;
317
318	error = copyin(uap->jail, &version, sizeof(uint32_t));
319	if (error)
320		return (error);
321
322	switch (version) {
323	case 0:
324	{
325		struct jail_v0 j0;
326
327		/* FreeBSD single IPv4 jails. */
328		bzero(&j, sizeof(struct jail));
329		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
330		if (error)
331			return (error);
332		j.version = j0.version;
333		j.path = j0.path;
334		j.hostname = j0.hostname;
335		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
336		break;
337	}
338
339	case 1:
340		/*
341		 * Version 1 was used by multi-IPv4 jail implementations
342		 * that never made it into the official kernel.
343		 */
344		return (EINVAL);
345
346	case 2:	/* JAIL_API_VERSION */
347		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
348		error = copyin(uap->jail, &j, sizeof(struct jail));
349		if (error)
350			return (error);
351		break;
352
353	default:
354		/* Sci-Fi jails are not supported, sorry. */
355		return (EINVAL);
356	}
357	return (kern_jail(td, &j));
358}
359
360int
361kern_jail(struct thread *td, struct jail *j)
362{
363	struct iovec optiov[2 * (4
364			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
365#ifdef INET
366			    + 1
367#endif
368#ifdef INET6
369			    + 1
370#endif
371			    )];
372	struct uio opt;
373	char *u_path, *u_hostname, *u_name;
374#ifdef INET
375	uint32_t ip4s;
376	struct in_addr *u_ip4;
377#endif
378#ifdef INET6
379	struct in6_addr *u_ip6;
380#endif
381	size_t tmplen;
382	int error, enforce_statfs, fi;
383
384	bzero(&optiov, sizeof(optiov));
385	opt.uio_iov = optiov;
386	opt.uio_iovcnt = 0;
387	opt.uio_offset = -1;
388	opt.uio_resid = -1;
389	opt.uio_segflg = UIO_SYSSPACE;
390	opt.uio_rw = UIO_READ;
391	opt.uio_td = td;
392
393	/* Set permissions for top-level jails from sysctls. */
394	if (!jailed(td->td_ucred)) {
395		for (fi = 0; fi < sizeof(pr_allow_names) /
396		     sizeof(pr_allow_names[0]); fi++) {
397			optiov[opt.uio_iovcnt].iov_base =
398			    (jail_default_allow & (1 << fi))
399			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
400			optiov[opt.uio_iovcnt].iov_len =
401			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
402			opt.uio_iovcnt += 2;
403		}
404		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
405		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
406		opt.uio_iovcnt++;
407		enforce_statfs = jail_default_enforce_statfs;
408		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
409		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
410		opt.uio_iovcnt++;
411	}
412
413	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
414#ifdef INET
415	ip4s = (j->version == 0) ? 1 : j->ip4s;
416	if (ip4s > jail_max_af_ips)
417		return (EINVAL);
418	tmplen += ip4s * sizeof(struct in_addr);
419#else
420	if (j->ip4s > 0)
421		return (EINVAL);
422#endif
423#ifdef INET6
424	if (j->ip6s > jail_max_af_ips)
425		return (EINVAL);
426	tmplen += j->ip6s * sizeof(struct in6_addr);
427#else
428	if (j->ip6s > 0)
429		return (EINVAL);
430#endif
431	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
432	u_hostname = u_path + MAXPATHLEN;
433	u_name = u_hostname + MAXHOSTNAMELEN;
434#ifdef INET
435	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
436#endif
437#ifdef INET6
438#ifdef INET
439	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
440#else
441	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
442#endif
443#endif
444	optiov[opt.uio_iovcnt].iov_base = "path";
445	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
446	opt.uio_iovcnt++;
447	optiov[opt.uio_iovcnt].iov_base = u_path;
448	error = copyinstr(j->path, u_path, MAXPATHLEN,
449	    &optiov[opt.uio_iovcnt].iov_len);
450	if (error) {
451		free(u_path, M_TEMP);
452		return (error);
453	}
454	opt.uio_iovcnt++;
455	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
456	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
457	opt.uio_iovcnt++;
458	optiov[opt.uio_iovcnt].iov_base = u_hostname;
459	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
460	    &optiov[opt.uio_iovcnt].iov_len);
461	if (error) {
462		free(u_path, M_TEMP);
463		return (error);
464	}
465	opt.uio_iovcnt++;
466	if (j->jailname != NULL) {
467		optiov[opt.uio_iovcnt].iov_base = "name";
468		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
469		opt.uio_iovcnt++;
470		optiov[opt.uio_iovcnt].iov_base = u_name;
471		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
472		    &optiov[opt.uio_iovcnt].iov_len);
473		if (error) {
474			free(u_path, M_TEMP);
475			return (error);
476		}
477		opt.uio_iovcnt++;
478	}
479#ifdef INET
480	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
481	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
482	opt.uio_iovcnt++;
483	optiov[opt.uio_iovcnt].iov_base = u_ip4;
484	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
485	if (j->version == 0)
486		u_ip4->s_addr = j->ip4s;
487	else {
488		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
489		if (error) {
490			free(u_path, M_TEMP);
491			return (error);
492		}
493	}
494	opt.uio_iovcnt++;
495#endif
496#ifdef INET6
497	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
498	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
499	opt.uio_iovcnt++;
500	optiov[opt.uio_iovcnt].iov_base = u_ip6;
501	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
502	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
503	if (error) {
504		free(u_path, M_TEMP);
505		return (error);
506	}
507	opt.uio_iovcnt++;
508#endif
509	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
510	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
511	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
512	free(u_path, M_TEMP);
513	return (error);
514}
515
516
517/*
518 * struct jail_set_args {
519 *	struct iovec *iovp;
520 *	unsigned int iovcnt;
521 *	int flags;
522 * };
523 */
524int
525sys_jail_set(struct thread *td, struct jail_set_args *uap)
526{
527	struct uio *auio;
528	int error;
529
530	/* Check that we have an even number of iovecs. */
531	if (uap->iovcnt & 1)
532		return (EINVAL);
533
534	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
535	if (error)
536		return (error);
537	error = kern_jail_set(td, auio, uap->flags);
538	free(auio, M_IOV);
539	return (error);
540}
541
542int
543kern_jail_set(struct thread *td, struct uio *optuio, int flags)
544{
545	struct nameidata nd;
546#ifdef INET
547	struct in_addr *ip4;
548#endif
549#ifdef INET6
550	struct in6_addr *ip6;
551#endif
552	struct vfsopt *opt;
553	struct vfsoptlist *opts;
554	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
555	struct vnode *root;
556	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
557	char *g_path, *osrelstr;
558#if defined(INET) || defined(INET6)
559	struct prison *tppr;
560	void *op;
561#endif
562	unsigned long hid;
563	size_t namelen, onamelen;
564	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
565	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
566	int fi, jid, jsys, len, level;
567	int childmax, osreldt, rsnum, slevel;
568	int fullpath_disabled;
569#if defined(INET) || defined(INET6)
570	int ii, ij;
571#endif
572#ifdef INET
573	int ip4s, redo_ip4;
574#endif
575#ifdef INET6
576	int ip6s, redo_ip6;
577#endif
578	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
579	unsigned tallow;
580	char numbuf[12];
581
582	error = priv_check(td, PRIV_JAIL_SET);
583	if (!error && (flags & JAIL_ATTACH))
584		error = priv_check(td, PRIV_JAIL_ATTACH);
585	if (error)
586		return (error);
587	mypr = ppr = td->td_ucred->cr_prison;
588	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
589		return (EPERM);
590	if (flags & ~JAIL_SET_MASK)
591		return (EINVAL);
592
593	/*
594	 * Check all the parameters before committing to anything.  Not all
595	 * errors can be caught early, but we may as well try.  Also, this
596	 * takes care of some expensive stuff (path lookup) before getting
597	 * the allprison lock.
598	 *
599	 * XXX Jails are not filesystems, and jail parameters are not mount
600	 *     options.  But it makes more sense to re-use the vfsopt code
601	 *     than duplicate it under a different name.
602	 */
603	error = vfs_buildopts(optuio, &opts);
604	if (error)
605		return (error);
606#ifdef INET
607	ip4 = NULL;
608#endif
609#ifdef INET6
610	ip6 = NULL;
611#endif
612	g_path = NULL;
613
614	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
615	if (error == ENOENT)
616		jid = 0;
617	else if (error != 0)
618		goto done_free;
619
620	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
621	if (error == ENOENT)
622		gotslevel = 0;
623	else if (error != 0)
624		goto done_free;
625	else
626		gotslevel = 1;
627
628	error =
629	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
630	if (error == ENOENT)
631		gotchildmax = 0;
632	else if (error != 0)
633		goto done_free;
634	else
635		gotchildmax = 1;
636
637	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
638	if (error == ENOENT)
639		gotenforce = 0;
640	else if (error != 0)
641		goto done_free;
642	else if (enforce < 0 || enforce > 2) {
643		error = EINVAL;
644		goto done_free;
645	} else
646		gotenforce = 1;
647
648	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
649	if (error == ENOENT)
650		gotrsnum = 0;
651	else if (error != 0)
652		goto done_free;
653	else
654		gotrsnum = 1;
655
656	pr_flags = ch_flags = 0;
657	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
658	    fi++) {
659		if (pr_flag_names[fi] == NULL)
660			continue;
661		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
662		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
663	}
664	ch_flags |= pr_flags;
665	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
666	    fi++) {
667		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
668		    sizeof(jsys));
669		if (error == ENOENT)
670			continue;
671		if (error != 0)
672			goto done_free;
673		switch (jsys) {
674		case JAIL_SYS_DISABLE:
675			if (!pr_flag_jailsys[fi].disable) {
676				error = EINVAL;
677				goto done_free;
678			}
679			pr_flags |= pr_flag_jailsys[fi].disable;
680			break;
681		case JAIL_SYS_NEW:
682			pr_flags |= pr_flag_jailsys[fi].new;
683			break;
684		case JAIL_SYS_INHERIT:
685			break;
686		default:
687			error = EINVAL;
688			goto done_free;
689		}
690		ch_flags |=
691		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
692	}
693	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
694	    && !(pr_flags & PR_PERSIST)) {
695		error = EINVAL;
696		vfs_opterror(opts, "new jail must persist or attach");
697		goto done_errmsg;
698	}
699#ifdef VIMAGE
700	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
701		error = EINVAL;
702		vfs_opterror(opts, "vnet cannot be changed after creation");
703		goto done_errmsg;
704	}
705#endif
706#ifdef INET
707	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
708		error = EINVAL;
709		vfs_opterror(opts, "ip4 cannot be changed after creation");
710		goto done_errmsg;
711	}
712#endif
713#ifdef INET6
714	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
715		error = EINVAL;
716		vfs_opterror(opts, "ip6 cannot be changed after creation");
717		goto done_errmsg;
718	}
719#endif
720
721	pr_allow = ch_allow = 0;
722	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
723	    fi++) {
724		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
725		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
726	}
727	ch_allow |= pr_allow;
728
729	error = vfs_getopt(opts, "name", (void **)&name, &len);
730	if (error == ENOENT)
731		name = NULL;
732	else if (error != 0)
733		goto done_free;
734	else {
735		if (len == 0 || name[len - 1] != '\0') {
736			error = EINVAL;
737			goto done_free;
738		}
739		if (len > MAXHOSTNAMELEN) {
740			error = ENAMETOOLONG;
741			goto done_free;
742		}
743	}
744
745	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
746	if (error == ENOENT)
747		host = NULL;
748	else if (error != 0)
749		goto done_free;
750	else {
751		ch_flags |= PR_HOST;
752		pr_flags |= PR_HOST;
753		if (len == 0 || host[len - 1] != '\0') {
754			error = EINVAL;
755			goto done_free;
756		}
757		if (len > MAXHOSTNAMELEN) {
758			error = ENAMETOOLONG;
759			goto done_free;
760		}
761	}
762
763	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
764	if (error == ENOENT)
765		domain = NULL;
766	else if (error != 0)
767		goto done_free;
768	else {
769		ch_flags |= PR_HOST;
770		pr_flags |= PR_HOST;
771		if (len == 0 || domain[len - 1] != '\0') {
772			error = EINVAL;
773			goto done_free;
774		}
775		if (len > MAXHOSTNAMELEN) {
776			error = ENAMETOOLONG;
777			goto done_free;
778		}
779	}
780
781	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
782	if (error == ENOENT)
783		uuid = NULL;
784	else if (error != 0)
785		goto done_free;
786	else {
787		ch_flags |= PR_HOST;
788		pr_flags |= PR_HOST;
789		if (len == 0 || uuid[len - 1] != '\0') {
790			error = EINVAL;
791			goto done_free;
792		}
793		if (len > HOSTUUIDLEN) {
794			error = ENAMETOOLONG;
795			goto done_free;
796		}
797	}
798
799#ifdef COMPAT_FREEBSD32
800	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
801		uint32_t hid32;
802
803		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
804		hid = hid32;
805	} else
806#endif
807		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
808	if (error == ENOENT)
809		gothid = 0;
810	else if (error != 0)
811		goto done_free;
812	else {
813		gothid = 1;
814		ch_flags |= PR_HOST;
815		pr_flags |= PR_HOST;
816	}
817
818#ifdef INET
819	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
820	if (error == ENOENT)
821		ip4s = 0;
822	else if (error != 0)
823		goto done_free;
824	else if (ip4s & (sizeof(*ip4) - 1)) {
825		error = EINVAL;
826		goto done_free;
827	} else {
828		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
829		if (ip4s == 0)
830			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
831		else {
832			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
833			ip4s /= sizeof(*ip4);
834			if (ip4s > jail_max_af_ips) {
835				error = EINVAL;
836				vfs_opterror(opts, "too many IPv4 addresses");
837				goto done_errmsg;
838			}
839			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
840			bcopy(op, ip4, ip4s * sizeof(*ip4));
841			/*
842			 * IP addresses are all sorted but ip[0] to preserve
843			 * the primary IP address as given from userland.
844			 * This special IP is used for unbound outgoing
845			 * connections as well for "loopback" traffic in case
846			 * source address selection cannot find any more fitting
847			 * address to connect from.
848			 */
849			if (ip4s > 1)
850				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
851			/*
852			 * Check for duplicate addresses and do some simple
853			 * zero and broadcast checks. If users give other bogus
854			 * addresses it is their problem.
855			 *
856			 * We do not have to care about byte order for these
857			 * checks so we will do them in NBO.
858			 */
859			for (ii = 0; ii < ip4s; ii++) {
860				if (ip4[ii].s_addr == INADDR_ANY ||
861				    ip4[ii].s_addr == INADDR_BROADCAST) {
862					error = EINVAL;
863					goto done_free;
864				}
865				if ((ii+1) < ip4s &&
866				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
867				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
868					error = EINVAL;
869					goto done_free;
870				}
871			}
872		}
873	}
874#endif
875
876#ifdef INET6
877	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
878	if (error == ENOENT)
879		ip6s = 0;
880	else if (error != 0)
881		goto done_free;
882	else if (ip6s & (sizeof(*ip6) - 1)) {
883		error = EINVAL;
884		goto done_free;
885	} else {
886		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
887		if (ip6s == 0)
888			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
889		else {
890			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
891			ip6s /= sizeof(*ip6);
892			if (ip6s > jail_max_af_ips) {
893				error = EINVAL;
894				vfs_opterror(opts, "too many IPv6 addresses");
895				goto done_errmsg;
896			}
897			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
898			bcopy(op, ip6, ip6s * sizeof(*ip6));
899			if (ip6s > 1)
900				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
901			for (ii = 0; ii < ip6s; ii++) {
902				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
903					error = EINVAL;
904					goto done_free;
905				}
906				if ((ii+1) < ip6s &&
907				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
908				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
909				{
910					error = EINVAL;
911					goto done_free;
912				}
913			}
914		}
915	}
916#endif
917
918#if defined(VIMAGE) && (defined(INET) || defined(INET6))
919	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
920		error = EINVAL;
921		vfs_opterror(opts,
922		    "vnet jails cannot have IP address restrictions");
923		goto done_errmsg;
924	}
925#endif
926
927	fullpath_disabled = 0;
928	root = NULL;
929	error = vfs_getopt(opts, "path", (void **)&path, &len);
930	if (error == ENOENT)
931		path = NULL;
932	else if (error != 0)
933		goto done_free;
934	else {
935		if (flags & JAIL_UPDATE) {
936			error = EINVAL;
937			vfs_opterror(opts,
938			    "path cannot be changed after creation");
939			goto done_errmsg;
940		}
941		if (len == 0 || path[len - 1] != '\0') {
942			error = EINVAL;
943			goto done_free;
944		}
945		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
946		    path, td);
947		error = namei(&nd);
948		if (error)
949			goto done_free;
950		root = nd.ni_vp;
951		NDFREE(&nd, NDF_ONLY_PNBUF);
952		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
953		strlcpy(g_path, path, MAXPATHLEN);
954		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
955		if (error == 0)
956			path = g_path;
957		else if (error == ENODEV) {
958			/* proceed if sysctl debug.disablefullpath == 1 */
959			fullpath_disabled = 1;
960			if (len < 2 || (len == 2 && path[0] == '/'))
961				path = NULL;
962		} else {
963			/* exit on other errors */
964			goto done_free;
965		}
966		if (root->v_type != VDIR) {
967			error = ENOTDIR;
968			vput(root);
969			goto done_free;
970		}
971		VOP_UNLOCK(root, 0);
972		if (fullpath_disabled) {
973			/* Leave room for a real-root full pathname. */
974			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
975			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
976				error = ENAMETOOLONG;
977				goto done_free;
978			}
979		}
980	}
981
982	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
983	if (error == ENOENT)
984		osrelstr = NULL;
985	else if (error != 0)
986		goto done_free;
987	else {
988		if (flags & JAIL_UPDATE) {
989			error = EINVAL;
990			vfs_opterror(opts,
991			    "osrelease cannot be changed after creation");
992			goto done_errmsg;
993		}
994		if (len == 0 || len >= OSRELEASELEN) {
995			error = EINVAL;
996			vfs_opterror(opts,
997			    "osrelease string must be 1-%d bytes long",
998			    OSRELEASELEN - 1);
999			goto done_errmsg;
1000		}
1001	}
1002
1003	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1004	if (error == ENOENT)
1005		osreldt = 0;
1006	else if (error != 0)
1007		goto done_free;
1008	else {
1009		if (flags & JAIL_UPDATE) {
1010			error = EINVAL;
1011			vfs_opterror(opts,
1012			    "osreldate cannot be changed after creation");
1013			goto done_errmsg;
1014		}
1015		if (osreldt == 0) {
1016			error = EINVAL;
1017			vfs_opterror(opts, "osreldate cannot be 0");
1018			goto done_errmsg;
1019		}
1020	}
1021
1022	/*
1023	 * Grab the allprison lock before letting modules check their
1024	 * parameters.  Once we have it, do not let go so we'll have a
1025	 * consistent view of the OSD list.
1026	 */
1027	sx_xlock(&allprison_lock);
1028	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
1029	if (error)
1030		goto done_unlock_list;
1031
1032	/* By now, all parameters should have been noted. */
1033	TAILQ_FOREACH(opt, opts, link) {
1034		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1035			error = EINVAL;
1036			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1037			goto done_unlock_list;
1038		}
1039	}
1040
1041	/*
1042	 * See if we are creating a new record or updating an existing one.
1043	 * This abuses the file error codes ENOENT and EEXIST.
1044	 */
1045	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1046	if (!cuflags) {
1047		error = EINVAL;
1048		vfs_opterror(opts, "no valid operation (create or update)");
1049		goto done_unlock_list;
1050	}
1051	pr = NULL;
1052	namelc = NULL;
1053	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1054		namelc = strrchr(name, '.');
1055		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1056		if (*p != '\0')
1057			jid = 0;
1058	}
1059	if (jid != 0) {
1060		/*
1061		 * See if a requested jid already exists.  There is an
1062		 * information leak here if the jid exists but is not within
1063		 * the caller's jail hierarchy.  Jail creators will get EEXIST
1064		 * even though they cannot see the jail, and CREATE | UPDATE
1065		 * will return ENOENT which is not normally a valid error.
1066		 */
1067		if (jid < 0) {
1068			error = EINVAL;
1069			vfs_opterror(opts, "negative jid");
1070			goto done_unlock_list;
1071		}
1072		pr = prison_find(jid);
1073		if (pr != NULL) {
1074			ppr = pr->pr_parent;
1075			/* Create: jid must not exist. */
1076			if (cuflags == JAIL_CREATE) {
1077				mtx_unlock(&pr->pr_mtx);
1078				error = EEXIST;
1079				vfs_opterror(opts, "jail %d already exists",
1080				    jid);
1081				goto done_unlock_list;
1082			}
1083			if (!prison_ischild(mypr, pr)) {
1084				mtx_unlock(&pr->pr_mtx);
1085				pr = NULL;
1086			} else if (pr->pr_uref == 0) {
1087				if (!(flags & JAIL_DYING)) {
1088					mtx_unlock(&pr->pr_mtx);
1089					error = ENOENT;
1090					vfs_opterror(opts, "jail %d is dying",
1091					    jid);
1092					goto done_unlock_list;
1093				} else if ((flags & JAIL_ATTACH) ||
1094				    (pr_flags & PR_PERSIST)) {
1095					/*
1096					 * A dying jail might be resurrected
1097					 * (via attach or persist), but first
1098					 * it must determine if another jail
1099					 * has claimed its name.  Accomplish
1100					 * this by implicitly re-setting the
1101					 * name.
1102					 */
1103					if (name == NULL)
1104						name = prison_name(mypr, pr);
1105				}
1106			}
1107		}
1108		if (pr == NULL) {
1109			/* Update: jid must exist. */
1110			if (cuflags == JAIL_UPDATE) {
1111				error = ENOENT;
1112				vfs_opterror(opts, "jail %d not found", jid);
1113				goto done_unlock_list;
1114			}
1115		}
1116	}
1117	/*
1118	 * If the caller provided a name, look for a jail by that name.
1119	 * This has different semantics for creates and updates keyed by jid
1120	 * (where the name must not already exist in a different jail),
1121	 * and updates keyed by the name itself (where the name must exist
1122	 * because that is the jail being updated).
1123	 */
1124	if (name != NULL) {
1125		namelc = strrchr(name, '.');
1126		if (namelc == NULL)
1127			namelc = name;
1128		else {
1129			/*
1130			 * This is a hierarchical name.  Split it into the
1131			 * parent and child names, and make sure the parent
1132			 * exists or matches an already found jail.
1133			 */
1134			*namelc = '\0';
1135			if (pr != NULL) {
1136				if (strncmp(name, ppr->pr_name, namelc - name)
1137				    || ppr->pr_name[namelc - name] != '\0') {
1138					mtx_unlock(&pr->pr_mtx);
1139					error = EINVAL;
1140					vfs_opterror(opts,
1141					    "cannot change jail's parent");
1142					goto done_unlock_list;
1143				}
1144			} else {
1145				ppr = prison_find_name(mypr, name);
1146				if (ppr == NULL) {
1147					error = ENOENT;
1148					vfs_opterror(opts,
1149					    "jail \"%s\" not found", name);
1150					goto done_unlock_list;
1151				}
1152				mtx_unlock(&ppr->pr_mtx);
1153			}
1154			name = ++namelc;
1155		}
1156		if (name[0] != '\0') {
1157			namelen =
1158			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1159 name_again:
1160			deadpr = NULL;
1161			FOREACH_PRISON_CHILD(ppr, tpr) {
1162				if (tpr != pr && tpr->pr_ref > 0 &&
1163				    !strcmp(tpr->pr_name + namelen, name)) {
1164					if (pr == NULL &&
1165					    cuflags != JAIL_CREATE) {
1166						mtx_lock(&tpr->pr_mtx);
1167						if (tpr->pr_ref > 0) {
1168							/*
1169							 * Use this jail
1170							 * for updates.
1171							 */
1172							if (tpr->pr_uref > 0) {
1173								pr = tpr;
1174								break;
1175							}
1176							deadpr = tpr;
1177						}
1178						mtx_unlock(&tpr->pr_mtx);
1179					} else if (tpr->pr_uref > 0) {
1180						/*
1181						 * Create, or update(jid):
1182						 * name must not exist in an
1183						 * active sibling jail.
1184						 */
1185						error = EEXIST;
1186						if (pr != NULL)
1187							mtx_unlock(&pr->pr_mtx);
1188						vfs_opterror(opts,
1189						   "jail \"%s\" already exists",
1190						   name);
1191						goto done_unlock_list;
1192					}
1193				}
1194			}
1195			/* If no active jail is found, use a dying one. */
1196			if (deadpr != NULL && pr == NULL) {
1197				if (flags & JAIL_DYING) {
1198					mtx_lock(&deadpr->pr_mtx);
1199					if (deadpr->pr_ref == 0) {
1200						mtx_unlock(&deadpr->pr_mtx);
1201						goto name_again;
1202					}
1203					pr = deadpr;
1204				} else if (cuflags == JAIL_UPDATE) {
1205					error = ENOENT;
1206					vfs_opterror(opts,
1207					    "jail \"%s\" is dying", name);
1208					goto done_unlock_list;
1209				}
1210			}
1211			/* Update: name must exist if no jid. */
1212			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1213				error = ENOENT;
1214				vfs_opterror(opts, "jail \"%s\" not found",
1215				    name);
1216				goto done_unlock_list;
1217			}
1218		}
1219	}
1220	/* Update: must provide a jid or name. */
1221	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1222		error = ENOENT;
1223		vfs_opterror(opts, "update specified no jail");
1224		goto done_unlock_list;
1225	}
1226
1227	/* If there's no prison to update, create a new one and link it in. */
1228	if (pr == NULL) {
1229		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1230			if (tpr->pr_childcount >= tpr->pr_childmax) {
1231				error = EPERM;
1232				vfs_opterror(opts, "prison limit exceeded");
1233				goto done_unlock_list;
1234			}
1235		created = 1;
1236		mtx_lock(&ppr->pr_mtx);
1237		if (ppr->pr_ref == 0) {
1238			mtx_unlock(&ppr->pr_mtx);
1239			error = ENOENT;
1240			vfs_opterror(opts, "parent jail went away!");
1241			goto done_unlock_list;
1242		}
1243		ppr->pr_ref++;
1244		ppr->pr_uref++;
1245		mtx_unlock(&ppr->pr_mtx);
1246		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1247		if (jid == 0) {
1248			/* Find the next free jid. */
1249			jid = lastprid + 1;
1250 findnext:
1251			if (jid == JAIL_MAX)
1252				jid = 1;
1253			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1254				if (tpr->pr_id < jid)
1255					continue;
1256				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1257					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1258					break;
1259				}
1260				if (jid == lastprid) {
1261					error = EAGAIN;
1262					vfs_opterror(opts,
1263					    "no available jail IDs");
1264					free(pr, M_PRISON);
1265					prison_deref(ppr, PD_DEREF |
1266					    PD_DEUREF | PD_LIST_XLOCKED);
1267					goto done_releroot;
1268				}
1269				jid++;
1270				goto findnext;
1271			}
1272			lastprid = jid;
1273		} else {
1274			/*
1275			 * The jail already has a jid (that did not yet exist),
1276			 * so just find where to insert it.
1277			 */
1278			TAILQ_FOREACH(tpr, &allprison, pr_list)
1279				if (tpr->pr_id >= jid) {
1280					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1281					break;
1282				}
1283		}
1284		if (tpr == NULL)
1285			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1286		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1287		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1288			tpr->pr_childcount++;
1289
1290		pr->pr_parent = ppr;
1291		pr->pr_id = jid;
1292
1293		/* Set some default values, and inherit some from the parent. */
1294		if (name == NULL)
1295			name = "";
1296		if (path == NULL) {
1297			path = "/";
1298			root = mypr->pr_root;
1299			vref(root);
1300		}
1301		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1302		pr->pr_flags |= PR_HOST;
1303#if defined(INET) || defined(INET6)
1304#ifdef VIMAGE
1305		if (!(pr_flags & PR_VNET))
1306#endif
1307		{
1308#ifdef INET
1309			if (!(ch_flags & PR_IP4_USER))
1310				pr->pr_flags |=
1311				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1312			else if (!(pr_flags & PR_IP4_USER)) {
1313				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1314				if (ppr->pr_ip4 != NULL) {
1315					pr->pr_ip4s = ppr->pr_ip4s;
1316					pr->pr_ip4 = malloc(pr->pr_ip4s *
1317					    sizeof(struct in_addr), M_PRISON,
1318					    M_WAITOK);
1319					bcopy(ppr->pr_ip4, pr->pr_ip4,
1320					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1321				}
1322			}
1323#endif
1324#ifdef INET6
1325			if (!(ch_flags & PR_IP6_USER))
1326				pr->pr_flags |=
1327				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1328			else if (!(pr_flags & PR_IP6_USER)) {
1329				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1330				if (ppr->pr_ip6 != NULL) {
1331					pr->pr_ip6s = ppr->pr_ip6s;
1332					pr->pr_ip6 = malloc(pr->pr_ip6s *
1333					    sizeof(struct in6_addr), M_PRISON,
1334					    M_WAITOK);
1335					bcopy(ppr->pr_ip6, pr->pr_ip6,
1336					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1337				}
1338			}
1339#endif
1340		}
1341#endif
1342		/* Source address selection is always on by default. */
1343		pr->pr_flags |= _PR_IP_SADDRSEL;
1344
1345		pr->pr_securelevel = ppr->pr_securelevel;
1346		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1347		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1348		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1349
1350		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1351		if (osrelstr == NULL)
1352		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
1353		else
1354		    strcpy(pr->pr_osrelease, osrelstr);
1355
1356		LIST_INIT(&pr->pr_children);
1357		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1358
1359#ifdef VIMAGE
1360		/* Allocate a new vnet if specified. */
1361		pr->pr_vnet = (pr_flags & PR_VNET)
1362		    ? vnet_alloc() : ppr->pr_vnet;
1363#endif
1364		/*
1365		 * Allocate a dedicated cpuset for each jail.
1366		 * Unlike other initial settings, this may return an erorr.
1367		 */
1368		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1369		if (error) {
1370			prison_deref(pr, PD_LIST_XLOCKED);
1371			goto done_releroot;
1372		}
1373
1374		mtx_lock(&pr->pr_mtx);
1375		/*
1376		 * New prisons do not yet have a reference, because we do not
1377		 * want other to see the incomplete prison once the
1378		 * allprison_lock is downgraded.
1379		 */
1380	} else {
1381		created = 0;
1382		/*
1383		 * Grab a reference for existing prisons, to ensure they
1384		 * continue to exist for the duration of the call.
1385		 */
1386		pr->pr_ref++;
1387#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1388		if ((pr->pr_flags & PR_VNET) &&
1389		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1390			error = EINVAL;
1391			vfs_opterror(opts,
1392			    "vnet jails cannot have IP address restrictions");
1393			goto done_deref_locked;
1394		}
1395#endif
1396#ifdef INET
1397		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1398			error = EINVAL;
1399			vfs_opterror(opts,
1400			    "ip4 cannot be changed after creation");
1401			goto done_deref_locked;
1402		}
1403#endif
1404#ifdef INET6
1405		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1406			error = EINVAL;
1407			vfs_opterror(opts,
1408			    "ip6 cannot be changed after creation");
1409			goto done_deref_locked;
1410		}
1411#endif
1412	}
1413
1414	/* Do final error checking before setting anything. */
1415	if (gotslevel) {
1416		if (slevel < ppr->pr_securelevel) {
1417			error = EPERM;
1418			goto done_deref_locked;
1419		}
1420	}
1421	if (gotchildmax) {
1422		if (childmax >= ppr->pr_childmax) {
1423			error = EPERM;
1424			goto done_deref_locked;
1425		}
1426	}
1427	if (gotenforce) {
1428		if (enforce < ppr->pr_enforce_statfs) {
1429			error = EPERM;
1430			goto done_deref_locked;
1431		}
1432	}
1433	if (gotrsnum) {
1434		/*
1435		 * devfs_rsnum is a uint16_t
1436		 */
1437		if (rsnum < 0 || rsnum > 65535) {
1438			error = EINVAL;
1439			goto done_deref_locked;
1440		}
1441		/*
1442		 * Nested jails always inherit parent's devfs ruleset
1443		 */
1444		if (jailed(td->td_ucred)) {
1445			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1446				error = EPERM;
1447				goto done_deref_locked;
1448			} else
1449				rsnum = ppr->pr_devfs_rsnum;
1450		}
1451	}
1452#ifdef INET
1453	if (ip4s > 0) {
1454		if (ppr->pr_flags & PR_IP4) {
1455			/*
1456			 * Make sure the new set of IP addresses is a
1457			 * subset of the parent's list.  Don't worry
1458			 * about the parent being unlocked, as any
1459			 * setting is done with allprison_lock held.
1460			 */
1461			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1462				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1463					break;
1464			if (ij == ppr->pr_ip4s) {
1465				error = EPERM;
1466				goto done_deref_locked;
1467			}
1468			if (ip4s > 1) {
1469				for (ii = ij = 1; ii < ip4s; ii++) {
1470					if (ip4[ii].s_addr ==
1471					    ppr->pr_ip4[0].s_addr)
1472						continue;
1473					for (; ij < ppr->pr_ip4s; ij++)
1474						if (ip4[ii].s_addr ==
1475						    ppr->pr_ip4[ij].s_addr)
1476							break;
1477					if (ij == ppr->pr_ip4s)
1478						break;
1479				}
1480				if (ij == ppr->pr_ip4s) {
1481					error = EPERM;
1482					goto done_deref_locked;
1483				}
1484			}
1485		}
1486		/*
1487		 * Check for conflicting IP addresses.  We permit them
1488		 * if there is no more than one IP on each jail.  If
1489		 * there is a duplicate on a jail with more than one
1490		 * IP stop checking and return error.
1491		 */
1492		tppr = ppr;
1493#ifdef VIMAGE
1494		for (; tppr != &prison0; tppr = tppr->pr_parent)
1495			if (tppr->pr_flags & PR_VNET)
1496				break;
1497#endif
1498		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1499			if (tpr == pr ||
1500#ifdef VIMAGE
1501			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1502#endif
1503			    tpr->pr_uref == 0) {
1504				descend = 0;
1505				continue;
1506			}
1507			if (!(tpr->pr_flags & PR_IP4_USER))
1508				continue;
1509			descend = 0;
1510			if (tpr->pr_ip4 == NULL ||
1511			    (ip4s == 1 && tpr->pr_ip4s == 1))
1512				continue;
1513			for (ii = 0; ii < ip4s; ii++) {
1514				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1515					error = EADDRINUSE;
1516					vfs_opterror(opts,
1517					    "IPv4 addresses clash");
1518					goto done_deref_locked;
1519				}
1520			}
1521		}
1522	}
1523#endif
1524#ifdef INET6
1525	if (ip6s > 0) {
1526		if (ppr->pr_flags & PR_IP6) {
1527			/*
1528			 * Make sure the new set of IP addresses is a
1529			 * subset of the parent's list.
1530			 */
1531			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1532				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1533				    &ppr->pr_ip6[ij]))
1534					break;
1535			if (ij == ppr->pr_ip6s) {
1536				error = EPERM;
1537				goto done_deref_locked;
1538			}
1539			if (ip6s > 1) {
1540				for (ii = ij = 1; ii < ip6s; ii++) {
1541					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1542					     &ppr->pr_ip6[0]))
1543						continue;
1544					for (; ij < ppr->pr_ip6s; ij++)
1545						if (IN6_ARE_ADDR_EQUAL(
1546						    &ip6[ii], &ppr->pr_ip6[ij]))
1547							break;
1548					if (ij == ppr->pr_ip6s)
1549						break;
1550				}
1551				if (ij == ppr->pr_ip6s) {
1552					error = EPERM;
1553					goto done_deref_locked;
1554				}
1555			}
1556		}
1557		/* Check for conflicting IP addresses. */
1558		tppr = ppr;
1559#ifdef VIMAGE
1560		for (; tppr != &prison0; tppr = tppr->pr_parent)
1561			if (tppr->pr_flags & PR_VNET)
1562				break;
1563#endif
1564		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1565			if (tpr == pr ||
1566#ifdef VIMAGE
1567			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1568#endif
1569			    tpr->pr_uref == 0) {
1570				descend = 0;
1571				continue;
1572			}
1573			if (!(tpr->pr_flags & PR_IP6_USER))
1574				continue;
1575			descend = 0;
1576			if (tpr->pr_ip6 == NULL ||
1577			    (ip6s == 1 && tpr->pr_ip6s == 1))
1578				continue;
1579			for (ii = 0; ii < ip6s; ii++) {
1580				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1581					error = EADDRINUSE;
1582					vfs_opterror(opts,
1583					    "IPv6 addresses clash");
1584					goto done_deref_locked;
1585				}
1586			}
1587		}
1588	}
1589#endif
1590	onamelen = namelen = 0;
1591	if (name != NULL) {
1592		/* Give a default name of the jid.  Also allow the name to be
1593		 * explicitly the jid - but not any other number, and only in
1594		 * normal form (no leading zero/etc).
1595		 */
1596		if (name[0] == '\0')
1597			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1598		else if ((strtoul(namelc, &p, 10) != jid ||
1599			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1600			error = EINVAL;
1601			vfs_opterror(opts,
1602			    "name cannot be numeric (unless it is the jid)");
1603			goto done_deref_locked;
1604		}
1605		/*
1606		 * Make sure the name isn't too long for the prison or its
1607		 * children.
1608		 */
1609		onamelen = strlen(pr->pr_name);
1610		namelen = strlen(name);
1611		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1612			error = ENAMETOOLONG;
1613			goto done_deref_locked;
1614		}
1615		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1616			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1617			    sizeof(pr->pr_name)) {
1618				error = ENAMETOOLONG;
1619				goto done_deref_locked;
1620			}
1621		}
1622	}
1623	if (pr_allow & ~ppr->pr_allow) {
1624		error = EPERM;
1625		goto done_deref_locked;
1626	}
1627
1628	/* Set the parameters of the prison. */
1629#ifdef INET
1630	redo_ip4 = 0;
1631	if (pr_flags & PR_IP4_USER) {
1632		pr->pr_flags |= PR_IP4;
1633		free(pr->pr_ip4, M_PRISON);
1634		pr->pr_ip4s = ip4s;
1635		pr->pr_ip4 = ip4;
1636		ip4 = NULL;
1637		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1638#ifdef VIMAGE
1639			if (tpr->pr_flags & PR_VNET) {
1640				descend = 0;
1641				continue;
1642			}
1643#endif
1644			if (prison_restrict_ip4(tpr, NULL)) {
1645				redo_ip4 = 1;
1646				descend = 0;
1647			}
1648		}
1649	}
1650#endif
1651#ifdef INET6
1652	redo_ip6 = 0;
1653	if (pr_flags & PR_IP6_USER) {
1654		pr->pr_flags |= PR_IP6;
1655		free(pr->pr_ip6, M_PRISON);
1656		pr->pr_ip6s = ip6s;
1657		pr->pr_ip6 = ip6;
1658		ip6 = NULL;
1659		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1660#ifdef VIMAGE
1661			if (tpr->pr_flags & PR_VNET) {
1662				descend = 0;
1663				continue;
1664			}
1665#endif
1666			if (prison_restrict_ip6(tpr, NULL)) {
1667				redo_ip6 = 1;
1668				descend = 0;
1669			}
1670		}
1671	}
1672#endif
1673	if (gotslevel) {
1674		pr->pr_securelevel = slevel;
1675		/* Set all child jails to be at least this level. */
1676		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1677			if (tpr->pr_securelevel < slevel)
1678				tpr->pr_securelevel = slevel;
1679	}
1680	if (gotchildmax) {
1681		pr->pr_childmax = childmax;
1682		/* Set all child jails to under this limit. */
1683		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1684			if (tpr->pr_childmax > childmax - level)
1685				tpr->pr_childmax = childmax > level
1686				    ? childmax - level : 0;
1687	}
1688	if (gotenforce) {
1689		pr->pr_enforce_statfs = enforce;
1690		/* Pass this restriction on to the children. */
1691		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1692			if (tpr->pr_enforce_statfs < enforce)
1693				tpr->pr_enforce_statfs = enforce;
1694	}
1695	if (gotrsnum) {
1696		pr->pr_devfs_rsnum = rsnum;
1697		/* Pass this restriction on to the children. */
1698		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1699			tpr->pr_devfs_rsnum = rsnum;
1700	}
1701	if (name != NULL) {
1702		if (ppr == &prison0)
1703			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1704		else
1705			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1706			    ppr->pr_name, name);
1707		/* Change this component of child names. */
1708		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1709			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1710			    strlen(tpr->pr_name + onamelen) + 1);
1711			bcopy(pr->pr_name, tpr->pr_name, namelen);
1712		}
1713	}
1714	if (path != NULL) {
1715		/* Try to keep a real-rooted full pathname. */
1716		if (fullpath_disabled && path[0] == '/' &&
1717		    strcmp(mypr->pr_path, "/"))
1718			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1719			    mypr->pr_path, path);
1720		else
1721			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1722		pr->pr_root = root;
1723	}
1724	if (PR_HOST & ch_flags & ~pr_flags) {
1725		if (pr->pr_flags & PR_HOST) {
1726			/*
1727			 * Copy the parent's host info.  As with pr_ip4 above,
1728			 * the lack of a lock on the parent is not a problem;
1729			 * it is always set with allprison_lock at least
1730			 * shared, and is held exclusively here.
1731			 */
1732			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1733			    sizeof(pr->pr_hostname));
1734			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1735			    sizeof(pr->pr_domainname));
1736			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1737			    sizeof(pr->pr_hostuuid));
1738			pr->pr_hostid = pr->pr_parent->pr_hostid;
1739		}
1740	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1741		/* Set this prison, and any descendants without PR_HOST. */
1742		if (host != NULL)
1743			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1744		if (domain != NULL)
1745			strlcpy(pr->pr_domainname, domain,
1746			    sizeof(pr->pr_domainname));
1747		if (uuid != NULL)
1748			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1749		if (gothid)
1750			pr->pr_hostid = hid;
1751		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1752			if (tpr->pr_flags & PR_HOST)
1753				descend = 0;
1754			else {
1755				if (host != NULL)
1756					strlcpy(tpr->pr_hostname,
1757					    pr->pr_hostname,
1758					    sizeof(tpr->pr_hostname));
1759				if (domain != NULL)
1760					strlcpy(tpr->pr_domainname,
1761					    pr->pr_domainname,
1762					    sizeof(tpr->pr_domainname));
1763				if (uuid != NULL)
1764					strlcpy(tpr->pr_hostuuid,
1765					    pr->pr_hostuuid,
1766					    sizeof(tpr->pr_hostuuid));
1767				if (gothid)
1768					tpr->pr_hostid = hid;
1769			}
1770		}
1771	}
1772	if ((tallow = ch_allow & ~pr_allow)) {
1773		/* Clear allow bits in all children. */
1774		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1775			tpr->pr_allow &= ~tallow;
1776	}
1777	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1778	/*
1779	 * Persistent prisons get an extra reference, and prisons losing their
1780	 * persist flag lose that reference.  Only do this for existing prisons
1781	 * for now, so new ones will remain unseen until after the module
1782	 * handlers have completed.
1783	 */
1784	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1785		if (pr_flags & PR_PERSIST) {
1786			pr->pr_ref++;
1787			pr->pr_uref++;
1788		} else {
1789			pr->pr_ref--;
1790			pr->pr_uref--;
1791		}
1792	}
1793	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1794	mtx_unlock(&pr->pr_mtx);
1795
1796#ifdef RACCT
1797	if (racct_enable && created)
1798		prison_racct_attach(pr);
1799#endif
1800
1801	/* Locks may have prevented a complete restriction of child IP
1802	 * addresses.  If so, allocate some more memory and try again.
1803	 */
1804#ifdef INET
1805	while (redo_ip4) {
1806		ip4s = pr->pr_ip4s;
1807		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1808		mtx_lock(&pr->pr_mtx);
1809		redo_ip4 = 0;
1810		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1811#ifdef VIMAGE
1812			if (tpr->pr_flags & PR_VNET) {
1813				descend = 0;
1814				continue;
1815			}
1816#endif
1817			if (prison_restrict_ip4(tpr, ip4)) {
1818				if (ip4 != NULL)
1819					ip4 = NULL;
1820				else
1821					redo_ip4 = 1;
1822			}
1823		}
1824		mtx_unlock(&pr->pr_mtx);
1825	}
1826#endif
1827#ifdef INET6
1828	while (redo_ip6) {
1829		ip6s = pr->pr_ip6s;
1830		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1831		mtx_lock(&pr->pr_mtx);
1832		redo_ip6 = 0;
1833		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1834#ifdef VIMAGE
1835			if (tpr->pr_flags & PR_VNET) {
1836				descend = 0;
1837				continue;
1838			}
1839#endif
1840			if (prison_restrict_ip6(tpr, ip6)) {
1841				if (ip6 != NULL)
1842					ip6 = NULL;
1843				else
1844					redo_ip6 = 1;
1845			}
1846		}
1847		mtx_unlock(&pr->pr_mtx);
1848	}
1849#endif
1850
1851	/* Let the modules do their work. */
1852	sx_downgrade(&allprison_lock);
1853	if (created) {
1854		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1855		if (error) {
1856			prison_deref(pr, PD_LIST_SLOCKED);
1857			goto done_errmsg;
1858		}
1859	}
1860	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1861	if (error) {
1862		prison_deref(pr, created
1863		    ? PD_LIST_SLOCKED
1864		    : PD_DEREF | PD_LIST_SLOCKED);
1865		goto done_errmsg;
1866	}
1867
1868	/* Attach this process to the prison if requested. */
1869	if (flags & JAIL_ATTACH) {
1870		mtx_lock(&pr->pr_mtx);
1871		error = do_jail_attach(td, pr);
1872		if (error) {
1873			vfs_opterror(opts, "attach failed");
1874			if (!created)
1875				prison_deref(pr, PD_DEREF);
1876			goto done_errmsg;
1877		}
1878	}
1879
1880#ifdef RACCT
1881	if (racct_enable && !created) {
1882		if (!(flags & JAIL_ATTACH))
1883			sx_sunlock(&allprison_lock);
1884		prison_racct_modify(pr);
1885		if (!(flags & JAIL_ATTACH))
1886			sx_slock(&allprison_lock);
1887	}
1888#endif
1889
1890	td->td_retval[0] = pr->pr_id;
1891
1892	/*
1893	 * Now that it is all there, drop the temporary reference from existing
1894	 * prisons.  Or add a reference to newly created persistent prisons
1895	 * (which was not done earlier so that the prison would not be publicly
1896	 * visible).
1897	 */
1898	if (!created) {
1899		prison_deref(pr, (flags & JAIL_ATTACH)
1900		    ? PD_DEREF
1901		    : PD_DEREF | PD_LIST_SLOCKED);
1902	} else {
1903		if (pr_flags & PR_PERSIST) {
1904			mtx_lock(&pr->pr_mtx);
1905			pr->pr_ref++;
1906			pr->pr_uref++;
1907			mtx_unlock(&pr->pr_mtx);
1908		}
1909		if (!(flags & JAIL_ATTACH))
1910			sx_sunlock(&allprison_lock);
1911	}
1912
1913	goto done_errmsg;
1914
1915 done_deref_locked:
1916	prison_deref(pr, created
1917	    ? PD_LOCKED | PD_LIST_XLOCKED
1918	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1919	goto done_releroot;
1920 done_unlock_list:
1921	sx_xunlock(&allprison_lock);
1922 done_releroot:
1923	if (root != NULL)
1924		vrele(root);
1925 done_errmsg:
1926	if (error) {
1927		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1928		if (errmsg_len > 0) {
1929			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1930			if (errmsg_pos > 0) {
1931				if (optuio->uio_segflg == UIO_SYSSPACE)
1932					bcopy(errmsg,
1933					   optuio->uio_iov[errmsg_pos].iov_base,
1934					   errmsg_len);
1935				else
1936					copyout(errmsg,
1937					   optuio->uio_iov[errmsg_pos].iov_base,
1938					   errmsg_len);
1939			}
1940		}
1941	}
1942 done_free:
1943#ifdef INET
1944	free(ip4, M_PRISON);
1945#endif
1946#ifdef INET6
1947	free(ip6, M_PRISON);
1948#endif
1949	if (g_path != NULL)
1950		free(g_path, M_TEMP);
1951	vfs_freeopts(opts);
1952	return (error);
1953}
1954
1955
1956/*
1957 * struct jail_get_args {
1958 *	struct iovec *iovp;
1959 *	unsigned int iovcnt;
1960 *	int flags;
1961 * };
1962 */
1963int
1964sys_jail_get(struct thread *td, struct jail_get_args *uap)
1965{
1966	struct uio *auio;
1967	int error;
1968
1969	/* Check that we have an even number of iovecs. */
1970	if (uap->iovcnt & 1)
1971		return (EINVAL);
1972
1973	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1974	if (error)
1975		return (error);
1976	error = kern_jail_get(td, auio, uap->flags);
1977	if (error == 0)
1978		error = copyout(auio->uio_iov, uap->iovp,
1979		    uap->iovcnt * sizeof (struct iovec));
1980	free(auio, M_IOV);
1981	return (error);
1982}
1983
1984int
1985kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1986{
1987	struct prison *pr, *mypr;
1988	struct vfsopt *opt;
1989	struct vfsoptlist *opts;
1990	char *errmsg, *name;
1991	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1992
1993	if (flags & ~JAIL_GET_MASK)
1994		return (EINVAL);
1995
1996	/* Get the parameter list. */
1997	error = vfs_buildopts(optuio, &opts);
1998	if (error)
1999		return (error);
2000	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2001	mypr = td->td_ucred->cr_prison;
2002
2003	/*
2004	 * Find the prison specified by one of: lastjid, jid, name.
2005	 */
2006	sx_slock(&allprison_lock);
2007	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2008	if (error == 0) {
2009		TAILQ_FOREACH(pr, &allprison, pr_list) {
2010			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
2011				mtx_lock(&pr->pr_mtx);
2012				if (pr->pr_ref > 0 &&
2013				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
2014					break;
2015				mtx_unlock(&pr->pr_mtx);
2016			}
2017		}
2018		if (pr != NULL)
2019			goto found_prison;
2020		error = ENOENT;
2021		vfs_opterror(opts, "no jail after %d", jid);
2022		goto done_unlock_list;
2023	} else if (error != ENOENT)
2024		goto done_unlock_list;
2025
2026	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2027	if (error == 0) {
2028		if (jid != 0) {
2029			pr = prison_find_child(mypr, jid);
2030			if (pr != NULL) {
2031				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2032					mtx_unlock(&pr->pr_mtx);
2033					error = ENOENT;
2034					vfs_opterror(opts, "jail %d is dying",
2035					    jid);
2036					goto done_unlock_list;
2037				}
2038				goto found_prison;
2039			}
2040			error = ENOENT;
2041			vfs_opterror(opts, "jail %d not found", jid);
2042			goto done_unlock_list;
2043		}
2044	} else if (error != ENOENT)
2045		goto done_unlock_list;
2046
2047	error = vfs_getopt(opts, "name", (void **)&name, &len);
2048	if (error == 0) {
2049		if (len == 0 || name[len - 1] != '\0') {
2050			error = EINVAL;
2051			goto done_unlock_list;
2052		}
2053		pr = prison_find_name(mypr, name);
2054		if (pr != NULL) {
2055			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2056				mtx_unlock(&pr->pr_mtx);
2057				error = ENOENT;
2058				vfs_opterror(opts, "jail \"%s\" is dying",
2059				    name);
2060				goto done_unlock_list;
2061			}
2062			goto found_prison;
2063		}
2064		error = ENOENT;
2065		vfs_opterror(opts, "jail \"%s\" not found", name);
2066		goto done_unlock_list;
2067	} else if (error != ENOENT)
2068		goto done_unlock_list;
2069
2070	vfs_opterror(opts, "no jail specified");
2071	error = ENOENT;
2072	goto done_unlock_list;
2073
2074 found_prison:
2075	/* Get the parameters of the prison. */
2076	pr->pr_ref++;
2077	locked = PD_LOCKED;
2078	td->td_retval[0] = pr->pr_id;
2079	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2080	if (error != 0 && error != ENOENT)
2081		goto done_deref;
2082	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2083	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2084	if (error != 0 && error != ENOENT)
2085		goto done_deref;
2086	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2087	if (error != 0 && error != ENOENT)
2088		goto done_deref;
2089	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2090	    sizeof(pr->pr_cpuset->cs_id));
2091	if (error != 0 && error != ENOENT)
2092		goto done_deref;
2093	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2094	if (error != 0 && error != ENOENT)
2095		goto done_deref;
2096#ifdef INET
2097	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2098	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2099	if (error != 0 && error != ENOENT)
2100		goto done_deref;
2101#endif
2102#ifdef INET6
2103	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2104	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2105	if (error != 0 && error != ENOENT)
2106		goto done_deref;
2107#endif
2108	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2109	    sizeof(pr->pr_securelevel));
2110	if (error != 0 && error != ENOENT)
2111		goto done_deref;
2112	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2113	    sizeof(pr->pr_childcount));
2114	if (error != 0 && error != ENOENT)
2115		goto done_deref;
2116	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2117	    sizeof(pr->pr_childmax));
2118	if (error != 0 && error != ENOENT)
2119		goto done_deref;
2120	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2121	if (error != 0 && error != ENOENT)
2122		goto done_deref;
2123	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2124	if (error != 0 && error != ENOENT)
2125		goto done_deref;
2126	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2127	if (error != 0 && error != ENOENT)
2128		goto done_deref;
2129#ifdef COMPAT_FREEBSD32
2130	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2131		uint32_t hid32 = pr->pr_hostid;
2132
2133		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2134	} else
2135#endif
2136	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2137	    sizeof(pr->pr_hostid));
2138	if (error != 0 && error != ENOENT)
2139		goto done_deref;
2140	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2141	    sizeof(pr->pr_enforce_statfs));
2142	if (error != 0 && error != ENOENT)
2143		goto done_deref;
2144	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2145	    sizeof(pr->pr_devfs_rsnum));
2146	if (error != 0 && error != ENOENT)
2147		goto done_deref;
2148	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2149	    fi++) {
2150		if (pr_flag_names[fi] == NULL)
2151			continue;
2152		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2153		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2154		if (error != 0 && error != ENOENT)
2155			goto done_deref;
2156		i = !i;
2157		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2158		if (error != 0 && error != ENOENT)
2159			goto done_deref;
2160	}
2161	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2162	    fi++) {
2163		i = pr->pr_flags &
2164		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2165		i = pr_flag_jailsys[fi].disable &&
2166		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2167		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2168		    : JAIL_SYS_INHERIT;
2169		error =
2170		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2171		if (error != 0 && error != ENOENT)
2172			goto done_deref;
2173	}
2174	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2175	    fi++) {
2176		if (pr_allow_names[fi] == NULL)
2177			continue;
2178		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2179		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2180		if (error != 0 && error != ENOENT)
2181			goto done_deref;
2182		i = !i;
2183		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2184		if (error != 0 && error != ENOENT)
2185			goto done_deref;
2186	}
2187	i = (pr->pr_uref == 0);
2188	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2189	if (error != 0 && error != ENOENT)
2190		goto done_deref;
2191	i = !i;
2192	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2193	if (error != 0 && error != ENOENT)
2194		goto done_deref;
2195	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2196	    sizeof(pr->pr_osreldate));
2197	if (error != 0 && error != ENOENT)
2198		goto done_deref;
2199	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2200	if (error != 0 && error != ENOENT)
2201		goto done_deref;
2202
2203	/* Get the module parameters. */
2204	mtx_unlock(&pr->pr_mtx);
2205	locked = 0;
2206	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2207	if (error)
2208		goto done_deref;
2209	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2210
2211	/* By now, all parameters should have been noted. */
2212	TAILQ_FOREACH(opt, opts, link) {
2213		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2214			error = EINVAL;
2215			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2216			goto done_errmsg;
2217		}
2218	}
2219
2220	/* Write the fetched parameters back to userspace. */
2221	error = 0;
2222	TAILQ_FOREACH(opt, opts, link) {
2223		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2224			pos = 2 * opt->pos + 1;
2225			optuio->uio_iov[pos].iov_len = opt->len;
2226			if (opt->value != NULL) {
2227				if (optuio->uio_segflg == UIO_SYSSPACE) {
2228					bcopy(opt->value,
2229					    optuio->uio_iov[pos].iov_base,
2230					    opt->len);
2231				} else {
2232					error = copyout(opt->value,
2233					    optuio->uio_iov[pos].iov_base,
2234					    opt->len);
2235					if (error)
2236						break;
2237				}
2238			}
2239		}
2240	}
2241	goto done_errmsg;
2242
2243 done_deref:
2244	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2245	goto done_errmsg;
2246
2247 done_unlock_list:
2248	sx_sunlock(&allprison_lock);
2249 done_errmsg:
2250	if (error && errmsg_pos >= 0) {
2251		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2252		errmsg_pos = 2 * errmsg_pos + 1;
2253		if (errmsg_len > 0) {
2254			if (optuio->uio_segflg == UIO_SYSSPACE)
2255				bcopy(errmsg,
2256				    optuio->uio_iov[errmsg_pos].iov_base,
2257				    errmsg_len);
2258			else
2259				copyout(errmsg,
2260				    optuio->uio_iov[errmsg_pos].iov_base,
2261				    errmsg_len);
2262		}
2263	}
2264	vfs_freeopts(opts);
2265	return (error);
2266}
2267
2268
2269/*
2270 * struct jail_remove_args {
2271 *	int jid;
2272 * };
2273 */
2274int
2275sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2276{
2277	struct prison *pr, *cpr, *lpr, *tpr;
2278	int descend, error;
2279
2280	error = priv_check(td, PRIV_JAIL_REMOVE);
2281	if (error)
2282		return (error);
2283
2284	sx_xlock(&allprison_lock);
2285	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2286	if (pr == NULL) {
2287		sx_xunlock(&allprison_lock);
2288		return (EINVAL);
2289	}
2290
2291	/* Remove all descendants of this prison, then remove this prison. */
2292	pr->pr_ref++;
2293	if (!LIST_EMPTY(&pr->pr_children)) {
2294		mtx_unlock(&pr->pr_mtx);
2295		lpr = NULL;
2296		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2297			mtx_lock(&cpr->pr_mtx);
2298			if (cpr->pr_ref > 0) {
2299				tpr = cpr;
2300				cpr->pr_ref++;
2301			} else {
2302				/* Already removed - do not do it again. */
2303				tpr = NULL;
2304			}
2305			mtx_unlock(&cpr->pr_mtx);
2306			if (lpr != NULL) {
2307				mtx_lock(&lpr->pr_mtx);
2308				prison_remove_one(lpr);
2309				sx_xlock(&allprison_lock);
2310			}
2311			lpr = tpr;
2312		}
2313		if (lpr != NULL) {
2314			mtx_lock(&lpr->pr_mtx);
2315			prison_remove_one(lpr);
2316			sx_xlock(&allprison_lock);
2317		}
2318		mtx_lock(&pr->pr_mtx);
2319	}
2320	prison_remove_one(pr);
2321	return (0);
2322}
2323
2324static void
2325prison_remove_one(struct prison *pr)
2326{
2327	struct proc *p;
2328	int deuref;
2329
2330	/* If the prison was persistent, it is not anymore. */
2331	deuref = 0;
2332	if (pr->pr_flags & PR_PERSIST) {
2333		pr->pr_ref--;
2334		deuref = PD_DEUREF;
2335		pr->pr_flags &= ~PR_PERSIST;
2336	}
2337
2338	/*
2339	 * jail_remove added a reference.  If that's the only one, remove
2340	 * the prison now.
2341	 */
2342	KASSERT(pr->pr_ref > 0,
2343	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2344	if (pr->pr_ref == 1) {
2345		prison_deref(pr,
2346		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2347		return;
2348	}
2349
2350	mtx_unlock(&pr->pr_mtx);
2351	sx_xunlock(&allprison_lock);
2352	/*
2353	 * Kill all processes unfortunate enough to be attached to this prison.
2354	 */
2355	sx_slock(&allproc_lock);
2356	LIST_FOREACH(p, &allproc, p_list) {
2357		PROC_LOCK(p);
2358		if (p->p_state != PRS_NEW && p->p_ucred &&
2359		    p->p_ucred->cr_prison == pr)
2360			kern_psignal(p, SIGKILL);
2361		PROC_UNLOCK(p);
2362	}
2363	sx_sunlock(&allproc_lock);
2364	/* Remove the temporary reference added by jail_remove. */
2365	prison_deref(pr, deuref | PD_DEREF);
2366}
2367
2368
2369/*
2370 * struct jail_attach_args {
2371 *	int jid;
2372 * };
2373 */
2374int
2375sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2376{
2377	struct prison *pr;
2378	int error;
2379
2380	error = priv_check(td, PRIV_JAIL_ATTACH);
2381	if (error)
2382		return (error);
2383
2384	sx_slock(&allprison_lock);
2385	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2386	if (pr == NULL) {
2387		sx_sunlock(&allprison_lock);
2388		return (EINVAL);
2389	}
2390
2391	/*
2392	 * Do not allow a process to attach to a prison that is not
2393	 * considered to be "alive".
2394	 */
2395	if (pr->pr_uref == 0) {
2396		mtx_unlock(&pr->pr_mtx);
2397		sx_sunlock(&allprison_lock);
2398		return (EINVAL);
2399	}
2400
2401	return (do_jail_attach(td, pr));
2402}
2403
2404static int
2405do_jail_attach(struct thread *td, struct prison *pr)
2406{
2407	struct prison *ppr;
2408	struct proc *p;
2409	struct ucred *newcred, *oldcred;
2410	int error;
2411
2412	/*
2413	 * XXX: Note that there is a slight race here if two threads
2414	 * in the same privileged process attempt to attach to two
2415	 * different jails at the same time.  It is important for
2416	 * user processes not to do this, or they might end up with
2417	 * a process root from one prison, but attached to the jail
2418	 * of another.
2419	 */
2420	pr->pr_ref++;
2421	pr->pr_uref++;
2422	mtx_unlock(&pr->pr_mtx);
2423
2424	/* Let modules do whatever they need to prepare for attaching. */
2425	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2426	if (error) {
2427		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2428		return (error);
2429	}
2430	sx_sunlock(&allprison_lock);
2431
2432	/*
2433	 * Reparent the newly attached process to this jail.
2434	 */
2435	ppr = td->td_ucred->cr_prison;
2436	p = td->td_proc;
2437	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2438	if (error)
2439		goto e_revert_osd;
2440
2441	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2442	if ((error = change_dir(pr->pr_root, td)) != 0)
2443		goto e_unlock;
2444#ifdef MAC
2445	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2446		goto e_unlock;
2447#endif
2448	VOP_UNLOCK(pr->pr_root, 0);
2449	if ((error = change_root(pr->pr_root, td)))
2450		goto e_revert_osd;
2451
2452	newcred = crget();
2453	PROC_LOCK(p);
2454	oldcred = p->p_ucred;
2455	setsugid(p);
2456	crcopy(newcred, oldcred);
2457	newcred->cr_prison = pr;
2458	p->p_ucred = newcred;
2459	PROC_UNLOCK(p);
2460#ifdef RACCT
2461	racct_proc_ucred_changed(p, oldcred, newcred);
2462#endif
2463	crfree(oldcred);
2464	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2465	return (0);
2466 e_unlock:
2467	VOP_UNLOCK(pr->pr_root, 0);
2468 e_revert_osd:
2469	/* Tell modules this thread is still in its old jail after all. */
2470	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2471	prison_deref(pr, PD_DEREF | PD_DEUREF);
2472	return (error);
2473}
2474
2475
2476/*
2477 * Returns a locked prison instance, or NULL on failure.
2478 */
2479struct prison *
2480prison_find(int prid)
2481{
2482	struct prison *pr;
2483
2484	sx_assert(&allprison_lock, SX_LOCKED);
2485	TAILQ_FOREACH(pr, &allprison, pr_list) {
2486		if (pr->pr_id == prid) {
2487			mtx_lock(&pr->pr_mtx);
2488			if (pr->pr_ref > 0)
2489				return (pr);
2490			mtx_unlock(&pr->pr_mtx);
2491		}
2492	}
2493	return (NULL);
2494}
2495
2496/*
2497 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2498 */
2499struct prison *
2500prison_find_child(struct prison *mypr, int prid)
2501{
2502	struct prison *pr;
2503	int descend;
2504
2505	sx_assert(&allprison_lock, SX_LOCKED);
2506	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2507		if (pr->pr_id == prid) {
2508			mtx_lock(&pr->pr_mtx);
2509			if (pr->pr_ref > 0)
2510				return (pr);
2511			mtx_unlock(&pr->pr_mtx);
2512		}
2513	}
2514	return (NULL);
2515}
2516
2517/*
2518 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2519 */
2520struct prison *
2521prison_find_name(struct prison *mypr, const char *name)
2522{
2523	struct prison *pr, *deadpr;
2524	size_t mylen;
2525	int descend;
2526
2527	sx_assert(&allprison_lock, SX_LOCKED);
2528	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2529 again:
2530	deadpr = NULL;
2531	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2532		if (!strcmp(pr->pr_name + mylen, name)) {
2533			mtx_lock(&pr->pr_mtx);
2534			if (pr->pr_ref > 0) {
2535				if (pr->pr_uref > 0)
2536					return (pr);
2537				deadpr = pr;
2538			}
2539			mtx_unlock(&pr->pr_mtx);
2540		}
2541	}
2542	/* There was no valid prison - perhaps there was a dying one. */
2543	if (deadpr != NULL) {
2544		mtx_lock(&deadpr->pr_mtx);
2545		if (deadpr->pr_ref == 0) {
2546			mtx_unlock(&deadpr->pr_mtx);
2547			goto again;
2548		}
2549	}
2550	return (deadpr);
2551}
2552
2553/*
2554 * See if a prison has the specific flag set.
2555 */
2556int
2557prison_flag(struct ucred *cred, unsigned flag)
2558{
2559
2560	/* This is an atomic read, so no locking is necessary. */
2561	return (cred->cr_prison->pr_flags & flag);
2562}
2563
2564int
2565prison_allow(struct ucred *cred, unsigned flag)
2566{
2567
2568	/* This is an atomic read, so no locking is necessary. */
2569	return (cred->cr_prison->pr_allow & flag);
2570}
2571
2572/*
2573 * Remove a prison reference.  If that was the last reference, remove the
2574 * prison itself - but not in this context in case there are locks held.
2575 */
2576void
2577prison_free_locked(struct prison *pr)
2578{
2579
2580	mtx_assert(&pr->pr_mtx, MA_OWNED);
2581	pr->pr_ref--;
2582	if (pr->pr_ref == 0) {
2583		mtx_unlock(&pr->pr_mtx);
2584		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2585		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2586		return;
2587	}
2588	mtx_unlock(&pr->pr_mtx);
2589}
2590
2591void
2592prison_free(struct prison *pr)
2593{
2594
2595	mtx_lock(&pr->pr_mtx);
2596	prison_free_locked(pr);
2597}
2598
2599static void
2600prison_complete(void *context, int pending)
2601{
2602
2603	prison_deref((struct prison *)context, 0);
2604}
2605
2606/*
2607 * Remove a prison reference (usually).  This internal version assumes no
2608 * mutexes are held, except perhaps the prison itself.  If there are no more
2609 * references, release and delist the prison.  On completion, the prison lock
2610 * and the allprison lock are both unlocked.
2611 */
2612static void
2613prison_deref(struct prison *pr, int flags)
2614{
2615	struct prison *ppr, *tpr;
2616
2617	if (!(flags & PD_LOCKED))
2618		mtx_lock(&pr->pr_mtx);
2619	for (;;) {
2620		if (flags & PD_DEUREF) {
2621			pr->pr_uref--;
2622			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2623		}
2624		if (flags & PD_DEREF)
2625			pr->pr_ref--;
2626		/* If the prison still has references, nothing else to do. */
2627		if (pr->pr_ref > 0) {
2628			mtx_unlock(&pr->pr_mtx);
2629			if (flags & PD_LIST_SLOCKED)
2630				sx_sunlock(&allprison_lock);
2631			else if (flags & PD_LIST_XLOCKED)
2632				sx_xunlock(&allprison_lock);
2633			return;
2634		}
2635
2636		mtx_unlock(&pr->pr_mtx);
2637		if (flags & PD_LIST_SLOCKED) {
2638			if (!sx_try_upgrade(&allprison_lock)) {
2639				sx_sunlock(&allprison_lock);
2640				sx_xlock(&allprison_lock);
2641			}
2642		} else if (!(flags & PD_LIST_XLOCKED))
2643			sx_xlock(&allprison_lock);
2644
2645		TAILQ_REMOVE(&allprison, pr, pr_list);
2646		LIST_REMOVE(pr, pr_sibling);
2647		ppr = pr->pr_parent;
2648		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2649			tpr->pr_childcount--;
2650		sx_xunlock(&allprison_lock);
2651
2652#ifdef VIMAGE
2653		if (pr->pr_vnet != ppr->pr_vnet)
2654			vnet_destroy(pr->pr_vnet);
2655#endif
2656		if (pr->pr_root != NULL)
2657			vrele(pr->pr_root);
2658		mtx_destroy(&pr->pr_mtx);
2659#ifdef INET
2660		free(pr->pr_ip4, M_PRISON);
2661#endif
2662#ifdef INET6
2663		free(pr->pr_ip6, M_PRISON);
2664#endif
2665		if (pr->pr_cpuset != NULL)
2666			cpuset_rel(pr->pr_cpuset);
2667		osd_jail_exit(pr);
2668#ifdef RACCT
2669		if (racct_enable)
2670			prison_racct_detach(pr);
2671#endif
2672		free(pr, M_PRISON);
2673
2674		/* Removing a prison frees a reference on its parent. */
2675		pr = ppr;
2676		mtx_lock(&pr->pr_mtx);
2677		flags = PD_DEREF | PD_DEUREF;
2678	}
2679}
2680
2681void
2682prison_hold_locked(struct prison *pr)
2683{
2684
2685	mtx_assert(&pr->pr_mtx, MA_OWNED);
2686	KASSERT(pr->pr_ref > 0,
2687	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2688	pr->pr_ref++;
2689}
2690
2691void
2692prison_hold(struct prison *pr)
2693{
2694
2695	mtx_lock(&pr->pr_mtx);
2696	prison_hold_locked(pr);
2697	mtx_unlock(&pr->pr_mtx);
2698}
2699
2700void
2701prison_proc_hold(struct prison *pr)
2702{
2703
2704	mtx_lock(&pr->pr_mtx);
2705	KASSERT(pr->pr_uref > 0,
2706	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2707	pr->pr_uref++;
2708	mtx_unlock(&pr->pr_mtx);
2709}
2710
2711void
2712prison_proc_free(struct prison *pr)
2713{
2714
2715	mtx_lock(&pr->pr_mtx);
2716	KASSERT(pr->pr_uref > 0,
2717	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2718	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2719}
2720
2721
2722#ifdef INET
2723/*
2724 * Restrict a prison's IP address list with its parent's, possibly replacing
2725 * it.  Return true if the replacement buffer was used (or would have been).
2726 */
2727static int
2728prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2729{
2730	int ii, ij, used;
2731	struct prison *ppr;
2732
2733	ppr = pr->pr_parent;
2734	if (!(pr->pr_flags & PR_IP4_USER)) {
2735		/* This has no user settings, so just copy the parent's list. */
2736		if (pr->pr_ip4s < ppr->pr_ip4s) {
2737			/*
2738			 * There's no room for the parent's list.  Use the
2739			 * new list buffer, which is assumed to be big enough
2740			 * (if it was passed).  If there's no buffer, try to
2741			 * allocate one.
2742			 */
2743			used = 1;
2744			if (newip4 == NULL) {
2745				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2746				    M_PRISON, M_NOWAIT);
2747				if (newip4 != NULL)
2748					used = 0;
2749			}
2750			if (newip4 != NULL) {
2751				bcopy(ppr->pr_ip4, newip4,
2752				    ppr->pr_ip4s * sizeof(*newip4));
2753				free(pr->pr_ip4, M_PRISON);
2754				pr->pr_ip4 = newip4;
2755				pr->pr_ip4s = ppr->pr_ip4s;
2756			}
2757			return (used);
2758		}
2759		pr->pr_ip4s = ppr->pr_ip4s;
2760		if (pr->pr_ip4s > 0)
2761			bcopy(ppr->pr_ip4, pr->pr_ip4,
2762			    pr->pr_ip4s * sizeof(*newip4));
2763		else if (pr->pr_ip4 != NULL) {
2764			free(pr->pr_ip4, M_PRISON);
2765			pr->pr_ip4 = NULL;
2766		}
2767	} else if (pr->pr_ip4s > 0) {
2768		/* Remove addresses that aren't in the parent. */
2769		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2770			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2771				break;
2772		if (ij < ppr->pr_ip4s)
2773			ii = 1;
2774		else {
2775			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2776			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2777			ii = 0;
2778		}
2779		for (ij = 1; ii < pr->pr_ip4s; ) {
2780			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2781				ii++;
2782				continue;
2783			}
2784			switch (ij >= ppr->pr_ip4s ? -1 :
2785				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2786			case -1:
2787				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2788				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2789				break;
2790			case 0:
2791				ii++;
2792				ij++;
2793				break;
2794			case 1:
2795				ij++;
2796				break;
2797			}
2798		}
2799		if (pr->pr_ip4s == 0) {
2800			pr->pr_flags |= PR_IP4_DISABLE;
2801			free(pr->pr_ip4, M_PRISON);
2802			pr->pr_ip4 = NULL;
2803		}
2804	}
2805	return (0);
2806}
2807
2808/*
2809 * Pass back primary IPv4 address of this jail.
2810 *
2811 * If not restricted return success but do not alter the address.  Caller has
2812 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2813 *
2814 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2815 * Address returned in NBO.
2816 */
2817int
2818prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2819{
2820	struct prison *pr;
2821
2822	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2823	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2824
2825	pr = cred->cr_prison;
2826	if (!(pr->pr_flags & PR_IP4))
2827		return (0);
2828	mtx_lock(&pr->pr_mtx);
2829	if (!(pr->pr_flags & PR_IP4)) {
2830		mtx_unlock(&pr->pr_mtx);
2831		return (0);
2832	}
2833	if (pr->pr_ip4 == NULL) {
2834		mtx_unlock(&pr->pr_mtx);
2835		return (EAFNOSUPPORT);
2836	}
2837
2838	ia->s_addr = pr->pr_ip4[0].s_addr;
2839	mtx_unlock(&pr->pr_mtx);
2840	return (0);
2841}
2842
2843/*
2844 * Return 1 if we should do proper source address selection or are not jailed.
2845 * We will return 0 if we should bypass source address selection in favour
2846 * of the primary jail IPv4 address. Only in this case *ia will be updated and
2847 * returned in NBO.
2848 * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2849 */
2850int
2851prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2852{
2853	struct prison *pr;
2854	struct in_addr lia;
2855	int error;
2856
2857	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2858	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2859
2860	if (!jailed(cred))
2861		return (1);
2862
2863	pr = cred->cr_prison;
2864	if (pr->pr_flags & PR_IP4_SADDRSEL)
2865		return (1);
2866
2867	lia.s_addr = INADDR_ANY;
2868	error = prison_get_ip4(cred, &lia);
2869	if (error)
2870		return (error);
2871	if (lia.s_addr == INADDR_ANY)
2872		return (1);
2873
2874	ia->s_addr = lia.s_addr;
2875	return (0);
2876}
2877
2878/*
2879 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2880 */
2881int
2882prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2883{
2884
2885	if (pr1 == pr2)
2886		return (1);
2887
2888	/*
2889	 * No need to lock since the PR_IP4_USER flag can't be altered for
2890	 * existing prisons.
2891	 */
2892	while (pr1 != &prison0 &&
2893#ifdef VIMAGE
2894	       !(pr1->pr_flags & PR_VNET) &&
2895#endif
2896	       !(pr1->pr_flags & PR_IP4_USER))
2897		pr1 = pr1->pr_parent;
2898	while (pr2 != &prison0 &&
2899#ifdef VIMAGE
2900	       !(pr2->pr_flags & PR_VNET) &&
2901#endif
2902	       !(pr2->pr_flags & PR_IP4_USER))
2903		pr2 = pr2->pr_parent;
2904	return (pr1 == pr2);
2905}
2906
2907/*
2908 * Make sure our (source) address is set to something meaningful to this
2909 * jail.
2910 *
2911 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2912 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2913 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2914 */
2915int
2916prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2917{
2918	struct prison *pr;
2919	struct in_addr ia0;
2920	int error;
2921
2922	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2923	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2924
2925	pr = cred->cr_prison;
2926	if (!(pr->pr_flags & PR_IP4))
2927		return (0);
2928	mtx_lock(&pr->pr_mtx);
2929	if (!(pr->pr_flags & PR_IP4)) {
2930		mtx_unlock(&pr->pr_mtx);
2931		return (0);
2932	}
2933	if (pr->pr_ip4 == NULL) {
2934		mtx_unlock(&pr->pr_mtx);
2935		return (EAFNOSUPPORT);
2936	}
2937
2938	ia0.s_addr = ntohl(ia->s_addr);
2939	if (ia0.s_addr == INADDR_LOOPBACK) {
2940		ia->s_addr = pr->pr_ip4[0].s_addr;
2941		mtx_unlock(&pr->pr_mtx);
2942		return (0);
2943	}
2944
2945	if (ia0.s_addr == INADDR_ANY) {
2946		/*
2947		 * In case there is only 1 IPv4 address, bind directly.
2948		 */
2949		if (pr->pr_ip4s == 1)
2950			ia->s_addr = pr->pr_ip4[0].s_addr;
2951		mtx_unlock(&pr->pr_mtx);
2952		return (0);
2953	}
2954
2955	error = _prison_check_ip4(pr, ia);
2956	mtx_unlock(&pr->pr_mtx);
2957	return (error);
2958}
2959
2960/*
2961 * Rewrite destination address in case we will connect to loopback address.
2962 *
2963 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2964 * Address passed in in NBO and returned in NBO.
2965 */
2966int
2967prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2968{
2969	struct prison *pr;
2970
2971	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2972	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2973
2974	pr = cred->cr_prison;
2975	if (!(pr->pr_flags & PR_IP4))
2976		return (0);
2977	mtx_lock(&pr->pr_mtx);
2978	if (!(pr->pr_flags & PR_IP4)) {
2979		mtx_unlock(&pr->pr_mtx);
2980		return (0);
2981	}
2982	if (pr->pr_ip4 == NULL) {
2983		mtx_unlock(&pr->pr_mtx);
2984		return (EAFNOSUPPORT);
2985	}
2986
2987	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2988		ia->s_addr = pr->pr_ip4[0].s_addr;
2989		mtx_unlock(&pr->pr_mtx);
2990		return (0);
2991	}
2992
2993	/*
2994	 * Return success because nothing had to be changed.
2995	 */
2996	mtx_unlock(&pr->pr_mtx);
2997	return (0);
2998}
2999
3000/*
3001 * Check if given address belongs to the jail referenced by cred/prison.
3002 *
3003 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
3004 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3005 * doesn't allow IPv4.  Address passed in in NBO.
3006 */
3007static int
3008_prison_check_ip4(struct prison *pr, struct in_addr *ia)
3009{
3010	int i, a, z, d;
3011
3012	/*
3013	 * Check the primary IP.
3014	 */
3015	if (pr->pr_ip4[0].s_addr == ia->s_addr)
3016		return (0);
3017
3018	/*
3019	 * All the other IPs are sorted so we can do a binary search.
3020	 */
3021	a = 0;
3022	z = pr->pr_ip4s - 2;
3023	while (a <= z) {
3024		i = (a + z) / 2;
3025		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
3026		if (d > 0)
3027			z = i - 1;
3028		else if (d < 0)
3029			a = i + 1;
3030		else
3031			return (0);
3032	}
3033
3034	return (EADDRNOTAVAIL);
3035}
3036
3037int
3038prison_check_ip4(struct ucred *cred, struct in_addr *ia)
3039{
3040	struct prison *pr;
3041	int error;
3042
3043	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3044	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3045
3046	pr = cred->cr_prison;
3047	if (!(pr->pr_flags & PR_IP4))
3048		return (0);
3049	mtx_lock(&pr->pr_mtx);
3050	if (!(pr->pr_flags & PR_IP4)) {
3051		mtx_unlock(&pr->pr_mtx);
3052		return (0);
3053	}
3054	if (pr->pr_ip4 == NULL) {
3055		mtx_unlock(&pr->pr_mtx);
3056		return (EAFNOSUPPORT);
3057	}
3058
3059	error = _prison_check_ip4(pr, ia);
3060	mtx_unlock(&pr->pr_mtx);
3061	return (error);
3062}
3063#endif
3064
3065#ifdef INET6
3066static int
3067prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
3068{
3069	int ii, ij, used;
3070	struct prison *ppr;
3071
3072	ppr = pr->pr_parent;
3073	if (!(pr->pr_flags & PR_IP6_USER)) {
3074		/* This has no user settings, so just copy the parent's list. */
3075		if (pr->pr_ip6s < ppr->pr_ip6s) {
3076			/*
3077			 * There's no room for the parent's list.  Use the
3078			 * new list buffer, which is assumed to be big enough
3079			 * (if it was passed).  If there's no buffer, try to
3080			 * allocate one.
3081			 */
3082			used = 1;
3083			if (newip6 == NULL) {
3084				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
3085				    M_PRISON, M_NOWAIT);
3086				if (newip6 != NULL)
3087					used = 0;
3088			}
3089			if (newip6 != NULL) {
3090				bcopy(ppr->pr_ip6, newip6,
3091				    ppr->pr_ip6s * sizeof(*newip6));
3092				free(pr->pr_ip6, M_PRISON);
3093				pr->pr_ip6 = newip6;
3094				pr->pr_ip6s = ppr->pr_ip6s;
3095			}
3096			return (used);
3097		}
3098		pr->pr_ip6s = ppr->pr_ip6s;
3099		if (pr->pr_ip6s > 0)
3100			bcopy(ppr->pr_ip6, pr->pr_ip6,
3101			    pr->pr_ip6s * sizeof(*newip6));
3102		else if (pr->pr_ip6 != NULL) {
3103			free(pr->pr_ip6, M_PRISON);
3104			pr->pr_ip6 = NULL;
3105		}
3106	} else if (pr->pr_ip6s > 0) {
3107		/* Remove addresses that aren't in the parent. */
3108		for (ij = 0; ij < ppr->pr_ip6s; ij++)
3109			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
3110			    &ppr->pr_ip6[ij]))
3111				break;
3112		if (ij < ppr->pr_ip6s)
3113			ii = 1;
3114		else {
3115			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
3116			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3117			ii = 0;
3118		}
3119		for (ij = 1; ii < pr->pr_ip6s; ) {
3120			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3121			    &ppr->pr_ip6[0])) {
3122				ii++;
3123				continue;
3124			}
3125			switch (ij >= ppr->pr_ip6s ? -1 :
3126				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3127			case -1:
3128				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3129				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3130				break;
3131			case 0:
3132				ii++;
3133				ij++;
3134				break;
3135			case 1:
3136				ij++;
3137				break;
3138			}
3139		}
3140		if (pr->pr_ip6s == 0) {
3141			pr->pr_flags |= PR_IP6_DISABLE;
3142			free(pr->pr_ip6, M_PRISON);
3143			pr->pr_ip6 = NULL;
3144		}
3145	}
3146	return 0;
3147}
3148
3149/*
3150 * Pass back primary IPv6 address for this jail.
3151 *
3152 * If not restricted return success but do not alter the address.  Caller has
3153 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3154 *
3155 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3156 */
3157int
3158prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3159{
3160	struct prison *pr;
3161
3162	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3163	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3164
3165	pr = cred->cr_prison;
3166	if (!(pr->pr_flags & PR_IP6))
3167		return (0);
3168	mtx_lock(&pr->pr_mtx);
3169	if (!(pr->pr_flags & PR_IP6)) {
3170		mtx_unlock(&pr->pr_mtx);
3171		return (0);
3172	}
3173	if (pr->pr_ip6 == NULL) {
3174		mtx_unlock(&pr->pr_mtx);
3175		return (EAFNOSUPPORT);
3176	}
3177
3178	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3179	mtx_unlock(&pr->pr_mtx);
3180	return (0);
3181}
3182
3183/*
3184 * Return 1 if we should do proper source address selection or are not jailed.
3185 * We will return 0 if we should bypass source address selection in favour
3186 * of the primary jail IPv6 address. Only in this case *ia will be updated and
3187 * returned in NBO.
3188 * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3189 */
3190int
3191prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3192{
3193	struct prison *pr;
3194	struct in6_addr lia6;
3195	int error;
3196
3197	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3198	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3199
3200	if (!jailed(cred))
3201		return (1);
3202
3203	pr = cred->cr_prison;
3204	if (pr->pr_flags & PR_IP6_SADDRSEL)
3205		return (1);
3206
3207	lia6 = in6addr_any;
3208	error = prison_get_ip6(cred, &lia6);
3209	if (error)
3210		return (error);
3211	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3212		return (1);
3213
3214	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3215	return (0);
3216}
3217
3218/*
3219 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3220 */
3221int
3222prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3223{
3224
3225	if (pr1 == pr2)
3226		return (1);
3227
3228	while (pr1 != &prison0 &&
3229#ifdef VIMAGE
3230	       !(pr1->pr_flags & PR_VNET) &&
3231#endif
3232	       !(pr1->pr_flags & PR_IP6_USER))
3233		pr1 = pr1->pr_parent;
3234	while (pr2 != &prison0 &&
3235#ifdef VIMAGE
3236	       !(pr2->pr_flags & PR_VNET) &&
3237#endif
3238	       !(pr2->pr_flags & PR_IP6_USER))
3239		pr2 = pr2->pr_parent;
3240	return (pr1 == pr2);
3241}
3242
3243/*
3244 * Make sure our (source) address is set to something meaningful to this jail.
3245 *
3246 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3247 * when needed while binding.
3248 *
3249 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3250 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3251 * doesn't allow IPv6.
3252 */
3253int
3254prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3255{
3256	struct prison *pr;
3257	int error;
3258
3259	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3260	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3261
3262	pr = cred->cr_prison;
3263	if (!(pr->pr_flags & PR_IP6))
3264		return (0);
3265	mtx_lock(&pr->pr_mtx);
3266	if (!(pr->pr_flags & PR_IP6)) {
3267		mtx_unlock(&pr->pr_mtx);
3268		return (0);
3269	}
3270	if (pr->pr_ip6 == NULL) {
3271		mtx_unlock(&pr->pr_mtx);
3272		return (EAFNOSUPPORT);
3273	}
3274
3275	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3276		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3277		mtx_unlock(&pr->pr_mtx);
3278		return (0);
3279	}
3280
3281	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3282		/*
3283		 * In case there is only 1 IPv6 address, and v6only is true,
3284		 * then bind directly.
3285		 */
3286		if (v6only != 0 && pr->pr_ip6s == 1)
3287			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3288		mtx_unlock(&pr->pr_mtx);
3289		return (0);
3290	}
3291
3292	error = _prison_check_ip6(pr, ia6);
3293	mtx_unlock(&pr->pr_mtx);
3294	return (error);
3295}
3296
3297/*
3298 * Rewrite destination address in case we will connect to loopback address.
3299 *
3300 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3301 */
3302int
3303prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3304{
3305	struct prison *pr;
3306
3307	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3308	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3309
3310	pr = cred->cr_prison;
3311	if (!(pr->pr_flags & PR_IP6))
3312		return (0);
3313	mtx_lock(&pr->pr_mtx);
3314	if (!(pr->pr_flags & PR_IP6)) {
3315		mtx_unlock(&pr->pr_mtx);
3316		return (0);
3317	}
3318	if (pr->pr_ip6 == NULL) {
3319		mtx_unlock(&pr->pr_mtx);
3320		return (EAFNOSUPPORT);
3321	}
3322
3323	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3324		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3325		mtx_unlock(&pr->pr_mtx);
3326		return (0);
3327	}
3328
3329	/*
3330	 * Return success because nothing had to be changed.
3331	 */
3332	mtx_unlock(&pr->pr_mtx);
3333	return (0);
3334}
3335
3336/*
3337 * Check if given address belongs to the jail referenced by cred/prison.
3338 *
3339 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3340 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3341 * doesn't allow IPv6.
3342 */
3343static int
3344_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3345{
3346	int i, a, z, d;
3347
3348	/*
3349	 * Check the primary IP.
3350	 */
3351	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3352		return (0);
3353
3354	/*
3355	 * All the other IPs are sorted so we can do a binary search.
3356	 */
3357	a = 0;
3358	z = pr->pr_ip6s - 2;
3359	while (a <= z) {
3360		i = (a + z) / 2;
3361		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3362		if (d > 0)
3363			z = i - 1;
3364		else if (d < 0)
3365			a = i + 1;
3366		else
3367			return (0);
3368	}
3369
3370	return (EADDRNOTAVAIL);
3371}
3372
3373int
3374prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3375{
3376	struct prison *pr;
3377	int error;
3378
3379	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3380	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3381
3382	pr = cred->cr_prison;
3383	if (!(pr->pr_flags & PR_IP6))
3384		return (0);
3385	mtx_lock(&pr->pr_mtx);
3386	if (!(pr->pr_flags & PR_IP6)) {
3387		mtx_unlock(&pr->pr_mtx);
3388		return (0);
3389	}
3390	if (pr->pr_ip6 == NULL) {
3391		mtx_unlock(&pr->pr_mtx);
3392		return (EAFNOSUPPORT);
3393	}
3394
3395	error = _prison_check_ip6(pr, ia6);
3396	mtx_unlock(&pr->pr_mtx);
3397	return (error);
3398}
3399#endif
3400
3401/*
3402 * Check if a jail supports the given address family.
3403 *
3404 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3405 * if not.
3406 */
3407int
3408prison_check_af(struct ucred *cred, int af)
3409{
3410	struct prison *pr;
3411	int error;
3412
3413	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3414
3415	pr = cred->cr_prison;
3416#ifdef VIMAGE
3417	/* Prisons with their own network stack are not limited. */
3418	if (prison_owns_vnet(cred))
3419		return (0);
3420#endif
3421
3422	error = 0;
3423	switch (af)
3424	{
3425#ifdef INET
3426	case AF_INET:
3427		if (pr->pr_flags & PR_IP4)
3428		{
3429			mtx_lock(&pr->pr_mtx);
3430			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3431				error = EAFNOSUPPORT;
3432			mtx_unlock(&pr->pr_mtx);
3433		}
3434		break;
3435#endif
3436#ifdef INET6
3437	case AF_INET6:
3438		if (pr->pr_flags & PR_IP6)
3439		{
3440			mtx_lock(&pr->pr_mtx);
3441			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3442				error = EAFNOSUPPORT;
3443			mtx_unlock(&pr->pr_mtx);
3444		}
3445		break;
3446#endif
3447	case AF_LOCAL:
3448	case AF_ROUTE:
3449		break;
3450	default:
3451		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3452			error = EAFNOSUPPORT;
3453	}
3454	return (error);
3455}
3456
3457/*
3458 * Check if given address belongs to the jail referenced by cred (wrapper to
3459 * prison_check_ip[46]).
3460 *
3461 * Returns 0 if jail doesn't restrict the address family or if address belongs
3462 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3463 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3464 */
3465int
3466prison_if(struct ucred *cred, struct sockaddr *sa)
3467{
3468#ifdef INET
3469	struct sockaddr_in *sai;
3470#endif
3471#ifdef INET6
3472	struct sockaddr_in6 *sai6;
3473#endif
3474	int error;
3475
3476	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3477	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3478
3479#ifdef VIMAGE
3480	if (prison_owns_vnet(cred))
3481		return (0);
3482#endif
3483
3484	error = 0;
3485	switch (sa->sa_family)
3486	{
3487#ifdef INET
3488	case AF_INET:
3489		sai = (struct sockaddr_in *)sa;
3490		error = prison_check_ip4(cred, &sai->sin_addr);
3491		break;
3492#endif
3493#ifdef INET6
3494	case AF_INET6:
3495		sai6 = (struct sockaddr_in6 *)sa;
3496		error = prison_check_ip6(cred, &sai6->sin6_addr);
3497		break;
3498#endif
3499	default:
3500		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3501			error = EAFNOSUPPORT;
3502	}
3503	return (error);
3504}
3505
3506/*
3507 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3508 */
3509int
3510prison_check(struct ucred *cred1, struct ucred *cred2)
3511{
3512
3513	return ((cred1->cr_prison == cred2->cr_prison ||
3514	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3515}
3516
3517/*
3518 * Return 1 if p2 is a child of p1, otherwise 0.
3519 */
3520int
3521prison_ischild(struct prison *pr1, struct prison *pr2)
3522{
3523
3524	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3525		if (pr1 == pr2)
3526			return (1);
3527	return (0);
3528}
3529
3530/*
3531 * Return 1 if the passed credential is in a jail, otherwise 0.
3532 */
3533int
3534jailed(struct ucred *cred)
3535{
3536
3537	return (cred->cr_prison != &prison0);
3538}
3539
3540/*
3541 * Return 1 if the passed credential is in a jail and that jail does not
3542 * have its own virtual network stack, otherwise 0.
3543 */
3544int
3545jailed_without_vnet(struct ucred *cred)
3546{
3547
3548	if (!jailed(cred))
3549		return (0);
3550#ifdef VIMAGE
3551	if (prison_owns_vnet(cred))
3552		return (0);
3553#endif
3554
3555	return (1);
3556}
3557
3558/*
3559 * Return the correct hostname (domainname, et al) for the passed credential.
3560 */
3561void
3562getcredhostname(struct ucred *cred, char *buf, size_t size)
3563{
3564	struct prison *pr;
3565
3566	/*
3567	 * A NULL credential can be used to shortcut to the physical
3568	 * system's hostname.
3569	 */
3570	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3571	mtx_lock(&pr->pr_mtx);
3572	strlcpy(buf, pr->pr_hostname, size);
3573	mtx_unlock(&pr->pr_mtx);
3574}
3575
3576void
3577getcreddomainname(struct ucred *cred, char *buf, size_t size)
3578{
3579
3580	mtx_lock(&cred->cr_prison->pr_mtx);
3581	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3582	mtx_unlock(&cred->cr_prison->pr_mtx);
3583}
3584
3585void
3586getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3587{
3588
3589	mtx_lock(&cred->cr_prison->pr_mtx);
3590	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3591	mtx_unlock(&cred->cr_prison->pr_mtx);
3592}
3593
3594void
3595getcredhostid(struct ucred *cred, unsigned long *hostid)
3596{
3597
3598	mtx_lock(&cred->cr_prison->pr_mtx);
3599	*hostid = cred->cr_prison->pr_hostid;
3600	mtx_unlock(&cred->cr_prison->pr_mtx);
3601}
3602
3603#ifdef VIMAGE
3604/*
3605 * Determine whether the prison represented by cred owns
3606 * its vnet rather than having it inherited.
3607 *
3608 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3609 */
3610int
3611prison_owns_vnet(struct ucred *cred)
3612{
3613
3614	/*
3615	 * vnets cannot be added/removed after jail creation,
3616	 * so no need to lock here.
3617	 */
3618	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3619}
3620#endif
3621
3622/*
3623 * Determine whether the subject represented by cred can "see"
3624 * status of a mount point.
3625 * Returns: 0 for permitted, ENOENT otherwise.
3626 * XXX: This function should be called cr_canseemount() and should be
3627 *      placed in kern_prot.c.
3628 */
3629int
3630prison_canseemount(struct ucred *cred, struct mount *mp)
3631{
3632	struct prison *pr;
3633	struct statfs *sp;
3634	size_t len;
3635
3636	pr = cred->cr_prison;
3637	if (pr->pr_enforce_statfs == 0)
3638		return (0);
3639	if (pr->pr_root->v_mount == mp)
3640		return (0);
3641	if (pr->pr_enforce_statfs == 2)
3642		return (ENOENT);
3643	/*
3644	 * If jail's chroot directory is set to "/" we should be able to see
3645	 * all mount-points from inside a jail.
3646	 * This is ugly check, but this is the only situation when jail's
3647	 * directory ends with '/'.
3648	 */
3649	if (strcmp(pr->pr_path, "/") == 0)
3650		return (0);
3651	len = strlen(pr->pr_path);
3652	sp = &mp->mnt_stat;
3653	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3654		return (ENOENT);
3655	/*
3656	 * Be sure that we don't have situation where jail's root directory
3657	 * is "/some/path" and mount point is "/some/pathpath".
3658	 */
3659	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3660		return (ENOENT);
3661	return (0);
3662}
3663
3664void
3665prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3666{
3667	char jpath[MAXPATHLEN];
3668	struct prison *pr;
3669	size_t len;
3670
3671	pr = cred->cr_prison;
3672	if (pr->pr_enforce_statfs == 0)
3673		return;
3674	if (prison_canseemount(cred, mp) != 0) {
3675		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3676		strlcpy(sp->f_mntonname, "[restricted]",
3677		    sizeof(sp->f_mntonname));
3678		return;
3679	}
3680	if (pr->pr_root->v_mount == mp) {
3681		/*
3682		 * Clear current buffer data, so we are sure nothing from
3683		 * the valid path left there.
3684		 */
3685		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3686		*sp->f_mntonname = '/';
3687		return;
3688	}
3689	/*
3690	 * If jail's chroot directory is set to "/" we should be able to see
3691	 * all mount-points from inside a jail.
3692	 */
3693	if (strcmp(pr->pr_path, "/") == 0)
3694		return;
3695	len = strlen(pr->pr_path);
3696	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3697	/*
3698	 * Clear current buffer data, so we are sure nothing from
3699	 * the valid path left there.
3700	 */
3701	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3702	if (*jpath == '\0') {
3703		/* Should never happen. */
3704		*sp->f_mntonname = '/';
3705	} else {
3706		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3707	}
3708}
3709
3710/*
3711 * Check with permission for a specific privilege is granted within jail.  We
3712 * have a specific list of accepted privileges; the rest are denied.
3713 */
3714int
3715prison_priv_check(struct ucred *cred, int priv)
3716{
3717
3718	if (!jailed(cred))
3719		return (0);
3720
3721#ifdef VIMAGE
3722	/*
3723	 * Privileges specific to prisons with a virtual network stack.
3724	 * There might be a duplicate entry here in case the privilege
3725	 * is only granted conditionally in the legacy jail case.
3726	 */
3727	switch (priv) {
3728#ifdef notyet
3729		/*
3730		 * NFS-specific privileges.
3731		 */
3732	case PRIV_NFS_DAEMON:
3733	case PRIV_NFS_LOCKD:
3734#endif
3735		/*
3736		 * Network stack privileges.
3737		 */
3738	case PRIV_NET_BRIDGE:
3739	case PRIV_NET_GRE:
3740	case PRIV_NET_BPF:
3741	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3742	case PRIV_NET_ROUTE:
3743	case PRIV_NET_TAP:
3744	case PRIV_NET_SETIFMTU:
3745	case PRIV_NET_SETIFFLAGS:
3746	case PRIV_NET_SETIFCAP:
3747	case PRIV_NET_SETIFDESCR:
3748	case PRIV_NET_SETIFNAME	:
3749	case PRIV_NET_SETIFMETRIC:
3750	case PRIV_NET_SETIFPHYS:
3751	case PRIV_NET_SETIFMAC:
3752	case PRIV_NET_ADDMULTI:
3753	case PRIV_NET_DELMULTI:
3754	case PRIV_NET_HWIOCTL:
3755	case PRIV_NET_SETLLADDR:
3756	case PRIV_NET_ADDIFGROUP:
3757	case PRIV_NET_DELIFGROUP:
3758	case PRIV_NET_IFCREATE:
3759	case PRIV_NET_IFDESTROY:
3760	case PRIV_NET_ADDIFADDR:
3761	case PRIV_NET_DELIFADDR:
3762	case PRIV_NET_LAGG:
3763	case PRIV_NET_GIF:
3764	case PRIV_NET_SETIFVNET:
3765	case PRIV_NET_SETIFFIB:
3766
3767		/*
3768		 * 802.11-related privileges.
3769		 */
3770	case PRIV_NET80211_GETKEY:
3771#ifdef notyet
3772	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3773#endif
3774
3775#ifdef notyet
3776		/*
3777		 * AppleTalk privileges.
3778		 */
3779	case PRIV_NETATALK_RESERVEDPORT:
3780
3781		/*
3782		 * ATM privileges.
3783		 */
3784	case PRIV_NETATM_CFG:
3785	case PRIV_NETATM_ADD:
3786	case PRIV_NETATM_DEL:
3787	case PRIV_NETATM_SET:
3788
3789		/*
3790		 * Bluetooth privileges.
3791		 */
3792	case PRIV_NETBLUETOOTH_RAW:
3793#endif
3794
3795		/*
3796		 * Netgraph and netgraph module privileges.
3797		 */
3798	case PRIV_NETGRAPH_CONTROL:
3799#ifdef notyet
3800	case PRIV_NETGRAPH_TTY:
3801#endif
3802
3803		/*
3804		 * IPv4 and IPv6 privileges.
3805		 */
3806	case PRIV_NETINET_IPFW:
3807	case PRIV_NETINET_DIVERT:
3808	case PRIV_NETINET_PF:
3809	case PRIV_NETINET_DUMMYNET:
3810	case PRIV_NETINET_CARP:
3811	case PRIV_NETINET_MROUTE:
3812	case PRIV_NETINET_RAW:
3813	case PRIV_NETINET_ADDRCTRL6:
3814	case PRIV_NETINET_ND6:
3815	case PRIV_NETINET_SCOPE6:
3816	case PRIV_NETINET_ALIFETIME6:
3817	case PRIV_NETINET_IPSEC:
3818	case PRIV_NETINET_BINDANY:
3819
3820#ifdef notyet
3821		/*
3822		 * IPX/SPX privileges.
3823		 */
3824	case PRIV_NETIPX_RESERVEDPORT:
3825	case PRIV_NETIPX_RAW:
3826
3827		/*
3828		 * NCP privileges.
3829		 */
3830	case PRIV_NETNCP:
3831
3832		/*
3833		 * SMB privileges.
3834		 */
3835	case PRIV_NETSMB:
3836#endif
3837
3838	/*
3839	 * No default: or deny here.
3840	 * In case of no permit fall through to next switch().
3841	 */
3842		if (cred->cr_prison->pr_flags & PR_VNET)
3843			return (0);
3844	}
3845#endif /* VIMAGE */
3846
3847	switch (priv) {
3848
3849		/*
3850		 * Allow ktrace privileges for root in jail.
3851		 */
3852	case PRIV_KTRACE:
3853
3854#if 0
3855		/*
3856		 * Allow jailed processes to configure audit identity and
3857		 * submit audit records (login, etc).  In the future we may
3858		 * want to further refine the relationship between audit and
3859		 * jail.
3860		 */
3861	case PRIV_AUDIT_GETAUDIT:
3862	case PRIV_AUDIT_SETAUDIT:
3863	case PRIV_AUDIT_SUBMIT:
3864#endif
3865
3866		/*
3867		 * Allow jailed processes to manipulate process UNIX
3868		 * credentials in any way they see fit.
3869		 */
3870	case PRIV_CRED_SETUID:
3871	case PRIV_CRED_SETEUID:
3872	case PRIV_CRED_SETGID:
3873	case PRIV_CRED_SETEGID:
3874	case PRIV_CRED_SETGROUPS:
3875	case PRIV_CRED_SETREUID:
3876	case PRIV_CRED_SETREGID:
3877	case PRIV_CRED_SETRESUID:
3878	case PRIV_CRED_SETRESGID:
3879
3880		/*
3881		 * Jail implements visibility constraints already, so allow
3882		 * jailed root to override uid/gid-based constraints.
3883		 */
3884	case PRIV_SEEOTHERGIDS:
3885	case PRIV_SEEOTHERUIDS:
3886
3887		/*
3888		 * Jail implements inter-process debugging limits already, so
3889		 * allow jailed root various debugging privileges.
3890		 */
3891	case PRIV_DEBUG_DIFFCRED:
3892	case PRIV_DEBUG_SUGID:
3893	case PRIV_DEBUG_UNPRIV:
3894
3895		/*
3896		 * Allow jail to set various resource limits and login
3897		 * properties, and for now, exceed process resource limits.
3898		 */
3899	case PRIV_PROC_LIMIT:
3900	case PRIV_PROC_SETLOGIN:
3901	case PRIV_PROC_SETRLIMIT:
3902
3903		/*
3904		 * System V and POSIX IPC privileges are granted in jail.
3905		 */
3906	case PRIV_IPC_READ:
3907	case PRIV_IPC_WRITE:
3908	case PRIV_IPC_ADMIN:
3909	case PRIV_IPC_MSGSIZE:
3910	case PRIV_MQ_ADMIN:
3911
3912		/*
3913		 * Jail operations within a jail work on child jails.
3914		 */
3915	case PRIV_JAIL_ATTACH:
3916	case PRIV_JAIL_SET:
3917	case PRIV_JAIL_REMOVE:
3918
3919		/*
3920		 * Jail implements its own inter-process limits, so allow
3921		 * root processes in jail to change scheduling on other
3922		 * processes in the same jail.  Likewise for signalling.
3923		 */
3924	case PRIV_SCHED_DIFFCRED:
3925	case PRIV_SCHED_CPUSET:
3926	case PRIV_SIGNAL_DIFFCRED:
3927	case PRIV_SIGNAL_SUGID:
3928
3929		/*
3930		 * Allow jailed processes to write to sysctls marked as jail
3931		 * writable.
3932		 */
3933	case PRIV_SYSCTL_WRITEJAIL:
3934
3935		/*
3936		 * Allow root in jail to manage a variety of quota
3937		 * properties.  These should likely be conditional on a
3938		 * configuration option.
3939		 */
3940	case PRIV_VFS_GETQUOTA:
3941	case PRIV_VFS_SETQUOTA:
3942
3943		/*
3944		 * Since Jail relies on chroot() to implement file system
3945		 * protections, grant many VFS privileges to root in jail.
3946		 * Be careful to exclude mount-related and NFS-related
3947		 * privileges.
3948		 */
3949	case PRIV_VFS_READ:
3950	case PRIV_VFS_WRITE:
3951	case PRIV_VFS_ADMIN:
3952	case PRIV_VFS_EXEC:
3953	case PRIV_VFS_LOOKUP:
3954	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3955	case PRIV_VFS_CHFLAGS_DEV:
3956	case PRIV_VFS_CHOWN:
3957	case PRIV_VFS_CHROOT:
3958	case PRIV_VFS_RETAINSUGID:
3959	case PRIV_VFS_FCHROOT:
3960	case PRIV_VFS_LINK:
3961	case PRIV_VFS_SETGID:
3962	case PRIV_VFS_STAT:
3963	case PRIV_VFS_STICKYFILE:
3964
3965		/*
3966		 * As in the non-jail case, non-root users are expected to be
3967		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
3968		 * exists in the jail and they have permission to access it).
3969		 */
3970	case PRIV_KMEM_READ:
3971		return (0);
3972
3973		/*
3974		 * Depending on the global setting, allow privilege of
3975		 * setting system flags.
3976		 */
3977	case PRIV_VFS_SYSFLAGS:
3978		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3979			return (0);
3980		else
3981			return (EPERM);
3982
3983		/*
3984		 * Depending on the global setting, allow privilege of
3985		 * mounting/unmounting file systems.
3986		 */
3987	case PRIV_VFS_MOUNT:
3988	case PRIV_VFS_UNMOUNT:
3989	case PRIV_VFS_MOUNT_NONUSER:
3990	case PRIV_VFS_MOUNT_OWNER:
3991		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
3992		    cred->cr_prison->pr_enforce_statfs < 2)
3993			return (0);
3994		else
3995			return (EPERM);
3996
3997		/*
3998		 * Allow jailed root to bind reserved ports and reuse in-use
3999		 * ports.
4000		 */
4001	case PRIV_NETINET_RESERVEDPORT:
4002	case PRIV_NETINET_REUSEPORT:
4003		return (0);
4004
4005		/*
4006		 * Allow jailed root to set certian IPv4/6 (option) headers.
4007		 */
4008	case PRIV_NETINET_SETHDROPTS:
4009		return (0);
4010
4011		/*
4012		 * Conditionally allow creating raw sockets in jail.
4013		 */
4014	case PRIV_NETINET_RAW:
4015		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4016			return (0);
4017		else
4018			return (EPERM);
4019
4020		/*
4021		 * Since jail implements its own visibility limits on netstat
4022		 * sysctls, allow getcred.  This allows identd to work in
4023		 * jail.
4024		 */
4025	case PRIV_NETINET_GETCRED:
4026		return (0);
4027
4028		/*
4029		 * Allow jailed root to set loginclass.
4030		 */
4031	case PRIV_PROC_SETLOGINCLASS:
4032		return (0);
4033
4034	default:
4035		/*
4036		 * In all remaining cases, deny the privilege request.  This
4037		 * includes almost all network privileges, many system
4038		 * configuration privileges.
4039		 */
4040		return (EPERM);
4041	}
4042}
4043
4044/*
4045 * Return the part of pr2's name that is relative to pr1, or the whole name
4046 * if it does not directly follow.
4047 */
4048
4049char *
4050prison_name(struct prison *pr1, struct prison *pr2)
4051{
4052	char *name;
4053
4054	/* Jails see themselves as "0" (if they see themselves at all). */
4055	if (pr1 == pr2)
4056		return "0";
4057	name = pr2->pr_name;
4058	if (prison_ischild(pr1, pr2)) {
4059		/*
4060		 * pr1 isn't locked (and allprison_lock may not be either)
4061		 * so its length can't be counted on.  But the number of dots
4062		 * can be counted on - and counted.
4063		 */
4064		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4065			name = strchr(name, '.') + 1;
4066	}
4067	return (name);
4068}
4069
4070/*
4071 * Return the part of pr2's path that is relative to pr1, or the whole path
4072 * if it does not directly follow.
4073 */
4074static char *
4075prison_path(struct prison *pr1, struct prison *pr2)
4076{
4077	char *path1, *path2;
4078	int len1;
4079
4080	path1 = pr1->pr_path;
4081	path2 = pr2->pr_path;
4082	if (!strcmp(path1, "/"))
4083		return (path2);
4084	len1 = strlen(path1);
4085	if (strncmp(path1, path2, len1))
4086		return (path2);
4087	if (path2[len1] == '\0')
4088		return "/";
4089	if (path2[len1] == '/')
4090		return (path2 + len1);
4091	return (path2);
4092}
4093
4094
4095/*
4096 * Jail-related sysctls.
4097 */
4098static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4099    "Jails");
4100
4101static int
4102sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4103{
4104	struct xprison *xp;
4105	struct prison *pr, *cpr;
4106#ifdef INET
4107	struct in_addr *ip4 = NULL;
4108	int ip4s = 0;
4109#endif
4110#ifdef INET6
4111	struct in6_addr *ip6 = NULL;
4112	int ip6s = 0;
4113#endif
4114	int descend, error;
4115
4116	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4117	pr = req->td->td_ucred->cr_prison;
4118	error = 0;
4119	sx_slock(&allprison_lock);
4120	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4121#if defined(INET) || defined(INET6)
4122 again:
4123#endif
4124		mtx_lock(&cpr->pr_mtx);
4125#ifdef INET
4126		if (cpr->pr_ip4s > 0) {
4127			if (ip4s < cpr->pr_ip4s) {
4128				ip4s = cpr->pr_ip4s;
4129				mtx_unlock(&cpr->pr_mtx);
4130				ip4 = realloc(ip4, ip4s *
4131				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4132				goto again;
4133			}
4134			bcopy(cpr->pr_ip4, ip4,
4135			    cpr->pr_ip4s * sizeof(struct in_addr));
4136		}
4137#endif
4138#ifdef INET6
4139		if (cpr->pr_ip6s > 0) {
4140			if (ip6s < cpr->pr_ip6s) {
4141				ip6s = cpr->pr_ip6s;
4142				mtx_unlock(&cpr->pr_mtx);
4143				ip6 = realloc(ip6, ip6s *
4144				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4145				goto again;
4146			}
4147			bcopy(cpr->pr_ip6, ip6,
4148			    cpr->pr_ip6s * sizeof(struct in6_addr));
4149		}
4150#endif
4151		if (cpr->pr_ref == 0) {
4152			mtx_unlock(&cpr->pr_mtx);
4153			continue;
4154		}
4155		bzero(xp, sizeof(*xp));
4156		xp->pr_version = XPRISON_VERSION;
4157		xp->pr_id = cpr->pr_id;
4158		xp->pr_state = cpr->pr_uref > 0
4159		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4160		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4161		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4162		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4163#ifdef INET
4164		xp->pr_ip4s = cpr->pr_ip4s;
4165#endif
4166#ifdef INET6
4167		xp->pr_ip6s = cpr->pr_ip6s;
4168#endif
4169		mtx_unlock(&cpr->pr_mtx);
4170		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4171		if (error)
4172			break;
4173#ifdef INET
4174		if (xp->pr_ip4s > 0) {
4175			error = SYSCTL_OUT(req, ip4,
4176			    xp->pr_ip4s * sizeof(struct in_addr));
4177			if (error)
4178				break;
4179		}
4180#endif
4181#ifdef INET6
4182		if (xp->pr_ip6s > 0) {
4183			error = SYSCTL_OUT(req, ip6,
4184			    xp->pr_ip6s * sizeof(struct in6_addr));
4185			if (error)
4186				break;
4187		}
4188#endif
4189	}
4190	sx_sunlock(&allprison_lock);
4191	free(xp, M_TEMP);
4192#ifdef INET
4193	free(ip4, M_TEMP);
4194#endif
4195#ifdef INET6
4196	free(ip6, M_TEMP);
4197#endif
4198	return (error);
4199}
4200
4201SYSCTL_OID(_security_jail, OID_AUTO, list,
4202    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4203    sysctl_jail_list, "S", "List of active jails");
4204
4205static int
4206sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4207{
4208	int error, injail;
4209
4210	injail = jailed(req->td->td_ucred);
4211	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4212
4213	return (error);
4214}
4215
4216SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4217    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4218    sysctl_jail_jailed, "I", "Process in jail?");
4219
4220static int
4221sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4222{
4223	int error, havevnet;
4224#ifdef VIMAGE
4225	struct ucred *cred = req->td->td_ucred;
4226
4227	havevnet = jailed(cred) && prison_owns_vnet(cred);
4228#else
4229	havevnet = 0;
4230#endif
4231	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4232
4233	return (error);
4234}
4235
4236SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4237    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4238    sysctl_jail_vnet, "I", "Jail owns VNET?");
4239
4240#if defined(INET) || defined(INET6)
4241SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4242    &jail_max_af_ips, 0,
4243    "Number of IP addresses a jail may have at most per address family");
4244#endif
4245
4246/*
4247 * Default parameters for jail(2) compatability.  For historical reasons,
4248 * the sysctl names have varying similarity to the parameter names.  Prisons
4249 * just see their own parameters, and can't change them.
4250 */
4251static int
4252sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4253{
4254	struct prison *pr;
4255	int allow, error, i;
4256
4257	pr = req->td->td_ucred->cr_prison;
4258	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4259
4260	/* Get the current flag value, and convert it to a boolean. */
4261	i = (allow & arg2) ? 1 : 0;
4262	if (arg1 != NULL)
4263		i = !i;
4264	error = sysctl_handle_int(oidp, &i, 0, req);
4265	if (error || !req->newptr)
4266		return (error);
4267	i = i ? arg2 : 0;
4268	if (arg1 != NULL)
4269		i ^= arg2;
4270	/*
4271	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4272	 * for writing.
4273	 */
4274	mtx_lock(&prison0.pr_mtx);
4275	jail_default_allow = (jail_default_allow & ~arg2) | i;
4276	mtx_unlock(&prison0.pr_mtx);
4277	return (0);
4278}
4279
4280SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4281    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4282    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4283    "Processes in jail can set their hostnames");
4284SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4285    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4286    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4287    "Processes in jail are limited to creating UNIX/IP/route sockets only");
4288SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4289    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4290    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4291    "Processes in jail can use System V IPC primitives");
4292SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4293    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4294    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4295    "Prison root can create raw sockets");
4296SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4297    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4298    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4299    "Processes in jail can alter system file flags");
4300SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4301    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4302    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4303    "Processes in jail can mount/unmount jail-friendly file systems");
4304SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
4305    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4306    NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
4307    "Processes in jail can mount the devfs file system");
4308SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
4309    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4310    NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
4311    "Processes in jail can mount the fdescfs file system");
4312SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
4313    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4314    NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
4315    "Processes in jail can mount the nullfs file system");
4316SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
4317    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4318    NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
4319    "Processes in jail can mount the procfs file system");
4320SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
4321    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4322    NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
4323    "Processes in jail can mount the linprocfs file system");
4324SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
4325    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4326    NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
4327    "Processes in jail can mount the linsysfs file system");
4328SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
4329    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4330    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
4331    "Processes in jail can mount the tmpfs file system");
4332SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
4333    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4334    NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
4335    "Processes in jail can mount the zfs file system");
4336
4337static int
4338sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4339{
4340	struct prison *pr;
4341	int level, error;
4342
4343	pr = req->td->td_ucred->cr_prison;
4344	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4345	error = sysctl_handle_int(oidp, &level, 0, req);
4346	if (error || !req->newptr)
4347		return (error);
4348	*(int *)arg1 = level;
4349	return (0);
4350}
4351
4352SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4353    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4354    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4355    sysctl_jail_default_level, "I",
4356    "Processes in jail cannot see all mounted file systems");
4357
4358SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4359    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4360    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4361    sysctl_jail_default_level, "I",
4362    "Ruleset for the devfs filesystem in jail");
4363
4364/*
4365 * Nodes to describe jail parameters.  Maximum length of string parameters
4366 * is returned in the string itself, and the other parameters exist merely
4367 * to make themselves and their types known.
4368 */
4369SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4370    "Jail parameters");
4371
4372int
4373sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4374{
4375	int i;
4376	long l;
4377	size_t s;
4378	char numbuf[12];
4379
4380	switch (oidp->oid_kind & CTLTYPE)
4381	{
4382	case CTLTYPE_LONG:
4383	case CTLTYPE_ULONG:
4384		l = 0;
4385#ifdef SCTL_MASK32
4386		if (!(req->flags & SCTL_MASK32))
4387#endif
4388			return (SYSCTL_OUT(req, &l, sizeof(l)));
4389	case CTLTYPE_INT:
4390	case CTLTYPE_UINT:
4391		i = 0;
4392		return (SYSCTL_OUT(req, &i, sizeof(i)));
4393	case CTLTYPE_STRING:
4394		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4395		return
4396		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4397	case CTLTYPE_STRUCT:
4398		s = (size_t)arg2;
4399		return (SYSCTL_OUT(req, &s, sizeof(s)));
4400	}
4401	return (0);
4402}
4403
4404/*
4405 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4406 * jail creation time but cannot be changed in an existing jail.
4407 */
4408SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4409SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4410SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4411SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4412SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4413    "I", "Jail secure level");
4414SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4415    "Jail value for kern.osreldate and uname -K");
4416SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4417    "Jail value for kern.osrelease and uname -r");
4418SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4419    "I", "Jail cannot see all mounted file systems");
4420SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4421    "I", "Ruleset for in-jail devfs mounts");
4422SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4423    "B", "Jail persistence");
4424#ifdef VIMAGE
4425SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4426    "E,jailsys", "Virtual network stack");
4427#endif
4428SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4429    "B", "Jail is in the process of shutting down");
4430
4431SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4432SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4433    "I", "Current number of child jails");
4434SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4435    "I", "Maximum number of child jails");
4436
4437SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4438SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4439    "Jail hostname");
4440SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4441    "Jail NIS domainname");
4442SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4443    "Jail host UUID");
4444SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4445    "LU", "Jail host ID");
4446
4447SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4448SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4449
4450#ifdef INET
4451SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4452    "Jail IPv4 address virtualization");
4453SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4454    "S,in_addr,a", "Jail IPv4 addresses");
4455SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4456    "B", "Do (not) use IPv4 source address selection rather than the "
4457    "primary jail IPv4 address.");
4458#endif
4459#ifdef INET6
4460SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4461    "Jail IPv6 address virtualization");
4462SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4463    "S,in6_addr,a", "Jail IPv6 addresses");
4464SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4465    "B", "Do (not) use IPv6 source address selection rather than the "
4466    "primary jail IPv6 address.");
4467#endif
4468
4469SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4470SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4471    "B", "Jail may set hostname");
4472SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4473    "B", "Jail may use SYSV IPC");
4474SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4475    "B", "Jail may create raw sockets");
4476SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4477    "B", "Jail may alter system file flags");
4478SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4479    "B", "Jail may set file quotas");
4480SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4481    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4482
4483SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4484SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4485    "B", "Jail may mount/unmount jail-friendly file systems in general");
4486SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
4487    "B", "Jail may mount the devfs file system");
4488SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
4489    "B", "Jail may mount the fdescfs file system");
4490SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
4491    "B", "Jail may mount the nullfs file system");
4492SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
4493    "B", "Jail may mount the procfs file system");
4494SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
4495    "B", "Jail may mount the linprocfs file system");
4496SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
4497    "B", "Jail may mount the linsysfs file system");
4498SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
4499    "B", "Jail may mount the tmpfs file system");
4500SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
4501    "B", "Jail may mount the zfs file system");
4502
4503#ifdef RACCT
4504void
4505prison_racct_foreach(void (*callback)(struct racct *racct,
4506    void *arg2, void *arg3), void *arg2, void *arg3)
4507{
4508	struct prison_racct *prr;
4509
4510	ASSERT_RACCT_ENABLED();
4511
4512	sx_slock(&allprison_lock);
4513	LIST_FOREACH(prr, &allprison_racct, prr_next)
4514		(callback)(prr->prr_racct, arg2, arg3);
4515	sx_sunlock(&allprison_lock);
4516}
4517
4518static struct prison_racct *
4519prison_racct_find_locked(const char *name)
4520{
4521	struct prison_racct *prr;
4522
4523	ASSERT_RACCT_ENABLED();
4524	sx_assert(&allprison_lock, SA_XLOCKED);
4525
4526	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4527		return (NULL);
4528
4529	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4530		if (strcmp(name, prr->prr_name) != 0)
4531			continue;
4532
4533		/* Found prison_racct with a matching name? */
4534		prison_racct_hold(prr);
4535		return (prr);
4536	}
4537
4538	/* Add new prison_racct. */
4539	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4540	racct_create(&prr->prr_racct);
4541
4542	strcpy(prr->prr_name, name);
4543	refcount_init(&prr->prr_refcount, 1);
4544	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4545
4546	return (prr);
4547}
4548
4549struct prison_racct *
4550prison_racct_find(const char *name)
4551{
4552	struct prison_racct *prr;
4553
4554	ASSERT_RACCT_ENABLED();
4555
4556	sx_xlock(&allprison_lock);
4557	prr = prison_racct_find_locked(name);
4558	sx_xunlock(&allprison_lock);
4559	return (prr);
4560}
4561
4562void
4563prison_racct_hold(struct prison_racct *prr)
4564{
4565
4566	ASSERT_RACCT_ENABLED();
4567
4568	refcount_acquire(&prr->prr_refcount);
4569}
4570
4571static void
4572prison_racct_free_locked(struct prison_racct *prr)
4573{
4574
4575	ASSERT_RACCT_ENABLED();
4576	sx_assert(&allprison_lock, SA_XLOCKED);
4577
4578	if (refcount_release(&prr->prr_refcount)) {
4579		racct_destroy(&prr->prr_racct);
4580		LIST_REMOVE(prr, prr_next);
4581		free(prr, M_PRISON_RACCT);
4582	}
4583}
4584
4585void
4586prison_racct_free(struct prison_racct *prr)
4587{
4588	int old;
4589
4590	ASSERT_RACCT_ENABLED();
4591	sx_assert(&allprison_lock, SA_UNLOCKED);
4592
4593	old = prr->prr_refcount;
4594	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4595		return;
4596
4597	sx_xlock(&allprison_lock);
4598	prison_racct_free_locked(prr);
4599	sx_xunlock(&allprison_lock);
4600}
4601
4602static void
4603prison_racct_attach(struct prison *pr)
4604{
4605	struct prison_racct *prr;
4606
4607	ASSERT_RACCT_ENABLED();
4608	sx_assert(&allprison_lock, SA_XLOCKED);
4609
4610	prr = prison_racct_find_locked(pr->pr_name);
4611	KASSERT(prr != NULL, ("cannot find prison_racct"));
4612
4613	pr->pr_prison_racct = prr;
4614}
4615
4616/*
4617 * Handle jail renaming.  From the racct point of view, renaming means
4618 * moving from one prison_racct to another.
4619 */
4620static void
4621prison_racct_modify(struct prison *pr)
4622{
4623	struct proc *p;
4624	struct ucred *cred;
4625	struct prison_racct *oldprr;
4626
4627	ASSERT_RACCT_ENABLED();
4628
4629	sx_slock(&allproc_lock);
4630	sx_xlock(&allprison_lock);
4631
4632	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4633		sx_xunlock(&allprison_lock);
4634		sx_sunlock(&allproc_lock);
4635		return;
4636	}
4637
4638	oldprr = pr->pr_prison_racct;
4639	pr->pr_prison_racct = NULL;
4640
4641	prison_racct_attach(pr);
4642
4643	/*
4644	 * Move resource utilisation records.
4645	 */
4646	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4647
4648	/*
4649	 * Force rctl to reattach rules to processes.
4650	 */
4651	FOREACH_PROC_IN_SYSTEM(p) {
4652		PROC_LOCK(p);
4653		cred = crhold(p->p_ucred);
4654		PROC_UNLOCK(p);
4655		racct_proc_ucred_changed(p, cred, cred);
4656		crfree(cred);
4657	}
4658
4659	sx_sunlock(&allproc_lock);
4660	prison_racct_free_locked(oldprr);
4661	sx_xunlock(&allprison_lock);
4662}
4663
4664static void
4665prison_racct_detach(struct prison *pr)
4666{
4667
4668	ASSERT_RACCT_ENABLED();
4669	sx_assert(&allprison_lock, SA_UNLOCKED);
4670
4671	if (pr->pr_prison_racct == NULL)
4672		return;
4673	prison_racct_free(pr->pr_prison_racct);
4674	pr->pr_prison_racct = NULL;
4675}
4676#endif /* RACCT */
4677
4678#ifdef DDB
4679
4680static void
4681db_show_prison(struct prison *pr)
4682{
4683	int fi;
4684#if defined(INET) || defined(INET6)
4685	int ii;
4686#endif
4687	unsigned jsf;
4688#ifdef INET6
4689	char ip6buf[INET6_ADDRSTRLEN];
4690#endif
4691
4692	db_printf("prison %p:\n", pr);
4693	db_printf(" jid             = %d\n", pr->pr_id);
4694	db_printf(" name            = %s\n", pr->pr_name);
4695	db_printf(" parent          = %p\n", pr->pr_parent);
4696	db_printf(" ref             = %d\n", pr->pr_ref);
4697	db_printf(" uref            = %d\n", pr->pr_uref);
4698	db_printf(" path            = %s\n", pr->pr_path);
4699	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4700	    ? pr->pr_cpuset->cs_id : -1);
4701#ifdef VIMAGE
4702	db_printf(" vnet            = %p\n", pr->pr_vnet);
4703#endif
4704	db_printf(" root            = %p\n", pr->pr_root);
4705	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4706	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4707	db_printf(" children.max    = %d\n", pr->pr_childmax);
4708	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4709	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4710	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4711	db_printf(" flags           = 0x%x", pr->pr_flags);
4712	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4713	    fi++)
4714		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4715			db_printf(" %s", pr_flag_names[fi]);
4716	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4717	    fi++) {
4718		jsf = pr->pr_flags &
4719		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4720		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4721		    pr_flag_jailsys[fi].disable &&
4722		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4723		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4724		    : "inherit");
4725	}
4726	db_printf(" allow           = 0x%x", pr->pr_allow);
4727	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4728	    fi++)
4729		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4730			db_printf(" %s", pr_allow_names[fi]);
4731	db_printf("\n");
4732	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4733	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4734	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4735	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4736	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4737#ifdef INET
4738	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4739	for (ii = 0; ii < pr->pr_ip4s; ii++)
4740		db_printf(" %s %s\n",
4741		    ii == 0 ? "ip4.addr        =" : "                 ",
4742		    inet_ntoa(pr->pr_ip4[ii]));
4743#endif
4744#ifdef INET6
4745	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4746	for (ii = 0; ii < pr->pr_ip6s; ii++)
4747		db_printf(" %s %s\n",
4748		    ii == 0 ? "ip6.addr        =" : "                 ",
4749		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4750#endif
4751}
4752
4753DB_SHOW_COMMAND(prison, db_show_prison_command)
4754{
4755	struct prison *pr;
4756
4757	if (!have_addr) {
4758		/*
4759		 * Show all prisons in the list, and prison0 which is not
4760		 * listed.
4761		 */
4762		db_show_prison(&prison0);
4763		if (!db_pager_quit) {
4764			TAILQ_FOREACH(pr, &allprison, pr_list) {
4765				db_show_prison(pr);
4766				if (db_pager_quit)
4767					break;
4768			}
4769		}
4770		return;
4771	}
4772
4773	if (addr == 0)
4774		pr = &prison0;
4775	else {
4776		/* Look for a prison with the ID and with references. */
4777		TAILQ_FOREACH(pr, &allprison, pr_list)
4778			if (pr->pr_id == addr && pr->pr_ref > 0)
4779				break;
4780		if (pr == NULL)
4781			/* Look again, without requiring a reference. */
4782			TAILQ_FOREACH(pr, &allprison, pr_list)
4783				if (pr->pr_id == addr)
4784					break;
4785		if (pr == NULL)
4786			/* Assume address points to a valid prison. */
4787			pr = (struct prison *)addr;
4788	}
4789	db_show_prison(pr);
4790}
4791
4792#endif /* DDB */
4793