kern_jail.c revision 196835
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 196835 2009-09-04 19:00:48Z jamie $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/sx.h>
53#include <sys/sysent.h>
54#include <sys/namei.h>
55#include <sys/mount.h>
56#include <sys/queue.h>
57#include <sys/socket.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/vnode.h>
61
62#include <net/if.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#ifdef INET6
70#include <netinet6/in6_var.h>
71#endif /* INET6 */
72#endif /* DDB */
73
74#include <security/mac/mac_framework.h>
75
76#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
77
78MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
79
80/* prison0 describes what is "real" about the system. */
81struct prison prison0 = {
82	.pr_id		= 0,
83	.pr_name	= "0",
84	.pr_ref		= 1,
85	.pr_uref	= 1,
86	.pr_path	= "/",
87	.pr_securelevel	= -1,
88	.pr_childmax	= JAIL_MAX,
89	.pr_hostuuid	= DEFAULT_HOSTUUID,
90	.pr_children	= LIST_HEAD_INITIALIZER(&prison0.pr_children),
91#ifdef VIMAGE
92	.pr_flags	= PR_HOST|PR_VNET,
93#else
94	.pr_flags	= PR_HOST,
95#endif
96	.pr_allow	= PR_ALLOW_ALL,
97};
98MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
99
100/* allprison and lastprid are protected by allprison_lock. */
101struct	sx allprison_lock;
102SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
103struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
104int	lastprid = 0;
105
106static int do_jail_attach(struct thread *td, struct prison *pr);
107static void prison_complete(void *context, int pending);
108static void prison_deref(struct prison *pr, int flags);
109static char *prison_path(struct prison *pr1, struct prison *pr2);
110static void prison_remove_one(struct prison *pr);
111#ifdef INET
112static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
113static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
114#endif
115#ifdef INET6
116static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
117static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
118#endif
119
120/* Flags for prison_deref */
121#define	PD_DEREF	0x01
122#define	PD_DEUREF	0x02
123#define	PD_LOCKED	0x04
124#define	PD_LIST_SLOCKED	0x08
125#define	PD_LIST_XLOCKED	0x10
126
127/*
128 * Parameter names corresponding to PR_* flag values
129 */
130static char *pr_flag_names[] = {
131	[0] = "persist",
132};
133
134static char *pr_flag_nonames[] = {
135	[0] = "nopersist",
136};
137
138struct jailsys_flags {
139	const char	*name;
140	unsigned	 disable;
141	unsigned	 new;
142} pr_flag_jailsys[] = {
143	{ "host", 0, PR_HOST },
144#ifdef VIMAGE
145	{ "vnet", 0, PR_VNET },
146#endif
147#ifdef INET
148	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
149#endif
150#ifdef INET6
151	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
152#endif
153};
154
155static char *pr_allow_names[] = {
156	"allow.set_hostname",
157	"allow.sysvipc",
158	"allow.raw_sockets",
159	"allow.chflags",
160	"allow.mount",
161	"allow.quotas",
162	"allow.socket_af",
163};
164
165static char *pr_allow_nonames[] = {
166	"allow.noset_hostname",
167	"allow.nosysvipc",
168	"allow.noraw_sockets",
169	"allow.nochflags",
170	"allow.nomount",
171	"allow.noquotas",
172	"allow.nosocket_af",
173};
174
175#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
176#define	JAIL_DEFAULT_ENFORCE_STATFS	2
177static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
178static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
179#if defined(INET) || defined(INET6)
180static unsigned jail_max_af_ips = 255;
181#endif
182
183#ifdef INET
184static int
185qcmp_v4(const void *ip1, const void *ip2)
186{
187	in_addr_t iaa, iab;
188
189	/*
190	 * We need to compare in HBO here to get the list sorted as expected
191	 * by the result of the code.  Sorting NBO addresses gives you
192	 * interesting results.  If you do not understand, do not try.
193	 */
194	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
195	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
196
197	/*
198	 * Do not simply return the difference of the two numbers, the int is
199	 * not wide enough.
200	 */
201	if (iaa > iab)
202		return (1);
203	else if (iaa < iab)
204		return (-1);
205	else
206		return (0);
207}
208#endif
209
210#ifdef INET6
211static int
212qcmp_v6(const void *ip1, const void *ip2)
213{
214	const struct in6_addr *ia6a, *ia6b;
215	int i, rc;
216
217	ia6a = (const struct in6_addr *)ip1;
218	ia6b = (const struct in6_addr *)ip2;
219
220	rc = 0;
221	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
222		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
223			rc = 1;
224		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
225			rc = -1;
226	}
227	return (rc);
228}
229#endif
230
231/*
232 * struct jail_args {
233 *	struct jail *jail;
234 * };
235 */
236int
237jail(struct thread *td, struct jail_args *uap)
238{
239	uint32_t version;
240	int error;
241	struct jail j;
242
243	error = copyin(uap->jail, &version, sizeof(uint32_t));
244	if (error)
245		return (error);
246
247	switch (version) {
248	case 0:
249	{
250		struct jail_v0 j0;
251
252		/* FreeBSD single IPv4 jails. */
253		bzero(&j, sizeof(struct jail));
254		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
255		if (error)
256			return (error);
257		j.version = j0.version;
258		j.path = j0.path;
259		j.hostname = j0.hostname;
260		j.ip4s = j0.ip_number;
261		break;
262	}
263
264	case 1:
265		/*
266		 * Version 1 was used by multi-IPv4 jail implementations
267		 * that never made it into the official kernel.
268		 */
269		return (EINVAL);
270
271	case 2:	/* JAIL_API_VERSION */
272		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
273		error = copyin(uap->jail, &j, sizeof(struct jail));
274		if (error)
275			return (error);
276		break;
277
278	default:
279		/* Sci-Fi jails are not supported, sorry. */
280		return (EINVAL);
281	}
282	return (kern_jail(td, &j));
283}
284
285int
286kern_jail(struct thread *td, struct jail *j)
287{
288	struct iovec optiov[2 * (4
289			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
290#ifdef INET
291			    + 1
292#endif
293#ifdef INET6
294			    + 1
295#endif
296			    )];
297	struct uio opt;
298	char *u_path, *u_hostname, *u_name;
299#ifdef INET
300	uint32_t ip4s;
301	struct in_addr *u_ip4;
302#endif
303#ifdef INET6
304	struct in6_addr *u_ip6;
305#endif
306	size_t tmplen;
307	int error, enforce_statfs, fi;
308
309	bzero(&optiov, sizeof(optiov));
310	opt.uio_iov = optiov;
311	opt.uio_iovcnt = 0;
312	opt.uio_offset = -1;
313	opt.uio_resid = -1;
314	opt.uio_segflg = UIO_SYSSPACE;
315	opt.uio_rw = UIO_READ;
316	opt.uio_td = td;
317
318	/* Set permissions for top-level jails from sysctls. */
319	if (!jailed(td->td_ucred)) {
320		for (fi = 0; fi < sizeof(pr_allow_names) /
321		     sizeof(pr_allow_names[0]); fi++) {
322			optiov[opt.uio_iovcnt].iov_base =
323			    (jail_default_allow & (1 << fi))
324			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
325			optiov[opt.uio_iovcnt].iov_len =
326			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
327			opt.uio_iovcnt += 2;
328		}
329		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
330		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
331		opt.uio_iovcnt++;
332		enforce_statfs = jail_default_enforce_statfs;
333		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
334		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
335		opt.uio_iovcnt++;
336	}
337
338	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
339#ifdef INET
340	ip4s = (j->version == 0) ? 1 : j->ip4s;
341	if (ip4s > jail_max_af_ips)
342		return (EINVAL);
343	tmplen += ip4s * sizeof(struct in_addr);
344#else
345	if (j->ip4s > 0)
346		return (EINVAL);
347#endif
348#ifdef INET6
349	if (j->ip6s > jail_max_af_ips)
350		return (EINVAL);
351	tmplen += j->ip6s * sizeof(struct in6_addr);
352#else
353	if (j->ip6s > 0)
354		return (EINVAL);
355#endif
356	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
357	u_hostname = u_path + MAXPATHLEN;
358	u_name = u_hostname + MAXHOSTNAMELEN;
359#ifdef INET
360	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
361#endif
362#ifdef INET6
363#ifdef INET
364	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
365#else
366	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
367#endif
368#endif
369	optiov[opt.uio_iovcnt].iov_base = "path";
370	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
371	opt.uio_iovcnt++;
372	optiov[opt.uio_iovcnt].iov_base = u_path;
373	error = copyinstr(j->path, u_path, MAXPATHLEN,
374	    &optiov[opt.uio_iovcnt].iov_len);
375	if (error) {
376		free(u_path, M_TEMP);
377		return (error);
378	}
379	opt.uio_iovcnt++;
380	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
381	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
382	opt.uio_iovcnt++;
383	optiov[opt.uio_iovcnt].iov_base = u_hostname;
384	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
385	    &optiov[opt.uio_iovcnt].iov_len);
386	if (error) {
387		free(u_path, M_TEMP);
388		return (error);
389	}
390	opt.uio_iovcnt++;
391	if (j->jailname != NULL) {
392		optiov[opt.uio_iovcnt].iov_base = "name";
393		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
394		opt.uio_iovcnt++;
395		optiov[opt.uio_iovcnt].iov_base = u_name;
396		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
397		    &optiov[opt.uio_iovcnt].iov_len);
398		if (error) {
399			free(u_path, M_TEMP);
400			return (error);
401		}
402		opt.uio_iovcnt++;
403	}
404#ifdef INET
405	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
406	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
407	opt.uio_iovcnt++;
408	optiov[opt.uio_iovcnt].iov_base = u_ip4;
409	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
410	if (j->version == 0)
411		u_ip4->s_addr = j->ip4s;
412	else {
413		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
414		if (error) {
415			free(u_path, M_TEMP);
416			return (error);
417		}
418	}
419	opt.uio_iovcnt++;
420#endif
421#ifdef INET6
422	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
423	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
424	opt.uio_iovcnt++;
425	optiov[opt.uio_iovcnt].iov_base = u_ip6;
426	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
427	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
428	if (error) {
429		free(u_path, M_TEMP);
430		return (error);
431	}
432	opt.uio_iovcnt++;
433#endif
434	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
435	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
436	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
437	free(u_path, M_TEMP);
438	return (error);
439}
440
441
442/*
443 * struct jail_set_args {
444 *	struct iovec *iovp;
445 *	unsigned int iovcnt;
446 *	int flags;
447 * };
448 */
449int
450jail_set(struct thread *td, struct jail_set_args *uap)
451{
452	struct uio *auio;
453	int error;
454
455	/* Check that we have an even number of iovecs. */
456	if (uap->iovcnt & 1)
457		return (EINVAL);
458
459	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
460	if (error)
461		return (error);
462	error = kern_jail_set(td, auio, uap->flags);
463	free(auio, M_IOV);
464	return (error);
465}
466
467int
468kern_jail_set(struct thread *td, struct uio *optuio, int flags)
469{
470	struct nameidata nd;
471#ifdef INET
472	struct in_addr *ip4;
473#endif
474#ifdef INET6
475	struct in6_addr *ip6;
476#endif
477	struct vfsopt *opt;
478	struct vfsoptlist *opts;
479	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
480	struct vnode *root;
481	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
482#if defined(INET) || defined(INET6)
483	struct prison *tppr;
484	void *op;
485#endif
486	unsigned long hid;
487	size_t namelen, onamelen;
488	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
489	int gotchildmax, gotenforce, gothid, gotslevel;
490	int fi, jid, jsys, len, level;
491	int childmax, slevel, vfslocked;
492#if defined(INET) || defined(INET6)
493	int ii, ij;
494#endif
495#ifdef INET
496	int ip4s, redo_ip4;
497#endif
498#ifdef INET6
499	int ip6s, redo_ip6;
500#endif
501	unsigned pr_flags, ch_flags;
502	unsigned pr_allow, ch_allow, tallow;
503	char numbuf[12];
504
505	error = priv_check(td, PRIV_JAIL_SET);
506	if (!error && (flags & JAIL_ATTACH))
507		error = priv_check(td, PRIV_JAIL_ATTACH);
508	if (error)
509		return (error);
510	mypr = ppr = td->td_ucred->cr_prison;
511	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
512		return (EPERM);
513	if (flags & ~JAIL_SET_MASK)
514		return (EINVAL);
515
516	/*
517	 * Check all the parameters before committing to anything.  Not all
518	 * errors can be caught early, but we may as well try.  Also, this
519	 * takes care of some expensive stuff (path lookup) before getting
520	 * the allprison lock.
521	 *
522	 * XXX Jails are not filesystems, and jail parameters are not mount
523	 *     options.  But it makes more sense to re-use the vfsopt code
524	 *     than duplicate it under a different name.
525	 */
526	error = vfs_buildopts(optuio, &opts);
527	if (error)
528		return (error);
529#ifdef INET
530	ip4 = NULL;
531#endif
532#ifdef INET6
533	ip6 = NULL;
534#endif
535
536	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
537	if (error == ENOENT)
538		jid = 0;
539	else if (error != 0)
540		goto done_free;
541
542	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
543	if (error == ENOENT)
544		gotslevel = 0;
545	else if (error != 0)
546		goto done_free;
547	else
548		gotslevel = 1;
549
550	error =
551	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
552	if (error == ENOENT)
553		gotchildmax = 0;
554	else if (error != 0)
555		goto done_free;
556	else
557		gotchildmax = 1;
558
559	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
560	gotenforce = (error == 0);
561	if (gotenforce) {
562		if (enforce < 0 || enforce > 2)
563			return (EINVAL);
564	} else if (error != ENOENT)
565		goto done_free;
566
567	pr_flags = ch_flags = 0;
568	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
569	    fi++) {
570		if (pr_flag_names[fi] == NULL)
571			continue;
572		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
573		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
574	}
575	ch_flags |= pr_flags;
576	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
577	    fi++) {
578		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
579		    sizeof(jsys));
580		if (error == ENOENT)
581			continue;
582		if (error != 0)
583			goto done_free;
584		switch (jsys) {
585		case JAIL_SYS_DISABLE:
586			if (!pr_flag_jailsys[fi].disable) {
587				error = EINVAL;
588				goto done_free;
589			}
590			pr_flags |= pr_flag_jailsys[fi].disable;
591			break;
592		case JAIL_SYS_NEW:
593			pr_flags |= pr_flag_jailsys[fi].new;
594			break;
595		case JAIL_SYS_INHERIT:
596			break;
597		default:
598			error = EINVAL;
599			goto done_free;
600		}
601		ch_flags |=
602		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
603	}
604	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
605	    && !(pr_flags & PR_PERSIST)) {
606		error = EINVAL;
607		vfs_opterror(opts, "new jail must persist or attach");
608		goto done_errmsg;
609	}
610#ifdef VIMAGE
611	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
612		error = EINVAL;
613		vfs_opterror(opts, "vnet cannot be changed after creation");
614		goto done_errmsg;
615	}
616#endif
617#ifdef INET
618	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
619		error = EINVAL;
620		vfs_opterror(opts, "ip4 cannot be changed after creation");
621		goto done_errmsg;
622	}
623#endif
624#ifdef INET6
625	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
626		error = EINVAL;
627		vfs_opterror(opts, "ip6 cannot be changed after creation");
628		goto done_errmsg;
629	}
630#endif
631
632	pr_allow = ch_allow = 0;
633	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
634	    fi++) {
635		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
636		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
637	}
638	ch_allow |= pr_allow;
639
640	error = vfs_getopt(opts, "name", (void **)&name, &len);
641	if (error == ENOENT)
642		name = NULL;
643	else if (error != 0)
644		goto done_free;
645	else {
646		if (len == 0 || name[len - 1] != '\0') {
647			error = EINVAL;
648			goto done_free;
649		}
650		if (len > MAXHOSTNAMELEN) {
651			error = ENAMETOOLONG;
652			goto done_free;
653		}
654	}
655
656	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
657	if (error == ENOENT)
658		host = NULL;
659	else if (error != 0)
660		goto done_free;
661	else {
662		ch_flags |= PR_HOST;
663		pr_flags |= PR_HOST;
664		if (len == 0 || host[len - 1] != '\0') {
665			error = EINVAL;
666			goto done_free;
667		}
668		if (len > MAXHOSTNAMELEN) {
669			error = ENAMETOOLONG;
670			goto done_free;
671		}
672	}
673
674	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
675	if (error == ENOENT)
676		domain = NULL;
677	else if (error != 0)
678		goto done_free;
679	else {
680		ch_flags |= PR_HOST;
681		pr_flags |= PR_HOST;
682		if (len == 0 || domain[len - 1] != '\0') {
683			error = EINVAL;
684			goto done_free;
685		}
686		if (len > MAXHOSTNAMELEN) {
687			error = ENAMETOOLONG;
688			goto done_free;
689		}
690	}
691
692	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
693	if (error == ENOENT)
694		uuid = NULL;
695	else if (error != 0)
696		goto done_free;
697	else {
698		ch_flags |= PR_HOST;
699		pr_flags |= PR_HOST;
700		if (len == 0 || uuid[len - 1] != '\0') {
701			error = EINVAL;
702			goto done_free;
703		}
704		if (len > HOSTUUIDLEN) {
705			error = ENAMETOOLONG;
706			goto done_free;
707		}
708	}
709
710#ifdef COMPAT_IA32
711	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
712		uint32_t hid32;
713
714		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
715		hid = hid32;
716	} else
717#endif
718		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
719	if (error == ENOENT)
720		gothid = 0;
721	else if (error != 0)
722		goto done_free;
723	else {
724		gothid = 1;
725		ch_flags |= PR_HOST;
726		pr_flags |= PR_HOST;
727	}
728
729#ifdef INET
730	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
731	if (error == ENOENT)
732		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
733	else if (error != 0)
734		goto done_free;
735	else if (ip4s & (sizeof(*ip4) - 1)) {
736		error = EINVAL;
737		goto done_free;
738	} else {
739		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
740		if (ip4s == 0)
741			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
742		else {
743			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
744			ip4s /= sizeof(*ip4);
745			if (ip4s > jail_max_af_ips) {
746				error = EINVAL;
747				vfs_opterror(opts, "too many IPv4 addresses");
748				goto done_errmsg;
749			}
750			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
751			bcopy(op, ip4, ip4s * sizeof(*ip4));
752			/*
753			 * IP addresses are all sorted but ip[0] to preserve
754			 * the primary IP address as given from userland.
755			 * This special IP is used for unbound outgoing
756			 * connections as well for "loopback" traffic.
757			 */
758			if (ip4s > 1)
759				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
760			/*
761			 * Check for duplicate addresses and do some simple
762			 * zero and broadcast checks. If users give other bogus
763			 * addresses it is their problem.
764			 *
765			 * We do not have to care about byte order for these
766			 * checks so we will do them in NBO.
767			 */
768			for (ii = 0; ii < ip4s; ii++) {
769				if (ip4[ii].s_addr == INADDR_ANY ||
770				    ip4[ii].s_addr == INADDR_BROADCAST) {
771					error = EINVAL;
772					goto done_free;
773				}
774				if ((ii+1) < ip4s &&
775				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
776				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
777					error = EINVAL;
778					goto done_free;
779				}
780			}
781		}
782	}
783#endif
784
785#ifdef INET6
786	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
787	if (error == ENOENT)
788		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
789	else if (error != 0)
790		goto done_free;
791	else if (ip6s & (sizeof(*ip6) - 1)) {
792		error = EINVAL;
793		goto done_free;
794	} else {
795		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
796		if (ip6s == 0)
797			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
798		else {
799			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
800			ip6s /= sizeof(*ip6);
801			if (ip6s > jail_max_af_ips) {
802				error = EINVAL;
803				vfs_opterror(opts, "too many IPv6 addresses");
804				goto done_errmsg;
805			}
806			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
807			bcopy(op, ip6, ip6s * sizeof(*ip6));
808			if (ip6s > 1)
809				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
810			for (ii = 0; ii < ip6s; ii++) {
811				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
812					error = EINVAL;
813					goto done_free;
814				}
815				if ((ii+1) < ip6s &&
816				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
817				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
818				{
819					error = EINVAL;
820					goto done_free;
821				}
822			}
823		}
824	}
825#endif
826
827#if defined(VIMAGE) && (defined(INET) || defined(INET6))
828	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
829		error = EINVAL;
830		vfs_opterror(opts,
831		    "vnet jails cannot have IP address restrictions");
832		goto done_errmsg;
833	}
834#endif
835
836	root = NULL;
837	error = vfs_getopt(opts, "path", (void **)&path, &len);
838	if (error == ENOENT)
839		path = NULL;
840	else if (error != 0)
841		goto done_free;
842	else {
843		if (flags & JAIL_UPDATE) {
844			error = EINVAL;
845			vfs_opterror(opts,
846			    "path cannot be changed after creation");
847			goto done_errmsg;
848		}
849		if (len == 0 || path[len - 1] != '\0') {
850			error = EINVAL;
851			goto done_free;
852		}
853		if (len < 2 || (len == 2 && path[0] == '/'))
854			path = NULL;
855		else {
856			/* Leave room for a real-root full pathname. */
857			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
858			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
859				error = ENAMETOOLONG;
860				goto done_free;
861			}
862			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
863			    path, td);
864			error = namei(&nd);
865			if (error)
866				goto done_free;
867			vfslocked = NDHASGIANT(&nd);
868			root = nd.ni_vp;
869			NDFREE(&nd, NDF_ONLY_PNBUF);
870			if (root->v_type != VDIR) {
871				error = ENOTDIR;
872				vrele(root);
873				VFS_UNLOCK_GIANT(vfslocked);
874				goto done_free;
875			}
876			VFS_UNLOCK_GIANT(vfslocked);
877		}
878	}
879
880	/*
881	 * Grab the allprison lock before letting modules check their
882	 * parameters.  Once we have it, do not let go so we'll have a
883	 * consistent view of the OSD list.
884	 */
885	sx_xlock(&allprison_lock);
886	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
887	if (error)
888		goto done_unlock_list;
889
890	/* By now, all parameters should have been noted. */
891	TAILQ_FOREACH(opt, opts, link) {
892		if (!opt->seen && strcmp(opt->name, "errmsg")) {
893			error = EINVAL;
894			vfs_opterror(opts, "unknown parameter: %s", opt->name);
895			goto done_unlock_list;
896		}
897	}
898
899	/*
900	 * See if we are creating a new record or updating an existing one.
901	 * This abuses the file error codes ENOENT and EEXIST.
902	 */
903	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
904	if (!cuflags) {
905		error = EINVAL;
906		vfs_opterror(opts, "no valid operation (create or update)");
907		goto done_unlock_list;
908	}
909	pr = NULL;
910	namelc = NULL;
911	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
912		namelc = strrchr(name, '.');
913		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
914		if (*p != '\0')
915			jid = 0;
916	}
917	if (jid != 0) {
918		/*
919		 * See if a requested jid already exists.  There is an
920		 * information leak here if the jid exists but is not within
921		 * the caller's jail hierarchy.  Jail creators will get EEXIST
922		 * even though they cannot see the jail, and CREATE | UPDATE
923		 * will return ENOENT which is not normally a valid error.
924		 */
925		if (jid < 0) {
926			error = EINVAL;
927			vfs_opterror(opts, "negative jid");
928			goto done_unlock_list;
929		}
930		pr = prison_find(jid);
931		if (pr != NULL) {
932			ppr = pr->pr_parent;
933			/* Create: jid must not exist. */
934			if (cuflags == JAIL_CREATE) {
935				mtx_unlock(&pr->pr_mtx);
936				error = EEXIST;
937				vfs_opterror(opts, "jail %d already exists",
938				    jid);
939				goto done_unlock_list;
940			}
941			if (!prison_ischild(mypr, pr)) {
942				mtx_unlock(&pr->pr_mtx);
943				pr = NULL;
944			} else if (pr->pr_uref == 0) {
945				if (!(flags & JAIL_DYING)) {
946					mtx_unlock(&pr->pr_mtx);
947					error = ENOENT;
948					vfs_opterror(opts, "jail %d is dying",
949					    jid);
950					goto done_unlock_list;
951				} else if ((flags & JAIL_ATTACH) ||
952				    (pr_flags & PR_PERSIST)) {
953					/*
954					 * A dying jail might be resurrected
955					 * (via attach or persist), but first
956					 * it must determine if another jail
957					 * has claimed its name.  Accomplish
958					 * this by implicitly re-setting the
959					 * name.
960					 */
961					if (name == NULL)
962						name = prison_name(mypr, pr);
963				}
964			}
965		}
966		if (pr == NULL) {
967			/* Update: jid must exist. */
968			if (cuflags == JAIL_UPDATE) {
969				error = ENOENT;
970				vfs_opterror(opts, "jail %d not found", jid);
971				goto done_unlock_list;
972			}
973		}
974	}
975	/*
976	 * If the caller provided a name, look for a jail by that name.
977	 * This has different semantics for creates and updates keyed by jid
978	 * (where the name must not already exist in a different jail),
979	 * and updates keyed by the name itself (where the name must exist
980	 * because that is the jail being updated).
981	 */
982	if (name != NULL) {
983		namelc = strrchr(name, '.');
984		if (namelc == NULL)
985			namelc = name;
986		else {
987			/*
988			 * This is a hierarchical name.  Split it into the
989			 * parent and child names, and make sure the parent
990			 * exists or matches an already found jail.
991			 */
992			*namelc = '\0';
993			if (pr != NULL) {
994				if (strncmp(name, ppr->pr_name, namelc - name)
995				    || ppr->pr_name[namelc - name] != '\0') {
996					mtx_unlock(&pr->pr_mtx);
997					error = EINVAL;
998					vfs_opterror(opts,
999					    "cannot change jail's parent");
1000					goto done_unlock_list;
1001				}
1002			} else {
1003				ppr = prison_find_name(mypr, name);
1004				if (ppr == NULL) {
1005					error = ENOENT;
1006					vfs_opterror(opts,
1007					    "jail \"%s\" not found", name);
1008					goto done_unlock_list;
1009				}
1010				mtx_unlock(&ppr->pr_mtx);
1011			}
1012			name = ++namelc;
1013		}
1014		if (name[0] != '\0') {
1015			namelen =
1016			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1017 name_again:
1018			deadpr = NULL;
1019			FOREACH_PRISON_CHILD(ppr, tpr) {
1020				if (tpr != pr && tpr->pr_ref > 0 &&
1021				    !strcmp(tpr->pr_name + namelen, name)) {
1022					if (pr == NULL &&
1023					    cuflags != JAIL_CREATE) {
1024						mtx_lock(&tpr->pr_mtx);
1025						if (tpr->pr_ref > 0) {
1026							/*
1027							 * Use this jail
1028							 * for updates.
1029							 */
1030							if (tpr->pr_uref > 0) {
1031								pr = tpr;
1032								break;
1033							}
1034							deadpr = tpr;
1035						}
1036						mtx_unlock(&tpr->pr_mtx);
1037					} else if (tpr->pr_uref > 0) {
1038						/*
1039						 * Create, or update(jid):
1040						 * name must not exist in an
1041						 * active sibling jail.
1042						 */
1043						error = EEXIST;
1044						if (pr != NULL)
1045							mtx_unlock(&pr->pr_mtx);
1046						vfs_opterror(opts,
1047						   "jail \"%s\" already exists",
1048						   name);
1049						goto done_unlock_list;
1050					}
1051				}
1052			}
1053			/* If no active jail is found, use a dying one. */
1054			if (deadpr != NULL && pr == NULL) {
1055				if (flags & JAIL_DYING) {
1056					mtx_lock(&deadpr->pr_mtx);
1057					if (deadpr->pr_ref == 0) {
1058						mtx_unlock(&deadpr->pr_mtx);
1059						goto name_again;
1060					}
1061					pr = deadpr;
1062				} else if (cuflags == JAIL_UPDATE) {
1063					error = ENOENT;
1064					vfs_opterror(opts,
1065					    "jail \"%s\" is dying", name);
1066					goto done_unlock_list;
1067				}
1068			}
1069			/* Update: name must exist if no jid. */
1070			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1071				error = ENOENT;
1072				vfs_opterror(opts, "jail \"%s\" not found",
1073				    name);
1074				goto done_unlock_list;
1075			}
1076		}
1077	}
1078	/* Update: must provide a jid or name. */
1079	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1080		error = ENOENT;
1081		vfs_opterror(opts, "update specified no jail");
1082		goto done_unlock_list;
1083	}
1084
1085	/* If there's no prison to update, create a new one and link it in. */
1086	if (pr == NULL) {
1087		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1088			if (tpr->pr_childcount >= tpr->pr_childmax) {
1089				error = EPERM;
1090				vfs_opterror(opts, "prison limit exceeded");
1091				goto done_unlock_list;
1092			}
1093		created = 1;
1094		mtx_lock(&ppr->pr_mtx);
1095		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1096			mtx_unlock(&ppr->pr_mtx);
1097			error = ENOENT;
1098			vfs_opterror(opts, "parent jail went away!");
1099			goto done_unlock_list;
1100		}
1101		ppr->pr_ref++;
1102		ppr->pr_uref++;
1103		mtx_unlock(&ppr->pr_mtx);
1104		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1105		if (jid == 0) {
1106			/* Find the next free jid. */
1107			jid = lastprid + 1;
1108 findnext:
1109			if (jid == JAIL_MAX)
1110				jid = 1;
1111			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1112				if (tpr->pr_id < jid)
1113					continue;
1114				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1115					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1116					break;
1117				}
1118				if (jid == lastprid) {
1119					error = EAGAIN;
1120					vfs_opterror(opts,
1121					    "no available jail IDs");
1122					free(pr, M_PRISON);
1123					prison_deref(ppr, PD_DEREF |
1124					    PD_DEUREF | PD_LIST_XLOCKED);
1125					goto done_releroot;
1126				}
1127				jid++;
1128				goto findnext;
1129			}
1130			lastprid = jid;
1131		} else {
1132			/*
1133			 * The jail already has a jid (that did not yet exist),
1134			 * so just find where to insert it.
1135			 */
1136			TAILQ_FOREACH(tpr, &allprison, pr_list)
1137				if (tpr->pr_id >= jid) {
1138					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1139					break;
1140				}
1141		}
1142		if (tpr == NULL)
1143			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1144		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1145		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1146			tpr->pr_childcount++;
1147
1148		pr->pr_parent = ppr;
1149		pr->pr_id = jid;
1150
1151		/* Set some default values, and inherit some from the parent. */
1152		if (name == NULL)
1153			name = "";
1154		if (path == NULL) {
1155			path = "/";
1156			root = mypr->pr_root;
1157			vref(root);
1158		}
1159		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1160		pr->pr_flags |= PR_HOST;
1161#if defined(INET) || defined(INET6)
1162#ifdef VIMAGE
1163		if (!(pr_flags & PR_VNET))
1164#endif
1165		{
1166#ifdef INET
1167			if (!(ch_flags & PR_IP4_USER))
1168				pr->pr_flags |=
1169				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1170			else if (!(pr_flags & PR_IP4_USER)) {
1171				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1172				if (ppr->pr_ip4 != NULL) {
1173					pr->pr_ip4s = ppr->pr_ip4s;
1174					pr->pr_ip4 = malloc(pr->pr_ip4s *
1175					    sizeof(struct in_addr), M_PRISON,
1176					    M_WAITOK);
1177					bcopy(ppr->pr_ip4, pr->pr_ip4,
1178					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1179				}
1180			}
1181#endif
1182#ifdef INET6
1183			if (!(ch_flags & PR_IP6_USER))
1184				pr->pr_flags |=
1185				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1186			else if (!(pr_flags & PR_IP6_USER)) {
1187				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1188				if (ppr->pr_ip6 != NULL) {
1189					pr->pr_ip6s = ppr->pr_ip6s;
1190					pr->pr_ip6 = malloc(pr->pr_ip6s *
1191					    sizeof(struct in6_addr), M_PRISON,
1192					    M_WAITOK);
1193					bcopy(ppr->pr_ip6, pr->pr_ip6,
1194					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1195				}
1196			}
1197#endif
1198		}
1199#endif
1200		pr->pr_securelevel = ppr->pr_securelevel;
1201		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1202		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1203
1204		LIST_INIT(&pr->pr_children);
1205		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1206
1207#ifdef VIMAGE
1208		/* Allocate a new vnet if specified. */
1209		pr->pr_vnet = (pr_flags & PR_VNET)
1210		    ? vnet_alloc() : ppr->pr_vnet;
1211#endif
1212		/*
1213		 * Allocate a dedicated cpuset for each jail.
1214		 * Unlike other initial settings, this may return an erorr.
1215		 */
1216		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1217		if (error) {
1218			prison_deref(pr, PD_LIST_XLOCKED);
1219			goto done_releroot;
1220		}
1221
1222		mtx_lock(&pr->pr_mtx);
1223		/*
1224		 * New prisons do not yet have a reference, because we do not
1225		 * want other to see the incomplete prison once the
1226		 * allprison_lock is downgraded.
1227		 */
1228	} else {
1229		created = 0;
1230		/*
1231		 * Grab a reference for existing prisons, to ensure they
1232		 * continue to exist for the duration of the call.
1233		 */
1234		pr->pr_ref++;
1235#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1236		if ((pr->pr_flags & PR_VNET) &&
1237		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1238			error = EINVAL;
1239			vfs_opterror(opts,
1240			    "vnet jails cannot have IP address restrictions");
1241			goto done_deref_locked;
1242		}
1243#endif
1244#ifdef INET
1245		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1246			error = EINVAL;
1247			vfs_opterror(opts,
1248			    "ip4 cannot be changed after creation");
1249			goto done_deref_locked;
1250		}
1251#endif
1252#ifdef INET6
1253		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1254			error = EINVAL;
1255			vfs_opterror(opts,
1256			    "ip6 cannot be changed after creation");
1257			goto done_deref_locked;
1258		}
1259#endif
1260	}
1261
1262	/* Do final error checking before setting anything. */
1263	if (gotslevel) {
1264		if (slevel < ppr->pr_securelevel) {
1265			error = EPERM;
1266			goto done_deref_locked;
1267		}
1268	}
1269	if (gotchildmax) {
1270		if (childmax >= ppr->pr_childmax) {
1271			error = EPERM;
1272			goto done_deref_locked;
1273		}
1274	}
1275	if (gotenforce) {
1276		if (enforce < ppr->pr_enforce_statfs) {
1277			error = EPERM;
1278			goto done_deref_locked;
1279		}
1280	}
1281#ifdef INET
1282	if (ip4s > 0) {
1283		if (ppr->pr_flags & PR_IP4) {
1284			/*
1285			 * Make sure the new set of IP addresses is a
1286			 * subset of the parent's list.  Don't worry
1287			 * about the parent being unlocked, as any
1288			 * setting is done with allprison_lock held.
1289			 */
1290			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1291				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1292					break;
1293			if (ij == ppr->pr_ip4s) {
1294				error = EPERM;
1295				goto done_deref_locked;
1296			}
1297			if (ip4s > 1) {
1298				for (ii = ij = 1; ii < ip4s; ii++) {
1299					if (ip4[ii].s_addr ==
1300					    ppr->pr_ip4[0].s_addr)
1301						continue;
1302					for (; ij < ppr->pr_ip4s; ij++)
1303						if (ip4[ii].s_addr ==
1304						    ppr->pr_ip4[ij].s_addr)
1305							break;
1306					if (ij == ppr->pr_ip4s)
1307						break;
1308				}
1309				if (ij == ppr->pr_ip4s) {
1310					error = EPERM;
1311					goto done_deref_locked;
1312				}
1313			}
1314		}
1315		/*
1316		 * Check for conflicting IP addresses.  We permit them
1317		 * if there is no more than one IP on each jail.  If
1318		 * there is a duplicate on a jail with more than one
1319		 * IP stop checking and return error.
1320		 */
1321		tppr = ppr;
1322#ifdef VIMAGE
1323		for (; tppr != &prison0; tppr = tppr->pr_parent)
1324			if (tppr->pr_flags & PR_VNET)
1325				break;
1326#endif
1327		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1328			if (tpr == pr ||
1329#ifdef VIMAGE
1330			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1331#endif
1332			    tpr->pr_uref == 0) {
1333				descend = 0;
1334				continue;
1335			}
1336			if (!(tpr->pr_flags & PR_IP4_USER))
1337				continue;
1338			descend = 0;
1339			if (tpr->pr_ip4 == NULL ||
1340			    (ip4s == 1 && tpr->pr_ip4s == 1))
1341				continue;
1342			for (ii = 0; ii < ip4s; ii++) {
1343				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1344					error = EADDRINUSE;
1345					vfs_opterror(opts,
1346					    "IPv4 addresses clash");
1347					goto done_deref_locked;
1348				}
1349			}
1350		}
1351	}
1352#endif
1353#ifdef INET6
1354	if (ip6s > 0) {
1355		if (ppr->pr_flags & PR_IP6) {
1356			/*
1357			 * Make sure the new set of IP addresses is a
1358			 * subset of the parent's list.
1359			 */
1360			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1361				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1362				    &ppr->pr_ip6[ij]))
1363					break;
1364			if (ij == ppr->pr_ip6s) {
1365				error = EPERM;
1366				goto done_deref_locked;
1367			}
1368			if (ip6s > 1) {
1369				for (ii = ij = 1; ii < ip6s; ii++) {
1370					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1371					     &ppr->pr_ip6[0]))
1372						continue;
1373					for (; ij < ppr->pr_ip6s; ij++)
1374						if (IN6_ARE_ADDR_EQUAL(
1375						    &ip6[ii], &ppr->pr_ip6[ij]))
1376							break;
1377					if (ij == ppr->pr_ip6s)
1378						break;
1379				}
1380				if (ij == ppr->pr_ip6s) {
1381					error = EPERM;
1382					goto done_deref_locked;
1383				}
1384			}
1385		}
1386		/* Check for conflicting IP addresses. */
1387		tppr = ppr;
1388#ifdef VIMAGE
1389		for (; tppr != &prison0; tppr = tppr->pr_parent)
1390			if (tppr->pr_flags & PR_VNET)
1391				break;
1392#endif
1393		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1394			if (tpr == pr ||
1395#ifdef VIMAGE
1396			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1397#endif
1398			    tpr->pr_uref == 0) {
1399				descend = 0;
1400				continue;
1401			}
1402			if (!(tpr->pr_flags & PR_IP6_USER))
1403				continue;
1404			descend = 0;
1405			if (tpr->pr_ip6 == NULL ||
1406			    (ip6s == 1 && tpr->pr_ip6s == 1))
1407				continue;
1408			for (ii = 0; ii < ip6s; ii++) {
1409				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1410					error = EADDRINUSE;
1411					vfs_opterror(opts,
1412					    "IPv6 addresses clash");
1413					goto done_deref_locked;
1414				}
1415			}
1416		}
1417	}
1418#endif
1419	onamelen = namelen = 0;
1420	if (name != NULL) {
1421		/* Give a default name of the jid. */
1422		if (name[0] == '\0')
1423			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1424		else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
1425		    *p == '\0')) {
1426			error = EINVAL;
1427			vfs_opterror(opts,
1428			    "name cannot be numeric (unless it is the jid)");
1429			goto done_deref_locked;
1430		}
1431		/*
1432		 * Make sure the name isn't too long for the prison or its
1433		 * children.
1434		 */
1435		onamelen = strlen(pr->pr_name);
1436		namelen = strlen(name);
1437		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1438			error = ENAMETOOLONG;
1439			goto done_deref_locked;
1440		}
1441		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1442			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1443			    sizeof(pr->pr_name)) {
1444				error = ENAMETOOLONG;
1445				goto done_deref_locked;
1446			}
1447		}
1448	}
1449	if (pr_allow & ~ppr->pr_allow) {
1450		error = EPERM;
1451		goto done_deref_locked;
1452	}
1453
1454	/* Set the parameters of the prison. */
1455#ifdef INET
1456	redo_ip4 = 0;
1457	if (pr_flags & PR_IP4_USER) {
1458		pr->pr_flags |= PR_IP4;
1459		free(pr->pr_ip4, M_PRISON);
1460		pr->pr_ip4s = ip4s;
1461		pr->pr_ip4 = ip4;
1462		ip4 = NULL;
1463		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1464#ifdef VIMAGE
1465			if (tpr->pr_flags & PR_VNET) {
1466				descend = 0;
1467				continue;
1468			}
1469#endif
1470			if (prison_restrict_ip4(tpr, NULL)) {
1471				redo_ip4 = 1;
1472				descend = 0;
1473			}
1474		}
1475	}
1476#endif
1477#ifdef INET6
1478	redo_ip6 = 0;
1479	if (pr_flags & PR_IP6_USER) {
1480		pr->pr_flags |= PR_IP6;
1481		free(pr->pr_ip6, M_PRISON);
1482		pr->pr_ip6s = ip6s;
1483		pr->pr_ip6 = ip6;
1484		ip6 = NULL;
1485		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1486#ifdef VIMAGE
1487			if (tpr->pr_flags & PR_VNET) {
1488				descend = 0;
1489				continue;
1490			}
1491#endif
1492			if (prison_restrict_ip6(tpr, NULL)) {
1493				redo_ip6 = 1;
1494				descend = 0;
1495			}
1496		}
1497	}
1498#endif
1499	if (gotslevel) {
1500		pr->pr_securelevel = slevel;
1501		/* Set all child jails to be at least this level. */
1502		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1503			if (tpr->pr_securelevel < slevel)
1504				tpr->pr_securelevel = slevel;
1505	}
1506	if (gotchildmax) {
1507		pr->pr_childmax = childmax;
1508		/* Set all child jails to under this limit. */
1509		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1510			if (tpr->pr_childmax > childmax - level)
1511				tpr->pr_childmax = childmax > level
1512				    ? childmax - level : 0;
1513	}
1514	if (gotenforce) {
1515		pr->pr_enforce_statfs = enforce;
1516		/* Pass this restriction on to the children. */
1517		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1518			if (tpr->pr_enforce_statfs < enforce)
1519				tpr->pr_enforce_statfs = enforce;
1520	}
1521	if (name != NULL) {
1522		if (ppr == &prison0)
1523			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1524		else
1525			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1526			    ppr->pr_name, name);
1527		/* Change this component of child names. */
1528		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1529			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1530			    strlen(tpr->pr_name + onamelen) + 1);
1531			bcopy(pr->pr_name, tpr->pr_name, namelen);
1532		}
1533	}
1534	if (path != NULL) {
1535		/* Try to keep a real-rooted full pathname. */
1536		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1537			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1538			    mypr->pr_path, path);
1539		else
1540			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1541		pr->pr_root = root;
1542	}
1543	if (PR_HOST & ch_flags & ~pr_flags) {
1544		if (pr->pr_flags & PR_HOST) {
1545			/*
1546			 * Copy the parent's host info.  As with pr_ip4 above,
1547			 * the lack of a lock on the parent is not a problem;
1548			 * it is always set with allprison_lock at least
1549			 * shared, and is held exclusively here.
1550			 */
1551			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1552			    sizeof(pr->pr_hostname));
1553			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1554			    sizeof(pr->pr_domainname));
1555			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1556			    sizeof(pr->pr_hostuuid));
1557			pr->pr_hostid = pr->pr_parent->pr_hostid;
1558		}
1559	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1560		/* Set this prison, and any descendants without PR_HOST. */
1561		if (host != NULL)
1562			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1563		if (domain != NULL)
1564			strlcpy(pr->pr_domainname, domain,
1565			    sizeof(pr->pr_domainname));
1566		if (uuid != NULL)
1567			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1568		if (gothid)
1569			pr->pr_hostid = hid;
1570		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1571			if (tpr->pr_flags & PR_HOST)
1572				descend = 0;
1573			else {
1574				if (host != NULL)
1575					strlcpy(tpr->pr_hostname,
1576					    pr->pr_hostname,
1577					    sizeof(tpr->pr_hostname));
1578				if (domain != NULL)
1579					strlcpy(tpr->pr_domainname,
1580					    pr->pr_domainname,
1581					    sizeof(tpr->pr_domainname));
1582				if (uuid != NULL)
1583					strlcpy(tpr->pr_hostuuid,
1584					    pr->pr_hostuuid,
1585					    sizeof(tpr->pr_hostuuid));
1586				if (gothid)
1587					tpr->pr_hostid = hid;
1588			}
1589		}
1590	}
1591	if ((tallow = ch_allow & ~pr_allow)) {
1592		/* Clear allow bits in all children. */
1593		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1594			tpr->pr_allow &= ~tallow;
1595	}
1596	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1597	/*
1598	 * Persistent prisons get an extra reference, and prisons losing their
1599	 * persist flag lose that reference.  Only do this for existing prisons
1600	 * for now, so new ones will remain unseen until after the module
1601	 * handlers have completed.
1602	 */
1603	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1604		if (pr_flags & PR_PERSIST) {
1605			pr->pr_ref++;
1606			pr->pr_uref++;
1607		} else {
1608			pr->pr_ref--;
1609			pr->pr_uref--;
1610		}
1611	}
1612	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1613	mtx_unlock(&pr->pr_mtx);
1614
1615	/* Locks may have prevented a complete restriction of child IP
1616	 * addresses.  If so, allocate some more memory and try again.
1617	 */
1618#ifdef INET
1619	while (redo_ip4) {
1620		ip4s = pr->pr_ip4s;
1621		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1622		mtx_lock(&pr->pr_mtx);
1623		redo_ip4 = 0;
1624		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1625#ifdef VIMAGE
1626			if (tpr->pr_flags & PR_VNET) {
1627				descend = 0;
1628				continue;
1629			}
1630#endif
1631			if (prison_restrict_ip4(tpr, ip4)) {
1632				if (ip4 != NULL)
1633					ip4 = NULL;
1634				else
1635					redo_ip4 = 1;
1636			}
1637		}
1638		mtx_unlock(&pr->pr_mtx);
1639	}
1640#endif
1641#ifdef INET6
1642	while (redo_ip6) {
1643		ip6s = pr->pr_ip6s;
1644		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1645		mtx_lock(&pr->pr_mtx);
1646		redo_ip6 = 0;
1647		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1648#ifdef VIMAGE
1649			if (tpr->pr_flags & PR_VNET) {
1650				descend = 0;
1651				continue;
1652			}
1653#endif
1654			if (prison_restrict_ip6(tpr, ip6)) {
1655				if (ip6 != NULL)
1656					ip6 = NULL;
1657				else
1658					redo_ip6 = 1;
1659			}
1660		}
1661		mtx_unlock(&pr->pr_mtx);
1662	}
1663#endif
1664
1665	/* Let the modules do their work. */
1666	sx_downgrade(&allprison_lock);
1667	if (created) {
1668		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1669		if (error) {
1670			prison_deref(pr, PD_LIST_SLOCKED);
1671			goto done_errmsg;
1672		}
1673	}
1674	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1675	if (error) {
1676		prison_deref(pr, created
1677		    ? PD_LIST_SLOCKED
1678		    : PD_DEREF | PD_LIST_SLOCKED);
1679		goto done_errmsg;
1680	}
1681
1682	/* Attach this process to the prison if requested. */
1683	if (flags & JAIL_ATTACH) {
1684		mtx_lock(&pr->pr_mtx);
1685		error = do_jail_attach(td, pr);
1686		if (error) {
1687			vfs_opterror(opts, "attach failed");
1688			if (!created)
1689				prison_deref(pr, PD_DEREF);
1690			goto done_errmsg;
1691		}
1692	}
1693
1694	/*
1695	 * Now that it is all there, drop the temporary reference from existing
1696	 * prisons.  Or add a reference to newly created persistent prisons
1697	 * (which was not done earlier so that the prison would not be publicly
1698	 * visible).
1699	 */
1700	if (!created) {
1701		prison_deref(pr, (flags & JAIL_ATTACH)
1702		    ? PD_DEREF
1703		    : PD_DEREF | PD_LIST_SLOCKED);
1704	} else {
1705		if (pr_flags & PR_PERSIST) {
1706			mtx_lock(&pr->pr_mtx);
1707			pr->pr_ref++;
1708			pr->pr_uref++;
1709			mtx_unlock(&pr->pr_mtx);
1710		}
1711		if (!(flags & JAIL_ATTACH))
1712			sx_sunlock(&allprison_lock);
1713	}
1714	td->td_retval[0] = pr->pr_id;
1715	goto done_errmsg;
1716
1717 done_deref_locked:
1718	prison_deref(pr, created
1719	    ? PD_LOCKED | PD_LIST_XLOCKED
1720	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1721	goto done_releroot;
1722 done_unlock_list:
1723	sx_xunlock(&allprison_lock);
1724 done_releroot:
1725	if (root != NULL) {
1726		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1727		vrele(root);
1728		VFS_UNLOCK_GIANT(vfslocked);
1729	}
1730 done_errmsg:
1731	if (error) {
1732		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1733		if (errmsg_len > 0) {
1734			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1735			if (errmsg_pos > 0) {
1736				if (optuio->uio_segflg == UIO_SYSSPACE)
1737					bcopy(errmsg,
1738					   optuio->uio_iov[errmsg_pos].iov_base,
1739					   errmsg_len);
1740				else
1741					copyout(errmsg,
1742					   optuio->uio_iov[errmsg_pos].iov_base,
1743					   errmsg_len);
1744			}
1745		}
1746	}
1747 done_free:
1748#ifdef INET
1749	free(ip4, M_PRISON);
1750#endif
1751#ifdef INET6
1752	free(ip6, M_PRISON);
1753#endif
1754	vfs_freeopts(opts);
1755	return (error);
1756}
1757
1758
1759/*
1760 * struct jail_get_args {
1761 *	struct iovec *iovp;
1762 *	unsigned int iovcnt;
1763 *	int flags;
1764 * };
1765 */
1766int
1767jail_get(struct thread *td, struct jail_get_args *uap)
1768{
1769	struct uio *auio;
1770	int error;
1771
1772	/* Check that we have an even number of iovecs. */
1773	if (uap->iovcnt & 1)
1774		return (EINVAL);
1775
1776	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1777	if (error)
1778		return (error);
1779	error = kern_jail_get(td, auio, uap->flags);
1780	if (error == 0)
1781		error = copyout(auio->uio_iov, uap->iovp,
1782		    uap->iovcnt * sizeof (struct iovec));
1783	free(auio, M_IOV);
1784	return (error);
1785}
1786
1787int
1788kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1789{
1790	struct prison *pr, *mypr;
1791	struct vfsopt *opt;
1792	struct vfsoptlist *opts;
1793	char *errmsg, *name;
1794	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1795
1796	if (flags & ~JAIL_GET_MASK)
1797		return (EINVAL);
1798
1799	/* Get the parameter list. */
1800	error = vfs_buildopts(optuio, &opts);
1801	if (error)
1802		return (error);
1803	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1804	mypr = td->td_ucred->cr_prison;
1805
1806	/*
1807	 * Find the prison specified by one of: lastjid, jid, name.
1808	 */
1809	sx_slock(&allprison_lock);
1810	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1811	if (error == 0) {
1812		TAILQ_FOREACH(pr, &allprison, pr_list) {
1813			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1814				mtx_lock(&pr->pr_mtx);
1815				if (pr->pr_ref > 0 &&
1816				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1817					break;
1818				mtx_unlock(&pr->pr_mtx);
1819			}
1820		}
1821		if (pr != NULL)
1822			goto found_prison;
1823		error = ENOENT;
1824		vfs_opterror(opts, "no jail after %d", jid);
1825		goto done_unlock_list;
1826	} else if (error != ENOENT)
1827		goto done_unlock_list;
1828
1829	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1830	if (error == 0) {
1831		if (jid != 0) {
1832			pr = prison_find_child(mypr, jid);
1833			if (pr != NULL) {
1834				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1835					mtx_unlock(&pr->pr_mtx);
1836					error = ENOENT;
1837					vfs_opterror(opts, "jail %d is dying",
1838					    jid);
1839					goto done_unlock_list;
1840				}
1841				goto found_prison;
1842			}
1843			error = ENOENT;
1844			vfs_opterror(opts, "jail %d not found", jid);
1845			goto done_unlock_list;
1846		}
1847	} else if (error != ENOENT)
1848		goto done_unlock_list;
1849
1850	error = vfs_getopt(opts, "name", (void **)&name, &len);
1851	if (error == 0) {
1852		if (len == 0 || name[len - 1] != '\0') {
1853			error = EINVAL;
1854			goto done_unlock_list;
1855		}
1856		pr = prison_find_name(mypr, name);
1857		if (pr != NULL) {
1858			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1859				mtx_unlock(&pr->pr_mtx);
1860				error = ENOENT;
1861				vfs_opterror(opts, "jail \"%s\" is dying",
1862				    name);
1863				goto done_unlock_list;
1864			}
1865			goto found_prison;
1866		}
1867		error = ENOENT;
1868		vfs_opterror(opts, "jail \"%s\" not found", name);
1869		goto done_unlock_list;
1870	} else if (error != ENOENT)
1871		goto done_unlock_list;
1872
1873	vfs_opterror(opts, "no jail specified");
1874	error = ENOENT;
1875	goto done_unlock_list;
1876
1877 found_prison:
1878	/* Get the parameters of the prison. */
1879	pr->pr_ref++;
1880	locked = PD_LOCKED;
1881	td->td_retval[0] = pr->pr_id;
1882	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1883	if (error != 0 && error != ENOENT)
1884		goto done_deref;
1885	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1886	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1887	if (error != 0 && error != ENOENT)
1888		goto done_deref;
1889	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1890	if (error != 0 && error != ENOENT)
1891		goto done_deref;
1892	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1893	    sizeof(pr->pr_cpuset->cs_id));
1894	if (error != 0 && error != ENOENT)
1895		goto done_deref;
1896	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1897	if (error != 0 && error != ENOENT)
1898		goto done_deref;
1899#ifdef INET
1900	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1901	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1902	if (error != 0 && error != ENOENT)
1903		goto done_deref;
1904#endif
1905#ifdef INET6
1906	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1907	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1908	if (error != 0 && error != ENOENT)
1909		goto done_deref;
1910#endif
1911	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1912	    sizeof(pr->pr_securelevel));
1913	if (error != 0 && error != ENOENT)
1914		goto done_deref;
1915	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
1916	    sizeof(pr->pr_childcount));
1917	if (error != 0 && error != ENOENT)
1918		goto done_deref;
1919	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
1920	    sizeof(pr->pr_childmax));
1921	if (error != 0 && error != ENOENT)
1922		goto done_deref;
1923	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
1924	if (error != 0 && error != ENOENT)
1925		goto done_deref;
1926	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
1927	if (error != 0 && error != ENOENT)
1928		goto done_deref;
1929	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
1930	if (error != 0 && error != ENOENT)
1931		goto done_deref;
1932#ifdef COMPAT_IA32
1933	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
1934		uint32_t hid32 = pr->pr_hostid;
1935
1936		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
1937	} else
1938#endif
1939	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
1940	    sizeof(pr->pr_hostid));
1941	if (error != 0 && error != ENOENT)
1942		goto done_deref;
1943	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
1944	    sizeof(pr->pr_enforce_statfs));
1945	if (error != 0 && error != ENOENT)
1946		goto done_deref;
1947	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
1948	    fi++) {
1949		if (pr_flag_names[fi] == NULL)
1950			continue;
1951		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
1952		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
1953		if (error != 0 && error != ENOENT)
1954			goto done_deref;
1955		i = !i;
1956		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
1957		if (error != 0 && error != ENOENT)
1958			goto done_deref;
1959	}
1960	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
1961	    fi++) {
1962		i = pr->pr_flags &
1963		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
1964		i = pr_flag_jailsys[fi].disable &&
1965		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
1966		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
1967		    : JAIL_SYS_INHERIT;
1968		error =
1969		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
1970		if (error != 0 && error != ENOENT)
1971			goto done_deref;
1972	}
1973	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
1974	    fi++) {
1975		if (pr_allow_names[fi] == NULL)
1976			continue;
1977		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
1978		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
1979		if (error != 0 && error != ENOENT)
1980			goto done_deref;
1981		i = !i;
1982		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
1983		if (error != 0 && error != ENOENT)
1984			goto done_deref;
1985	}
1986	i = (pr->pr_uref == 0);
1987	error = vfs_setopt(opts, "dying", &i, sizeof(i));
1988	if (error != 0 && error != ENOENT)
1989		goto done_deref;
1990	i = !i;
1991	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
1992	if (error != 0 && error != ENOENT)
1993		goto done_deref;
1994
1995	/* Get the module parameters. */
1996	mtx_unlock(&pr->pr_mtx);
1997	locked = 0;
1998	error = osd_jail_call(pr, PR_METHOD_GET, opts);
1999	if (error)
2000		goto done_deref;
2001	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2002
2003	/* By now, all parameters should have been noted. */
2004	TAILQ_FOREACH(opt, opts, link) {
2005		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2006			error = EINVAL;
2007			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2008			goto done_errmsg;
2009		}
2010	}
2011
2012	/* Write the fetched parameters back to userspace. */
2013	error = 0;
2014	TAILQ_FOREACH(opt, opts, link) {
2015		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2016			pos = 2 * opt->pos + 1;
2017			optuio->uio_iov[pos].iov_len = opt->len;
2018			if (opt->value != NULL) {
2019				if (optuio->uio_segflg == UIO_SYSSPACE) {
2020					bcopy(opt->value,
2021					    optuio->uio_iov[pos].iov_base,
2022					    opt->len);
2023				} else {
2024					error = copyout(opt->value,
2025					    optuio->uio_iov[pos].iov_base,
2026					    opt->len);
2027					if (error)
2028						break;
2029				}
2030			}
2031		}
2032	}
2033	goto done_errmsg;
2034
2035 done_deref:
2036	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2037	goto done_errmsg;
2038
2039 done_unlock_list:
2040	sx_sunlock(&allprison_lock);
2041 done_errmsg:
2042	if (error && errmsg_pos >= 0) {
2043		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2044		errmsg_pos = 2 * errmsg_pos + 1;
2045		if (errmsg_len > 0) {
2046			if (optuio->uio_segflg == UIO_SYSSPACE)
2047				bcopy(errmsg,
2048				    optuio->uio_iov[errmsg_pos].iov_base,
2049				    errmsg_len);
2050			else
2051				copyout(errmsg,
2052				    optuio->uio_iov[errmsg_pos].iov_base,
2053				    errmsg_len);
2054		}
2055	}
2056	vfs_freeopts(opts);
2057	return (error);
2058}
2059
2060
2061/*
2062 * struct jail_remove_args {
2063 *	int jid;
2064 * };
2065 */
2066int
2067jail_remove(struct thread *td, struct jail_remove_args *uap)
2068{
2069	struct prison *pr, *cpr, *lpr, *tpr;
2070	int descend, error;
2071
2072	error = priv_check(td, PRIV_JAIL_REMOVE);
2073	if (error)
2074		return (error);
2075
2076	sx_xlock(&allprison_lock);
2077	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2078	if (pr == NULL) {
2079		sx_xunlock(&allprison_lock);
2080		return (EINVAL);
2081	}
2082
2083	/* Remove all descendants of this prison, then remove this prison. */
2084	pr->pr_ref++;
2085	pr->pr_flags |= PR_REMOVE;
2086	if (!LIST_EMPTY(&pr->pr_children)) {
2087		mtx_unlock(&pr->pr_mtx);
2088		lpr = NULL;
2089		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2090			mtx_lock(&cpr->pr_mtx);
2091			if (cpr->pr_ref > 0) {
2092				tpr = cpr;
2093				cpr->pr_ref++;
2094				cpr->pr_flags |= PR_REMOVE;
2095			} else {
2096				/* Already removed - do not do it again. */
2097				tpr = NULL;
2098			}
2099			mtx_unlock(&cpr->pr_mtx);
2100			if (lpr != NULL) {
2101				mtx_lock(&lpr->pr_mtx);
2102				prison_remove_one(lpr);
2103				sx_xlock(&allprison_lock);
2104			}
2105			lpr = tpr;
2106		}
2107		if (lpr != NULL) {
2108			mtx_lock(&lpr->pr_mtx);
2109			prison_remove_one(lpr);
2110			sx_xlock(&allprison_lock);
2111		}
2112		mtx_lock(&pr->pr_mtx);
2113	}
2114	prison_remove_one(pr);
2115	return (0);
2116}
2117
2118static void
2119prison_remove_one(struct prison *pr)
2120{
2121	struct proc *p;
2122	int deuref;
2123
2124	/* If the prison was persistent, it is not anymore. */
2125	deuref = 0;
2126	if (pr->pr_flags & PR_PERSIST) {
2127		pr->pr_ref--;
2128		deuref = PD_DEUREF;
2129		pr->pr_flags &= ~PR_PERSIST;
2130	}
2131
2132	/*
2133	 * jail_remove added a reference.  If that's the only one, remove
2134	 * the prison now.
2135	 */
2136	KASSERT(pr->pr_ref > 0,
2137	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2138	if (pr->pr_ref == 1) {
2139		prison_deref(pr,
2140		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2141		return;
2142	}
2143
2144	mtx_unlock(&pr->pr_mtx);
2145	sx_xunlock(&allprison_lock);
2146	/*
2147	 * Kill all processes unfortunate enough to be attached to this prison.
2148	 */
2149	sx_slock(&allproc_lock);
2150	LIST_FOREACH(p, &allproc, p_list) {
2151		PROC_LOCK(p);
2152		if (p->p_state != PRS_NEW && p->p_ucred &&
2153		    p->p_ucred->cr_prison == pr)
2154			psignal(p, SIGKILL);
2155		PROC_UNLOCK(p);
2156	}
2157	sx_sunlock(&allproc_lock);
2158	/* Remove the temporary reference added by jail_remove. */
2159	prison_deref(pr, deuref | PD_DEREF);
2160}
2161
2162
2163/*
2164 * struct jail_attach_args {
2165 *	int jid;
2166 * };
2167 */
2168int
2169jail_attach(struct thread *td, struct jail_attach_args *uap)
2170{
2171	struct prison *pr;
2172	int error;
2173
2174	error = priv_check(td, PRIV_JAIL_ATTACH);
2175	if (error)
2176		return (error);
2177
2178	sx_slock(&allprison_lock);
2179	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2180	if (pr == NULL) {
2181		sx_sunlock(&allprison_lock);
2182		return (EINVAL);
2183	}
2184
2185	/*
2186	 * Do not allow a process to attach to a prison that is not
2187	 * considered to be "alive".
2188	 */
2189	if (pr->pr_uref == 0) {
2190		mtx_unlock(&pr->pr_mtx);
2191		sx_sunlock(&allprison_lock);
2192		return (EINVAL);
2193	}
2194
2195	return (do_jail_attach(td, pr));
2196}
2197
2198static int
2199do_jail_attach(struct thread *td, struct prison *pr)
2200{
2201	struct prison *ppr;
2202	struct proc *p;
2203	struct ucred *newcred, *oldcred;
2204	int vfslocked, error;
2205
2206	/*
2207	 * XXX: Note that there is a slight race here if two threads
2208	 * in the same privileged process attempt to attach to two
2209	 * different jails at the same time.  It is important for
2210	 * user processes not to do this, or they might end up with
2211	 * a process root from one prison, but attached to the jail
2212	 * of another.
2213	 */
2214	pr->pr_ref++;
2215	pr->pr_uref++;
2216	mtx_unlock(&pr->pr_mtx);
2217
2218	/* Let modules do whatever they need to prepare for attaching. */
2219	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2220	if (error) {
2221		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2222		return (error);
2223	}
2224	sx_sunlock(&allprison_lock);
2225
2226	/*
2227	 * Reparent the newly attached process to this jail.
2228	 */
2229	ppr = td->td_ucred->cr_prison;
2230	p = td->td_proc;
2231	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2232	if (error)
2233		goto e_revert_osd;
2234
2235	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2236	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2237	if ((error = change_dir(pr->pr_root, td)) != 0)
2238		goto e_unlock;
2239#ifdef MAC
2240	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2241		goto e_unlock;
2242#endif
2243	VOP_UNLOCK(pr->pr_root, 0);
2244	if ((error = change_root(pr->pr_root, td)))
2245		goto e_unlock_giant;
2246	VFS_UNLOCK_GIANT(vfslocked);
2247
2248	newcred = crget();
2249	PROC_LOCK(p);
2250	oldcred = p->p_ucred;
2251	setsugid(p);
2252	crcopy(newcred, oldcred);
2253	newcred->cr_prison = pr;
2254	p->p_ucred = newcred;
2255	PROC_UNLOCK(p);
2256	crfree(oldcred);
2257	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2258	return (0);
2259 e_unlock:
2260	VOP_UNLOCK(pr->pr_root, 0);
2261 e_unlock_giant:
2262	VFS_UNLOCK_GIANT(vfslocked);
2263 e_revert_osd:
2264	/* Tell modules this thread is still in its old jail after all. */
2265	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2266	prison_deref(pr, PD_DEREF | PD_DEUREF);
2267	return (error);
2268}
2269
2270
2271/*
2272 * Returns a locked prison instance, or NULL on failure.
2273 */
2274struct prison *
2275prison_find(int prid)
2276{
2277	struct prison *pr;
2278
2279	sx_assert(&allprison_lock, SX_LOCKED);
2280	TAILQ_FOREACH(pr, &allprison, pr_list) {
2281		if (pr->pr_id == prid) {
2282			mtx_lock(&pr->pr_mtx);
2283			if (pr->pr_ref > 0)
2284				return (pr);
2285			mtx_unlock(&pr->pr_mtx);
2286		}
2287	}
2288	return (NULL);
2289}
2290
2291/*
2292 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2293 */
2294struct prison *
2295prison_find_child(struct prison *mypr, int prid)
2296{
2297	struct prison *pr;
2298	int descend;
2299
2300	sx_assert(&allprison_lock, SX_LOCKED);
2301	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2302		if (pr->pr_id == prid) {
2303			mtx_lock(&pr->pr_mtx);
2304			if (pr->pr_ref > 0)
2305				return (pr);
2306			mtx_unlock(&pr->pr_mtx);
2307		}
2308	}
2309	return (NULL);
2310}
2311
2312/*
2313 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2314 */
2315struct prison *
2316prison_find_name(struct prison *mypr, const char *name)
2317{
2318	struct prison *pr, *deadpr;
2319	size_t mylen;
2320	int descend;
2321
2322	sx_assert(&allprison_lock, SX_LOCKED);
2323	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2324 again:
2325	deadpr = NULL;
2326	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2327		if (!strcmp(pr->pr_name + mylen, name)) {
2328			mtx_lock(&pr->pr_mtx);
2329			if (pr->pr_ref > 0) {
2330				if (pr->pr_uref > 0)
2331					return (pr);
2332				deadpr = pr;
2333			}
2334			mtx_unlock(&pr->pr_mtx);
2335		}
2336	}
2337	/* There was no valid prison - perhaps there was a dying one. */
2338	if (deadpr != NULL) {
2339		mtx_lock(&deadpr->pr_mtx);
2340		if (deadpr->pr_ref == 0) {
2341			mtx_unlock(&deadpr->pr_mtx);
2342			goto again;
2343		}
2344	}
2345	return (deadpr);
2346}
2347
2348/*
2349 * See if a prison has the specific flag set.
2350 */
2351int
2352prison_flag(struct ucred *cred, unsigned flag)
2353{
2354
2355	/* This is an atomic read, so no locking is necessary. */
2356	return (cred->cr_prison->pr_flags & flag);
2357}
2358
2359int
2360prison_allow(struct ucred *cred, unsigned flag)
2361{
2362
2363	/* This is an atomic read, so no locking is necessary. */
2364	return (cred->cr_prison->pr_allow & flag);
2365}
2366
2367/*
2368 * Remove a prison reference.  If that was the last reference, remove the
2369 * prison itself - but not in this context in case there are locks held.
2370 */
2371void
2372prison_free_locked(struct prison *pr)
2373{
2374
2375	mtx_assert(&pr->pr_mtx, MA_OWNED);
2376	pr->pr_ref--;
2377	if (pr->pr_ref == 0) {
2378		mtx_unlock(&pr->pr_mtx);
2379		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2380		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2381		return;
2382	}
2383	mtx_unlock(&pr->pr_mtx);
2384}
2385
2386void
2387prison_free(struct prison *pr)
2388{
2389
2390	mtx_lock(&pr->pr_mtx);
2391	prison_free_locked(pr);
2392}
2393
2394static void
2395prison_complete(void *context, int pending)
2396{
2397
2398	prison_deref((struct prison *)context, 0);
2399}
2400
2401/*
2402 * Remove a prison reference (usually).  This internal version assumes no
2403 * mutexes are held, except perhaps the prison itself.  If there are no more
2404 * references, release and delist the prison.  On completion, the prison lock
2405 * and the allprison lock are both unlocked.
2406 */
2407static void
2408prison_deref(struct prison *pr, int flags)
2409{
2410	struct prison *ppr, *tpr;
2411	int vfslocked;
2412
2413	if (!(flags & PD_LOCKED))
2414		mtx_lock(&pr->pr_mtx);
2415	/* Decrement the user references in a separate loop. */
2416	if (flags & PD_DEUREF) {
2417		for (tpr = pr;; tpr = tpr->pr_parent) {
2418			if (tpr != pr)
2419				mtx_lock(&tpr->pr_mtx);
2420			if (--tpr->pr_uref > 0)
2421				break;
2422			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2423			mtx_unlock(&tpr->pr_mtx);
2424		}
2425		/* Done if there were only user references to remove. */
2426		if (!(flags & PD_DEREF)) {
2427			mtx_unlock(&tpr->pr_mtx);
2428			if (flags & PD_LIST_SLOCKED)
2429				sx_sunlock(&allprison_lock);
2430			else if (flags & PD_LIST_XLOCKED)
2431				sx_xunlock(&allprison_lock);
2432			return;
2433		}
2434		if (tpr != pr) {
2435			mtx_unlock(&tpr->pr_mtx);
2436			mtx_lock(&pr->pr_mtx);
2437		}
2438	}
2439
2440	for (;;) {
2441		if (flags & PD_DEREF)
2442			pr->pr_ref--;
2443		/* If the prison still has references, nothing else to do. */
2444		if (pr->pr_ref > 0) {
2445			mtx_unlock(&pr->pr_mtx);
2446			if (flags & PD_LIST_SLOCKED)
2447				sx_sunlock(&allprison_lock);
2448			else if (flags & PD_LIST_XLOCKED)
2449				sx_xunlock(&allprison_lock);
2450			return;
2451		}
2452
2453		mtx_unlock(&pr->pr_mtx);
2454		if (flags & PD_LIST_SLOCKED) {
2455			if (!sx_try_upgrade(&allprison_lock)) {
2456				sx_sunlock(&allprison_lock);
2457				sx_xlock(&allprison_lock);
2458			}
2459		} else if (!(flags & PD_LIST_XLOCKED))
2460			sx_xlock(&allprison_lock);
2461
2462		TAILQ_REMOVE(&allprison, pr, pr_list);
2463		LIST_REMOVE(pr, pr_sibling);
2464		ppr = pr->pr_parent;
2465		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2466			tpr->pr_childcount--;
2467		sx_xunlock(&allprison_lock);
2468
2469#ifdef VIMAGE
2470		if (pr->pr_vnet != ppr->pr_vnet)
2471			vnet_destroy(pr->pr_vnet);
2472#endif
2473		if (pr->pr_root != NULL) {
2474			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2475			vrele(pr->pr_root);
2476			VFS_UNLOCK_GIANT(vfslocked);
2477		}
2478		mtx_destroy(&pr->pr_mtx);
2479#ifdef INET
2480		free(pr->pr_ip4, M_PRISON);
2481#endif
2482#ifdef INET6
2483		free(pr->pr_ip6, M_PRISON);
2484#endif
2485		if (pr->pr_cpuset != NULL)
2486			cpuset_rel(pr->pr_cpuset);
2487		osd_jail_exit(pr);
2488		free(pr, M_PRISON);
2489
2490		/* Removing a prison frees a reference on its parent. */
2491		pr = ppr;
2492		mtx_lock(&pr->pr_mtx);
2493		flags = PD_DEREF;
2494	}
2495}
2496
2497void
2498prison_hold_locked(struct prison *pr)
2499{
2500
2501	mtx_assert(&pr->pr_mtx, MA_OWNED);
2502	KASSERT(pr->pr_ref > 0,
2503	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2504	pr->pr_ref++;
2505}
2506
2507void
2508prison_hold(struct prison *pr)
2509{
2510
2511	mtx_lock(&pr->pr_mtx);
2512	prison_hold_locked(pr);
2513	mtx_unlock(&pr->pr_mtx);
2514}
2515
2516void
2517prison_proc_hold(struct prison *pr)
2518{
2519
2520	mtx_lock(&pr->pr_mtx);
2521	KASSERT(pr->pr_uref > 0,
2522	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2523	pr->pr_uref++;
2524	mtx_unlock(&pr->pr_mtx);
2525}
2526
2527void
2528prison_proc_free(struct prison *pr)
2529{
2530
2531	mtx_lock(&pr->pr_mtx);
2532	KASSERT(pr->pr_uref > 0,
2533	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2534	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2535}
2536
2537
2538#ifdef INET
2539/*
2540 * Restrict a prison's IP address list with its parent's, possibly replacing
2541 * it.  Return true if the replacement buffer was used (or would have been).
2542 */
2543static int
2544prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2545{
2546	int ii, ij, used;
2547	struct prison *ppr;
2548
2549	ppr = pr->pr_parent;
2550	if (!(pr->pr_flags & PR_IP4_USER)) {
2551		/* This has no user settings, so just copy the parent's list. */
2552		if (pr->pr_ip4s < ppr->pr_ip4s) {
2553			/*
2554			 * There's no room for the parent's list.  Use the
2555			 * new list buffer, which is assumed to be big enough
2556			 * (if it was passed).  If there's no buffer, try to
2557			 * allocate one.
2558			 */
2559			used = 1;
2560			if (newip4 == NULL) {
2561				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2562				    M_PRISON, M_NOWAIT);
2563				if (newip4 != NULL)
2564					used = 0;
2565			}
2566			if (newip4 != NULL) {
2567				bcopy(ppr->pr_ip4, newip4,
2568				    ppr->pr_ip4s * sizeof(*newip4));
2569				free(pr->pr_ip4, M_PRISON);
2570				pr->pr_ip4 = newip4;
2571				pr->pr_ip4s = ppr->pr_ip4s;
2572			}
2573			return (used);
2574		}
2575		pr->pr_ip4s = ppr->pr_ip4s;
2576		if (pr->pr_ip4s > 0)
2577			bcopy(ppr->pr_ip4, pr->pr_ip4,
2578			    pr->pr_ip4s * sizeof(*newip4));
2579		else if (pr->pr_ip4 != NULL) {
2580			free(pr->pr_ip4, M_PRISON);
2581			pr->pr_ip4 = NULL;
2582		}
2583	} else if (pr->pr_ip4s > 0) {
2584		/* Remove addresses that aren't in the parent. */
2585		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2586			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2587				break;
2588		if (ij < ppr->pr_ip4s)
2589			ii = 1;
2590		else {
2591			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2592			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2593			ii = 0;
2594		}
2595		for (ij = 1; ii < pr->pr_ip4s; ) {
2596			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2597				ii++;
2598				continue;
2599			}
2600			switch (ij >= ppr->pr_ip4s ? -1 :
2601				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2602			case -1:
2603				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2604				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2605				break;
2606			case 0:
2607				ii++;
2608				ij++;
2609				break;
2610			case 1:
2611				ij++;
2612				break;
2613			}
2614		}
2615		if (pr->pr_ip4s == 0) {
2616			pr->pr_flags |= PR_IP4_DISABLE;
2617			free(pr->pr_ip4, M_PRISON);
2618			pr->pr_ip4 = NULL;
2619		}
2620	}
2621	return (0);
2622}
2623
2624/*
2625 * Pass back primary IPv4 address of this jail.
2626 *
2627 * If not restricted return success but do not alter the address.  Caller has
2628 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2629 *
2630 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2631 * Address returned in NBO.
2632 */
2633int
2634prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2635{
2636	struct prison *pr;
2637
2638	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2639	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2640
2641	pr = cred->cr_prison;
2642	if (!(pr->pr_flags & PR_IP4))
2643		return (0);
2644	mtx_lock(&pr->pr_mtx);
2645	if (!(pr->pr_flags & PR_IP4)) {
2646		mtx_unlock(&pr->pr_mtx);
2647		return (0);
2648	}
2649	if (pr->pr_ip4 == NULL) {
2650		mtx_unlock(&pr->pr_mtx);
2651		return (EAFNOSUPPORT);
2652	}
2653
2654	ia->s_addr = pr->pr_ip4[0].s_addr;
2655	mtx_unlock(&pr->pr_mtx);
2656	return (0);
2657}
2658
2659/*
2660 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2661 */
2662int
2663prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2664{
2665
2666	if (pr1 == pr2)
2667		return (1);
2668
2669	/*
2670	 * No need to lock since the PR_IP4_USER flag can't be altered for
2671	 * existing prisons.
2672	 */
2673	while (pr1 != &prison0 &&
2674#ifdef VIMAGE
2675	       !(pr1->pr_flags & PR_VNET) &&
2676#endif
2677	       !(pr1->pr_flags & PR_IP4_USER))
2678		pr1 = pr1->pr_parent;
2679	while (pr2 != &prison0 &&
2680#ifdef VIMAGE
2681	       !(pr2->pr_flags & PR_VNET) &&
2682#endif
2683	       !(pr2->pr_flags & PR_IP4_USER))
2684		pr2 = pr2->pr_parent;
2685	return (pr1 == pr2);
2686}
2687
2688/*
2689 * Make sure our (source) address is set to something meaningful to this
2690 * jail.
2691 *
2692 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2693 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2694 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2695 */
2696int
2697prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2698{
2699	struct prison *pr;
2700	struct in_addr ia0;
2701	int error;
2702
2703	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2704	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2705
2706	pr = cred->cr_prison;
2707	if (!(pr->pr_flags & PR_IP4))
2708		return (0);
2709	mtx_lock(&pr->pr_mtx);
2710	if (!(pr->pr_flags & PR_IP4)) {
2711		mtx_unlock(&pr->pr_mtx);
2712		return (0);
2713	}
2714	if (pr->pr_ip4 == NULL) {
2715		mtx_unlock(&pr->pr_mtx);
2716		return (EAFNOSUPPORT);
2717	}
2718
2719	ia0.s_addr = ntohl(ia->s_addr);
2720	if (ia0.s_addr == INADDR_LOOPBACK) {
2721		ia->s_addr = pr->pr_ip4[0].s_addr;
2722		mtx_unlock(&pr->pr_mtx);
2723		return (0);
2724	}
2725
2726	if (ia0.s_addr == INADDR_ANY) {
2727		/*
2728		 * In case there is only 1 IPv4 address, bind directly.
2729		 */
2730		if (pr->pr_ip4s == 1)
2731			ia->s_addr = pr->pr_ip4[0].s_addr;
2732		mtx_unlock(&pr->pr_mtx);
2733		return (0);
2734	}
2735
2736	error = _prison_check_ip4(pr, ia);
2737	mtx_unlock(&pr->pr_mtx);
2738	return (error);
2739}
2740
2741/*
2742 * Rewrite destination address in case we will connect to loopback address.
2743 *
2744 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2745 * Address passed in in NBO and returned in NBO.
2746 */
2747int
2748prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2749{
2750	struct prison *pr;
2751
2752	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2753	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2754
2755	pr = cred->cr_prison;
2756	if (!(pr->pr_flags & PR_IP4))
2757		return (0);
2758	mtx_lock(&pr->pr_mtx);
2759	if (!(pr->pr_flags & PR_IP4)) {
2760		mtx_unlock(&pr->pr_mtx);
2761		return (0);
2762	}
2763	if (pr->pr_ip4 == NULL) {
2764		mtx_unlock(&pr->pr_mtx);
2765		return (EAFNOSUPPORT);
2766	}
2767
2768	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2769		ia->s_addr = pr->pr_ip4[0].s_addr;
2770		mtx_unlock(&pr->pr_mtx);
2771		return (0);
2772	}
2773
2774	/*
2775	 * Return success because nothing had to be changed.
2776	 */
2777	mtx_unlock(&pr->pr_mtx);
2778	return (0);
2779}
2780
2781/*
2782 * Check if given address belongs to the jail referenced by cred/prison.
2783 *
2784 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2785 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2786 * doesn't allow IPv4.  Address passed in in NBO.
2787 */
2788static int
2789_prison_check_ip4(struct prison *pr, struct in_addr *ia)
2790{
2791	int i, a, z, d;
2792
2793	/*
2794	 * Check the primary IP.
2795	 */
2796	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2797		return (0);
2798
2799	/*
2800	 * All the other IPs are sorted so we can do a binary search.
2801	 */
2802	a = 0;
2803	z = pr->pr_ip4s - 2;
2804	while (a <= z) {
2805		i = (a + z) / 2;
2806		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2807		if (d > 0)
2808			z = i - 1;
2809		else if (d < 0)
2810			a = i + 1;
2811		else
2812			return (0);
2813	}
2814
2815	return (EADDRNOTAVAIL);
2816}
2817
2818int
2819prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2820{
2821	struct prison *pr;
2822	int error;
2823
2824	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2825	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2826
2827	pr = cred->cr_prison;
2828	if (!(pr->pr_flags & PR_IP4))
2829		return (0);
2830	mtx_lock(&pr->pr_mtx);
2831	if (!(pr->pr_flags & PR_IP4)) {
2832		mtx_unlock(&pr->pr_mtx);
2833		return (0);
2834	}
2835	if (pr->pr_ip4 == NULL) {
2836		mtx_unlock(&pr->pr_mtx);
2837		return (EAFNOSUPPORT);
2838	}
2839
2840	error = _prison_check_ip4(pr, ia);
2841	mtx_unlock(&pr->pr_mtx);
2842	return (error);
2843}
2844#endif
2845
2846#ifdef INET6
2847static int
2848prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2849{
2850	int ii, ij, used;
2851	struct prison *ppr;
2852
2853	ppr = pr->pr_parent;
2854	if (!(pr->pr_flags & PR_IP6_USER)) {
2855		/* This has no user settings, so just copy the parent's list. */
2856		if (pr->pr_ip6s < ppr->pr_ip6s) {
2857			/*
2858			 * There's no room for the parent's list.  Use the
2859			 * new list buffer, which is assumed to be big enough
2860			 * (if it was passed).  If there's no buffer, try to
2861			 * allocate one.
2862			 */
2863			used = 1;
2864			if (newip6 == NULL) {
2865				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2866				    M_PRISON, M_NOWAIT);
2867				if (newip6 != NULL)
2868					used = 0;
2869			}
2870			if (newip6 != NULL) {
2871				bcopy(ppr->pr_ip6, newip6,
2872				    ppr->pr_ip6s * sizeof(*newip6));
2873				free(pr->pr_ip6, M_PRISON);
2874				pr->pr_ip6 = newip6;
2875				pr->pr_ip6s = ppr->pr_ip6s;
2876			}
2877			return (used);
2878		}
2879		pr->pr_ip6s = ppr->pr_ip6s;
2880		if (pr->pr_ip6s > 0)
2881			bcopy(ppr->pr_ip6, pr->pr_ip6,
2882			    pr->pr_ip6s * sizeof(*newip6));
2883		else if (pr->pr_ip6 != NULL) {
2884			free(pr->pr_ip6, M_PRISON);
2885			pr->pr_ip6 = NULL;
2886		}
2887	} else if (pr->pr_ip6s > 0) {
2888		/* Remove addresses that aren't in the parent. */
2889		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2890			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2891			    &ppr->pr_ip6[ij]))
2892				break;
2893		if (ij < ppr->pr_ip6s)
2894			ii = 1;
2895		else {
2896			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2897			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
2898			ii = 0;
2899		}
2900		for (ij = 1; ii < pr->pr_ip6s; ) {
2901			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
2902			    &ppr->pr_ip6[0])) {
2903				ii++;
2904				continue;
2905			}
2906			switch (ij >= ppr->pr_ip4s ? -1 :
2907				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
2908			case -1:
2909				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
2910				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
2911				break;
2912			case 0:
2913				ii++;
2914				ij++;
2915				break;
2916			case 1:
2917				ij++;
2918				break;
2919			}
2920		}
2921		if (pr->pr_ip6s == 0) {
2922			pr->pr_flags |= PR_IP6_DISABLE;
2923			free(pr->pr_ip6, M_PRISON);
2924			pr->pr_ip6 = NULL;
2925		}
2926	}
2927	return 0;
2928}
2929
2930/*
2931 * Pass back primary IPv6 address for this jail.
2932 *
2933 * If not restricted return success but do not alter the address.  Caller has
2934 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
2935 *
2936 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2937 */
2938int
2939prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
2940{
2941	struct prison *pr;
2942
2943	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2944	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2945
2946	pr = cred->cr_prison;
2947	if (!(pr->pr_flags & PR_IP6))
2948		return (0);
2949	mtx_lock(&pr->pr_mtx);
2950	if (!(pr->pr_flags & PR_IP6)) {
2951		mtx_unlock(&pr->pr_mtx);
2952		return (0);
2953	}
2954	if (pr->pr_ip6 == NULL) {
2955		mtx_unlock(&pr->pr_mtx);
2956		return (EAFNOSUPPORT);
2957	}
2958
2959	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2960	mtx_unlock(&pr->pr_mtx);
2961	return (0);
2962}
2963
2964/*
2965 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
2966 */
2967int
2968prison_equal_ip6(struct prison *pr1, struct prison *pr2)
2969{
2970
2971	if (pr1 == pr2)
2972		return (1);
2973
2974	while (pr1 != &prison0 &&
2975#ifdef VIMAGE
2976	       !(pr1->pr_flags & PR_VNET) &&
2977#endif
2978	       !(pr1->pr_flags & PR_IP6_USER))
2979		pr1 = pr1->pr_parent;
2980	while (pr2 != &prison0 &&
2981#ifdef VIMAGE
2982	       !(pr2->pr_flags & PR_VNET) &&
2983#endif
2984	       !(pr2->pr_flags & PR_IP6_USER))
2985		pr2 = pr2->pr_parent;
2986	return (pr1 == pr2);
2987}
2988
2989/*
2990 * Make sure our (source) address is set to something meaningful to this jail.
2991 *
2992 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
2993 * when needed while binding.
2994 *
2995 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
2996 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2997 * doesn't allow IPv6.
2998 */
2999int
3000prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3001{
3002	struct prison *pr;
3003	int error;
3004
3005	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3006	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3007
3008	pr = cred->cr_prison;
3009	if (!(pr->pr_flags & PR_IP6))
3010		return (0);
3011	mtx_lock(&pr->pr_mtx);
3012	if (!(pr->pr_flags & PR_IP6)) {
3013		mtx_unlock(&pr->pr_mtx);
3014		return (0);
3015	}
3016	if (pr->pr_ip6 == NULL) {
3017		mtx_unlock(&pr->pr_mtx);
3018		return (EAFNOSUPPORT);
3019	}
3020
3021	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3022		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3023		mtx_unlock(&pr->pr_mtx);
3024		return (0);
3025	}
3026
3027	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3028		/*
3029		 * In case there is only 1 IPv6 address, and v6only is true,
3030		 * then bind directly.
3031		 */
3032		if (v6only != 0 && pr->pr_ip6s == 1)
3033			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3034		mtx_unlock(&pr->pr_mtx);
3035		return (0);
3036	}
3037
3038	error = _prison_check_ip6(pr, ia6);
3039	mtx_unlock(&pr->pr_mtx);
3040	return (error);
3041}
3042
3043/*
3044 * Rewrite destination address in case we will connect to loopback address.
3045 *
3046 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3047 */
3048int
3049prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3050{
3051	struct prison *pr;
3052
3053	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3054	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3055
3056	pr = cred->cr_prison;
3057	if (!(pr->pr_flags & PR_IP6))
3058		return (0);
3059	mtx_lock(&pr->pr_mtx);
3060	if (!(pr->pr_flags & PR_IP6)) {
3061		mtx_unlock(&pr->pr_mtx);
3062		return (0);
3063	}
3064	if (pr->pr_ip6 == NULL) {
3065		mtx_unlock(&pr->pr_mtx);
3066		return (EAFNOSUPPORT);
3067	}
3068
3069	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3070		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3071		mtx_unlock(&pr->pr_mtx);
3072		return (0);
3073	}
3074
3075	/*
3076	 * Return success because nothing had to be changed.
3077	 */
3078	mtx_unlock(&pr->pr_mtx);
3079	return (0);
3080}
3081
3082/*
3083 * Check if given address belongs to the jail referenced by cred/prison.
3084 *
3085 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3086 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3087 * doesn't allow IPv6.
3088 */
3089static int
3090_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3091{
3092	int i, a, z, d;
3093
3094	/*
3095	 * Check the primary IP.
3096	 */
3097	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3098		return (0);
3099
3100	/*
3101	 * All the other IPs are sorted so we can do a binary search.
3102	 */
3103	a = 0;
3104	z = pr->pr_ip6s - 2;
3105	while (a <= z) {
3106		i = (a + z) / 2;
3107		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3108		if (d > 0)
3109			z = i - 1;
3110		else if (d < 0)
3111			a = i + 1;
3112		else
3113			return (0);
3114	}
3115
3116	return (EADDRNOTAVAIL);
3117}
3118
3119int
3120prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3121{
3122	struct prison *pr;
3123	int error;
3124
3125	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3126	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3127
3128	pr = cred->cr_prison;
3129	if (!(pr->pr_flags & PR_IP6))
3130		return (0);
3131	mtx_lock(&pr->pr_mtx);
3132	if (!(pr->pr_flags & PR_IP6)) {
3133		mtx_unlock(&pr->pr_mtx);
3134		return (0);
3135	}
3136	if (pr->pr_ip6 == NULL) {
3137		mtx_unlock(&pr->pr_mtx);
3138		return (EAFNOSUPPORT);
3139	}
3140
3141	error = _prison_check_ip6(pr, ia6);
3142	mtx_unlock(&pr->pr_mtx);
3143	return (error);
3144}
3145#endif
3146
3147/*
3148 * Check if a jail supports the given address family.
3149 *
3150 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3151 * if not.
3152 */
3153int
3154prison_check_af(struct ucred *cred, int af)
3155{
3156	struct prison *pr;
3157	int error;
3158
3159	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3160
3161	pr = cred->cr_prison;
3162#ifdef VIMAGE
3163	/* Prisons with their own network stack are not limited. */
3164	if (pr->pr_flags & PR_VNET)
3165		return (0);
3166#endif
3167
3168	error = 0;
3169	switch (af)
3170	{
3171#ifdef INET
3172	case AF_INET:
3173		if (pr->pr_flags & PR_IP4)
3174		{
3175			mtx_lock(&pr->pr_mtx);
3176			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3177				error = EAFNOSUPPORT;
3178			mtx_unlock(&pr->pr_mtx);
3179		}
3180		break;
3181#endif
3182#ifdef INET6
3183	case AF_INET6:
3184		if (pr->pr_flags & PR_IP6)
3185		{
3186			mtx_lock(&pr->pr_mtx);
3187			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3188				error = EAFNOSUPPORT;
3189			mtx_unlock(&pr->pr_mtx);
3190		}
3191		break;
3192#endif
3193	case AF_LOCAL:
3194	case AF_ROUTE:
3195		break;
3196	default:
3197		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3198			error = EAFNOSUPPORT;
3199	}
3200	return (error);
3201}
3202
3203/*
3204 * Check if given address belongs to the jail referenced by cred (wrapper to
3205 * prison_check_ip[46]).
3206 *
3207 * Returns 0 if jail doesn't restrict the address family or if address belongs
3208 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3209 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3210 */
3211int
3212prison_if(struct ucred *cred, struct sockaddr *sa)
3213{
3214#ifdef INET
3215	struct sockaddr_in *sai;
3216#endif
3217#ifdef INET6
3218	struct sockaddr_in6 *sai6;
3219#endif
3220	int error;
3221
3222	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3223	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3224
3225	error = 0;
3226	switch (sa->sa_family)
3227	{
3228#ifdef INET
3229	case AF_INET:
3230		sai = (struct sockaddr_in *)sa;
3231		error = prison_check_ip4(cred, &sai->sin_addr);
3232		break;
3233#endif
3234#ifdef INET6
3235	case AF_INET6:
3236		sai6 = (struct sockaddr_in6 *)sa;
3237		error = prison_check_ip6(cred, &sai6->sin6_addr);
3238		break;
3239#endif
3240	default:
3241		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3242			error = EAFNOSUPPORT;
3243	}
3244	return (error);
3245}
3246
3247/*
3248 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3249 */
3250int
3251prison_check(struct ucred *cred1, struct ucred *cred2)
3252{
3253
3254	return ((cred1->cr_prison == cred2->cr_prison ||
3255	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3256}
3257
3258/*
3259 * Return 1 if p2 is a child of p1, otherwise 0.
3260 */
3261int
3262prison_ischild(struct prison *pr1, struct prison *pr2)
3263{
3264
3265	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3266		if (pr1 == pr2)
3267			return (1);
3268	return (0);
3269}
3270
3271/*
3272 * Return 1 if the passed credential is in a jail, otherwise 0.
3273 */
3274int
3275jailed(struct ucred *cred)
3276{
3277
3278	return (cred->cr_prison != &prison0);
3279}
3280
3281/*
3282 * Return the correct hostname (domainname, et al) for the passed credential.
3283 */
3284void
3285getcredhostname(struct ucred *cred, char *buf, size_t size)
3286{
3287	struct prison *pr;
3288
3289	/*
3290	 * A NULL credential can be used to shortcut to the physical
3291	 * system's hostname.
3292	 */
3293	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3294	mtx_lock(&pr->pr_mtx);
3295	strlcpy(buf, pr->pr_hostname, size);
3296	mtx_unlock(&pr->pr_mtx);
3297}
3298
3299void
3300getcreddomainname(struct ucred *cred, char *buf, size_t size)
3301{
3302
3303	mtx_lock(&cred->cr_prison->pr_mtx);
3304	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3305	mtx_unlock(&cred->cr_prison->pr_mtx);
3306}
3307
3308void
3309getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3310{
3311
3312	mtx_lock(&cred->cr_prison->pr_mtx);
3313	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3314	mtx_unlock(&cred->cr_prison->pr_mtx);
3315}
3316
3317void
3318getcredhostid(struct ucred *cred, unsigned long *hostid)
3319{
3320
3321	mtx_lock(&cred->cr_prison->pr_mtx);
3322	*hostid = cred->cr_prison->pr_hostid;
3323	mtx_unlock(&cred->cr_prison->pr_mtx);
3324}
3325
3326#ifdef VIMAGE
3327/*
3328 * Determine whether the prison represented by cred owns
3329 * its vnet rather than having it inherited.
3330 *
3331 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3332 */
3333int
3334prison_owns_vnet(struct ucred *cred)
3335{
3336
3337	/*
3338	 * vnets cannot be added/removed after jail creation,
3339	 * so no need to lock here.
3340	 */
3341	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3342}
3343#endif
3344
3345/*
3346 * Determine whether the subject represented by cred can "see"
3347 * status of a mount point.
3348 * Returns: 0 for permitted, ENOENT otherwise.
3349 * XXX: This function should be called cr_canseemount() and should be
3350 *      placed in kern_prot.c.
3351 */
3352int
3353prison_canseemount(struct ucred *cred, struct mount *mp)
3354{
3355	struct prison *pr;
3356	struct statfs *sp;
3357	size_t len;
3358
3359	pr = cred->cr_prison;
3360	if (pr->pr_enforce_statfs == 0)
3361		return (0);
3362	if (pr->pr_root->v_mount == mp)
3363		return (0);
3364	if (pr->pr_enforce_statfs == 2)
3365		return (ENOENT);
3366	/*
3367	 * If jail's chroot directory is set to "/" we should be able to see
3368	 * all mount-points from inside a jail.
3369	 * This is ugly check, but this is the only situation when jail's
3370	 * directory ends with '/'.
3371	 */
3372	if (strcmp(pr->pr_path, "/") == 0)
3373		return (0);
3374	len = strlen(pr->pr_path);
3375	sp = &mp->mnt_stat;
3376	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3377		return (ENOENT);
3378	/*
3379	 * Be sure that we don't have situation where jail's root directory
3380	 * is "/some/path" and mount point is "/some/pathpath".
3381	 */
3382	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3383		return (ENOENT);
3384	return (0);
3385}
3386
3387void
3388prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3389{
3390	char jpath[MAXPATHLEN];
3391	struct prison *pr;
3392	size_t len;
3393
3394	pr = cred->cr_prison;
3395	if (pr->pr_enforce_statfs == 0)
3396		return;
3397	if (prison_canseemount(cred, mp) != 0) {
3398		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3399		strlcpy(sp->f_mntonname, "[restricted]",
3400		    sizeof(sp->f_mntonname));
3401		return;
3402	}
3403	if (pr->pr_root->v_mount == mp) {
3404		/*
3405		 * Clear current buffer data, so we are sure nothing from
3406		 * the valid path left there.
3407		 */
3408		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3409		*sp->f_mntonname = '/';
3410		return;
3411	}
3412	/*
3413	 * If jail's chroot directory is set to "/" we should be able to see
3414	 * all mount-points from inside a jail.
3415	 */
3416	if (strcmp(pr->pr_path, "/") == 0)
3417		return;
3418	len = strlen(pr->pr_path);
3419	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3420	/*
3421	 * Clear current buffer data, so we are sure nothing from
3422	 * the valid path left there.
3423	 */
3424	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3425	if (*jpath == '\0') {
3426		/* Should never happen. */
3427		*sp->f_mntonname = '/';
3428	} else {
3429		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3430	}
3431}
3432
3433/*
3434 * Check with permission for a specific privilege is granted within jail.  We
3435 * have a specific list of accepted privileges; the rest are denied.
3436 */
3437int
3438prison_priv_check(struct ucred *cred, int priv)
3439{
3440
3441	if (!jailed(cred))
3442		return (0);
3443
3444#ifdef VIMAGE
3445	/*
3446	 * Privileges specific to prisons with a virtual network stack.
3447	 * There might be a duplicate entry here in case the privilege
3448	 * is only granted conditionally in the legacy jail case.
3449	 */
3450	switch (priv) {
3451#ifdef notyet
3452		/*
3453		 * NFS-specific privileges.
3454		 */
3455	case PRIV_NFS_DAEMON:
3456	case PRIV_NFS_LOCKD:
3457#endif
3458		/*
3459		 * Network stack privileges.
3460		 */
3461	case PRIV_NET_BRIDGE:
3462	case PRIV_NET_GRE:
3463	case PRIV_NET_BPF:
3464	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3465	case PRIV_NET_ROUTE:
3466	case PRIV_NET_TAP:
3467	case PRIV_NET_SETIFMTU:
3468	case PRIV_NET_SETIFFLAGS:
3469	case PRIV_NET_SETIFCAP:
3470	case PRIV_NET_SETIFNAME	:
3471	case PRIV_NET_SETIFMETRIC:
3472	case PRIV_NET_SETIFPHYS:
3473	case PRIV_NET_SETIFMAC:
3474	case PRIV_NET_ADDMULTI:
3475	case PRIV_NET_DELMULTI:
3476	case PRIV_NET_HWIOCTL:
3477	case PRIV_NET_SETLLADDR:
3478	case PRIV_NET_ADDIFGROUP:
3479	case PRIV_NET_DELIFGROUP:
3480	case PRIV_NET_IFCREATE:
3481	case PRIV_NET_IFDESTROY:
3482	case PRIV_NET_ADDIFADDR:
3483	case PRIV_NET_DELIFADDR:
3484	case PRIV_NET_LAGG:
3485	case PRIV_NET_GIF:
3486	case PRIV_NET_SETIFVNET:
3487
3488		/*
3489		 * 802.11-related privileges.
3490		 */
3491	case PRIV_NET80211_GETKEY:
3492#ifdef notyet
3493	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3494#endif
3495
3496#ifdef notyet
3497		/*
3498		 * AppleTalk privileges.
3499		 */
3500	case PRIV_NETATALK_RESERVEDPORT:
3501
3502		/*
3503		 * ATM privileges.
3504		 */
3505	case PRIV_NETATM_CFG:
3506	case PRIV_NETATM_ADD:
3507	case PRIV_NETATM_DEL:
3508	case PRIV_NETATM_SET:
3509
3510		/*
3511		 * Bluetooth privileges.
3512		 */
3513	case PRIV_NETBLUETOOTH_RAW:
3514#endif
3515
3516		/*
3517		 * Netgraph and netgraph module privileges.
3518		 */
3519	case PRIV_NETGRAPH_CONTROL:
3520#ifdef notyet
3521	case PRIV_NETGRAPH_TTY:
3522#endif
3523
3524		/*
3525		 * IPv4 and IPv6 privileges.
3526		 */
3527	case PRIV_NETINET_IPFW:
3528	case PRIV_NETINET_DIVERT:
3529	case PRIV_NETINET_PF:
3530	case PRIV_NETINET_DUMMYNET:
3531	case PRIV_NETINET_CARP:
3532	case PRIV_NETINET_MROUTE:
3533	case PRIV_NETINET_RAW:
3534	case PRIV_NETINET_ADDRCTRL6:
3535	case PRIV_NETINET_ND6:
3536	case PRIV_NETINET_SCOPE6:
3537	case PRIV_NETINET_ALIFETIME6:
3538	case PRIV_NETINET_IPSEC:
3539	case PRIV_NETINET_BINDANY:
3540
3541#ifdef notyet
3542		/*
3543		 * IPX/SPX privileges.
3544		 */
3545	case PRIV_NETIPX_RESERVEDPORT:
3546	case PRIV_NETIPX_RAW:
3547
3548		/*
3549		 * NCP privileges.
3550		 */
3551	case PRIV_NETNCP:
3552
3553		/*
3554		 * SMB privileges.
3555		 */
3556	case PRIV_NETSMB:
3557#endif
3558
3559	/*
3560	 * No default: or deny here.
3561	 * In case of no permit fall through to next switch().
3562	 */
3563		if (cred->cr_prison->pr_flags & PR_VNET)
3564			return (0);
3565	}
3566#endif /* VIMAGE */
3567
3568	switch (priv) {
3569
3570		/*
3571		 * Allow ktrace privileges for root in jail.
3572		 */
3573	case PRIV_KTRACE:
3574
3575#if 0
3576		/*
3577		 * Allow jailed processes to configure audit identity and
3578		 * submit audit records (login, etc).  In the future we may
3579		 * want to further refine the relationship between audit and
3580		 * jail.
3581		 */
3582	case PRIV_AUDIT_GETAUDIT:
3583	case PRIV_AUDIT_SETAUDIT:
3584	case PRIV_AUDIT_SUBMIT:
3585#endif
3586
3587		/*
3588		 * Allow jailed processes to manipulate process UNIX
3589		 * credentials in any way they see fit.
3590		 */
3591	case PRIV_CRED_SETUID:
3592	case PRIV_CRED_SETEUID:
3593	case PRIV_CRED_SETGID:
3594	case PRIV_CRED_SETEGID:
3595	case PRIV_CRED_SETGROUPS:
3596	case PRIV_CRED_SETREUID:
3597	case PRIV_CRED_SETREGID:
3598	case PRIV_CRED_SETRESUID:
3599	case PRIV_CRED_SETRESGID:
3600
3601		/*
3602		 * Jail implements visibility constraints already, so allow
3603		 * jailed root to override uid/gid-based constraints.
3604		 */
3605	case PRIV_SEEOTHERGIDS:
3606	case PRIV_SEEOTHERUIDS:
3607
3608		/*
3609		 * Jail implements inter-process debugging limits already, so
3610		 * allow jailed root various debugging privileges.
3611		 */
3612	case PRIV_DEBUG_DIFFCRED:
3613	case PRIV_DEBUG_SUGID:
3614	case PRIV_DEBUG_UNPRIV:
3615
3616		/*
3617		 * Allow jail to set various resource limits and login
3618		 * properties, and for now, exceed process resource limits.
3619		 */
3620	case PRIV_PROC_LIMIT:
3621	case PRIV_PROC_SETLOGIN:
3622	case PRIV_PROC_SETRLIMIT:
3623
3624		/*
3625		 * System V and POSIX IPC privileges are granted in jail.
3626		 */
3627	case PRIV_IPC_READ:
3628	case PRIV_IPC_WRITE:
3629	case PRIV_IPC_ADMIN:
3630	case PRIV_IPC_MSGSIZE:
3631	case PRIV_MQ_ADMIN:
3632
3633		/*
3634		 * Jail operations within a jail work on child jails.
3635		 */
3636	case PRIV_JAIL_ATTACH:
3637	case PRIV_JAIL_SET:
3638	case PRIV_JAIL_REMOVE:
3639
3640		/*
3641		 * Jail implements its own inter-process limits, so allow
3642		 * root processes in jail to change scheduling on other
3643		 * processes in the same jail.  Likewise for signalling.
3644		 */
3645	case PRIV_SCHED_DIFFCRED:
3646	case PRIV_SCHED_CPUSET:
3647	case PRIV_SIGNAL_DIFFCRED:
3648	case PRIV_SIGNAL_SUGID:
3649
3650		/*
3651		 * Allow jailed processes to write to sysctls marked as jail
3652		 * writable.
3653		 */
3654	case PRIV_SYSCTL_WRITEJAIL:
3655
3656		/*
3657		 * Allow root in jail to manage a variety of quota
3658		 * properties.  These should likely be conditional on a
3659		 * configuration option.
3660		 */
3661	case PRIV_VFS_GETQUOTA:
3662	case PRIV_VFS_SETQUOTA:
3663
3664		/*
3665		 * Since Jail relies on chroot() to implement file system
3666		 * protections, grant many VFS privileges to root in jail.
3667		 * Be careful to exclude mount-related and NFS-related
3668		 * privileges.
3669		 */
3670	case PRIV_VFS_READ:
3671	case PRIV_VFS_WRITE:
3672	case PRIV_VFS_ADMIN:
3673	case PRIV_VFS_EXEC:
3674	case PRIV_VFS_LOOKUP:
3675	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3676	case PRIV_VFS_CHFLAGS_DEV:
3677	case PRIV_VFS_CHOWN:
3678	case PRIV_VFS_CHROOT:
3679	case PRIV_VFS_RETAINSUGID:
3680	case PRIV_VFS_FCHROOT:
3681	case PRIV_VFS_LINK:
3682	case PRIV_VFS_SETGID:
3683	case PRIV_VFS_STAT:
3684	case PRIV_VFS_STICKYFILE:
3685		return (0);
3686
3687		/*
3688		 * Depending on the global setting, allow privilege of
3689		 * setting system flags.
3690		 */
3691	case PRIV_VFS_SYSFLAGS:
3692		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3693			return (0);
3694		else
3695			return (EPERM);
3696
3697		/*
3698		 * Depending on the global setting, allow privilege of
3699		 * mounting/unmounting file systems.
3700		 */
3701	case PRIV_VFS_MOUNT:
3702	case PRIV_VFS_UNMOUNT:
3703	case PRIV_VFS_MOUNT_NONUSER:
3704	case PRIV_VFS_MOUNT_OWNER:
3705		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3706			return (0);
3707		else
3708			return (EPERM);
3709
3710		/*
3711		 * Allow jailed root to bind reserved ports and reuse in-use
3712		 * ports.
3713		 */
3714	case PRIV_NETINET_RESERVEDPORT:
3715	case PRIV_NETINET_REUSEPORT:
3716		return (0);
3717
3718		/*
3719		 * Allow jailed root to set certian IPv4/6 (option) headers.
3720		 */
3721	case PRIV_NETINET_SETHDROPTS:
3722		return (0);
3723
3724		/*
3725		 * Conditionally allow creating raw sockets in jail.
3726		 */
3727	case PRIV_NETINET_RAW:
3728		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3729			return (0);
3730		else
3731			return (EPERM);
3732
3733		/*
3734		 * Since jail implements its own visibility limits on netstat
3735		 * sysctls, allow getcred.  This allows identd to work in
3736		 * jail.
3737		 */
3738	case PRIV_NETINET_GETCRED:
3739		return (0);
3740
3741	default:
3742		/*
3743		 * In all remaining cases, deny the privilege request.  This
3744		 * includes almost all network privileges, many system
3745		 * configuration privileges.
3746		 */
3747		return (EPERM);
3748	}
3749}
3750
3751/*
3752 * Return the part of pr2's name that is relative to pr1, or the whole name
3753 * if it does not directly follow.
3754 */
3755
3756char *
3757prison_name(struct prison *pr1, struct prison *pr2)
3758{
3759	char *name;
3760
3761	/* Jails see themselves as "0" (if they see themselves at all). */
3762	if (pr1 == pr2)
3763		return "0";
3764	name = pr2->pr_name;
3765	if (prison_ischild(pr1, pr2)) {
3766		/*
3767		 * pr1 isn't locked (and allprison_lock may not be either)
3768		 * so its length can't be counted on.  But the number of dots
3769		 * can be counted on - and counted.
3770		 */
3771		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3772			name = strchr(name, '.') + 1;
3773	}
3774	return (name);
3775}
3776
3777/*
3778 * Return the part of pr2's path that is relative to pr1, or the whole path
3779 * if it does not directly follow.
3780 */
3781static char *
3782prison_path(struct prison *pr1, struct prison *pr2)
3783{
3784	char *path1, *path2;
3785	int len1;
3786
3787	path1 = pr1->pr_path;
3788	path2 = pr2->pr_path;
3789	if (!strcmp(path1, "/"))
3790		return (path2);
3791	len1 = strlen(path1);
3792	if (strncmp(path1, path2, len1))
3793		return (path2);
3794	if (path2[len1] == '\0')
3795		return "/";
3796	if (path2[len1] == '/')
3797		return (path2 + len1);
3798	return (path2);
3799}
3800
3801
3802/*
3803 * Jail-related sysctls.
3804 */
3805SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3806    "Jails");
3807
3808static int
3809sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3810{
3811	struct xprison *xp;
3812	struct prison *pr, *cpr;
3813#ifdef INET
3814	struct in_addr *ip4 = NULL;
3815	int ip4s = 0;
3816#endif
3817#ifdef INET6
3818	struct in_addr *ip6 = NULL;
3819	int ip6s = 0;
3820#endif
3821	int descend, error;
3822
3823	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3824	pr = req->td->td_ucred->cr_prison;
3825	error = 0;
3826	sx_slock(&allprison_lock);
3827	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3828#if defined(INET) || defined(INET6)
3829 again:
3830#endif
3831		mtx_lock(&cpr->pr_mtx);
3832#ifdef INET
3833		if (cpr->pr_ip4s > 0) {
3834			if (ip4s < cpr->pr_ip4s) {
3835				ip4s = cpr->pr_ip4s;
3836				mtx_unlock(&cpr->pr_mtx);
3837				ip4 = realloc(ip4, ip4s *
3838				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3839				goto again;
3840			}
3841			bcopy(cpr->pr_ip4, ip4,
3842			    cpr->pr_ip4s * sizeof(struct in_addr));
3843		}
3844#endif
3845#ifdef INET6
3846		if (cpr->pr_ip6s > 0) {
3847			if (ip6s < cpr->pr_ip6s) {
3848				ip6s = cpr->pr_ip6s;
3849				mtx_unlock(&cpr->pr_mtx);
3850				ip6 = realloc(ip6, ip6s *
3851				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3852				goto again;
3853			}
3854			bcopy(cpr->pr_ip6, ip6,
3855			    cpr->pr_ip6s * sizeof(struct in6_addr));
3856		}
3857#endif
3858		if (cpr->pr_ref == 0) {
3859			mtx_unlock(&cpr->pr_mtx);
3860			continue;
3861		}
3862		bzero(xp, sizeof(*xp));
3863		xp->pr_version = XPRISON_VERSION;
3864		xp->pr_id = cpr->pr_id;
3865		xp->pr_state = cpr->pr_uref > 0
3866		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3867		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3868		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3869		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3870#ifdef INET
3871		xp->pr_ip4s = cpr->pr_ip4s;
3872#endif
3873#ifdef INET6
3874		xp->pr_ip6s = cpr->pr_ip6s;
3875#endif
3876		mtx_unlock(&cpr->pr_mtx);
3877		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3878		if (error)
3879			break;
3880#ifdef INET
3881		if (xp->pr_ip4s > 0) {
3882			error = SYSCTL_OUT(req, ip4,
3883			    xp->pr_ip4s * sizeof(struct in_addr));
3884			if (error)
3885				break;
3886		}
3887#endif
3888#ifdef INET6
3889		if (xp->pr_ip6s > 0) {
3890			error = SYSCTL_OUT(req, ip6,
3891			    xp->pr_ip6s * sizeof(struct in6_addr));
3892			if (error)
3893				break;
3894		}
3895#endif
3896	}
3897	sx_sunlock(&allprison_lock);
3898	free(xp, M_TEMP);
3899#ifdef INET
3900	free(ip4, M_TEMP);
3901#endif
3902#ifdef INET6
3903	free(ip6, M_TEMP);
3904#endif
3905	return (error);
3906}
3907
3908SYSCTL_OID(_security_jail, OID_AUTO, list,
3909    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3910    sysctl_jail_list, "S", "List of active jails");
3911
3912static int
3913sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3914{
3915	int error, injail;
3916
3917	injail = jailed(req->td->td_ucred);
3918	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3919
3920	return (error);
3921}
3922
3923SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3924    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3925    sysctl_jail_jailed, "I", "Process in jail?");
3926
3927#if defined(INET) || defined(INET6)
3928SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3929    &jail_max_af_ips, 0,
3930    "Number of IP addresses a jail may have at most per address family");
3931#endif
3932
3933/*
3934 * Default parameters for jail(2) compatability.  For historical reasons,
3935 * the sysctl names have varying similarity to the parameter names.  Prisons
3936 * just see their own parameters, and can't change them.
3937 */
3938static int
3939sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3940{
3941	struct prison *pr;
3942	int allow, error, i;
3943
3944	pr = req->td->td_ucred->cr_prison;
3945	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
3946
3947	/* Get the current flag value, and convert it to a boolean. */
3948	i = (allow & arg2) ? 1 : 0;
3949	if (arg1 != NULL)
3950		i = !i;
3951	error = sysctl_handle_int(oidp, &i, 0, req);
3952	if (error || !req->newptr)
3953		return (error);
3954	i = i ? arg2 : 0;
3955	if (arg1 != NULL)
3956		i ^= arg2;
3957	/*
3958	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3959	 * for writing.
3960	 */
3961	mtx_lock(&prison0.pr_mtx);
3962	jail_default_allow = (jail_default_allow & ~arg2) | i;
3963	mtx_unlock(&prison0.pr_mtx);
3964	return (0);
3965}
3966
3967SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3968    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3969    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3970    "Processes in jail can set their hostnames");
3971SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3972    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3973    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3974    "Processes in jail are limited to creating UNIX/IP/route sockets only");
3975SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3976    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3977    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3978    "Processes in jail can use System V IPC primitives");
3979SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3980    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3981    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3982    "Prison root can create raw sockets");
3983SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3984    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3985    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3986    "Processes in jail can alter system file flags");
3987SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3988    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3989    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3990    "Processes in jail can mount/unmount jail-friendly file systems");
3991
3992static int
3993sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3994{
3995	struct prison *pr;
3996	int level, error;
3997
3998	pr = req->td->td_ucred->cr_prison;
3999	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4000	error = sysctl_handle_int(oidp, &level, 0, req);
4001	if (error || !req->newptr)
4002		return (error);
4003	*(int *)arg1 = level;
4004	return (0);
4005}
4006
4007SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4008    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4009    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4010    sysctl_jail_default_level, "I",
4011    "Processes in jail cannot see all mounted file systems");
4012
4013/*
4014 * Nodes to describe jail parameters.  Maximum length of string parameters
4015 * is returned in the string itself, and the other parameters exist merely
4016 * to make themselves and their types known.
4017 */
4018SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4019    "Jail parameters");
4020
4021int
4022sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4023{
4024	int i;
4025	long l;
4026	size_t s;
4027	char numbuf[12];
4028
4029	switch (oidp->oid_kind & CTLTYPE)
4030	{
4031	case CTLTYPE_LONG:
4032	case CTLTYPE_ULONG:
4033		l = 0;
4034#ifdef SCTL_MASK32
4035		if (!(req->flags & SCTL_MASK32))
4036#endif
4037			return (SYSCTL_OUT(req, &l, sizeof(l)));
4038	case CTLTYPE_INT:
4039	case CTLTYPE_UINT:
4040		i = 0;
4041		return (SYSCTL_OUT(req, &i, sizeof(i)));
4042	case CTLTYPE_STRING:
4043		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
4044		return
4045		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4046	case CTLTYPE_STRUCT:
4047		s = (size_t)arg2;
4048		return (SYSCTL_OUT(req, &s, sizeof(s)));
4049	}
4050	return (0);
4051}
4052
4053SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4054SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4055SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4056SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4057SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4058    "I", "Jail secure level");
4059SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4060    "I", "Jail cannot see all mounted file systems");
4061SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4062    "B", "Jail persistence");
4063#ifdef VIMAGE
4064SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4065    "E,jailsys", "Virtual network stack");
4066#endif
4067SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4068    "B", "Jail is in the process of shutting down");
4069
4070SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4071SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4072    "I", "Current number of child jails");
4073SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4074    "I", "Maximum number of child jails");
4075
4076SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4077SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4078    "Jail hostname");
4079SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4080    "Jail NIS domainname");
4081SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4082    "Jail host UUID");
4083SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4084    "LU", "Jail host ID");
4085
4086SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4087SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4088
4089#ifdef INET
4090SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4091    "Jail IPv4 address virtualization");
4092SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4093    "S,in_addr,a", "Jail IPv4 addresses");
4094#endif
4095#ifdef INET6
4096SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4097    "Jail IPv6 address virtualization");
4098SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4099    "S,in6_addr,a", "Jail IPv6 addresses");
4100#endif
4101
4102SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4103SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4104    "B", "Jail may set hostname");
4105SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4106    "B", "Jail may use SYSV IPC");
4107SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4108    "B", "Jail may create raw sockets");
4109SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4110    "B", "Jail may alter system file flags");
4111SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
4112    "B", "Jail may mount/unmount jail-friendly file systems");
4113SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4114    "B", "Jail may set file quotas");
4115SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4116    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4117
4118
4119#ifdef DDB
4120
4121static void
4122db_show_prison(struct prison *pr)
4123{
4124	int fi;
4125#if defined(INET) || defined(INET6)
4126	int ii;
4127#endif
4128	unsigned jsf;
4129#ifdef INET6
4130	char ip6buf[INET6_ADDRSTRLEN];
4131#endif
4132
4133	db_printf("prison %p:\n", pr);
4134	db_printf(" jid             = %d\n", pr->pr_id);
4135	db_printf(" name            = %s\n", pr->pr_name);
4136	db_printf(" parent          = %p\n", pr->pr_parent);
4137	db_printf(" ref             = %d\n", pr->pr_ref);
4138	db_printf(" uref            = %d\n", pr->pr_uref);
4139	db_printf(" path            = %s\n", pr->pr_path);
4140	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4141	    ? pr->pr_cpuset->cs_id : -1);
4142#ifdef VIMAGE
4143	db_printf(" vnet            = %p\n", pr->pr_vnet);
4144#endif
4145	db_printf(" root            = %p\n", pr->pr_root);
4146	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4147	db_printf(" childcount      = %d\n", pr->pr_childcount);
4148	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4149	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4150	db_printf(" flags           = %x", pr->pr_flags);
4151	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4152	    fi++)
4153		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4154			db_printf(" %s", pr_flag_names[fi]);
4155	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4156	    fi++) {
4157		jsf = pr->pr_flags &
4158		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4159		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4160		    pr_flag_jailsys[fi].disable &&
4161		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4162		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4163		    : "inherit");
4164	}
4165	db_printf(" allow           = %x", pr->pr_allow);
4166	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4167	    fi++)
4168		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4169			db_printf(" %s", pr_allow_names[fi]);
4170	db_printf("\n");
4171	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4172	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4173	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4174	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4175	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4176#ifdef INET
4177	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4178	for (ii = 0; ii < pr->pr_ip4s; ii++)
4179		db_printf(" %s %s\n",
4180		    ii == 0 ? "ip4             =" : "                 ",
4181		    inet_ntoa(pr->pr_ip4[ii]));
4182#endif
4183#ifdef INET6
4184	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4185	for (ii = 0; ii < pr->pr_ip6s; ii++)
4186		db_printf(" %s %s\n",
4187		    ii == 0 ? "ip6             =" : "                 ",
4188		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4189#endif
4190}
4191
4192DB_SHOW_COMMAND(prison, db_show_prison_command)
4193{
4194	struct prison *pr;
4195
4196	if (!have_addr) {
4197		/*
4198		 * Show all prisons in the list, and prison0 which is not
4199		 * listed.
4200		 */
4201		db_show_prison(&prison0);
4202		if (!db_pager_quit) {
4203			TAILQ_FOREACH(pr, &allprison, pr_list) {
4204				db_show_prison(pr);
4205				if (db_pager_quit)
4206					break;
4207			}
4208		}
4209		return;
4210	}
4211
4212	if (addr == 0)
4213		pr = &prison0;
4214	else {
4215		/* Look for a prison with the ID and with references. */
4216		TAILQ_FOREACH(pr, &allprison, pr_list)
4217			if (pr->pr_id == addr && pr->pr_ref > 0)
4218				break;
4219		if (pr == NULL)
4220			/* Look again, without requiring a reference. */
4221			TAILQ_FOREACH(pr, &allprison, pr_list)
4222				if (pr->pr_id == addr)
4223					break;
4224		if (pr == NULL)
4225			/* Assume address points to a valid prison. */
4226			pr = (struct prison *)addr;
4227	}
4228	db_show_prison(pr);
4229}
4230
4231#endif /* DDB */
4232