kern_jail.c revision 196176
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 196176 2009-08-13 10:26:34Z bz $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/sx.h>
53#include <sys/sysent.h>
54#include <sys/namei.h>
55#include <sys/mount.h>
56#include <sys/queue.h>
57#include <sys/socket.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/vnode.h>
61
62#include <net/if.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#ifdef INET6
70#include <netinet6/in6_var.h>
71#endif /* INET6 */
72#endif /* DDB */
73
74#include <security/mac/mac_framework.h>
75
76#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
77
78MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
79
80/* prison0 describes what is "real" about the system. */
81struct prison prison0 = {
82	.pr_id		= 0,
83	.pr_name	= "0",
84	.pr_ref		= 1,
85	.pr_uref	= 1,
86	.pr_path	= "/",
87	.pr_securelevel	= -1,
88	.pr_childmax	= JAIL_MAX,
89	.pr_hostuuid	= DEFAULT_HOSTUUID,
90	.pr_children	= LIST_HEAD_INITIALIZER(&prison0.pr_children),
91#ifdef VIMAGE
92	.pr_flags	= PR_HOST|PR_VNET,
93#else
94	.pr_flags	= PR_HOST,
95#endif
96	.pr_allow	= PR_ALLOW_ALL,
97};
98MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
99
100/* allprison and lastprid are protected by allprison_lock. */
101struct	sx allprison_lock;
102SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
103struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
104int	lastprid = 0;
105
106static int do_jail_attach(struct thread *td, struct prison *pr);
107static void prison_complete(void *context, int pending);
108static void prison_deref(struct prison *pr, int flags);
109static char *prison_path(struct prison *pr1, struct prison *pr2);
110static void prison_remove_one(struct prison *pr);
111#ifdef INET
112static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
113static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
114#endif
115#ifdef INET6
116static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
117static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
118#endif
119
120/* Flags for prison_deref */
121#define	PD_DEREF	0x01
122#define	PD_DEUREF	0x02
123#define	PD_LOCKED	0x04
124#define	PD_LIST_SLOCKED	0x08
125#define	PD_LIST_XLOCKED	0x10
126
127/*
128 * Parameter names corresponding to PR_* flag values
129 */
130static char *pr_flag_names[] = {
131	[0] = "persist",
132};
133
134static char *pr_flag_nonames[] = {
135	[0] = "nopersist",
136};
137
138struct jailsys_flags {
139	const char	*name;
140	unsigned	 disable;
141	unsigned	 new;
142} pr_flag_jailsys[] = {
143	{ "host", 0, PR_HOST },
144#ifdef VIMAGE
145	{ "vnet", 0, PR_VNET },
146#endif
147#ifdef INET
148	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
149#endif
150#ifdef INET6
151	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
152#endif
153};
154
155static char *pr_allow_names[] = {
156	"allow.set_hostname",
157	"allow.sysvipc",
158	"allow.raw_sockets",
159	"allow.chflags",
160	"allow.mount",
161	"allow.quotas",
162	"allow.socket_af",
163};
164
165static char *pr_allow_nonames[] = {
166	"allow.noset_hostname",
167	"allow.nosysvipc",
168	"allow.noraw_sockets",
169	"allow.nochflags",
170	"allow.nomount",
171	"allow.noquotas",
172	"allow.nosocket_af",
173};
174
175#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
176#define	JAIL_DEFAULT_ENFORCE_STATFS	2
177static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
178static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
179#if defined(INET) || defined(INET6)
180static unsigned jail_max_af_ips = 255;
181#endif
182
183#ifdef INET
184static int
185qcmp_v4(const void *ip1, const void *ip2)
186{
187	in_addr_t iaa, iab;
188
189	/*
190	 * We need to compare in HBO here to get the list sorted as expected
191	 * by the result of the code.  Sorting NBO addresses gives you
192	 * interesting results.  If you do not understand, do not try.
193	 */
194	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
195	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
196
197	/*
198	 * Do not simply return the difference of the two numbers, the int is
199	 * not wide enough.
200	 */
201	if (iaa > iab)
202		return (1);
203	else if (iaa < iab)
204		return (-1);
205	else
206		return (0);
207}
208#endif
209
210#ifdef INET6
211static int
212qcmp_v6(const void *ip1, const void *ip2)
213{
214	const struct in6_addr *ia6a, *ia6b;
215	int i, rc;
216
217	ia6a = (const struct in6_addr *)ip1;
218	ia6b = (const struct in6_addr *)ip2;
219
220	rc = 0;
221	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
222		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
223			rc = 1;
224		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
225			rc = -1;
226	}
227	return (rc);
228}
229#endif
230
231/*
232 * struct jail_args {
233 *	struct jail *jail;
234 * };
235 */
236int
237jail(struct thread *td, struct jail_args *uap)
238{
239	uint32_t version;
240	int error;
241	struct jail j;
242
243	error = copyin(uap->jail, &version, sizeof(uint32_t));
244	if (error)
245		return (error);
246
247	switch (version) {
248	case 0:
249	{
250		struct jail_v0 j0;
251
252		/* FreeBSD single IPv4 jails. */
253		bzero(&j, sizeof(struct jail));
254		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
255		if (error)
256			return (error);
257		j.version = j0.version;
258		j.path = j0.path;
259		j.hostname = j0.hostname;
260		j.ip4s = j0.ip_number;
261		break;
262	}
263
264	case 1:
265		/*
266		 * Version 1 was used by multi-IPv4 jail implementations
267		 * that never made it into the official kernel.
268		 */
269		return (EINVAL);
270
271	case 2:	/* JAIL_API_VERSION */
272		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
273		error = copyin(uap->jail, &j, sizeof(struct jail));
274		if (error)
275			return (error);
276		break;
277
278	default:
279		/* Sci-Fi jails are not supported, sorry. */
280		return (EINVAL);
281	}
282	return (kern_jail(td, &j));
283}
284
285int
286kern_jail(struct thread *td, struct jail *j)
287{
288	struct iovec optiov[2 * (4
289			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
290#ifdef INET
291			    + 1
292#endif
293#ifdef INET6
294			    + 1
295#endif
296			    )];
297	struct uio opt;
298	char *u_path, *u_hostname, *u_name;
299#ifdef INET
300	uint32_t ip4s;
301	struct in_addr *u_ip4;
302#endif
303#ifdef INET6
304	struct in6_addr *u_ip6;
305#endif
306	size_t tmplen;
307	int error, enforce_statfs, fi;
308
309	bzero(&optiov, sizeof(optiov));
310	opt.uio_iov = optiov;
311	opt.uio_iovcnt = 0;
312	opt.uio_offset = -1;
313	opt.uio_resid = -1;
314	opt.uio_segflg = UIO_SYSSPACE;
315	opt.uio_rw = UIO_READ;
316	opt.uio_td = td;
317
318	/* Set permissions for top-level jails from sysctls. */
319	if (!jailed(td->td_ucred)) {
320		for (fi = 0; fi < sizeof(pr_allow_names) /
321		     sizeof(pr_allow_names[0]); fi++) {
322			optiov[opt.uio_iovcnt].iov_base =
323			    (jail_default_allow & (1 << fi))
324			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
325			optiov[opt.uio_iovcnt].iov_len =
326			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
327			opt.uio_iovcnt += 2;
328		}
329		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
330		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
331		opt.uio_iovcnt++;
332		enforce_statfs = jail_default_enforce_statfs;
333		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
334		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
335		opt.uio_iovcnt++;
336	}
337
338	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
339#ifdef INET
340	ip4s = (j->version == 0) ? 1 : j->ip4s;
341	if (ip4s > jail_max_af_ips)
342		return (EINVAL);
343	tmplen += ip4s * sizeof(struct in_addr);
344#else
345	if (j->ip4s > 0)
346		return (EINVAL);
347#endif
348#ifdef INET6
349	if (j->ip6s > jail_max_af_ips)
350		return (EINVAL);
351	tmplen += j->ip6s * sizeof(struct in6_addr);
352#else
353	if (j->ip6s > 0)
354		return (EINVAL);
355#endif
356	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
357	u_hostname = u_path + MAXPATHLEN;
358	u_name = u_hostname + MAXHOSTNAMELEN;
359#ifdef INET
360	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
361#endif
362#ifdef INET6
363#ifdef INET
364	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
365#else
366	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
367#endif
368#endif
369	optiov[opt.uio_iovcnt].iov_base = "path";
370	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
371	opt.uio_iovcnt++;
372	optiov[opt.uio_iovcnt].iov_base = u_path;
373	error = copyinstr(j->path, u_path, MAXPATHLEN,
374	    &optiov[opt.uio_iovcnt].iov_len);
375	if (error) {
376		free(u_path, M_TEMP);
377		return (error);
378	}
379	opt.uio_iovcnt++;
380	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
381	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
382	opt.uio_iovcnt++;
383	optiov[opt.uio_iovcnt].iov_base = u_hostname;
384	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
385	    &optiov[opt.uio_iovcnt].iov_len);
386	if (error) {
387		free(u_path, M_TEMP);
388		return (error);
389	}
390	opt.uio_iovcnt++;
391	if (j->jailname != NULL) {
392		optiov[opt.uio_iovcnt].iov_base = "name";
393		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
394		opt.uio_iovcnt++;
395		optiov[opt.uio_iovcnt].iov_base = u_name;
396		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
397		    &optiov[opt.uio_iovcnt].iov_len);
398		if (error) {
399			free(u_path, M_TEMP);
400			return (error);
401		}
402		opt.uio_iovcnt++;
403	}
404#ifdef INET
405	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
406	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
407	opt.uio_iovcnt++;
408	optiov[opt.uio_iovcnt].iov_base = u_ip4;
409	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
410	if (j->version == 0)
411		u_ip4->s_addr = j->ip4s;
412	else {
413		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
414		if (error) {
415			free(u_path, M_TEMP);
416			return (error);
417		}
418	}
419	opt.uio_iovcnt++;
420#endif
421#ifdef INET6
422	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
423	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
424	opt.uio_iovcnt++;
425	optiov[opt.uio_iovcnt].iov_base = u_ip6;
426	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
427	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
428	if (error) {
429		free(u_path, M_TEMP);
430		return (error);
431	}
432	opt.uio_iovcnt++;
433#endif
434	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
435	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
436	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
437	free(u_path, M_TEMP);
438	return (error);
439}
440
441
442/*
443 * struct jail_set_args {
444 *	struct iovec *iovp;
445 *	unsigned int iovcnt;
446 *	int flags;
447 * };
448 */
449int
450jail_set(struct thread *td, struct jail_set_args *uap)
451{
452	struct uio *auio;
453	int error;
454
455	/* Check that we have an even number of iovecs. */
456	if (uap->iovcnt & 1)
457		return (EINVAL);
458
459	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
460	if (error)
461		return (error);
462	error = kern_jail_set(td, auio, uap->flags);
463	free(auio, M_IOV);
464	return (error);
465}
466
467int
468kern_jail_set(struct thread *td, struct uio *optuio, int flags)
469{
470	struct nameidata nd;
471#ifdef INET
472	struct in_addr *ip4;
473#endif
474#ifdef INET6
475	struct in6_addr *ip6;
476#endif
477	struct vfsopt *opt;
478	struct vfsoptlist *opts;
479	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
480	struct vnode *root;
481	char *domain, *errmsg, *host, *name, *p, *path, *uuid;
482#if defined(INET) || defined(INET6)
483	struct prison *tppr;
484	void *op;
485#endif
486	unsigned long hid;
487	size_t namelen, onamelen;
488	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
489	int gotchildmax, gotenforce, gothid, gotslevel;
490	int fi, jid, jsys, len, level;
491	int childmax, slevel, vfslocked;
492#if defined(INET) || defined(INET6)
493	int ii, ij;
494#endif
495#ifdef INET
496	int ip4s, redo_ip4;
497#endif
498#ifdef INET6
499	int ip6s, redo_ip6;
500#endif
501	unsigned pr_flags, ch_flags;
502	unsigned pr_allow, ch_allow, tallow;
503	char numbuf[12];
504
505	error = priv_check(td, PRIV_JAIL_SET);
506	if (!error && (flags & JAIL_ATTACH))
507		error = priv_check(td, PRIV_JAIL_ATTACH);
508	if (error)
509		return (error);
510	mypr = ppr = td->td_ucred->cr_prison;
511	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
512		return (EPERM);
513	if (flags & ~JAIL_SET_MASK)
514		return (EINVAL);
515
516	/*
517	 * Check all the parameters before committing to anything.  Not all
518	 * errors can be caught early, but we may as well try.  Also, this
519	 * takes care of some expensive stuff (path lookup) before getting
520	 * the allprison lock.
521	 *
522	 * XXX Jails are not filesystems, and jail parameters are not mount
523	 *     options.  But it makes more sense to re-use the vfsopt code
524	 *     than duplicate it under a different name.
525	 */
526	error = vfs_buildopts(optuio, &opts);
527	if (error)
528		return (error);
529#ifdef INET
530	ip4 = NULL;
531#endif
532#ifdef INET6
533	ip6 = NULL;
534#endif
535
536	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
537	if (error == ENOENT)
538		jid = 0;
539	else if (error != 0)
540		goto done_free;
541
542	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
543	if (error == ENOENT)
544		gotslevel = 0;
545	else if (error != 0)
546		goto done_free;
547	else
548		gotslevel = 1;
549
550	error =
551	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
552	if (error == ENOENT)
553		gotchildmax = 0;
554	else if (error != 0)
555		goto done_free;
556	else
557		gotchildmax = 1;
558
559	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
560	gotenforce = (error == 0);
561	if (gotenforce) {
562		if (enforce < 0 || enforce > 2)
563			return (EINVAL);
564	} else if (error != ENOENT)
565		goto done_free;
566
567	pr_flags = ch_flags = 0;
568	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
569	    fi++) {
570		if (pr_flag_names[fi] == NULL)
571			continue;
572		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
573		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
574	}
575	ch_flags |= pr_flags;
576	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
577	    fi++) {
578		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
579		    sizeof(jsys));
580		if (error == ENOENT)
581			continue;
582		if (error != 0)
583			goto done_free;
584		switch (jsys) {
585		case JAIL_SYS_DISABLE:
586			if (!pr_flag_jailsys[fi].disable) {
587				error = EINVAL;
588				goto done_free;
589			}
590			pr_flags |= pr_flag_jailsys[fi].disable;
591			break;
592		case JAIL_SYS_NEW:
593			pr_flags |= pr_flag_jailsys[fi].new;
594			break;
595		case JAIL_SYS_INHERIT:
596			break;
597		default:
598			error = EINVAL;
599			goto done_free;
600		}
601		ch_flags |=
602		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
603	}
604	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
605	    && !(pr_flags & PR_PERSIST)) {
606		error = EINVAL;
607		vfs_opterror(opts, "new jail must persist or attach");
608		goto done_errmsg;
609	}
610#ifdef VIMAGE
611	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
612		error = EINVAL;
613		vfs_opterror(opts, "vnet cannot be changed after creation");
614		goto done_errmsg;
615	}
616#endif
617#ifdef INET
618	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
619		error = EINVAL;
620		vfs_opterror(opts, "ip4 cannot be changed after creation");
621		goto done_errmsg;
622	}
623#endif
624#ifdef INET6
625	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
626		error = EINVAL;
627		vfs_opterror(opts, "ip6 cannot be changed after creation");
628		goto done_errmsg;
629	}
630#endif
631
632	pr_allow = ch_allow = 0;
633	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
634	    fi++) {
635		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
636		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
637	}
638	ch_allow |= pr_allow;
639
640	error = vfs_getopt(opts, "name", (void **)&name, &len);
641	if (error == ENOENT)
642		name = NULL;
643	else if (error != 0)
644		goto done_free;
645	else {
646		if (len == 0 || name[len - 1] != '\0') {
647			error = EINVAL;
648			goto done_free;
649		}
650		if (len > MAXHOSTNAMELEN) {
651			error = ENAMETOOLONG;
652			goto done_free;
653		}
654	}
655
656	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
657	if (error == ENOENT)
658		host = NULL;
659	else if (error != 0)
660		goto done_free;
661	else {
662		ch_flags |= PR_HOST;
663		pr_flags |= PR_HOST;
664		if (len == 0 || host[len - 1] != '\0') {
665			error = EINVAL;
666			goto done_free;
667		}
668		if (len > MAXHOSTNAMELEN) {
669			error = ENAMETOOLONG;
670			goto done_free;
671		}
672	}
673
674	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
675	if (error == ENOENT)
676		domain = NULL;
677	else if (error != 0)
678		goto done_free;
679	else {
680		ch_flags |= PR_HOST;
681		pr_flags |= PR_HOST;
682		if (len == 0 || domain[len - 1] != '\0') {
683			error = EINVAL;
684			goto done_free;
685		}
686		if (len > MAXHOSTNAMELEN) {
687			error = ENAMETOOLONG;
688			goto done_free;
689		}
690	}
691
692	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
693	if (error == ENOENT)
694		uuid = NULL;
695	else if (error != 0)
696		goto done_free;
697	else {
698		ch_flags |= PR_HOST;
699		pr_flags |= PR_HOST;
700		if (len == 0 || uuid[len - 1] != '\0') {
701			error = EINVAL;
702			goto done_free;
703		}
704		if (len > HOSTUUIDLEN) {
705			error = ENAMETOOLONG;
706			goto done_free;
707		}
708	}
709
710#ifdef COMPAT_IA32
711	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
712		uint32_t hid32;
713
714		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
715		hid = hid32;
716	} else
717#endif
718		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
719	if (error == ENOENT)
720		gothid = 0;
721	else if (error != 0)
722		goto done_free;
723	else {
724		gothid = 1;
725		ch_flags |= PR_HOST;
726		pr_flags |= PR_HOST;
727	}
728
729#ifdef INET
730	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
731	if (error == ENOENT)
732		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
733	else if (error != 0)
734		goto done_free;
735	else if (ip4s & (sizeof(*ip4) - 1)) {
736		error = EINVAL;
737		goto done_free;
738	} else {
739		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
740		if (ip4s == 0)
741			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
742		else {
743			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
744			ip4s /= sizeof(*ip4);
745			if (ip4s > jail_max_af_ips) {
746				error = EINVAL;
747				vfs_opterror(opts, "too many IPv4 addresses");
748				goto done_errmsg;
749			}
750			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
751			bcopy(op, ip4, ip4s * sizeof(*ip4));
752			/*
753			 * IP addresses are all sorted but ip[0] to preserve
754			 * the primary IP address as given from userland.
755			 * This special IP is used for unbound outgoing
756			 * connections as well for "loopback" traffic.
757			 */
758			if (ip4s > 1)
759				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
760			/*
761			 * Check for duplicate addresses and do some simple
762			 * zero and broadcast checks. If users give other bogus
763			 * addresses it is their problem.
764			 *
765			 * We do not have to care about byte order for these
766			 * checks so we will do them in NBO.
767			 */
768			for (ii = 0; ii < ip4s; ii++) {
769				if (ip4[ii].s_addr == INADDR_ANY ||
770				    ip4[ii].s_addr == INADDR_BROADCAST) {
771					error = EINVAL;
772					goto done_free;
773				}
774				if ((ii+1) < ip4s &&
775				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
776				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
777					error = EINVAL;
778					goto done_free;
779				}
780			}
781		}
782	}
783#endif
784
785#ifdef INET6
786	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
787	if (error == ENOENT)
788		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
789	else if (error != 0)
790		goto done_free;
791	else if (ip6s & (sizeof(*ip6) - 1)) {
792		error = EINVAL;
793		goto done_free;
794	} else {
795		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
796		if (ip6s == 0)
797			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
798		else {
799			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
800			ip6s /= sizeof(*ip6);
801			if (ip6s > jail_max_af_ips) {
802				error = EINVAL;
803				vfs_opterror(opts, "too many IPv6 addresses");
804				goto done_errmsg;
805			}
806			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
807			bcopy(op, ip6, ip6s * sizeof(*ip6));
808			if (ip6s > 1)
809				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
810			for (ii = 0; ii < ip6s; ii++) {
811				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
812					error = EINVAL;
813					goto done_free;
814				}
815				if ((ii+1) < ip6s &&
816				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
817				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
818				{
819					error = EINVAL;
820					goto done_free;
821				}
822			}
823		}
824	}
825#endif
826
827#if defined(VIMAGE) && (defined(INET) || defined(INET6))
828	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
829		error = EINVAL;
830		vfs_opterror(opts,
831		    "vnet jails cannot have IP address restrictions");
832		goto done_errmsg;
833	}
834#endif
835
836	root = NULL;
837	error = vfs_getopt(opts, "path", (void **)&path, &len);
838	if (error == ENOENT)
839		path = NULL;
840	else if (error != 0)
841		goto done_free;
842	else {
843		if (flags & JAIL_UPDATE) {
844			error = EINVAL;
845			vfs_opterror(opts,
846			    "path cannot be changed after creation");
847			goto done_errmsg;
848		}
849		if (len == 0 || path[len - 1] != '\0') {
850			error = EINVAL;
851			goto done_free;
852		}
853		if (len < 2 || (len == 2 && path[0] == '/'))
854			path = NULL;
855		else {
856			/* Leave room for a real-root full pathname. */
857			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
858			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
859				error = ENAMETOOLONG;
860				goto done_free;
861			}
862			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
863			    path, td);
864			error = namei(&nd);
865			if (error)
866				goto done_free;
867			vfslocked = NDHASGIANT(&nd);
868			root = nd.ni_vp;
869			NDFREE(&nd, NDF_ONLY_PNBUF);
870			if (root->v_type != VDIR) {
871				error = ENOTDIR;
872				vrele(root);
873				VFS_UNLOCK_GIANT(vfslocked);
874				goto done_free;
875			}
876			VFS_UNLOCK_GIANT(vfslocked);
877		}
878	}
879
880	/*
881	 * Grab the allprison lock before letting modules check their
882	 * parameters.  Once we have it, do not let go so we'll have a
883	 * consistent view of the OSD list.
884	 */
885	sx_xlock(&allprison_lock);
886	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
887	if (error)
888		goto done_unlock_list;
889
890	/* By now, all parameters should have been noted. */
891	TAILQ_FOREACH(opt, opts, link) {
892		if (!opt->seen && strcmp(opt->name, "errmsg")) {
893			error = EINVAL;
894			vfs_opterror(opts, "unknown parameter: %s", opt->name);
895			goto done_unlock_list;
896		}
897	}
898
899	/*
900	 * See if we are creating a new record or updating an existing one.
901	 * This abuses the file error codes ENOENT and EEXIST.
902	 */
903	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
904	if (!cuflags) {
905		error = EINVAL;
906		vfs_opterror(opts, "no valid operation (create or update)");
907		goto done_unlock_list;
908	}
909	pr = NULL;
910	if (jid != 0) {
911		/*
912		 * See if a requested jid already exists.  There is an
913		 * information leak here if the jid exists but is not within
914		 * the caller's jail hierarchy.  Jail creators will get EEXIST
915		 * even though they cannot see the jail, and CREATE | UPDATE
916		 * will return ENOENT which is not normally a valid error.
917		 */
918		if (jid < 0) {
919			error = EINVAL;
920			vfs_opterror(opts, "negative jid");
921			goto done_unlock_list;
922		}
923		pr = prison_find(jid);
924		if (pr != NULL) {
925			ppr = pr->pr_parent;
926			/* Create: jid must not exist. */
927			if (cuflags == JAIL_CREATE) {
928				mtx_unlock(&pr->pr_mtx);
929				error = EEXIST;
930				vfs_opterror(opts, "jail %d already exists",
931				    jid);
932				goto done_unlock_list;
933			}
934			if (!prison_ischild(mypr, pr)) {
935				mtx_unlock(&pr->pr_mtx);
936				pr = NULL;
937			} else if (pr->pr_uref == 0) {
938				if (!(flags & JAIL_DYING)) {
939					mtx_unlock(&pr->pr_mtx);
940					error = ENOENT;
941					vfs_opterror(opts, "jail %d is dying",
942					    jid);
943					goto done_unlock_list;
944				} else if ((flags & JAIL_ATTACH) ||
945				    (pr_flags & PR_PERSIST)) {
946					/*
947					 * A dying jail might be resurrected
948					 * (via attach or persist), but first
949					 * it must determine if another jail
950					 * has claimed its name.  Accomplish
951					 * this by implicitly re-setting the
952					 * name.
953					 */
954					if (name == NULL)
955						name = prison_name(mypr, pr);
956				}
957			}
958		}
959		if (pr == NULL) {
960			/* Update: jid must exist. */
961			if (cuflags == JAIL_UPDATE) {
962				error = ENOENT;
963				vfs_opterror(opts, "jail %d not found", jid);
964				goto done_unlock_list;
965			}
966		}
967	}
968	/*
969	 * If the caller provided a name, look for a jail by that name.
970	 * This has different semantics for creates and updates keyed by jid
971	 * (where the name must not already exist in a different jail),
972	 * and updates keyed by the name itself (where the name must exist
973	 * because that is the jail being updated).
974	 */
975	if (name != NULL) {
976		p = strrchr(name, '.');
977		if (p != NULL) {
978			/*
979			 * This is a hierarchical name.  Split it into the
980			 * parent and child names, and make sure the parent
981			 * exists or matches an already found jail.
982			 */
983			*p = '\0';
984			if (pr != NULL) {
985				if (strncmp(name, ppr->pr_name, p - name) ||
986				    ppr->pr_name[p - name] != '\0') {
987					mtx_unlock(&pr->pr_mtx);
988					error = EINVAL;
989					vfs_opterror(opts,
990					    "cannot change jail's parent");
991					goto done_unlock_list;
992				}
993			} else {
994				ppr = prison_find_name(mypr, name);
995				if (ppr == NULL) {
996					error = ENOENT;
997					vfs_opterror(opts,
998					    "jail \"%s\" not found", name);
999					goto done_unlock_list;
1000				}
1001				mtx_unlock(&ppr->pr_mtx);
1002			}
1003			name = p + 1;
1004		}
1005		if (name[0] != '\0') {
1006			namelen =
1007			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1008 name_again:
1009			deadpr = NULL;
1010			FOREACH_PRISON_CHILD(ppr, tpr) {
1011				if (tpr != pr && tpr->pr_ref > 0 &&
1012				    !strcmp(tpr->pr_name + namelen, name)) {
1013					if (pr == NULL &&
1014					    cuflags != JAIL_CREATE) {
1015						mtx_lock(&tpr->pr_mtx);
1016						if (tpr->pr_ref > 0) {
1017							/*
1018							 * Use this jail
1019							 * for updates.
1020							 */
1021							if (tpr->pr_uref > 0) {
1022								pr = tpr;
1023								break;
1024							}
1025							deadpr = tpr;
1026						}
1027						mtx_unlock(&tpr->pr_mtx);
1028					} else if (tpr->pr_uref > 0) {
1029						/*
1030						 * Create, or update(jid):
1031						 * name must not exist in an
1032						 * active sibling jail.
1033						 */
1034						error = EEXIST;
1035						if (pr != NULL)
1036							mtx_unlock(&pr->pr_mtx);
1037						vfs_opterror(opts,
1038						   "jail \"%s\" already exists",
1039						   name);
1040						goto done_unlock_list;
1041					}
1042				}
1043			}
1044			/* If no active jail is found, use a dying one. */
1045			if (deadpr != NULL && pr == NULL) {
1046				if (flags & JAIL_DYING) {
1047					mtx_lock(&deadpr->pr_mtx);
1048					if (deadpr->pr_ref == 0) {
1049						mtx_unlock(&deadpr->pr_mtx);
1050						goto name_again;
1051					}
1052					pr = deadpr;
1053				} else if (cuflags == JAIL_UPDATE) {
1054					error = ENOENT;
1055					vfs_opterror(opts,
1056					    "jail \"%s\" is dying", name);
1057					goto done_unlock_list;
1058				}
1059			}
1060			/* Update: name must exist if no jid. */
1061			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1062				error = ENOENT;
1063				vfs_opterror(opts, "jail \"%s\" not found",
1064				    name);
1065				goto done_unlock_list;
1066			}
1067		}
1068	}
1069	/* Update: must provide a jid or name. */
1070	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1071		error = ENOENT;
1072		vfs_opterror(opts, "update specified no jail");
1073		goto done_unlock_list;
1074	}
1075
1076	/* If there's no prison to update, create a new one and link it in. */
1077	if (pr == NULL) {
1078		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1079			if (tpr->pr_childcount >= tpr->pr_childmax) {
1080				error = EPERM;
1081				vfs_opterror(opts, "prison limit exceeded");
1082				goto done_unlock_list;
1083			}
1084		created = 1;
1085		mtx_lock(&ppr->pr_mtx);
1086		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1087			mtx_unlock(&ppr->pr_mtx);
1088			error = ENOENT;
1089			vfs_opterror(opts, "parent jail went away!");
1090			goto done_unlock_list;
1091		}
1092		ppr->pr_ref++;
1093		ppr->pr_uref++;
1094		mtx_unlock(&ppr->pr_mtx);
1095		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1096		if (jid == 0) {
1097			/* Find the next free jid. */
1098			jid = lastprid + 1;
1099 findnext:
1100			if (jid == JAIL_MAX)
1101				jid = 1;
1102			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1103				if (tpr->pr_id < jid)
1104					continue;
1105				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1106					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1107					break;
1108				}
1109				if (jid == lastprid) {
1110					error = EAGAIN;
1111					vfs_opterror(opts,
1112					    "no available jail IDs");
1113					free(pr, M_PRISON);
1114					prison_deref(ppr, PD_DEREF |
1115					    PD_DEUREF | PD_LIST_XLOCKED);
1116					goto done_releroot;
1117				}
1118				jid++;
1119				goto findnext;
1120			}
1121			lastprid = jid;
1122		} else {
1123			/*
1124			 * The jail already has a jid (that did not yet exist),
1125			 * so just find where to insert it.
1126			 */
1127			TAILQ_FOREACH(tpr, &allprison, pr_list)
1128				if (tpr->pr_id >= jid) {
1129					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1130					break;
1131				}
1132		}
1133		if (tpr == NULL)
1134			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1135		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1136		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1137			tpr->pr_childcount++;
1138
1139		pr->pr_parent = ppr;
1140		pr->pr_id = jid;
1141
1142		/* Set some default values, and inherit some from the parent. */
1143		if (name == NULL)
1144			name = "";
1145		if (path == NULL) {
1146			path = "/";
1147			root = mypr->pr_root;
1148			vref(root);
1149		}
1150		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1151		pr->pr_flags |= PR_HOST;
1152#if defined(INET) || defined(INET6)
1153#ifdef VIMAGE
1154		if (!(pr_flags & PR_VNET))
1155#endif
1156		{
1157#ifdef INET
1158			if (!(ch_flags & PR_IP4_USER))
1159				pr->pr_flags |=
1160				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1161			else if (!(pr_flags & PR_IP4_USER)) {
1162				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1163				if (ppr->pr_ip4 != NULL) {
1164					pr->pr_ip4s = ppr->pr_ip4s;
1165					pr->pr_ip4 = malloc(pr->pr_ip4s *
1166					    sizeof(struct in_addr), M_PRISON,
1167					    M_WAITOK);
1168					bcopy(ppr->pr_ip4, pr->pr_ip4,
1169					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1170				}
1171			}
1172#endif
1173#ifdef INET6
1174			if (!(ch_flags & PR_IP6_USER))
1175				pr->pr_flags |=
1176				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1177			else if (!(pr_flags & PR_IP6_USER)) {
1178				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1179				if (ppr->pr_ip6 != NULL) {
1180					pr->pr_ip6s = ppr->pr_ip6s;
1181					pr->pr_ip6 = malloc(pr->pr_ip6s *
1182					    sizeof(struct in6_addr), M_PRISON,
1183					    M_WAITOK);
1184					bcopy(ppr->pr_ip6, pr->pr_ip6,
1185					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1186				}
1187			}
1188#endif
1189		}
1190#endif
1191		pr->pr_securelevel = ppr->pr_securelevel;
1192		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1193		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1194
1195		LIST_INIT(&pr->pr_children);
1196		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1197
1198#ifdef VIMAGE
1199		/* Allocate a new vnet if specified. */
1200		pr->pr_vnet = (pr_flags & PR_VNET)
1201		    ? vnet_alloc() : ppr->pr_vnet;
1202#endif
1203		/*
1204		 * Allocate a dedicated cpuset for each jail.
1205		 * Unlike other initial settings, this may return an erorr.
1206		 */
1207		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1208		if (error) {
1209			prison_deref(pr, PD_LIST_XLOCKED);
1210			goto done_releroot;
1211		}
1212
1213		mtx_lock(&pr->pr_mtx);
1214		/*
1215		 * New prisons do not yet have a reference, because we do not
1216		 * want other to see the incomplete prison once the
1217		 * allprison_lock is downgraded.
1218		 */
1219	} else {
1220		created = 0;
1221		/*
1222		 * Grab a reference for existing prisons, to ensure they
1223		 * continue to exist for the duration of the call.
1224		 */
1225		pr->pr_ref++;
1226#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1227		if ((pr->pr_flags & PR_VNET) &&
1228		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1229			error = EINVAL;
1230			vfs_opterror(opts,
1231			    "vnet jails cannot have IP address restrictions");
1232			goto done_deref_locked;
1233		}
1234#endif
1235#ifdef INET
1236		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1237			error = EINVAL;
1238			vfs_opterror(opts,
1239			    "ip4 cannot be changed after creation");
1240			goto done_deref_locked;
1241		}
1242#endif
1243#ifdef INET6
1244		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1245			error = EINVAL;
1246			vfs_opterror(opts,
1247			    "ip6 cannot be changed after creation");
1248			goto done_deref_locked;
1249		}
1250#endif
1251	}
1252
1253	/* Do final error checking before setting anything. */
1254	if (gotslevel) {
1255		if (slevel < ppr->pr_securelevel) {
1256			error = EPERM;
1257			goto done_deref_locked;
1258		}
1259	}
1260	if (gotchildmax) {
1261		if (childmax >= ppr->pr_childmax) {
1262			error = EPERM;
1263			goto done_deref_locked;
1264		}
1265	}
1266	if (gotenforce) {
1267		if (enforce < ppr->pr_enforce_statfs) {
1268			error = EPERM;
1269			goto done_deref_locked;
1270		}
1271	}
1272#ifdef INET
1273	if (ip4s > 0) {
1274		if (ppr->pr_flags & PR_IP4) {
1275			/*
1276			 * Make sure the new set of IP addresses is a
1277			 * subset of the parent's list.  Don't worry
1278			 * about the parent being unlocked, as any
1279			 * setting is done with allprison_lock held.
1280			 */
1281			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1282				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1283					break;
1284			if (ij == ppr->pr_ip4s) {
1285				error = EPERM;
1286				goto done_deref_locked;
1287			}
1288			if (ip4s > 1) {
1289				for (ii = ij = 1; ii < ip4s; ii++) {
1290					if (ip4[ii].s_addr ==
1291					    ppr->pr_ip4[0].s_addr)
1292						continue;
1293					for (; ij < ppr->pr_ip4s; ij++)
1294						if (ip4[ii].s_addr ==
1295						    ppr->pr_ip4[ij].s_addr)
1296							break;
1297					if (ij == ppr->pr_ip4s)
1298						break;
1299				}
1300				if (ij == ppr->pr_ip4s) {
1301					error = EPERM;
1302					goto done_deref_locked;
1303				}
1304			}
1305		}
1306		/*
1307		 * Check for conflicting IP addresses.  We permit them
1308		 * if there is no more than one IP on each jail.  If
1309		 * there is a duplicate on a jail with more than one
1310		 * IP stop checking and return error.
1311		 */
1312		tppr = ppr;
1313#ifdef VIMAGE
1314		for (; tppr != &prison0; tppr = tppr->pr_parent)
1315			if (tppr->pr_flags & PR_VNET)
1316				break;
1317#endif
1318		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1319			if (tpr == pr ||
1320#ifdef VIMAGE
1321			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1322#endif
1323			    tpr->pr_uref == 0) {
1324				descend = 0;
1325				continue;
1326			}
1327			if (!(tpr->pr_flags & PR_IP4_USER))
1328				continue;
1329			descend = 0;
1330			if (tpr->pr_ip4 == NULL ||
1331			    (ip4s == 1 && tpr->pr_ip4s == 1))
1332				continue;
1333			for (ii = 0; ii < ip4s; ii++) {
1334				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1335					error = EADDRINUSE;
1336					vfs_opterror(opts,
1337					    "IPv4 addresses clash");
1338					goto done_deref_locked;
1339				}
1340			}
1341		}
1342	}
1343#endif
1344#ifdef INET6
1345	if (ip6s > 0) {
1346		if (ppr->pr_flags & PR_IP6) {
1347			/*
1348			 * Make sure the new set of IP addresses is a
1349			 * subset of the parent's list.
1350			 */
1351			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1352				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1353				    &ppr->pr_ip6[ij]))
1354					break;
1355			if (ij == ppr->pr_ip6s) {
1356				error = EPERM;
1357				goto done_deref_locked;
1358			}
1359			if (ip6s > 1) {
1360				for (ii = ij = 1; ii < ip6s; ii++) {
1361					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1362					     &ppr->pr_ip6[0]))
1363						continue;
1364					for (; ij < ppr->pr_ip6s; ij++)
1365						if (IN6_ARE_ADDR_EQUAL(
1366						    &ip6[ii], &ppr->pr_ip6[ij]))
1367							break;
1368					if (ij == ppr->pr_ip6s)
1369						break;
1370				}
1371				if (ij == ppr->pr_ip6s) {
1372					error = EPERM;
1373					goto done_deref_locked;
1374				}
1375			}
1376		}
1377		/* Check for conflicting IP addresses. */
1378		tppr = ppr;
1379#ifdef VIMAGE
1380		for (; tppr != &prison0; tppr = tppr->pr_parent)
1381			if (tppr->pr_flags & PR_VNET)
1382				break;
1383#endif
1384		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1385			if (tpr == pr ||
1386#ifdef VIMAGE
1387			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1388#endif
1389			    tpr->pr_uref == 0) {
1390				descend = 0;
1391				continue;
1392			}
1393			if (!(tpr->pr_flags & PR_IP6_USER))
1394				continue;
1395			descend = 0;
1396			if (tpr->pr_ip6 == NULL ||
1397			    (ip6s == 1 && tpr->pr_ip6s == 1))
1398				continue;
1399			for (ii = 0; ii < ip6s; ii++) {
1400				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1401					error = EADDRINUSE;
1402					vfs_opterror(opts,
1403					    "IPv6 addresses clash");
1404					goto done_deref_locked;
1405				}
1406			}
1407		}
1408	}
1409#endif
1410	onamelen = namelen = 0;
1411	if (name != NULL) {
1412		/* Give a default name of the jid. */
1413		if (name[0] == '\0')
1414			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1415		else if (strtoul(name, &p, 10) != jid && *p == '\0') {
1416			error = EINVAL;
1417			vfs_opterror(opts, "name cannot be numeric");
1418			goto done_deref_locked;
1419		}
1420		/*
1421		 * Make sure the name isn't too long for the prison or its
1422		 * children.
1423		 */
1424		onamelen = strlen(pr->pr_name);
1425		namelen = strlen(name);
1426		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1427			error = ENAMETOOLONG;
1428			goto done_deref_locked;
1429		}
1430		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1431			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1432			    sizeof(pr->pr_name)) {
1433				error = ENAMETOOLONG;
1434				goto done_deref_locked;
1435			}
1436		}
1437	}
1438	if (pr_allow & ~ppr->pr_allow) {
1439		error = EPERM;
1440		goto done_deref_locked;
1441	}
1442
1443	/* Set the parameters of the prison. */
1444#ifdef INET
1445	redo_ip4 = 0;
1446	if (pr_flags & PR_IP4_USER) {
1447		pr->pr_flags |= PR_IP4;
1448		free(pr->pr_ip4, M_PRISON);
1449		pr->pr_ip4s = ip4s;
1450		pr->pr_ip4 = ip4;
1451		ip4 = NULL;
1452		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1453#ifdef VIMAGE
1454			if (tpr->pr_flags & PR_VNET) {
1455				descend = 0;
1456				continue;
1457			}
1458#endif
1459			if (prison_restrict_ip4(tpr, NULL)) {
1460				redo_ip4 = 1;
1461				descend = 0;
1462			}
1463		}
1464	}
1465#endif
1466#ifdef INET6
1467	redo_ip6 = 0;
1468	if (pr_flags & PR_IP6_USER) {
1469		pr->pr_flags |= PR_IP6;
1470		free(pr->pr_ip6, M_PRISON);
1471		pr->pr_ip6s = ip6s;
1472		pr->pr_ip6 = ip6;
1473		ip6 = NULL;
1474		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1475#ifdef VIMAGE
1476			if (tpr->pr_flags & PR_VNET) {
1477				descend = 0;
1478				continue;
1479			}
1480#endif
1481			if (prison_restrict_ip6(tpr, NULL)) {
1482				redo_ip6 = 1;
1483				descend = 0;
1484			}
1485		}
1486	}
1487#endif
1488	if (gotslevel) {
1489		pr->pr_securelevel = slevel;
1490		/* Set all child jails to be at least this level. */
1491		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1492			if (tpr->pr_securelevel < slevel)
1493				tpr->pr_securelevel = slevel;
1494	}
1495	if (gotchildmax) {
1496		pr->pr_childmax = childmax;
1497		/* Set all child jails to under this limit. */
1498		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1499			if (tpr->pr_childmax > childmax - level)
1500				tpr->pr_childmax = childmax > level
1501				    ? childmax - level : 0;
1502	}
1503	if (gotenforce) {
1504		pr->pr_enforce_statfs = enforce;
1505		/* Pass this restriction on to the children. */
1506		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1507			if (tpr->pr_enforce_statfs < enforce)
1508				tpr->pr_enforce_statfs = enforce;
1509	}
1510	if (name != NULL) {
1511		if (ppr == &prison0)
1512			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1513		else
1514			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1515			    ppr->pr_name, name);
1516		/* Change this component of child names. */
1517		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1518			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1519			    strlen(tpr->pr_name + onamelen) + 1);
1520			bcopy(pr->pr_name, tpr->pr_name, namelen);
1521		}
1522	}
1523	if (path != NULL) {
1524		/* Try to keep a real-rooted full pathname. */
1525		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1526			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1527			    mypr->pr_path, path);
1528		else
1529			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1530		pr->pr_root = root;
1531	}
1532	if (PR_HOST & ch_flags & ~pr_flags) {
1533		if (pr->pr_flags & PR_HOST) {
1534			/*
1535			 * Copy the parent's host info.  As with pr_ip4 above,
1536			 * the lack of a lock on the parent is not a problem;
1537			 * it is always set with allprison_lock at least
1538			 * shared, and is held exclusively here.
1539			 */
1540			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1541			    sizeof(pr->pr_hostname));
1542			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1543			    sizeof(pr->pr_domainname));
1544			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1545			    sizeof(pr->pr_hostuuid));
1546			pr->pr_hostid = pr->pr_parent->pr_hostid;
1547		}
1548	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1549		/* Set this prison, and any descendants without PR_HOST. */
1550		if (host != NULL)
1551			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1552		if (domain != NULL)
1553			strlcpy(pr->pr_domainname, domain,
1554			    sizeof(pr->pr_domainname));
1555		if (uuid != NULL)
1556			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1557		if (gothid)
1558			pr->pr_hostid = hid;
1559		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1560			if (tpr->pr_flags & PR_HOST)
1561				descend = 0;
1562			else {
1563				if (host != NULL)
1564					strlcpy(tpr->pr_hostname,
1565					    pr->pr_hostname,
1566					    sizeof(tpr->pr_hostname));
1567				if (domain != NULL)
1568					strlcpy(tpr->pr_domainname,
1569					    pr->pr_domainname,
1570					    sizeof(tpr->pr_domainname));
1571				if (uuid != NULL)
1572					strlcpy(tpr->pr_hostuuid,
1573					    pr->pr_hostuuid,
1574					    sizeof(tpr->pr_hostuuid));
1575				if (gothid)
1576					tpr->pr_hostid = hid;
1577			}
1578		}
1579	}
1580	if ((tallow = ch_allow & ~pr_allow)) {
1581		/* Clear allow bits in all children. */
1582		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1583			tpr->pr_allow &= ~tallow;
1584	}
1585	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1586	/*
1587	 * Persistent prisons get an extra reference, and prisons losing their
1588	 * persist flag lose that reference.  Only do this for existing prisons
1589	 * for now, so new ones will remain unseen until after the module
1590	 * handlers have completed.
1591	 */
1592	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1593		if (pr_flags & PR_PERSIST) {
1594			pr->pr_ref++;
1595			pr->pr_uref++;
1596		} else {
1597			pr->pr_ref--;
1598			pr->pr_uref--;
1599		}
1600	}
1601	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1602	mtx_unlock(&pr->pr_mtx);
1603
1604	/* Locks may have prevented a complete restriction of child IP
1605	 * addresses.  If so, allocate some more memory and try again.
1606	 */
1607#ifdef INET
1608	while (redo_ip4) {
1609		ip4s = pr->pr_ip4s;
1610		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1611		mtx_lock(&pr->pr_mtx);
1612		redo_ip4 = 0;
1613		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1614#ifdef VIMAGE
1615			if (tpr->pr_flags & PR_VNET) {
1616				descend = 0;
1617				continue;
1618			}
1619#endif
1620			if (prison_restrict_ip4(tpr, ip4)) {
1621				if (ip4 != NULL)
1622					ip4 = NULL;
1623				else
1624					redo_ip4 = 1;
1625			}
1626		}
1627		mtx_unlock(&pr->pr_mtx);
1628	}
1629#endif
1630#ifdef INET6
1631	while (redo_ip6) {
1632		ip6s = pr->pr_ip6s;
1633		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1634		mtx_lock(&pr->pr_mtx);
1635		redo_ip6 = 0;
1636		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1637#ifdef VIMAGE
1638			if (tpr->pr_flags & PR_VNET) {
1639				descend = 0;
1640				continue;
1641			}
1642#endif
1643			if (prison_restrict_ip6(tpr, ip6)) {
1644				if (ip6 != NULL)
1645					ip6 = NULL;
1646				else
1647					redo_ip6 = 1;
1648			}
1649		}
1650		mtx_unlock(&pr->pr_mtx);
1651	}
1652#endif
1653
1654	/* Let the modules do their work. */
1655	sx_downgrade(&allprison_lock);
1656	if (created) {
1657		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1658		if (error) {
1659			prison_deref(pr, PD_LIST_SLOCKED);
1660			goto done_errmsg;
1661		}
1662	}
1663	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1664	if (error) {
1665		prison_deref(pr, created
1666		    ? PD_LIST_SLOCKED
1667		    : PD_DEREF | PD_LIST_SLOCKED);
1668		goto done_errmsg;
1669	}
1670
1671	/* Attach this process to the prison if requested. */
1672	if (flags & JAIL_ATTACH) {
1673		mtx_lock(&pr->pr_mtx);
1674		error = do_jail_attach(td, pr);
1675		if (error) {
1676			vfs_opterror(opts, "attach failed");
1677			if (!created)
1678				prison_deref(pr, PD_DEREF);
1679			goto done_errmsg;
1680		}
1681	}
1682
1683	/*
1684	 * Now that it is all there, drop the temporary reference from existing
1685	 * prisons.  Or add a reference to newly created persistent prisons
1686	 * (which was not done earlier so that the prison would not be publicly
1687	 * visible).
1688	 */
1689	if (!created) {
1690		prison_deref(pr, (flags & JAIL_ATTACH)
1691		    ? PD_DEREF
1692		    : PD_DEREF | PD_LIST_SLOCKED);
1693	} else {
1694		if (pr_flags & PR_PERSIST) {
1695			mtx_lock(&pr->pr_mtx);
1696			pr->pr_ref++;
1697			pr->pr_uref++;
1698			mtx_unlock(&pr->pr_mtx);
1699		}
1700		if (!(flags & JAIL_ATTACH))
1701			sx_sunlock(&allprison_lock);
1702	}
1703	td->td_retval[0] = pr->pr_id;
1704	goto done_errmsg;
1705
1706 done_deref_locked:
1707	prison_deref(pr, created
1708	    ? PD_LOCKED | PD_LIST_XLOCKED
1709	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1710	goto done_releroot;
1711 done_unlock_list:
1712	sx_xunlock(&allprison_lock);
1713 done_releroot:
1714	if (root != NULL) {
1715		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1716		vrele(root);
1717		VFS_UNLOCK_GIANT(vfslocked);
1718	}
1719 done_errmsg:
1720	if (error) {
1721		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1722		if (errmsg_len > 0) {
1723			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1724			if (errmsg_pos > 0) {
1725				if (optuio->uio_segflg == UIO_SYSSPACE)
1726					bcopy(errmsg,
1727					   optuio->uio_iov[errmsg_pos].iov_base,
1728					   errmsg_len);
1729				else
1730					copyout(errmsg,
1731					   optuio->uio_iov[errmsg_pos].iov_base,
1732					   errmsg_len);
1733			}
1734		}
1735	}
1736 done_free:
1737#ifdef INET
1738	free(ip4, M_PRISON);
1739#endif
1740#ifdef INET6
1741	free(ip6, M_PRISON);
1742#endif
1743	vfs_freeopts(opts);
1744	return (error);
1745}
1746
1747
1748/*
1749 * struct jail_get_args {
1750 *	struct iovec *iovp;
1751 *	unsigned int iovcnt;
1752 *	int flags;
1753 * };
1754 */
1755int
1756jail_get(struct thread *td, struct jail_get_args *uap)
1757{
1758	struct uio *auio;
1759	int error;
1760
1761	/* Check that we have an even number of iovecs. */
1762	if (uap->iovcnt & 1)
1763		return (EINVAL);
1764
1765	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1766	if (error)
1767		return (error);
1768	error = kern_jail_get(td, auio, uap->flags);
1769	if (error == 0)
1770		error = copyout(auio->uio_iov, uap->iovp,
1771		    uap->iovcnt * sizeof (struct iovec));
1772	free(auio, M_IOV);
1773	return (error);
1774}
1775
1776int
1777kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1778{
1779	struct prison *pr, *mypr;
1780	struct vfsopt *opt;
1781	struct vfsoptlist *opts;
1782	char *errmsg, *name;
1783	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1784
1785	if (flags & ~JAIL_GET_MASK)
1786		return (EINVAL);
1787
1788	/* Get the parameter list. */
1789	error = vfs_buildopts(optuio, &opts);
1790	if (error)
1791		return (error);
1792	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1793	mypr = td->td_ucred->cr_prison;
1794
1795	/*
1796	 * Find the prison specified by one of: lastjid, jid, name.
1797	 */
1798	sx_slock(&allprison_lock);
1799	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1800	if (error == 0) {
1801		TAILQ_FOREACH(pr, &allprison, pr_list) {
1802			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1803				mtx_lock(&pr->pr_mtx);
1804				if (pr->pr_ref > 0 &&
1805				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1806					break;
1807				mtx_unlock(&pr->pr_mtx);
1808			}
1809		}
1810		if (pr != NULL)
1811			goto found_prison;
1812		error = ENOENT;
1813		vfs_opterror(opts, "no jail after %d", jid);
1814		goto done_unlock_list;
1815	} else if (error != ENOENT)
1816		goto done_unlock_list;
1817
1818	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1819	if (error == 0) {
1820		if (jid != 0) {
1821			pr = prison_find_child(mypr, jid);
1822			if (pr != NULL) {
1823				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1824					mtx_unlock(&pr->pr_mtx);
1825					error = ENOENT;
1826					vfs_opterror(opts, "jail %d is dying",
1827					    jid);
1828					goto done_unlock_list;
1829				}
1830				goto found_prison;
1831			}
1832			error = ENOENT;
1833			vfs_opterror(opts, "jail %d not found", jid);
1834			goto done_unlock_list;
1835		}
1836	} else if (error != ENOENT)
1837		goto done_unlock_list;
1838
1839	error = vfs_getopt(opts, "name", (void **)&name, &len);
1840	if (error == 0) {
1841		if (len == 0 || name[len - 1] != '\0') {
1842			error = EINVAL;
1843			goto done_unlock_list;
1844		}
1845		pr = prison_find_name(mypr, name);
1846		if (pr != NULL) {
1847			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1848				mtx_unlock(&pr->pr_mtx);
1849				error = ENOENT;
1850				vfs_opterror(opts, "jail \"%s\" is dying",
1851				    name);
1852				goto done_unlock_list;
1853			}
1854			goto found_prison;
1855		}
1856		error = ENOENT;
1857		vfs_opterror(opts, "jail \"%s\" not found", name);
1858		goto done_unlock_list;
1859	} else if (error != ENOENT)
1860		goto done_unlock_list;
1861
1862	vfs_opterror(opts, "no jail specified");
1863	error = ENOENT;
1864	goto done_unlock_list;
1865
1866 found_prison:
1867	/* Get the parameters of the prison. */
1868	pr->pr_ref++;
1869	locked = PD_LOCKED;
1870	td->td_retval[0] = pr->pr_id;
1871	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1872	if (error != 0 && error != ENOENT)
1873		goto done_deref;
1874	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1875	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1876	if (error != 0 && error != ENOENT)
1877		goto done_deref;
1878	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1879	if (error != 0 && error != ENOENT)
1880		goto done_deref;
1881	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1882	    sizeof(pr->pr_cpuset->cs_id));
1883	if (error != 0 && error != ENOENT)
1884		goto done_deref;
1885	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1886	if (error != 0 && error != ENOENT)
1887		goto done_deref;
1888#ifdef INET
1889	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1890	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1891	if (error != 0 && error != ENOENT)
1892		goto done_deref;
1893#endif
1894#ifdef INET6
1895	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1896	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1897	if (error != 0 && error != ENOENT)
1898		goto done_deref;
1899#endif
1900	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1901	    sizeof(pr->pr_securelevel));
1902	if (error != 0 && error != ENOENT)
1903		goto done_deref;
1904	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
1905	    sizeof(pr->pr_childcount));
1906	if (error != 0 && error != ENOENT)
1907		goto done_deref;
1908	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
1909	    sizeof(pr->pr_childmax));
1910	if (error != 0 && error != ENOENT)
1911		goto done_deref;
1912	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
1913	if (error != 0 && error != ENOENT)
1914		goto done_deref;
1915	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
1916	if (error != 0 && error != ENOENT)
1917		goto done_deref;
1918	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
1919	if (error != 0 && error != ENOENT)
1920		goto done_deref;
1921#ifdef COMPAT_IA32
1922	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
1923		uint32_t hid32 = pr->pr_hostid;
1924
1925		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
1926	} else
1927#endif
1928	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
1929	    sizeof(pr->pr_hostid));
1930	if (error != 0 && error != ENOENT)
1931		goto done_deref;
1932	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
1933	    sizeof(pr->pr_enforce_statfs));
1934	if (error != 0 && error != ENOENT)
1935		goto done_deref;
1936	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
1937	    fi++) {
1938		if (pr_flag_names[fi] == NULL)
1939			continue;
1940		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
1941		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
1942		if (error != 0 && error != ENOENT)
1943			goto done_deref;
1944		i = !i;
1945		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
1946		if (error != 0 && error != ENOENT)
1947			goto done_deref;
1948	}
1949	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
1950	    fi++) {
1951		i = pr->pr_flags &
1952		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
1953		i = pr_flag_jailsys[fi].disable &&
1954		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
1955		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
1956		    : JAIL_SYS_INHERIT;
1957		error =
1958		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
1959		if (error != 0 && error != ENOENT)
1960			goto done_deref;
1961	}
1962	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
1963	    fi++) {
1964		if (pr_allow_names[fi] == NULL)
1965			continue;
1966		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
1967		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
1968		if (error != 0 && error != ENOENT)
1969			goto done_deref;
1970		i = !i;
1971		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
1972		if (error != 0 && error != ENOENT)
1973			goto done_deref;
1974	}
1975	i = (pr->pr_uref == 0);
1976	error = vfs_setopt(opts, "dying", &i, sizeof(i));
1977	if (error != 0 && error != ENOENT)
1978		goto done_deref;
1979	i = !i;
1980	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
1981	if (error != 0 && error != ENOENT)
1982		goto done_deref;
1983
1984	/* Get the module parameters. */
1985	mtx_unlock(&pr->pr_mtx);
1986	locked = 0;
1987	error = osd_jail_call(pr, PR_METHOD_GET, opts);
1988	if (error)
1989		goto done_deref;
1990	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
1991
1992	/* By now, all parameters should have been noted. */
1993	TAILQ_FOREACH(opt, opts, link) {
1994		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1995			error = EINVAL;
1996			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1997			goto done_errmsg;
1998		}
1999	}
2000
2001	/* Write the fetched parameters back to userspace. */
2002	error = 0;
2003	TAILQ_FOREACH(opt, opts, link) {
2004		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2005			pos = 2 * opt->pos + 1;
2006			optuio->uio_iov[pos].iov_len = opt->len;
2007			if (opt->value != NULL) {
2008				if (optuio->uio_segflg == UIO_SYSSPACE) {
2009					bcopy(opt->value,
2010					    optuio->uio_iov[pos].iov_base,
2011					    opt->len);
2012				} else {
2013					error = copyout(opt->value,
2014					    optuio->uio_iov[pos].iov_base,
2015					    opt->len);
2016					if (error)
2017						break;
2018				}
2019			}
2020		}
2021	}
2022	goto done_errmsg;
2023
2024 done_deref:
2025	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2026	goto done_errmsg;
2027
2028 done_unlock_list:
2029	sx_sunlock(&allprison_lock);
2030 done_errmsg:
2031	if (error && errmsg_pos >= 0) {
2032		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2033		errmsg_pos = 2 * errmsg_pos + 1;
2034		if (errmsg_len > 0) {
2035			if (optuio->uio_segflg == UIO_SYSSPACE)
2036				bcopy(errmsg,
2037				    optuio->uio_iov[errmsg_pos].iov_base,
2038				    errmsg_len);
2039			else
2040				copyout(errmsg,
2041				    optuio->uio_iov[errmsg_pos].iov_base,
2042				    errmsg_len);
2043		}
2044	}
2045	vfs_freeopts(opts);
2046	return (error);
2047}
2048
2049
2050/*
2051 * struct jail_remove_args {
2052 *	int jid;
2053 * };
2054 */
2055int
2056jail_remove(struct thread *td, struct jail_remove_args *uap)
2057{
2058	struct prison *pr, *cpr, *lpr, *tpr;
2059	int descend, error;
2060
2061	error = priv_check(td, PRIV_JAIL_REMOVE);
2062	if (error)
2063		return (error);
2064
2065	sx_xlock(&allprison_lock);
2066	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2067	if (pr == NULL) {
2068		sx_xunlock(&allprison_lock);
2069		return (EINVAL);
2070	}
2071
2072	/* Remove all descendants of this prison, then remove this prison. */
2073	pr->pr_ref++;
2074	pr->pr_flags |= PR_REMOVE;
2075	if (!LIST_EMPTY(&pr->pr_children)) {
2076		mtx_unlock(&pr->pr_mtx);
2077		lpr = NULL;
2078		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2079			mtx_lock(&cpr->pr_mtx);
2080			if (cpr->pr_ref > 0) {
2081				tpr = cpr;
2082				cpr->pr_ref++;
2083				cpr->pr_flags |= PR_REMOVE;
2084			} else {
2085				/* Already removed - do not do it again. */
2086				tpr = NULL;
2087			}
2088			mtx_unlock(&cpr->pr_mtx);
2089			if (lpr != NULL) {
2090				mtx_lock(&lpr->pr_mtx);
2091				prison_remove_one(lpr);
2092				sx_xlock(&allprison_lock);
2093			}
2094			lpr = tpr;
2095		}
2096		if (lpr != NULL) {
2097			mtx_lock(&lpr->pr_mtx);
2098			prison_remove_one(lpr);
2099			sx_xlock(&allprison_lock);
2100		}
2101		mtx_lock(&pr->pr_mtx);
2102	}
2103	prison_remove_one(pr);
2104	return (0);
2105}
2106
2107static void
2108prison_remove_one(struct prison *pr)
2109{
2110	struct proc *p;
2111	int deuref;
2112
2113	/* If the prison was persistent, it is not anymore. */
2114	deuref = 0;
2115	if (pr->pr_flags & PR_PERSIST) {
2116		pr->pr_ref--;
2117		deuref = PD_DEUREF;
2118		pr->pr_flags &= ~PR_PERSIST;
2119	}
2120
2121	/*
2122	 * jail_remove added a reference.  If that's the only one, remove
2123	 * the prison now.
2124	 */
2125	KASSERT(pr->pr_ref > 0,
2126	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2127	if (pr->pr_ref == 1) {
2128		prison_deref(pr,
2129		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2130		return;
2131	}
2132
2133	mtx_unlock(&pr->pr_mtx);
2134	sx_xunlock(&allprison_lock);
2135	/*
2136	 * Kill all processes unfortunate enough to be attached to this prison.
2137	 */
2138	sx_slock(&allproc_lock);
2139	LIST_FOREACH(p, &allproc, p_list) {
2140		PROC_LOCK(p);
2141		if (p->p_state != PRS_NEW && p->p_ucred &&
2142		    p->p_ucred->cr_prison == pr)
2143			psignal(p, SIGKILL);
2144		PROC_UNLOCK(p);
2145	}
2146	sx_sunlock(&allproc_lock);
2147	/* Remove the temporary reference added by jail_remove. */
2148	prison_deref(pr, deuref | PD_DEREF);
2149}
2150
2151
2152/*
2153 * struct jail_attach_args {
2154 *	int jid;
2155 * };
2156 */
2157int
2158jail_attach(struct thread *td, struct jail_attach_args *uap)
2159{
2160	struct prison *pr;
2161	int error;
2162
2163	error = priv_check(td, PRIV_JAIL_ATTACH);
2164	if (error)
2165		return (error);
2166
2167	sx_slock(&allprison_lock);
2168	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2169	if (pr == NULL) {
2170		sx_sunlock(&allprison_lock);
2171		return (EINVAL);
2172	}
2173
2174	/*
2175	 * Do not allow a process to attach to a prison that is not
2176	 * considered to be "alive".
2177	 */
2178	if (pr->pr_uref == 0) {
2179		mtx_unlock(&pr->pr_mtx);
2180		sx_sunlock(&allprison_lock);
2181		return (EINVAL);
2182	}
2183
2184	return (do_jail_attach(td, pr));
2185}
2186
2187static int
2188do_jail_attach(struct thread *td, struct prison *pr)
2189{
2190	struct prison *ppr;
2191	struct proc *p;
2192	struct ucred *newcred, *oldcred;
2193	int vfslocked, error;
2194
2195	/*
2196	 * XXX: Note that there is a slight race here if two threads
2197	 * in the same privileged process attempt to attach to two
2198	 * different jails at the same time.  It is important for
2199	 * user processes not to do this, or they might end up with
2200	 * a process root from one prison, but attached to the jail
2201	 * of another.
2202	 */
2203	pr->pr_ref++;
2204	pr->pr_uref++;
2205	mtx_unlock(&pr->pr_mtx);
2206
2207	/* Let modules do whatever they need to prepare for attaching. */
2208	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2209	if (error) {
2210		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2211		return (error);
2212	}
2213	sx_sunlock(&allprison_lock);
2214
2215	/*
2216	 * Reparent the newly attached process to this jail.
2217	 */
2218	ppr = td->td_ucred->cr_prison;
2219	p = td->td_proc;
2220	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2221	if (error)
2222		goto e_revert_osd;
2223
2224	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2225	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2226	if ((error = change_dir(pr->pr_root, td)) != 0)
2227		goto e_unlock;
2228#ifdef MAC
2229	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2230		goto e_unlock;
2231#endif
2232	VOP_UNLOCK(pr->pr_root, 0);
2233	if ((error = change_root(pr->pr_root, td)))
2234		goto e_unlock_giant;
2235	VFS_UNLOCK_GIANT(vfslocked);
2236
2237	newcred = crget();
2238	PROC_LOCK(p);
2239	oldcred = p->p_ucred;
2240	setsugid(p);
2241	crcopy(newcred, oldcred);
2242	newcred->cr_prison = pr;
2243	p->p_ucred = newcred;
2244	PROC_UNLOCK(p);
2245	crfree(oldcred);
2246	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2247	return (0);
2248 e_unlock:
2249	VOP_UNLOCK(pr->pr_root, 0);
2250 e_unlock_giant:
2251	VFS_UNLOCK_GIANT(vfslocked);
2252 e_revert_osd:
2253	/* Tell modules this thread is still in its old jail after all. */
2254	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2255	prison_deref(pr, PD_DEREF | PD_DEUREF);
2256	return (error);
2257}
2258
2259
2260/*
2261 * Returns a locked prison instance, or NULL on failure.
2262 */
2263struct prison *
2264prison_find(int prid)
2265{
2266	struct prison *pr;
2267
2268	sx_assert(&allprison_lock, SX_LOCKED);
2269	TAILQ_FOREACH(pr, &allprison, pr_list) {
2270		if (pr->pr_id == prid) {
2271			mtx_lock(&pr->pr_mtx);
2272			if (pr->pr_ref > 0)
2273				return (pr);
2274			mtx_unlock(&pr->pr_mtx);
2275		}
2276	}
2277	return (NULL);
2278}
2279
2280/*
2281 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2282 */
2283struct prison *
2284prison_find_child(struct prison *mypr, int prid)
2285{
2286	struct prison *pr;
2287	int descend;
2288
2289	sx_assert(&allprison_lock, SX_LOCKED);
2290	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2291		if (pr->pr_id == prid) {
2292			mtx_lock(&pr->pr_mtx);
2293			if (pr->pr_ref > 0)
2294				return (pr);
2295			mtx_unlock(&pr->pr_mtx);
2296		}
2297	}
2298	return (NULL);
2299}
2300
2301/*
2302 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2303 */
2304struct prison *
2305prison_find_name(struct prison *mypr, const char *name)
2306{
2307	struct prison *pr, *deadpr;
2308	size_t mylen;
2309	int descend;
2310
2311	sx_assert(&allprison_lock, SX_LOCKED);
2312	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2313 again:
2314	deadpr = NULL;
2315	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2316		if (!strcmp(pr->pr_name + mylen, name)) {
2317			mtx_lock(&pr->pr_mtx);
2318			if (pr->pr_ref > 0) {
2319				if (pr->pr_uref > 0)
2320					return (pr);
2321				deadpr = pr;
2322			}
2323			mtx_unlock(&pr->pr_mtx);
2324		}
2325	}
2326	/* There was no valid prison - perhaps there was a dying one. */
2327	if (deadpr != NULL) {
2328		mtx_lock(&deadpr->pr_mtx);
2329		if (deadpr->pr_ref == 0) {
2330			mtx_unlock(&deadpr->pr_mtx);
2331			goto again;
2332		}
2333	}
2334	return (deadpr);
2335}
2336
2337/*
2338 * See if a prison has the specific flag set.
2339 */
2340int
2341prison_flag(struct ucred *cred, unsigned flag)
2342{
2343
2344	/* This is an atomic read, so no locking is necessary. */
2345	return (cred->cr_prison->pr_flags & flag);
2346}
2347
2348int
2349prison_allow(struct ucred *cred, unsigned flag)
2350{
2351
2352	/* This is an atomic read, so no locking is necessary. */
2353	return (cred->cr_prison->pr_allow & flag);
2354}
2355
2356/*
2357 * Remove a prison reference.  If that was the last reference, remove the
2358 * prison itself - but not in this context in case there are locks held.
2359 */
2360void
2361prison_free_locked(struct prison *pr)
2362{
2363
2364	mtx_assert(&pr->pr_mtx, MA_OWNED);
2365	pr->pr_ref--;
2366	if (pr->pr_ref == 0) {
2367		mtx_unlock(&pr->pr_mtx);
2368		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2369		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2370		return;
2371	}
2372	mtx_unlock(&pr->pr_mtx);
2373}
2374
2375void
2376prison_free(struct prison *pr)
2377{
2378
2379	mtx_lock(&pr->pr_mtx);
2380	prison_free_locked(pr);
2381}
2382
2383static void
2384prison_complete(void *context, int pending)
2385{
2386
2387	prison_deref((struct prison *)context, 0);
2388}
2389
2390/*
2391 * Remove a prison reference (usually).  This internal version assumes no
2392 * mutexes are held, except perhaps the prison itself.  If there are no more
2393 * references, release and delist the prison.  On completion, the prison lock
2394 * and the allprison lock are both unlocked.
2395 */
2396static void
2397prison_deref(struct prison *pr, int flags)
2398{
2399	struct prison *ppr, *tpr;
2400	int vfslocked;
2401
2402	if (!(flags & PD_LOCKED))
2403		mtx_lock(&pr->pr_mtx);
2404	/* Decrement the user references in a separate loop. */
2405	if (flags & PD_DEUREF) {
2406		for (tpr = pr;; tpr = tpr->pr_parent) {
2407			if (tpr != pr)
2408				mtx_lock(&tpr->pr_mtx);
2409			if (--tpr->pr_uref > 0)
2410				break;
2411			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2412			mtx_unlock(&tpr->pr_mtx);
2413		}
2414		/* Done if there were only user references to remove. */
2415		if (!(flags & PD_DEREF)) {
2416			mtx_unlock(&tpr->pr_mtx);
2417			if (flags & PD_LIST_SLOCKED)
2418				sx_sunlock(&allprison_lock);
2419			else if (flags & PD_LIST_XLOCKED)
2420				sx_xunlock(&allprison_lock);
2421			return;
2422		}
2423		if (tpr != pr) {
2424			mtx_unlock(&tpr->pr_mtx);
2425			mtx_lock(&pr->pr_mtx);
2426		}
2427	}
2428
2429	for (;;) {
2430		if (flags & PD_DEREF)
2431			pr->pr_ref--;
2432		/* If the prison still has references, nothing else to do. */
2433		if (pr->pr_ref > 0) {
2434			mtx_unlock(&pr->pr_mtx);
2435			if (flags & PD_LIST_SLOCKED)
2436				sx_sunlock(&allprison_lock);
2437			else if (flags & PD_LIST_XLOCKED)
2438				sx_xunlock(&allprison_lock);
2439			return;
2440		}
2441
2442		mtx_unlock(&pr->pr_mtx);
2443		if (flags & PD_LIST_SLOCKED) {
2444			if (!sx_try_upgrade(&allprison_lock)) {
2445				sx_sunlock(&allprison_lock);
2446				sx_xlock(&allprison_lock);
2447			}
2448		} else if (!(flags & PD_LIST_XLOCKED))
2449			sx_xlock(&allprison_lock);
2450
2451		TAILQ_REMOVE(&allprison, pr, pr_list);
2452		LIST_REMOVE(pr, pr_sibling);
2453		ppr = pr->pr_parent;
2454		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2455			tpr->pr_childcount--;
2456		sx_downgrade(&allprison_lock);
2457
2458#ifdef VIMAGE
2459		if (pr->pr_flags & PR_VNET)
2460			vnet_destroy(pr->pr_vnet);
2461#endif
2462		if (pr->pr_root != NULL) {
2463			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2464			vrele(pr->pr_root);
2465			VFS_UNLOCK_GIANT(vfslocked);
2466		}
2467		mtx_destroy(&pr->pr_mtx);
2468#ifdef INET
2469		free(pr->pr_ip4, M_PRISON);
2470#endif
2471#ifdef INET6
2472		free(pr->pr_ip6, M_PRISON);
2473#endif
2474		if (pr->pr_cpuset != NULL)
2475			cpuset_rel(pr->pr_cpuset);
2476		osd_jail_exit(pr);
2477		free(pr, M_PRISON);
2478
2479		/* Removing a prison frees a reference on its parent. */
2480		pr = ppr;
2481		mtx_lock(&pr->pr_mtx);
2482		flags = PD_DEREF | PD_LIST_SLOCKED;
2483	}
2484}
2485
2486void
2487prison_hold_locked(struct prison *pr)
2488{
2489
2490	mtx_assert(&pr->pr_mtx, MA_OWNED);
2491	KASSERT(pr->pr_ref > 0,
2492	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2493	pr->pr_ref++;
2494}
2495
2496void
2497prison_hold(struct prison *pr)
2498{
2499
2500	mtx_lock(&pr->pr_mtx);
2501	prison_hold_locked(pr);
2502	mtx_unlock(&pr->pr_mtx);
2503}
2504
2505void
2506prison_proc_hold(struct prison *pr)
2507{
2508
2509	mtx_lock(&pr->pr_mtx);
2510	KASSERT(pr->pr_uref > 0,
2511	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2512	pr->pr_uref++;
2513	mtx_unlock(&pr->pr_mtx);
2514}
2515
2516void
2517prison_proc_free(struct prison *pr)
2518{
2519
2520	mtx_lock(&pr->pr_mtx);
2521	KASSERT(pr->pr_uref > 0,
2522	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2523	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2524}
2525
2526
2527#ifdef INET
2528/*
2529 * Restrict a prison's IP address list with its parent's, possibly replacing
2530 * it.  Return true if the replacement buffer was used (or would have been).
2531 */
2532static int
2533prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2534{
2535	int ii, ij, used;
2536	struct prison *ppr;
2537
2538	ppr = pr->pr_parent;
2539	if (!(pr->pr_flags & PR_IP4_USER)) {
2540		/* This has no user settings, so just copy the parent's list. */
2541		if (pr->pr_ip4s < ppr->pr_ip4s) {
2542			/*
2543			 * There's no room for the parent's list.  Use the
2544			 * new list buffer, which is assumed to be big enough
2545			 * (if it was passed).  If there's no buffer, try to
2546			 * allocate one.
2547			 */
2548			used = 1;
2549			if (newip4 == NULL) {
2550				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2551				    M_PRISON, M_NOWAIT);
2552				if (newip4 != NULL)
2553					used = 0;
2554			}
2555			if (newip4 != NULL) {
2556				bcopy(ppr->pr_ip4, newip4,
2557				    ppr->pr_ip4s * sizeof(*newip4));
2558				free(pr->pr_ip4, M_PRISON);
2559				pr->pr_ip4 = newip4;
2560				pr->pr_ip4s = ppr->pr_ip4s;
2561			}
2562			return (used);
2563		}
2564		pr->pr_ip4s = ppr->pr_ip4s;
2565		if (pr->pr_ip4s > 0)
2566			bcopy(ppr->pr_ip4, pr->pr_ip4,
2567			    pr->pr_ip4s * sizeof(*newip4));
2568		else if (pr->pr_ip4 != NULL) {
2569			free(pr->pr_ip4, M_PRISON);
2570			pr->pr_ip4 = NULL;
2571		}
2572	} else if (pr->pr_ip4s > 0) {
2573		/* Remove addresses that aren't in the parent. */
2574		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2575			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2576				break;
2577		if (ij < ppr->pr_ip4s)
2578			ii = 1;
2579		else {
2580			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2581			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2582			ii = 0;
2583		}
2584		for (ij = 1; ii < pr->pr_ip4s; ) {
2585			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2586				ii++;
2587				continue;
2588			}
2589			switch (ij >= ppr->pr_ip4s ? -1 :
2590				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2591			case -1:
2592				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2593				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2594				break;
2595			case 0:
2596				ii++;
2597				ij++;
2598				break;
2599			case 1:
2600				ij++;
2601				break;
2602			}
2603		}
2604		if (pr->pr_ip4s == 0) {
2605			pr->pr_flags |= PR_IP4_DISABLE;
2606			free(pr->pr_ip4, M_PRISON);
2607			pr->pr_ip4 = NULL;
2608		}
2609	}
2610	return (0);
2611}
2612
2613/*
2614 * Pass back primary IPv4 address of this jail.
2615 *
2616 * If not restricted return success but do not alter the address.  Caller has
2617 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2618 *
2619 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2620 * Address returned in NBO.
2621 */
2622int
2623prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2624{
2625	struct prison *pr;
2626
2627	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2628	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2629
2630	pr = cred->cr_prison;
2631	if (!(pr->pr_flags & PR_IP4))
2632		return (0);
2633	mtx_lock(&pr->pr_mtx);
2634	if (!(pr->pr_flags & PR_IP4)) {
2635		mtx_unlock(&pr->pr_mtx);
2636		return (0);
2637	}
2638	if (pr->pr_ip4 == NULL) {
2639		mtx_unlock(&pr->pr_mtx);
2640		return (EAFNOSUPPORT);
2641	}
2642
2643	ia->s_addr = pr->pr_ip4[0].s_addr;
2644	mtx_unlock(&pr->pr_mtx);
2645	return (0);
2646}
2647
2648/*
2649 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2650 */
2651int
2652prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2653{
2654
2655	if (pr1 == pr2)
2656		return (1);
2657
2658	/*
2659	 * No need to lock since the PR_IP4_USER flag can't be altered for
2660	 * existing prisons.
2661	 */
2662	while (pr1 != &prison0 &&
2663#ifdef VIMAGE
2664	       !(pr1->pr_flags & PR_VNET) &&
2665#endif
2666	       !(pr1->pr_flags & PR_IP4_USER))
2667		pr1 = pr1->pr_parent;
2668	while (pr2 != &prison0 &&
2669#ifdef VIMAGE
2670	       !(pr2->pr_flags & PR_VNET) &&
2671#endif
2672	       !(pr2->pr_flags & PR_IP4_USER))
2673		pr2 = pr2->pr_parent;
2674	return (pr1 == pr2);
2675}
2676
2677/*
2678 * Make sure our (source) address is set to something meaningful to this
2679 * jail.
2680 *
2681 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2682 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2683 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2684 */
2685int
2686prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2687{
2688	struct prison *pr;
2689	struct in_addr ia0;
2690	int error;
2691
2692	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2693	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2694
2695	pr = cred->cr_prison;
2696	if (!(pr->pr_flags & PR_IP4))
2697		return (0);
2698	mtx_lock(&pr->pr_mtx);
2699	if (!(pr->pr_flags & PR_IP4)) {
2700		mtx_unlock(&pr->pr_mtx);
2701		return (0);
2702	}
2703	if (pr->pr_ip4 == NULL) {
2704		mtx_unlock(&pr->pr_mtx);
2705		return (EAFNOSUPPORT);
2706	}
2707
2708	ia0.s_addr = ntohl(ia->s_addr);
2709	if (ia0.s_addr == INADDR_LOOPBACK) {
2710		ia->s_addr = pr->pr_ip4[0].s_addr;
2711		mtx_unlock(&pr->pr_mtx);
2712		return (0);
2713	}
2714
2715	if (ia0.s_addr == INADDR_ANY) {
2716		/*
2717		 * In case there is only 1 IPv4 address, bind directly.
2718		 */
2719		if (pr->pr_ip4s == 1)
2720			ia->s_addr = pr->pr_ip4[0].s_addr;
2721		mtx_unlock(&pr->pr_mtx);
2722		return (0);
2723	}
2724
2725	error = _prison_check_ip4(pr, ia);
2726	mtx_unlock(&pr->pr_mtx);
2727	return (error);
2728}
2729
2730/*
2731 * Rewrite destination address in case we will connect to loopback address.
2732 *
2733 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2734 * Address passed in in NBO and returned in NBO.
2735 */
2736int
2737prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2738{
2739	struct prison *pr;
2740
2741	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2742	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2743
2744	pr = cred->cr_prison;
2745	if (!(pr->pr_flags & PR_IP4))
2746		return (0);
2747	mtx_lock(&pr->pr_mtx);
2748	if (!(pr->pr_flags & PR_IP4)) {
2749		mtx_unlock(&pr->pr_mtx);
2750		return (0);
2751	}
2752	if (pr->pr_ip4 == NULL) {
2753		mtx_unlock(&pr->pr_mtx);
2754		return (EAFNOSUPPORT);
2755	}
2756
2757	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2758		ia->s_addr = pr->pr_ip4[0].s_addr;
2759		mtx_unlock(&pr->pr_mtx);
2760		return (0);
2761	}
2762
2763	/*
2764	 * Return success because nothing had to be changed.
2765	 */
2766	mtx_unlock(&pr->pr_mtx);
2767	return (0);
2768}
2769
2770/*
2771 * Check if given address belongs to the jail referenced by cred/prison.
2772 *
2773 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2774 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2775 * doesn't allow IPv4.  Address passed in in NBO.
2776 */
2777static int
2778_prison_check_ip4(struct prison *pr, struct in_addr *ia)
2779{
2780	int i, a, z, d;
2781
2782	/*
2783	 * Check the primary IP.
2784	 */
2785	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2786		return (0);
2787
2788	/*
2789	 * All the other IPs are sorted so we can do a binary search.
2790	 */
2791	a = 0;
2792	z = pr->pr_ip4s - 2;
2793	while (a <= z) {
2794		i = (a + z) / 2;
2795		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2796		if (d > 0)
2797			z = i - 1;
2798		else if (d < 0)
2799			a = i + 1;
2800		else
2801			return (0);
2802	}
2803
2804	return (EADDRNOTAVAIL);
2805}
2806
2807int
2808prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2809{
2810	struct prison *pr;
2811	int error;
2812
2813	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2814	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2815
2816	pr = cred->cr_prison;
2817	if (!(pr->pr_flags & PR_IP4))
2818		return (0);
2819	mtx_lock(&pr->pr_mtx);
2820	if (!(pr->pr_flags & PR_IP4)) {
2821		mtx_unlock(&pr->pr_mtx);
2822		return (0);
2823	}
2824	if (pr->pr_ip4 == NULL) {
2825		mtx_unlock(&pr->pr_mtx);
2826		return (EAFNOSUPPORT);
2827	}
2828
2829	error = _prison_check_ip4(pr, ia);
2830	mtx_unlock(&pr->pr_mtx);
2831	return (error);
2832}
2833#endif
2834
2835#ifdef INET6
2836static int
2837prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2838{
2839	int ii, ij, used;
2840	struct prison *ppr;
2841
2842	ppr = pr->pr_parent;
2843	if (!(pr->pr_flags & PR_IP6_USER)) {
2844		/* This has no user settings, so just copy the parent's list. */
2845		if (pr->pr_ip6s < ppr->pr_ip6s) {
2846			/*
2847			 * There's no room for the parent's list.  Use the
2848			 * new list buffer, which is assumed to be big enough
2849			 * (if it was passed).  If there's no buffer, try to
2850			 * allocate one.
2851			 */
2852			used = 1;
2853			if (newip6 == NULL) {
2854				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2855				    M_PRISON, M_NOWAIT);
2856				if (newip6 != NULL)
2857					used = 0;
2858			}
2859			if (newip6 != NULL) {
2860				bcopy(ppr->pr_ip6, newip6,
2861				    ppr->pr_ip6s * sizeof(*newip6));
2862				free(pr->pr_ip6, M_PRISON);
2863				pr->pr_ip6 = newip6;
2864				pr->pr_ip6s = ppr->pr_ip6s;
2865			}
2866			return (used);
2867		}
2868		pr->pr_ip6s = ppr->pr_ip6s;
2869		if (pr->pr_ip6s > 0)
2870			bcopy(ppr->pr_ip6, pr->pr_ip6,
2871			    pr->pr_ip6s * sizeof(*newip6));
2872		else if (pr->pr_ip6 != NULL) {
2873			free(pr->pr_ip6, M_PRISON);
2874			pr->pr_ip6 = NULL;
2875		}
2876	} else if (pr->pr_ip6s > 0) {
2877		/* Remove addresses that aren't in the parent. */
2878		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2879			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2880			    &ppr->pr_ip6[ij]))
2881				break;
2882		if (ij < ppr->pr_ip6s)
2883			ii = 1;
2884		else {
2885			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2886			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
2887			ii = 0;
2888		}
2889		for (ij = 1; ii < pr->pr_ip6s; ) {
2890			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
2891			    &ppr->pr_ip6[0])) {
2892				ii++;
2893				continue;
2894			}
2895			switch (ij >= ppr->pr_ip4s ? -1 :
2896				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
2897			case -1:
2898				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
2899				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
2900				break;
2901			case 0:
2902				ii++;
2903				ij++;
2904				break;
2905			case 1:
2906				ij++;
2907				break;
2908			}
2909		}
2910		if (pr->pr_ip6s == 0) {
2911			pr->pr_flags |= PR_IP6_DISABLE;
2912			free(pr->pr_ip6, M_PRISON);
2913			pr->pr_ip6 = NULL;
2914		}
2915	}
2916	return 0;
2917}
2918
2919/*
2920 * Pass back primary IPv6 address for this jail.
2921 *
2922 * If not restricted return success but do not alter the address.  Caller has
2923 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
2924 *
2925 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2926 */
2927int
2928prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
2929{
2930	struct prison *pr;
2931
2932	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2933	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2934
2935	pr = cred->cr_prison;
2936	if (!(pr->pr_flags & PR_IP6))
2937		return (0);
2938	mtx_lock(&pr->pr_mtx);
2939	if (!(pr->pr_flags & PR_IP6)) {
2940		mtx_unlock(&pr->pr_mtx);
2941		return (0);
2942	}
2943	if (pr->pr_ip6 == NULL) {
2944		mtx_unlock(&pr->pr_mtx);
2945		return (EAFNOSUPPORT);
2946	}
2947
2948	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2949	mtx_unlock(&pr->pr_mtx);
2950	return (0);
2951}
2952
2953/*
2954 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
2955 */
2956int
2957prison_equal_ip6(struct prison *pr1, struct prison *pr2)
2958{
2959
2960	if (pr1 == pr2)
2961		return (1);
2962
2963	while (pr1 != &prison0 &&
2964#ifdef VIMAGE
2965	       !(pr1->pr_flags & PR_VNET) &&
2966#endif
2967	       !(pr1->pr_flags & PR_IP6_USER))
2968		pr1 = pr1->pr_parent;
2969	while (pr2 != &prison0 &&
2970#ifdef VIMAGE
2971	       !(pr2->pr_flags & PR_VNET) &&
2972#endif
2973	       !(pr2->pr_flags & PR_IP6_USER))
2974		pr2 = pr2->pr_parent;
2975	return (pr1 == pr2);
2976}
2977
2978/*
2979 * Make sure our (source) address is set to something meaningful to this jail.
2980 *
2981 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
2982 * when needed while binding.
2983 *
2984 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
2985 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2986 * doesn't allow IPv6.
2987 */
2988int
2989prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
2990{
2991	struct prison *pr;
2992	int error;
2993
2994	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2995	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2996
2997	pr = cred->cr_prison;
2998	if (!(pr->pr_flags & PR_IP6))
2999		return (0);
3000	mtx_lock(&pr->pr_mtx);
3001	if (!(pr->pr_flags & PR_IP6)) {
3002		mtx_unlock(&pr->pr_mtx);
3003		return (0);
3004	}
3005	if (pr->pr_ip6 == NULL) {
3006		mtx_unlock(&pr->pr_mtx);
3007		return (EAFNOSUPPORT);
3008	}
3009
3010	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3011		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3012		mtx_unlock(&pr->pr_mtx);
3013		return (0);
3014	}
3015
3016	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3017		/*
3018		 * In case there is only 1 IPv6 address, and v6only is true,
3019		 * then bind directly.
3020		 */
3021		if (v6only != 0 && pr->pr_ip6s == 1)
3022			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3023		mtx_unlock(&pr->pr_mtx);
3024		return (0);
3025	}
3026
3027	error = _prison_check_ip6(pr, ia6);
3028	mtx_unlock(&pr->pr_mtx);
3029	return (error);
3030}
3031
3032/*
3033 * Rewrite destination address in case we will connect to loopback address.
3034 *
3035 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3036 */
3037int
3038prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3039{
3040	struct prison *pr;
3041
3042	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3043	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3044
3045	pr = cred->cr_prison;
3046	if (!(pr->pr_flags & PR_IP6))
3047		return (0);
3048	mtx_lock(&pr->pr_mtx);
3049	if (!(pr->pr_flags & PR_IP6)) {
3050		mtx_unlock(&pr->pr_mtx);
3051		return (0);
3052	}
3053	if (pr->pr_ip6 == NULL) {
3054		mtx_unlock(&pr->pr_mtx);
3055		return (EAFNOSUPPORT);
3056	}
3057
3058	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3059		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3060		mtx_unlock(&pr->pr_mtx);
3061		return (0);
3062	}
3063
3064	/*
3065	 * Return success because nothing had to be changed.
3066	 */
3067	mtx_unlock(&pr->pr_mtx);
3068	return (0);
3069}
3070
3071/*
3072 * Check if given address belongs to the jail referenced by cred/prison.
3073 *
3074 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3075 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3076 * doesn't allow IPv6.
3077 */
3078static int
3079_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3080{
3081	int i, a, z, d;
3082
3083	/*
3084	 * Check the primary IP.
3085	 */
3086	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3087		return (0);
3088
3089	/*
3090	 * All the other IPs are sorted so we can do a binary search.
3091	 */
3092	a = 0;
3093	z = pr->pr_ip6s - 2;
3094	while (a <= z) {
3095		i = (a + z) / 2;
3096		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3097		if (d > 0)
3098			z = i - 1;
3099		else if (d < 0)
3100			a = i + 1;
3101		else
3102			return (0);
3103	}
3104
3105	return (EADDRNOTAVAIL);
3106}
3107
3108int
3109prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3110{
3111	struct prison *pr;
3112	int error;
3113
3114	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3115	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3116
3117	pr = cred->cr_prison;
3118	if (!(pr->pr_flags & PR_IP6))
3119		return (0);
3120	mtx_lock(&pr->pr_mtx);
3121	if (!(pr->pr_flags & PR_IP6)) {
3122		mtx_unlock(&pr->pr_mtx);
3123		return (0);
3124	}
3125	if (pr->pr_ip6 == NULL) {
3126		mtx_unlock(&pr->pr_mtx);
3127		return (EAFNOSUPPORT);
3128	}
3129
3130	error = _prison_check_ip6(pr, ia6);
3131	mtx_unlock(&pr->pr_mtx);
3132	return (error);
3133}
3134#endif
3135
3136/*
3137 * Check if a jail supports the given address family.
3138 *
3139 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3140 * if not.
3141 */
3142int
3143prison_check_af(struct ucred *cred, int af)
3144{
3145	struct prison *pr;
3146	int error;
3147
3148	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3149
3150	pr = cred->cr_prison;
3151#ifdef VIMAGE
3152	/* Prisons with their own network stack are not limited. */
3153	if (pr->pr_flags & PR_VNET)
3154		return (0);
3155#endif
3156
3157	error = 0;
3158	switch (af)
3159	{
3160#ifdef INET
3161	case AF_INET:
3162		if (pr->pr_flags & PR_IP4)
3163		{
3164			mtx_lock(&pr->pr_mtx);
3165			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3166				error = EAFNOSUPPORT;
3167			mtx_unlock(&pr->pr_mtx);
3168		}
3169		break;
3170#endif
3171#ifdef INET6
3172	case AF_INET6:
3173		if (pr->pr_flags & PR_IP6)
3174		{
3175			mtx_lock(&pr->pr_mtx);
3176			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3177				error = EAFNOSUPPORT;
3178			mtx_unlock(&pr->pr_mtx);
3179		}
3180		break;
3181#endif
3182	case AF_LOCAL:
3183	case AF_ROUTE:
3184		break;
3185	default:
3186		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3187			error = EAFNOSUPPORT;
3188	}
3189	return (error);
3190}
3191
3192/*
3193 * Check if given address belongs to the jail referenced by cred (wrapper to
3194 * prison_check_ip[46]).
3195 *
3196 * Returns 0 if jail doesn't restrict the address family or if address belongs
3197 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3198 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3199 */
3200int
3201prison_if(struct ucred *cred, struct sockaddr *sa)
3202{
3203#ifdef INET
3204	struct sockaddr_in *sai;
3205#endif
3206#ifdef INET6
3207	struct sockaddr_in6 *sai6;
3208#endif
3209	int error;
3210
3211	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3212	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3213
3214	error = 0;
3215	switch (sa->sa_family)
3216	{
3217#ifdef INET
3218	case AF_INET:
3219		sai = (struct sockaddr_in *)sa;
3220		error = prison_check_ip4(cred, &sai->sin_addr);
3221		break;
3222#endif
3223#ifdef INET6
3224	case AF_INET6:
3225		sai6 = (struct sockaddr_in6 *)sa;
3226		error = prison_check_ip6(cred, &sai6->sin6_addr);
3227		break;
3228#endif
3229	default:
3230		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3231			error = EAFNOSUPPORT;
3232	}
3233	return (error);
3234}
3235
3236/*
3237 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3238 */
3239int
3240prison_check(struct ucred *cred1, struct ucred *cred2)
3241{
3242
3243	return ((cred1->cr_prison == cred2->cr_prison ||
3244	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3245}
3246
3247/*
3248 * Return 1 if p2 is a child of p1, otherwise 0.
3249 */
3250int
3251prison_ischild(struct prison *pr1, struct prison *pr2)
3252{
3253
3254	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3255		if (pr1 == pr2)
3256			return (1);
3257	return (0);
3258}
3259
3260/*
3261 * Return 1 if the passed credential is in a jail, otherwise 0.
3262 */
3263int
3264jailed(struct ucred *cred)
3265{
3266
3267	return (cred->cr_prison != &prison0);
3268}
3269
3270/*
3271 * Return the correct hostname (domainname, et al) for the passed credential.
3272 */
3273void
3274getcredhostname(struct ucred *cred, char *buf, size_t size)
3275{
3276	struct prison *pr;
3277
3278	/*
3279	 * A NULL credential can be used to shortcut to the physical
3280	 * system's hostname.
3281	 */
3282	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3283	mtx_lock(&pr->pr_mtx);
3284	strlcpy(buf, pr->pr_hostname, size);
3285	mtx_unlock(&pr->pr_mtx);
3286}
3287
3288void
3289getcreddomainname(struct ucred *cred, char *buf, size_t size)
3290{
3291
3292	mtx_lock(&cred->cr_prison->pr_mtx);
3293	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3294	mtx_unlock(&cred->cr_prison->pr_mtx);
3295}
3296
3297void
3298getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3299{
3300
3301	mtx_lock(&cred->cr_prison->pr_mtx);
3302	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3303	mtx_unlock(&cred->cr_prison->pr_mtx);
3304}
3305
3306void
3307getcredhostid(struct ucred *cred, unsigned long *hostid)
3308{
3309
3310	mtx_lock(&cred->cr_prison->pr_mtx);
3311	*hostid = cred->cr_prison->pr_hostid;
3312	mtx_unlock(&cred->cr_prison->pr_mtx);
3313}
3314
3315#ifdef VIMAGE
3316/*
3317 * Determine whether the prison represented by cred owns
3318 * its vnet rather than having it inherited.
3319 *
3320 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3321 */
3322int
3323prison_owns_vnet(struct ucred *cred)
3324{
3325
3326	/*
3327	 * vnets cannot be added/removed after jail creation,
3328	 * so no need to lock here.
3329	 */
3330	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3331}
3332#endif
3333
3334/*
3335 * Determine whether the subject represented by cred can "see"
3336 * status of a mount point.
3337 * Returns: 0 for permitted, ENOENT otherwise.
3338 * XXX: This function should be called cr_canseemount() and should be
3339 *      placed in kern_prot.c.
3340 */
3341int
3342prison_canseemount(struct ucred *cred, struct mount *mp)
3343{
3344	struct prison *pr;
3345	struct statfs *sp;
3346	size_t len;
3347
3348	pr = cred->cr_prison;
3349	if (pr->pr_enforce_statfs == 0)
3350		return (0);
3351	if (pr->pr_root->v_mount == mp)
3352		return (0);
3353	if (pr->pr_enforce_statfs == 2)
3354		return (ENOENT);
3355	/*
3356	 * If jail's chroot directory is set to "/" we should be able to see
3357	 * all mount-points from inside a jail.
3358	 * This is ugly check, but this is the only situation when jail's
3359	 * directory ends with '/'.
3360	 */
3361	if (strcmp(pr->pr_path, "/") == 0)
3362		return (0);
3363	len = strlen(pr->pr_path);
3364	sp = &mp->mnt_stat;
3365	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3366		return (ENOENT);
3367	/*
3368	 * Be sure that we don't have situation where jail's root directory
3369	 * is "/some/path" and mount point is "/some/pathpath".
3370	 */
3371	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3372		return (ENOENT);
3373	return (0);
3374}
3375
3376void
3377prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3378{
3379	char jpath[MAXPATHLEN];
3380	struct prison *pr;
3381	size_t len;
3382
3383	pr = cred->cr_prison;
3384	if (pr->pr_enforce_statfs == 0)
3385		return;
3386	if (prison_canseemount(cred, mp) != 0) {
3387		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3388		strlcpy(sp->f_mntonname, "[restricted]",
3389		    sizeof(sp->f_mntonname));
3390		return;
3391	}
3392	if (pr->pr_root->v_mount == mp) {
3393		/*
3394		 * Clear current buffer data, so we are sure nothing from
3395		 * the valid path left there.
3396		 */
3397		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3398		*sp->f_mntonname = '/';
3399		return;
3400	}
3401	/*
3402	 * If jail's chroot directory is set to "/" we should be able to see
3403	 * all mount-points from inside a jail.
3404	 */
3405	if (strcmp(pr->pr_path, "/") == 0)
3406		return;
3407	len = strlen(pr->pr_path);
3408	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3409	/*
3410	 * Clear current buffer data, so we are sure nothing from
3411	 * the valid path left there.
3412	 */
3413	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3414	if (*jpath == '\0') {
3415		/* Should never happen. */
3416		*sp->f_mntonname = '/';
3417	} else {
3418		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3419	}
3420}
3421
3422/*
3423 * Check with permission for a specific privilege is granted within jail.  We
3424 * have a specific list of accepted privileges; the rest are denied.
3425 */
3426int
3427prison_priv_check(struct ucred *cred, int priv)
3428{
3429
3430	if (!jailed(cred))
3431		return (0);
3432
3433#ifdef VIMAGE
3434	/*
3435	 * Privileges specific to prisons with a virtual network stack.
3436	 * There might be a duplicate entry here in case the privilege
3437	 * is only granted conditionally in the legacy jail case.
3438	 */
3439	switch (priv) {
3440#ifdef notyet
3441		/*
3442		 * NFS-specific privileges.
3443		 */
3444	case PRIV_NFS_DAEMON:
3445	case PRIV_NFS_LOCKD:
3446#endif
3447		/*
3448		 * Network stack privileges.
3449		 */
3450	case PRIV_NET_BRIDGE:
3451	case PRIV_NET_GRE:
3452	case PRIV_NET_BPF:
3453	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3454	case PRIV_NET_ROUTE:
3455	case PRIV_NET_TAP:
3456	case PRIV_NET_SETIFMTU:
3457	case PRIV_NET_SETIFFLAGS:
3458	case PRIV_NET_SETIFCAP:
3459	case PRIV_NET_SETIFNAME	:
3460	case PRIV_NET_SETIFMETRIC:
3461	case PRIV_NET_SETIFPHYS:
3462	case PRIV_NET_SETIFMAC:
3463	case PRIV_NET_ADDMULTI:
3464	case PRIV_NET_DELMULTI:
3465	case PRIV_NET_HWIOCTL:
3466	case PRIV_NET_SETLLADDR:
3467	case PRIV_NET_ADDIFGROUP:
3468	case PRIV_NET_DELIFGROUP:
3469	case PRIV_NET_IFCREATE:
3470	case PRIV_NET_IFDESTROY:
3471	case PRIV_NET_ADDIFADDR:
3472	case PRIV_NET_DELIFADDR:
3473	case PRIV_NET_LAGG:
3474	case PRIV_NET_GIF:
3475	case PRIV_NET_SETIFVNET:
3476
3477		/*
3478		 * 802.11-related privileges.
3479		 */
3480	case PRIV_NET80211_GETKEY:
3481#ifdef notyet
3482	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3483#endif
3484
3485#ifdef notyet
3486		/*
3487		 * AppleTalk privileges.
3488		 */
3489	case PRIV_NETATALK_RESERVEDPORT:
3490
3491		/*
3492		 * ATM privileges.
3493		 */
3494	case PRIV_NETATM_CFG:
3495	case PRIV_NETATM_ADD:
3496	case PRIV_NETATM_DEL:
3497	case PRIV_NETATM_SET:
3498
3499		/*
3500		 * Bluetooth privileges.
3501		 */
3502	case PRIV_NETBLUETOOTH_RAW:
3503#endif
3504
3505		/*
3506		 * Netgraph and netgraph module privileges.
3507		 */
3508	case PRIV_NETGRAPH_CONTROL:
3509#ifdef notyet
3510	case PRIV_NETGRAPH_TTY:
3511#endif
3512
3513		/*
3514		 * IPv4 and IPv6 privileges.
3515		 */
3516	case PRIV_NETINET_IPFW:
3517	case PRIV_NETINET_DIVERT:
3518	case PRIV_NETINET_PF:
3519	case PRIV_NETINET_DUMMYNET:
3520	case PRIV_NETINET_CARP:
3521	case PRIV_NETINET_MROUTE:
3522	case PRIV_NETINET_RAW:
3523	case PRIV_NETINET_ADDRCTRL6:
3524	case PRIV_NETINET_ND6:
3525	case PRIV_NETINET_SCOPE6:
3526	case PRIV_NETINET_ALIFETIME6:
3527	case PRIV_NETINET_IPSEC:
3528	case PRIV_NETINET_BINDANY:
3529
3530#ifdef notyet
3531		/*
3532		 * IPX/SPX privileges.
3533		 */
3534	case PRIV_NETIPX_RESERVEDPORT:
3535	case PRIV_NETIPX_RAW:
3536
3537		/*
3538		 * NCP privileges.
3539		 */
3540	case PRIV_NETNCP:
3541
3542		/*
3543		 * SMB privileges.
3544		 */
3545	case PRIV_NETSMB:
3546#endif
3547
3548	/*
3549	 * No default: or deny here.
3550	 * In case of no permit fall through to next switch().
3551	 */
3552		if (cred->cr_prison->pr_flags & PR_VNET)
3553			return (0);
3554	}
3555#endif /* VIMAGE */
3556
3557	switch (priv) {
3558
3559		/*
3560		 * Allow ktrace privileges for root in jail.
3561		 */
3562	case PRIV_KTRACE:
3563
3564#if 0
3565		/*
3566		 * Allow jailed processes to configure audit identity and
3567		 * submit audit records (login, etc).  In the future we may
3568		 * want to further refine the relationship between audit and
3569		 * jail.
3570		 */
3571	case PRIV_AUDIT_GETAUDIT:
3572	case PRIV_AUDIT_SETAUDIT:
3573	case PRIV_AUDIT_SUBMIT:
3574#endif
3575
3576		/*
3577		 * Allow jailed processes to manipulate process UNIX
3578		 * credentials in any way they see fit.
3579		 */
3580	case PRIV_CRED_SETUID:
3581	case PRIV_CRED_SETEUID:
3582	case PRIV_CRED_SETGID:
3583	case PRIV_CRED_SETEGID:
3584	case PRIV_CRED_SETGROUPS:
3585	case PRIV_CRED_SETREUID:
3586	case PRIV_CRED_SETREGID:
3587	case PRIV_CRED_SETRESUID:
3588	case PRIV_CRED_SETRESGID:
3589
3590		/*
3591		 * Jail implements visibility constraints already, so allow
3592		 * jailed root to override uid/gid-based constraints.
3593		 */
3594	case PRIV_SEEOTHERGIDS:
3595	case PRIV_SEEOTHERUIDS:
3596
3597		/*
3598		 * Jail implements inter-process debugging limits already, so
3599		 * allow jailed root various debugging privileges.
3600		 */
3601	case PRIV_DEBUG_DIFFCRED:
3602	case PRIV_DEBUG_SUGID:
3603	case PRIV_DEBUG_UNPRIV:
3604
3605		/*
3606		 * Allow jail to set various resource limits and login
3607		 * properties, and for now, exceed process resource limits.
3608		 */
3609	case PRIV_PROC_LIMIT:
3610	case PRIV_PROC_SETLOGIN:
3611	case PRIV_PROC_SETRLIMIT:
3612
3613		/*
3614		 * System V and POSIX IPC privileges are granted in jail.
3615		 */
3616	case PRIV_IPC_READ:
3617	case PRIV_IPC_WRITE:
3618	case PRIV_IPC_ADMIN:
3619	case PRIV_IPC_MSGSIZE:
3620	case PRIV_MQ_ADMIN:
3621
3622		/*
3623		 * Jail operations within a jail work on child jails.
3624		 */
3625	case PRIV_JAIL_ATTACH:
3626	case PRIV_JAIL_SET:
3627	case PRIV_JAIL_REMOVE:
3628
3629		/*
3630		 * Jail implements its own inter-process limits, so allow
3631		 * root processes in jail to change scheduling on other
3632		 * processes in the same jail.  Likewise for signalling.
3633		 */
3634	case PRIV_SCHED_DIFFCRED:
3635	case PRIV_SCHED_CPUSET:
3636	case PRIV_SIGNAL_DIFFCRED:
3637	case PRIV_SIGNAL_SUGID:
3638
3639		/*
3640		 * Allow jailed processes to write to sysctls marked as jail
3641		 * writable.
3642		 */
3643	case PRIV_SYSCTL_WRITEJAIL:
3644
3645		/*
3646		 * Allow root in jail to manage a variety of quota
3647		 * properties.  These should likely be conditional on a
3648		 * configuration option.
3649		 */
3650	case PRIV_VFS_GETQUOTA:
3651	case PRIV_VFS_SETQUOTA:
3652
3653		/*
3654		 * Since Jail relies on chroot() to implement file system
3655		 * protections, grant many VFS privileges to root in jail.
3656		 * Be careful to exclude mount-related and NFS-related
3657		 * privileges.
3658		 */
3659	case PRIV_VFS_READ:
3660	case PRIV_VFS_WRITE:
3661	case PRIV_VFS_ADMIN:
3662	case PRIV_VFS_EXEC:
3663	case PRIV_VFS_LOOKUP:
3664	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3665	case PRIV_VFS_CHFLAGS_DEV:
3666	case PRIV_VFS_CHOWN:
3667	case PRIV_VFS_CHROOT:
3668	case PRIV_VFS_RETAINSUGID:
3669	case PRIV_VFS_FCHROOT:
3670	case PRIV_VFS_LINK:
3671	case PRIV_VFS_SETGID:
3672	case PRIV_VFS_STAT:
3673	case PRIV_VFS_STICKYFILE:
3674		return (0);
3675
3676		/*
3677		 * Depending on the global setting, allow privilege of
3678		 * setting system flags.
3679		 */
3680	case PRIV_VFS_SYSFLAGS:
3681		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3682			return (0);
3683		else
3684			return (EPERM);
3685
3686		/*
3687		 * Depending on the global setting, allow privilege of
3688		 * mounting/unmounting file systems.
3689		 */
3690	case PRIV_VFS_MOUNT:
3691	case PRIV_VFS_UNMOUNT:
3692	case PRIV_VFS_MOUNT_NONUSER:
3693	case PRIV_VFS_MOUNT_OWNER:
3694		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3695			return (0);
3696		else
3697			return (EPERM);
3698
3699		/*
3700		 * Allow jailed root to bind reserved ports and reuse in-use
3701		 * ports.
3702		 */
3703	case PRIV_NETINET_RESERVEDPORT:
3704	case PRIV_NETINET_REUSEPORT:
3705		return (0);
3706
3707		/*
3708		 * Allow jailed root to set certian IPv4/6 (option) headers.
3709		 */
3710	case PRIV_NETINET_SETHDROPTS:
3711		return (0);
3712
3713		/*
3714		 * Conditionally allow creating raw sockets in jail.
3715		 */
3716	case PRIV_NETINET_RAW:
3717		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3718			return (0);
3719		else
3720			return (EPERM);
3721
3722		/*
3723		 * Since jail implements its own visibility limits on netstat
3724		 * sysctls, allow getcred.  This allows identd to work in
3725		 * jail.
3726		 */
3727	case PRIV_NETINET_GETCRED:
3728		return (0);
3729
3730	default:
3731		/*
3732		 * In all remaining cases, deny the privilege request.  This
3733		 * includes almost all network privileges, many system
3734		 * configuration privileges.
3735		 */
3736		return (EPERM);
3737	}
3738}
3739
3740/*
3741 * Return the part of pr2's name that is relative to pr1, or the whole name
3742 * if it does not directly follow.
3743 */
3744
3745char *
3746prison_name(struct prison *pr1, struct prison *pr2)
3747{
3748	char *name;
3749
3750	/* Jails see themselves as "0" (if they see themselves at all). */
3751	if (pr1 == pr2)
3752		return "0";
3753	name = pr2->pr_name;
3754	if (prison_ischild(pr1, pr2)) {
3755		/*
3756		 * pr1 isn't locked (and allprison_lock may not be either)
3757		 * so its length can't be counted on.  But the number of dots
3758		 * can be counted on - and counted.
3759		 */
3760		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3761			name = strchr(name, '.') + 1;
3762	}
3763	return (name);
3764}
3765
3766/*
3767 * Return the part of pr2's path that is relative to pr1, or the whole path
3768 * if it does not directly follow.
3769 */
3770static char *
3771prison_path(struct prison *pr1, struct prison *pr2)
3772{
3773	char *path1, *path2;
3774	int len1;
3775
3776	path1 = pr1->pr_path;
3777	path2 = pr2->pr_path;
3778	if (!strcmp(path1, "/"))
3779		return (path2);
3780	len1 = strlen(path1);
3781	if (strncmp(path1, path2, len1))
3782		return (path2);
3783	if (path2[len1] == '\0')
3784		return "/";
3785	if (path2[len1] == '/')
3786		return (path2 + len1);
3787	return (path2);
3788}
3789
3790
3791/*
3792 * Jail-related sysctls.
3793 */
3794SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3795    "Jails");
3796
3797static int
3798sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3799{
3800	struct xprison *xp;
3801	struct prison *pr, *cpr;
3802#ifdef INET
3803	struct in_addr *ip4 = NULL;
3804	int ip4s = 0;
3805#endif
3806#ifdef INET6
3807	struct in_addr *ip6 = NULL;
3808	int ip6s = 0;
3809#endif
3810	int descend, error;
3811
3812	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3813	pr = req->td->td_ucred->cr_prison;
3814	error = 0;
3815	sx_slock(&allprison_lock);
3816	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3817#if defined(INET) || defined(INET6)
3818 again:
3819#endif
3820		mtx_lock(&cpr->pr_mtx);
3821#ifdef INET
3822		if (cpr->pr_ip4s > 0) {
3823			if (ip4s < cpr->pr_ip4s) {
3824				ip4s = cpr->pr_ip4s;
3825				mtx_unlock(&cpr->pr_mtx);
3826				ip4 = realloc(ip4, ip4s *
3827				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3828				goto again;
3829			}
3830			bcopy(cpr->pr_ip4, ip4,
3831			    cpr->pr_ip4s * sizeof(struct in_addr));
3832		}
3833#endif
3834#ifdef INET6
3835		if (cpr->pr_ip6s > 0) {
3836			if (ip6s < cpr->pr_ip6s) {
3837				ip6s = cpr->pr_ip6s;
3838				mtx_unlock(&cpr->pr_mtx);
3839				ip6 = realloc(ip6, ip6s *
3840				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3841				goto again;
3842			}
3843			bcopy(cpr->pr_ip6, ip6,
3844			    cpr->pr_ip6s * sizeof(struct in6_addr));
3845		}
3846#endif
3847		if (cpr->pr_ref == 0) {
3848			mtx_unlock(&cpr->pr_mtx);
3849			continue;
3850		}
3851		bzero(xp, sizeof(*xp));
3852		xp->pr_version = XPRISON_VERSION;
3853		xp->pr_id = cpr->pr_id;
3854		xp->pr_state = cpr->pr_uref > 0
3855		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3856		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3857		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3858		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3859#ifdef INET
3860		xp->pr_ip4s = cpr->pr_ip4s;
3861#endif
3862#ifdef INET6
3863		xp->pr_ip6s = cpr->pr_ip6s;
3864#endif
3865		mtx_unlock(&cpr->pr_mtx);
3866		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3867		if (error)
3868			break;
3869#ifdef INET
3870		if (xp->pr_ip4s > 0) {
3871			error = SYSCTL_OUT(req, ip4,
3872			    xp->pr_ip4s * sizeof(struct in_addr));
3873			if (error)
3874				break;
3875		}
3876#endif
3877#ifdef INET6
3878		if (xp->pr_ip6s > 0) {
3879			error = SYSCTL_OUT(req, ip6,
3880			    xp->pr_ip6s * sizeof(struct in6_addr));
3881			if (error)
3882				break;
3883		}
3884#endif
3885	}
3886	sx_sunlock(&allprison_lock);
3887	free(xp, M_TEMP);
3888#ifdef INET
3889	free(ip4, M_TEMP);
3890#endif
3891#ifdef INET6
3892	free(ip6, M_TEMP);
3893#endif
3894	return (error);
3895}
3896
3897SYSCTL_OID(_security_jail, OID_AUTO, list,
3898    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3899    sysctl_jail_list, "S", "List of active jails");
3900
3901static int
3902sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3903{
3904	int error, injail;
3905
3906	injail = jailed(req->td->td_ucred);
3907	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3908
3909	return (error);
3910}
3911
3912SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3913    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3914    sysctl_jail_jailed, "I", "Process in jail?");
3915
3916#if defined(INET) || defined(INET6)
3917SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3918    &jail_max_af_ips, 0,
3919    "Number of IP addresses a jail may have at most per address family");
3920#endif
3921
3922/*
3923 * Default parameters for jail(2) compatability.  For historical reasons,
3924 * the sysctl names have varying similarity to the parameter names.  Prisons
3925 * just see their own parameters, and can't change them.
3926 */
3927static int
3928sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3929{
3930	struct prison *pr;
3931	int allow, error, i;
3932
3933	pr = req->td->td_ucred->cr_prison;
3934	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
3935
3936	/* Get the current flag value, and convert it to a boolean. */
3937	i = (allow & arg2) ? 1 : 0;
3938	if (arg1 != NULL)
3939		i = !i;
3940	error = sysctl_handle_int(oidp, &i, 0, req);
3941	if (error || !req->newptr)
3942		return (error);
3943	i = i ? arg2 : 0;
3944	if (arg1 != NULL)
3945		i ^= arg2;
3946	/*
3947	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3948	 * for writing.
3949	 */
3950	mtx_lock(&prison0.pr_mtx);
3951	jail_default_allow = (jail_default_allow & ~arg2) | i;
3952	mtx_unlock(&prison0.pr_mtx);
3953	return (0);
3954}
3955
3956SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3957    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3958    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3959    "Processes in jail can set their hostnames");
3960SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3961    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3962    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3963    "Processes in jail are limited to creating UNIX/IP/route sockets only");
3964SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3965    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3966    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3967    "Processes in jail can use System V IPC primitives");
3968SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3969    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3970    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3971    "Prison root can create raw sockets");
3972SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3973    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3974    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3975    "Processes in jail can alter system file flags");
3976SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3977    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3978    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3979    "Processes in jail can mount/unmount jail-friendly file systems");
3980
3981static int
3982sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3983{
3984	struct prison *pr;
3985	int level, error;
3986
3987	pr = req->td->td_ucred->cr_prison;
3988	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
3989	error = sysctl_handle_int(oidp, &level, 0, req);
3990	if (error || !req->newptr)
3991		return (error);
3992	*(int *)arg1 = level;
3993	return (0);
3994}
3995
3996SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
3997    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3998    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
3999    sysctl_jail_default_level, "I",
4000    "Processes in jail cannot see all mounted file systems");
4001
4002/*
4003 * Nodes to describe jail parameters.  Maximum length of string parameters
4004 * is returned in the string itself, and the other parameters exist merely
4005 * to make themselves and their types known.
4006 */
4007SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4008    "Jail parameters");
4009
4010int
4011sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4012{
4013	int i;
4014	long l;
4015	size_t s;
4016	char numbuf[12];
4017
4018	switch (oidp->oid_kind & CTLTYPE)
4019	{
4020	case CTLTYPE_LONG:
4021	case CTLTYPE_ULONG:
4022		l = 0;
4023#ifdef SCTL_MASK32
4024		if (!(req->flags & SCTL_MASK32))
4025#endif
4026			return (SYSCTL_OUT(req, &l, sizeof(l)));
4027	case CTLTYPE_INT:
4028	case CTLTYPE_UINT:
4029		i = 0;
4030		return (SYSCTL_OUT(req, &i, sizeof(i)));
4031	case CTLTYPE_STRING:
4032		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
4033		return
4034		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4035	case CTLTYPE_STRUCT:
4036		s = (size_t)arg2;
4037		return (SYSCTL_OUT(req, &s, sizeof(s)));
4038	}
4039	return (0);
4040}
4041
4042SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4043SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4044SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4045SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4046SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4047    "I", "Jail secure level");
4048SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4049    "I", "Jail cannot see all mounted file systems");
4050SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4051    "B", "Jail persistence");
4052#ifdef VIMAGE
4053SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4054    "E,jailsys", "Virtual network stack");
4055#endif
4056SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4057    "B", "Jail is in the process of shutting down");
4058
4059SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4060SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4061    "I", "Current number of child jails");
4062SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4063    "I", "Maximum number of child jails");
4064
4065SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4066SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4067    "Jail hostname");
4068SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4069    "Jail NIS domainname");
4070SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4071    "Jail host UUID");
4072SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4073    "LU", "Jail host ID");
4074
4075SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4076SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4077
4078#ifdef INET
4079SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4080    "Jail IPv4 address virtualization");
4081SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4082    "S,in_addr,a", "Jail IPv4 addresses");
4083#endif
4084#ifdef INET6
4085SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4086    "Jail IPv6 address virtualization");
4087SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4088    "S,in6_addr,a", "Jail IPv6 addresses");
4089#endif
4090
4091SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4092SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4093    "B", "Jail may set hostname");
4094SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4095    "B", "Jail may use SYSV IPC");
4096SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4097    "B", "Jail may create raw sockets");
4098SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4099    "B", "Jail may alter system file flags");
4100SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
4101    "B", "Jail may mount/unmount jail-friendly file systems");
4102SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4103    "B", "Jail may set file quotas");
4104SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4105    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4106
4107
4108#ifdef DDB
4109
4110static void
4111db_show_prison(struct prison *pr)
4112{
4113	int fi;
4114#if defined(INET) || defined(INET6)
4115	int ii;
4116#endif
4117	unsigned jsf;
4118#ifdef INET6
4119	char ip6buf[INET6_ADDRSTRLEN];
4120#endif
4121
4122	db_printf("prison %p:\n", pr);
4123	db_printf(" jid             = %d\n", pr->pr_id);
4124	db_printf(" name            = %s\n", pr->pr_name);
4125	db_printf(" parent          = %p\n", pr->pr_parent);
4126	db_printf(" ref             = %d\n", pr->pr_ref);
4127	db_printf(" uref            = %d\n", pr->pr_uref);
4128	db_printf(" path            = %s\n", pr->pr_path);
4129	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4130	    ? pr->pr_cpuset->cs_id : -1);
4131#ifdef VIMAGE
4132	db_printf(" vnet            = %p\n", pr->pr_vnet);
4133#endif
4134	db_printf(" root            = %p\n", pr->pr_root);
4135	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4136	db_printf(" childcount      = %d\n", pr->pr_childcount);
4137	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4138	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4139	db_printf(" flags           = %x", pr->pr_flags);
4140	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4141	    fi++)
4142		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4143			db_printf(" %s", pr_flag_names[fi]);
4144	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4145	    fi++) {
4146		jsf = pr->pr_flags &
4147		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4148		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4149		    pr_flag_jailsys[fi].disable &&
4150		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4151		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4152		    : "inherit");
4153	}
4154	db_printf(" allow           = %x", pr->pr_allow);
4155	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4156	    fi++)
4157		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4158			db_printf(" %s", pr_allow_names[fi]);
4159	db_printf("\n");
4160	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4161	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4162	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4163	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4164	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4165#ifdef INET
4166	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4167	for (ii = 0; ii < pr->pr_ip4s; ii++)
4168		db_printf(" %s %s\n",
4169		    ii == 0 ? "ip4             =" : "                 ",
4170		    inet_ntoa(pr->pr_ip4[ii]));
4171#endif
4172#ifdef INET6
4173	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4174	for (ii = 0; ii < pr->pr_ip6s; ii++)
4175		db_printf(" %s %s\n",
4176		    ii == 0 ? "ip6             =" : "                 ",
4177		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4178#endif
4179}
4180
4181DB_SHOW_COMMAND(prison, db_show_prison_command)
4182{
4183	struct prison *pr;
4184
4185	if (!have_addr) {
4186		/*
4187		 * Show all prisons in the list, and prison0 which is not
4188		 * listed.
4189		 */
4190		db_show_prison(&prison0);
4191		if (!db_pager_quit) {
4192			TAILQ_FOREACH(pr, &allprison, pr_list) {
4193				db_show_prison(pr);
4194				if (db_pager_quit)
4195					break;
4196			}
4197		}
4198		return;
4199	}
4200
4201	if (addr == 0)
4202		pr = &prison0;
4203	else {
4204		/* Look for a prison with the ID and with references. */
4205		TAILQ_FOREACH(pr, &allprison, pr_list)
4206			if (pr->pr_id == addr && pr->pr_ref > 0)
4207				break;
4208		if (pr == NULL)
4209			/* Look again, without requiring a reference. */
4210			TAILQ_FOREACH(pr, &allprison, pr_list)
4211				if (pr->pr_id == addr)
4212					break;
4213		if (pr == NULL)
4214			/* Assume address points to a valid prison. */
4215			pr = (struct prison *)addr;
4216	}
4217	db_show_prison(pr);
4218}
4219
4220#endif /* DDB */
4221