kern_jail.c revision 196135
1274955Ssvnmir/*-
2274955Ssvnmir * Copyright (c) 1999 Poul-Henning Kamp.
3353358Sdim * Copyright (c) 2008 Bjoern A. Zeeb.
4353358Sdim * Copyright (c) 2009 James Gritton.
5353358Sdim * All rights reserved.
6274955Ssvnmir *
7274955Ssvnmir * Redistribution and use in source and binary forms, with or without
8274955Ssvnmir * modification, are permitted provided that the following conditions
9280031Sdim * are met:
10280031Sdim * 1. Redistributions of source code must retain the above copyright
11274955Ssvnmir *    notice, this list of conditions and the following disclaimer.
12274955Ssvnmir * 2. Redistributions in binary form must reproduce the above copyright
13360784Sdim *    notice, this list of conditions and the following disclaimer in the
14274955Ssvnmir *    documentation and/or other materials provided with the distribution.
15274955Ssvnmir *
16274955Ssvnmir * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17360784Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18360784Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19274955Ssvnmir * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20274955Ssvnmir * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21274955Ssvnmir * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22274955Ssvnmir * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23274955Ssvnmir * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24274955Ssvnmir * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25327952Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26327952Sdim * SUCH DAMAGE.
27341825Sdim */
28327952Sdim
29327952Sdim#include <sys/cdefs.h>
30274955Ssvnmir__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 196135 2009-08-12 12:12:23Z bz $");
31274955Ssvnmir
32274955Ssvnmir#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/sx.h>
53#include <sys/sysent.h>
54#include <sys/namei.h>
55#include <sys/mount.h>
56#include <sys/queue.h>
57#include <sys/socket.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/vnode.h>
61
62#include <net/if.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#ifdef INET6
70#include <netinet6/in6_var.h>
71#endif /* INET6 */
72#endif /* DDB */
73
74#include <security/mac/mac_framework.h>
75
76#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
77
78MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
79
80/* prison0 describes what is "real" about the system. */
81struct prison prison0 = {
82	.pr_id		= 0,
83	.pr_name	= "0",
84	.pr_ref		= 1,
85	.pr_uref	= 1,
86	.pr_path	= "/",
87	.pr_securelevel	= -1,
88	.pr_childmax	= JAIL_MAX,
89	.pr_hostuuid	= DEFAULT_HOSTUUID,
90	.pr_children	= LIST_HEAD_INITIALIZER(&prison0.pr_children),
91	.pr_flags	= PR_HOST,
92	.pr_allow	= PR_ALLOW_ALL,
93};
94MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
95
96/* allprison and lastprid are protected by allprison_lock. */
97struct	sx allprison_lock;
98SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
99struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
100int	lastprid = 0;
101
102static int do_jail_attach(struct thread *td, struct prison *pr);
103static void prison_complete(void *context, int pending);
104static void prison_deref(struct prison *pr, int flags);
105static char *prison_path(struct prison *pr1, struct prison *pr2);
106static void prison_remove_one(struct prison *pr);
107#ifdef INET
108static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
109static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
110#endif
111#ifdef INET6
112static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
113static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
114#endif
115
116/* Flags for prison_deref */
117#define	PD_DEREF	0x01
118#define	PD_DEUREF	0x02
119#define	PD_LOCKED	0x04
120#define	PD_LIST_SLOCKED	0x08
121#define	PD_LIST_XLOCKED	0x10
122
123/*
124 * Parameter names corresponding to PR_* flag values
125 */
126static char *pr_flag_names[] = {
127	[0] = "persist",
128};
129
130static char *pr_flag_nonames[] = {
131	[0] = "nopersist",
132};
133
134struct jailsys_flags {
135	const char	*name;
136	unsigned	 disable;
137	unsigned	 new;
138} pr_flag_jailsys[] = {
139	{ "host", 0, PR_HOST },
140#ifdef VIMAGE
141	{ "vnet", 0, PR_VNET },
142#endif
143#ifdef INET
144	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
145#endif
146#ifdef INET6
147	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
148#endif
149};
150
151static char *pr_allow_names[] = {
152	"allow.set_hostname",
153	"allow.sysvipc",
154	"allow.raw_sockets",
155	"allow.chflags",
156	"allow.mount",
157	"allow.quotas",
158	"allow.socket_af",
159};
160
161static char *pr_allow_nonames[] = {
162	"allow.noset_hostname",
163	"allow.nosysvipc",
164	"allow.noraw_sockets",
165	"allow.nochflags",
166	"allow.nomount",
167	"allow.noquotas",
168	"allow.nosocket_af",
169};
170
171#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
172#define	JAIL_DEFAULT_ENFORCE_STATFS	2
173static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
174static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
175#if defined(INET) || defined(INET6)
176static unsigned jail_max_af_ips = 255;
177#endif
178
179#ifdef INET
180static int
181qcmp_v4(const void *ip1, const void *ip2)
182{
183	in_addr_t iaa, iab;
184
185	/*
186	 * We need to compare in HBO here to get the list sorted as expected
187	 * by the result of the code.  Sorting NBO addresses gives you
188	 * interesting results.  If you do not understand, do not try.
189	 */
190	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
191	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
192
193	/*
194	 * Do not simply return the difference of the two numbers, the int is
195	 * not wide enough.
196	 */
197	if (iaa > iab)
198		return (1);
199	else if (iaa < iab)
200		return (-1);
201	else
202		return (0);
203}
204#endif
205
206#ifdef INET6
207static int
208qcmp_v6(const void *ip1, const void *ip2)
209{
210	const struct in6_addr *ia6a, *ia6b;
211	int i, rc;
212
213	ia6a = (const struct in6_addr *)ip1;
214	ia6b = (const struct in6_addr *)ip2;
215
216	rc = 0;
217	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
218		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
219			rc = 1;
220		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
221			rc = -1;
222	}
223	return (rc);
224}
225#endif
226
227/*
228 * struct jail_args {
229 *	struct jail *jail;
230 * };
231 */
232int
233jail(struct thread *td, struct jail_args *uap)
234{
235	uint32_t version;
236	int error;
237	struct jail j;
238
239	error = copyin(uap->jail, &version, sizeof(uint32_t));
240	if (error)
241		return (error);
242
243	switch (version) {
244	case 0:
245	{
246		struct jail_v0 j0;
247
248		/* FreeBSD single IPv4 jails. */
249		bzero(&j, sizeof(struct jail));
250		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
251		if (error)
252			return (error);
253		j.version = j0.version;
254		j.path = j0.path;
255		j.hostname = j0.hostname;
256		j.ip4s = j0.ip_number;
257		break;
258	}
259
260	case 1:
261		/*
262		 * Version 1 was used by multi-IPv4 jail implementations
263		 * that never made it into the official kernel.
264		 */
265		return (EINVAL);
266
267	case 2:	/* JAIL_API_VERSION */
268		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
269		error = copyin(uap->jail, &j, sizeof(struct jail));
270		if (error)
271			return (error);
272		break;
273
274	default:
275		/* Sci-Fi jails are not supported, sorry. */
276		return (EINVAL);
277	}
278	return (kern_jail(td, &j));
279}
280
281int
282kern_jail(struct thread *td, struct jail *j)
283{
284	struct iovec optiov[2 * (4
285			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
286#ifdef INET
287			    + 1
288#endif
289#ifdef INET6
290			    + 1
291#endif
292			    )];
293	struct uio opt;
294	char *u_path, *u_hostname, *u_name;
295#ifdef INET
296	uint32_t ip4s;
297	struct in_addr *u_ip4;
298#endif
299#ifdef INET6
300	struct in6_addr *u_ip6;
301#endif
302	size_t tmplen;
303	int error, enforce_statfs, fi;
304
305	bzero(&optiov, sizeof(optiov));
306	opt.uio_iov = optiov;
307	opt.uio_iovcnt = 0;
308	opt.uio_offset = -1;
309	opt.uio_resid = -1;
310	opt.uio_segflg = UIO_SYSSPACE;
311	opt.uio_rw = UIO_READ;
312	opt.uio_td = td;
313
314	/* Set permissions for top-level jails from sysctls. */
315	if (!jailed(td->td_ucred)) {
316		for (fi = 0; fi < sizeof(pr_allow_names) /
317		     sizeof(pr_allow_names[0]); fi++) {
318			optiov[opt.uio_iovcnt].iov_base =
319			    (jail_default_allow & (1 << fi))
320			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
321			optiov[opt.uio_iovcnt].iov_len =
322			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
323			opt.uio_iovcnt += 2;
324		}
325		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
326		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
327		opt.uio_iovcnt++;
328		enforce_statfs = jail_default_enforce_statfs;
329		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
330		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
331		opt.uio_iovcnt++;
332	}
333
334	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
335#ifdef INET
336	ip4s = (j->version == 0) ? 1 : j->ip4s;
337	if (ip4s > jail_max_af_ips)
338		return (EINVAL);
339	tmplen += ip4s * sizeof(struct in_addr);
340#else
341	if (j->ip4s > 0)
342		return (EINVAL);
343#endif
344#ifdef INET6
345	if (j->ip6s > jail_max_af_ips)
346		return (EINVAL);
347	tmplen += j->ip6s * sizeof(struct in6_addr);
348#else
349	if (j->ip6s > 0)
350		return (EINVAL);
351#endif
352	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
353	u_hostname = u_path + MAXPATHLEN;
354	u_name = u_hostname + MAXHOSTNAMELEN;
355#ifdef INET
356	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
357#endif
358#ifdef INET6
359#ifdef INET
360	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
361#else
362	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
363#endif
364#endif
365	optiov[opt.uio_iovcnt].iov_base = "path";
366	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
367	opt.uio_iovcnt++;
368	optiov[opt.uio_iovcnt].iov_base = u_path;
369	error = copyinstr(j->path, u_path, MAXPATHLEN,
370	    &optiov[opt.uio_iovcnt].iov_len);
371	if (error) {
372		free(u_path, M_TEMP);
373		return (error);
374	}
375	opt.uio_iovcnt++;
376	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
377	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
378	opt.uio_iovcnt++;
379	optiov[opt.uio_iovcnt].iov_base = u_hostname;
380	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
381	    &optiov[opt.uio_iovcnt].iov_len);
382	if (error) {
383		free(u_path, M_TEMP);
384		return (error);
385	}
386	opt.uio_iovcnt++;
387	if (j->jailname != NULL) {
388		optiov[opt.uio_iovcnt].iov_base = "name";
389		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
390		opt.uio_iovcnt++;
391		optiov[opt.uio_iovcnt].iov_base = u_name;
392		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
393		    &optiov[opt.uio_iovcnt].iov_len);
394		if (error) {
395			free(u_path, M_TEMP);
396			return (error);
397		}
398		opt.uio_iovcnt++;
399	}
400#ifdef INET
401	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
402	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
403	opt.uio_iovcnt++;
404	optiov[opt.uio_iovcnt].iov_base = u_ip4;
405	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
406	if (j->version == 0)
407		u_ip4->s_addr = j->ip4s;
408	else {
409		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
410		if (error) {
411			free(u_path, M_TEMP);
412			return (error);
413		}
414	}
415	opt.uio_iovcnt++;
416#endif
417#ifdef INET6
418	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
419	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
420	opt.uio_iovcnt++;
421	optiov[opt.uio_iovcnt].iov_base = u_ip6;
422	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
423	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
424	if (error) {
425		free(u_path, M_TEMP);
426		return (error);
427	}
428	opt.uio_iovcnt++;
429#endif
430	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
431	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
432	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
433	free(u_path, M_TEMP);
434	return (error);
435}
436
437
438/*
439 * struct jail_set_args {
440 *	struct iovec *iovp;
441 *	unsigned int iovcnt;
442 *	int flags;
443 * };
444 */
445int
446jail_set(struct thread *td, struct jail_set_args *uap)
447{
448	struct uio *auio;
449	int error;
450
451	/* Check that we have an even number of iovecs. */
452	if (uap->iovcnt & 1)
453		return (EINVAL);
454
455	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
456	if (error)
457		return (error);
458	error = kern_jail_set(td, auio, uap->flags);
459	free(auio, M_IOV);
460	return (error);
461}
462
463int
464kern_jail_set(struct thread *td, struct uio *optuio, int flags)
465{
466	struct nameidata nd;
467#ifdef INET
468	struct in_addr *ip4;
469#endif
470#ifdef INET6
471	struct in6_addr *ip6;
472#endif
473	struct vfsopt *opt;
474	struct vfsoptlist *opts;
475	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
476	struct vnode *root;
477	char *domain, *errmsg, *host, *name, *p, *path, *uuid;
478#if defined(INET) || defined(INET6)
479	struct prison *tppr;
480	void *op;
481#endif
482	unsigned long hid;
483	size_t namelen, onamelen;
484	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
485	int gotchildmax, gotenforce, gothid, gotslevel;
486	int fi, jid, jsys, len, level;
487	int childmax, slevel, vfslocked;
488#if defined(INET) || defined(INET6)
489	int ii, ij;
490#endif
491#ifdef INET
492	int ip4s, redo_ip4;
493#endif
494#ifdef INET6
495	int ip6s, redo_ip6;
496#endif
497	unsigned pr_flags, ch_flags;
498	unsigned pr_allow, ch_allow, tallow;
499	char numbuf[12];
500
501	error = priv_check(td, PRIV_JAIL_SET);
502	if (!error && (flags & JAIL_ATTACH))
503		error = priv_check(td, PRIV_JAIL_ATTACH);
504	if (error)
505		return (error);
506	mypr = ppr = td->td_ucred->cr_prison;
507	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
508		return (EPERM);
509	if (flags & ~JAIL_SET_MASK)
510		return (EINVAL);
511
512	/*
513	 * Check all the parameters before committing to anything.  Not all
514	 * errors can be caught early, but we may as well try.  Also, this
515	 * takes care of some expensive stuff (path lookup) before getting
516	 * the allprison lock.
517	 *
518	 * XXX Jails are not filesystems, and jail parameters are not mount
519	 *     options.  But it makes more sense to re-use the vfsopt code
520	 *     than duplicate it under a different name.
521	 */
522	error = vfs_buildopts(optuio, &opts);
523	if (error)
524		return (error);
525#ifdef INET
526	ip4 = NULL;
527#endif
528#ifdef INET6
529	ip6 = NULL;
530#endif
531
532	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
533	if (error == ENOENT)
534		jid = 0;
535	else if (error != 0)
536		goto done_free;
537
538	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
539	if (error == ENOENT)
540		gotslevel = 0;
541	else if (error != 0)
542		goto done_free;
543	else
544		gotslevel = 1;
545
546	error =
547	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
548	if (error == ENOENT)
549		gotchildmax = 0;
550	else if (error != 0)
551		goto done_free;
552	else
553		gotchildmax = 1;
554
555	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
556	gotenforce = (error == 0);
557	if (gotenforce) {
558		if (enforce < 0 || enforce > 2)
559			return (EINVAL);
560	} else if (error != ENOENT)
561		goto done_free;
562
563	pr_flags = ch_flags = 0;
564	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
565	    fi++) {
566		if (pr_flag_names[fi] == NULL)
567			continue;
568		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
569		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
570	}
571	ch_flags |= pr_flags;
572	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
573	    fi++) {
574		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
575		    sizeof(jsys));
576		if (error == ENOENT)
577			continue;
578		if (error != 0)
579			goto done_free;
580		switch (jsys) {
581		case JAIL_SYS_DISABLE:
582			if (!pr_flag_jailsys[fi].disable) {
583				error = EINVAL;
584				goto done_free;
585			}
586			pr_flags |= pr_flag_jailsys[fi].disable;
587			break;
588		case JAIL_SYS_NEW:
589			pr_flags |= pr_flag_jailsys[fi].new;
590			break;
591		case JAIL_SYS_INHERIT:
592			break;
593		default:
594			error = EINVAL;
595			goto done_free;
596		}
597		ch_flags |=
598		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
599	}
600	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
601	    && !(pr_flags & PR_PERSIST)) {
602		error = EINVAL;
603		vfs_opterror(opts, "new jail must persist or attach");
604		goto done_errmsg;
605	}
606#ifdef VIMAGE
607	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
608		error = EINVAL;
609		vfs_opterror(opts, "vnet cannot be changed after creation");
610		goto done_errmsg;
611	}
612#endif
613#ifdef INET
614	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
615		error = EINVAL;
616		vfs_opterror(opts, "ip4 cannot be changed after creation");
617		goto done_errmsg;
618	}
619#endif
620#ifdef INET6
621	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
622		error = EINVAL;
623		vfs_opterror(opts, "ip6 cannot be changed after creation");
624		goto done_errmsg;
625	}
626#endif
627
628	pr_allow = ch_allow = 0;
629	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
630	    fi++) {
631		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
632		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
633	}
634	ch_allow |= pr_allow;
635
636	error = vfs_getopt(opts, "name", (void **)&name, &len);
637	if (error == ENOENT)
638		name = NULL;
639	else if (error != 0)
640		goto done_free;
641	else {
642		if (len == 0 || name[len - 1] != '\0') {
643			error = EINVAL;
644			goto done_free;
645		}
646		if (len > MAXHOSTNAMELEN) {
647			error = ENAMETOOLONG;
648			goto done_free;
649		}
650	}
651
652	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
653	if (error == ENOENT)
654		host = NULL;
655	else if (error != 0)
656		goto done_free;
657	else {
658		ch_flags |= PR_HOST;
659		pr_flags |= PR_HOST;
660		if (len == 0 || host[len - 1] != '\0') {
661			error = EINVAL;
662			goto done_free;
663		}
664		if (len > MAXHOSTNAMELEN) {
665			error = ENAMETOOLONG;
666			goto done_free;
667		}
668	}
669
670	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
671	if (error == ENOENT)
672		domain = NULL;
673	else if (error != 0)
674		goto done_free;
675	else {
676		ch_flags |= PR_HOST;
677		pr_flags |= PR_HOST;
678		if (len == 0 || domain[len - 1] != '\0') {
679			error = EINVAL;
680			goto done_free;
681		}
682		if (len > MAXHOSTNAMELEN) {
683			error = ENAMETOOLONG;
684			goto done_free;
685		}
686	}
687
688	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
689	if (error == ENOENT)
690		uuid = NULL;
691	else if (error != 0)
692		goto done_free;
693	else {
694		ch_flags |= PR_HOST;
695		pr_flags |= PR_HOST;
696		if (len == 0 || uuid[len - 1] != '\0') {
697			error = EINVAL;
698			goto done_free;
699		}
700		if (len > HOSTUUIDLEN) {
701			error = ENAMETOOLONG;
702			goto done_free;
703		}
704	}
705
706#ifdef COMPAT_IA32
707	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
708		uint32_t hid32;
709
710		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
711		hid = hid32;
712	} else
713#endif
714		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
715	if (error == ENOENT)
716		gothid = 0;
717	else if (error != 0)
718		goto done_free;
719	else {
720		gothid = 1;
721		ch_flags |= PR_HOST;
722		pr_flags |= PR_HOST;
723	}
724
725#ifdef INET
726	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
727	if (error == ENOENT)
728		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
729	else if (error != 0)
730		goto done_free;
731	else if (ip4s & (sizeof(*ip4) - 1)) {
732		error = EINVAL;
733		goto done_free;
734	} else {
735		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
736		if (ip4s == 0)
737			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
738		else {
739			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
740			ip4s /= sizeof(*ip4);
741			if (ip4s > jail_max_af_ips) {
742				error = EINVAL;
743				vfs_opterror(opts, "too many IPv4 addresses");
744				goto done_errmsg;
745			}
746			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
747			bcopy(op, ip4, ip4s * sizeof(*ip4));
748			/*
749			 * IP addresses are all sorted but ip[0] to preserve
750			 * the primary IP address as given from userland.
751			 * This special IP is used for unbound outgoing
752			 * connections as well for "loopback" traffic.
753			 */
754			if (ip4s > 1)
755				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
756			/*
757			 * Check for duplicate addresses and do some simple
758			 * zero and broadcast checks. If users give other bogus
759			 * addresses it is their problem.
760			 *
761			 * We do not have to care about byte order for these
762			 * checks so we will do them in NBO.
763			 */
764			for (ii = 0; ii < ip4s; ii++) {
765				if (ip4[ii].s_addr == INADDR_ANY ||
766				    ip4[ii].s_addr == INADDR_BROADCAST) {
767					error = EINVAL;
768					goto done_free;
769				}
770				if ((ii+1) < ip4s &&
771				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
772				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
773					error = EINVAL;
774					goto done_free;
775				}
776			}
777		}
778	}
779#endif
780
781#ifdef INET6
782	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
783	if (error == ENOENT)
784		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
785	else if (error != 0)
786		goto done_free;
787	else if (ip6s & (sizeof(*ip6) - 1)) {
788		error = EINVAL;
789		goto done_free;
790	} else {
791		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
792		if (ip6s == 0)
793			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
794		else {
795			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
796			ip6s /= sizeof(*ip6);
797			if (ip6s > jail_max_af_ips) {
798				error = EINVAL;
799				vfs_opterror(opts, "too many IPv6 addresses");
800				goto done_errmsg;
801			}
802			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
803			bcopy(op, ip6, ip6s * sizeof(*ip6));
804			if (ip6s > 1)
805				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
806			for (ii = 0; ii < ip6s; ii++) {
807				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
808					error = EINVAL;
809					goto done_free;
810				}
811				if ((ii+1) < ip6s &&
812				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
813				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
814				{
815					error = EINVAL;
816					goto done_free;
817				}
818			}
819		}
820	}
821#endif
822
823#if defined(VIMAGE) && (defined(INET) || defined(INET6))
824	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
825		error = EINVAL;
826		vfs_opterror(opts,
827		    "vnet jails cannot have IP address restrictions");
828		goto done_errmsg;
829	}
830#endif
831
832	root = NULL;
833	error = vfs_getopt(opts, "path", (void **)&path, &len);
834	if (error == ENOENT)
835		path = NULL;
836	else if (error != 0)
837		goto done_free;
838	else {
839		if (flags & JAIL_UPDATE) {
840			error = EINVAL;
841			vfs_opterror(opts,
842			    "path cannot be changed after creation");
843			goto done_errmsg;
844		}
845		if (len == 0 || path[len - 1] != '\0') {
846			error = EINVAL;
847			goto done_free;
848		}
849		if (len < 2 || (len == 2 && path[0] == '/'))
850			path = NULL;
851		else {
852			/* Leave room for a real-root full pathname. */
853			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
854			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
855				error = ENAMETOOLONG;
856				goto done_free;
857			}
858			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
859			    path, td);
860			error = namei(&nd);
861			if (error)
862				goto done_free;
863			vfslocked = NDHASGIANT(&nd);
864			root = nd.ni_vp;
865			NDFREE(&nd, NDF_ONLY_PNBUF);
866			if (root->v_type != VDIR) {
867				error = ENOTDIR;
868				vrele(root);
869				VFS_UNLOCK_GIANT(vfslocked);
870				goto done_free;
871			}
872			VFS_UNLOCK_GIANT(vfslocked);
873		}
874	}
875
876	/*
877	 * Grab the allprison lock before letting modules check their
878	 * parameters.  Once we have it, do not let go so we'll have a
879	 * consistent view of the OSD list.
880	 */
881	sx_xlock(&allprison_lock);
882	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
883	if (error)
884		goto done_unlock_list;
885
886	/* By now, all parameters should have been noted. */
887	TAILQ_FOREACH(opt, opts, link) {
888		if (!opt->seen && strcmp(opt->name, "errmsg")) {
889			error = EINVAL;
890			vfs_opterror(opts, "unknown parameter: %s", opt->name);
891			goto done_unlock_list;
892		}
893	}
894
895	/*
896	 * See if we are creating a new record or updating an existing one.
897	 * This abuses the file error codes ENOENT and EEXIST.
898	 */
899	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
900	if (!cuflags) {
901		error = EINVAL;
902		vfs_opterror(opts, "no valid operation (create or update)");
903		goto done_unlock_list;
904	}
905	pr = NULL;
906	if (jid != 0) {
907		/*
908		 * See if a requested jid already exists.  There is an
909		 * information leak here if the jid exists but is not within
910		 * the caller's jail hierarchy.  Jail creators will get EEXIST
911		 * even though they cannot see the jail, and CREATE | UPDATE
912		 * will return ENOENT which is not normally a valid error.
913		 */
914		if (jid < 0) {
915			error = EINVAL;
916			vfs_opterror(opts, "negative jid");
917			goto done_unlock_list;
918		}
919		pr = prison_find(jid);
920		if (pr != NULL) {
921			ppr = pr->pr_parent;
922			/* Create: jid must not exist. */
923			if (cuflags == JAIL_CREATE) {
924				mtx_unlock(&pr->pr_mtx);
925				error = EEXIST;
926				vfs_opterror(opts, "jail %d already exists",
927				    jid);
928				goto done_unlock_list;
929			}
930			if (!prison_ischild(mypr, pr)) {
931				mtx_unlock(&pr->pr_mtx);
932				pr = NULL;
933			} else if (pr->pr_uref == 0) {
934				if (!(flags & JAIL_DYING)) {
935					mtx_unlock(&pr->pr_mtx);
936					error = ENOENT;
937					vfs_opterror(opts, "jail %d is dying",
938					    jid);
939					goto done_unlock_list;
940				} else if ((flags & JAIL_ATTACH) ||
941				    (pr_flags & PR_PERSIST)) {
942					/*
943					 * A dying jail might be resurrected
944					 * (via attach or persist), but first
945					 * it must determine if another jail
946					 * has claimed its name.  Accomplish
947					 * this by implicitly re-setting the
948					 * name.
949					 */
950					if (name == NULL)
951						name = prison_name(mypr, pr);
952				}
953			}
954		}
955		if (pr == NULL) {
956			/* Update: jid must exist. */
957			if (cuflags == JAIL_UPDATE) {
958				error = ENOENT;
959				vfs_opterror(opts, "jail %d not found", jid);
960				goto done_unlock_list;
961			}
962		}
963	}
964	/*
965	 * If the caller provided a name, look for a jail by that name.
966	 * This has different semantics for creates and updates keyed by jid
967	 * (where the name must not already exist in a different jail),
968	 * and updates keyed by the name itself (where the name must exist
969	 * because that is the jail being updated).
970	 */
971	if (name != NULL) {
972		p = strrchr(name, '.');
973		if (p != NULL) {
974			/*
975			 * This is a hierarchical name.  Split it into the
976			 * parent and child names, and make sure the parent
977			 * exists or matches an already found jail.
978			 */
979			*p = '\0';
980			if (pr != NULL) {
981				if (strncmp(name, ppr->pr_name, p - name) ||
982				    ppr->pr_name[p - name] != '\0') {
983					mtx_unlock(&pr->pr_mtx);
984					error = EINVAL;
985					vfs_opterror(opts,
986					    "cannot change jail's parent");
987					goto done_unlock_list;
988				}
989			} else {
990				ppr = prison_find_name(mypr, name);
991				if (ppr == NULL) {
992					error = ENOENT;
993					vfs_opterror(opts,
994					    "jail \"%s\" not found", name);
995					goto done_unlock_list;
996				}
997				mtx_unlock(&ppr->pr_mtx);
998			}
999			name = p + 1;
1000		}
1001		if (name[0] != '\0') {
1002			namelen =
1003			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1004 name_again:
1005			deadpr = NULL;
1006			FOREACH_PRISON_CHILD(ppr, tpr) {
1007				if (tpr != pr && tpr->pr_ref > 0 &&
1008				    !strcmp(tpr->pr_name + namelen, name)) {
1009					if (pr == NULL &&
1010					    cuflags != JAIL_CREATE) {
1011						mtx_lock(&tpr->pr_mtx);
1012						if (tpr->pr_ref > 0) {
1013							/*
1014							 * Use this jail
1015							 * for updates.
1016							 */
1017							if (tpr->pr_uref > 0) {
1018								pr = tpr;
1019								break;
1020							}
1021							deadpr = tpr;
1022						}
1023						mtx_unlock(&tpr->pr_mtx);
1024					} else if (tpr->pr_uref > 0) {
1025						/*
1026						 * Create, or update(jid):
1027						 * name must not exist in an
1028						 * active sibling jail.
1029						 */
1030						error = EEXIST;
1031						if (pr != NULL)
1032							mtx_unlock(&pr->pr_mtx);
1033						vfs_opterror(opts,
1034						   "jail \"%s\" already exists",
1035						   name);
1036						goto done_unlock_list;
1037					}
1038				}
1039			}
1040			/* If no active jail is found, use a dying one. */
1041			if (deadpr != NULL && pr == NULL) {
1042				if (flags & JAIL_DYING) {
1043					mtx_lock(&deadpr->pr_mtx);
1044					if (deadpr->pr_ref == 0) {
1045						mtx_unlock(&deadpr->pr_mtx);
1046						goto name_again;
1047					}
1048					pr = deadpr;
1049				} else if (cuflags == JAIL_UPDATE) {
1050					error = ENOENT;
1051					vfs_opterror(opts,
1052					    "jail \"%s\" is dying", name);
1053					goto done_unlock_list;
1054				}
1055			}
1056			/* Update: name must exist if no jid. */
1057			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1058				error = ENOENT;
1059				vfs_opterror(opts, "jail \"%s\" not found",
1060				    name);
1061				goto done_unlock_list;
1062			}
1063		}
1064	}
1065	/* Update: must provide a jid or name. */
1066	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1067		error = ENOENT;
1068		vfs_opterror(opts, "update specified no jail");
1069		goto done_unlock_list;
1070	}
1071
1072	/* If there's no prison to update, create a new one and link it in. */
1073	if (pr == NULL) {
1074		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1075			if (tpr->pr_childcount >= tpr->pr_childmax) {
1076				error = EPERM;
1077				vfs_opterror(opts, "prison limit exceeded");
1078				goto done_unlock_list;
1079			}
1080		created = 1;
1081		mtx_lock(&ppr->pr_mtx);
1082		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1083			mtx_unlock(&ppr->pr_mtx);
1084			error = ENOENT;
1085			vfs_opterror(opts, "parent jail went away!");
1086			goto done_unlock_list;
1087		}
1088		ppr->pr_ref++;
1089		ppr->pr_uref++;
1090		mtx_unlock(&ppr->pr_mtx);
1091		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1092		if (jid == 0) {
1093			/* Find the next free jid. */
1094			jid = lastprid + 1;
1095 findnext:
1096			if (jid == JAIL_MAX)
1097				jid = 1;
1098			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1099				if (tpr->pr_id < jid)
1100					continue;
1101				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1102					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1103					break;
1104				}
1105				if (jid == lastprid) {
1106					error = EAGAIN;
1107					vfs_opterror(opts,
1108					    "no available jail IDs");
1109					free(pr, M_PRISON);
1110					prison_deref(ppr, PD_DEREF |
1111					    PD_DEUREF | PD_LIST_XLOCKED);
1112					goto done_releroot;
1113				}
1114				jid++;
1115				goto findnext;
1116			}
1117			lastprid = jid;
1118		} else {
1119			/*
1120			 * The jail already has a jid (that did not yet exist),
1121			 * so just find where to insert it.
1122			 */
1123			TAILQ_FOREACH(tpr, &allprison, pr_list)
1124				if (tpr->pr_id >= jid) {
1125					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1126					break;
1127				}
1128		}
1129		if (tpr == NULL)
1130			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1131		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1132		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1133			tpr->pr_childcount++;
1134
1135		pr->pr_parent = ppr;
1136		pr->pr_id = jid;
1137
1138		/* Set some default values, and inherit some from the parent. */
1139		if (name == NULL)
1140			name = "";
1141		if (path == NULL) {
1142			path = "/";
1143			root = mypr->pr_root;
1144			vref(root);
1145		}
1146		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1147		pr->pr_flags |= PR_HOST;
1148#if defined(INET) || defined(INET6)
1149#ifdef VIMAGE
1150		if (!(pr_flags & PR_VNET))
1151#endif
1152		{
1153#ifdef INET
1154			if (!(ch_flags & PR_IP4_USER))
1155				pr->pr_flags |=
1156				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1157			else if (!(pr_flags & PR_IP4_USER)) {
1158				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1159				if (ppr->pr_ip4 != NULL) {
1160					pr->pr_ip4s = ppr->pr_ip4s;
1161					pr->pr_ip4 = malloc(pr->pr_ip4s *
1162					    sizeof(struct in_addr), M_PRISON,
1163					    M_WAITOK);
1164					bcopy(ppr->pr_ip4, pr->pr_ip4,
1165					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1166				}
1167			}
1168#endif
1169#ifdef INET6
1170			if (!(ch_flags & PR_IP6_USER))
1171				pr->pr_flags |=
1172				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1173			else if (!(pr_flags & PR_IP6_USER)) {
1174				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1175				if (ppr->pr_ip6 != NULL) {
1176					pr->pr_ip6s = ppr->pr_ip6s;
1177					pr->pr_ip6 = malloc(pr->pr_ip6s *
1178					    sizeof(struct in6_addr), M_PRISON,
1179					    M_WAITOK);
1180					bcopy(ppr->pr_ip6, pr->pr_ip6,
1181					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1182				}
1183			}
1184#endif
1185		}
1186#endif
1187		pr->pr_securelevel = ppr->pr_securelevel;
1188		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1189		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1190
1191		LIST_INIT(&pr->pr_children);
1192		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1193
1194#ifdef VIMAGE
1195		/* Allocate a new vnet if specified. */
1196		pr->pr_vnet = (pr_flags & PR_VNET)
1197		    ? vnet_alloc() : ppr->pr_vnet;
1198#endif
1199		/*
1200		 * Allocate a dedicated cpuset for each jail.
1201		 * Unlike other initial settings, this may return an erorr.
1202		 */
1203		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1204		if (error) {
1205			prison_deref(pr, PD_LIST_XLOCKED);
1206			goto done_releroot;
1207		}
1208
1209		mtx_lock(&pr->pr_mtx);
1210		/*
1211		 * New prisons do not yet have a reference, because we do not
1212		 * want other to see the incomplete prison once the
1213		 * allprison_lock is downgraded.
1214		 */
1215	} else {
1216		created = 0;
1217		/*
1218		 * Grab a reference for existing prisons, to ensure they
1219		 * continue to exist for the duration of the call.
1220		 */
1221		pr->pr_ref++;
1222#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1223		if ((pr->pr_flags & PR_VNET) &&
1224		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1225			error = EINVAL;
1226			vfs_opterror(opts,
1227			    "vnet jails cannot have IP address restrictions");
1228			goto done_deref_locked;
1229		}
1230#endif
1231#ifdef INET
1232		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1233			error = EINVAL;
1234			vfs_opterror(opts,
1235			    "ip4 cannot be changed after creation");
1236			goto done_deref_locked;
1237		}
1238#endif
1239#ifdef INET6
1240		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1241			error = EINVAL;
1242			vfs_opterror(opts,
1243			    "ip6 cannot be changed after creation");
1244			goto done_deref_locked;
1245		}
1246#endif
1247	}
1248
1249	/* Do final error checking before setting anything. */
1250	if (gotslevel) {
1251		if (slevel < ppr->pr_securelevel) {
1252			error = EPERM;
1253			goto done_deref_locked;
1254		}
1255	}
1256	if (gotchildmax) {
1257		if (childmax >= ppr->pr_childmax) {
1258			error = EPERM;
1259			goto done_deref_locked;
1260		}
1261	}
1262	if (gotenforce) {
1263		if (enforce < ppr->pr_enforce_statfs) {
1264			error = EPERM;
1265			goto done_deref_locked;
1266		}
1267	}
1268#ifdef INET
1269	if (ip4s > 0) {
1270		if (ppr->pr_flags & PR_IP4) {
1271			/*
1272			 * Make sure the new set of IP addresses is a
1273			 * subset of the parent's list.  Don't worry
1274			 * about the parent being unlocked, as any
1275			 * setting is done with allprison_lock held.
1276			 */
1277			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1278				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1279					break;
1280			if (ij == ppr->pr_ip4s) {
1281				error = EPERM;
1282				goto done_deref_locked;
1283			}
1284			if (ip4s > 1) {
1285				for (ii = ij = 1; ii < ip4s; ii++) {
1286					if (ip4[ii].s_addr ==
1287					    ppr->pr_ip4[0].s_addr)
1288						continue;
1289					for (; ij < ppr->pr_ip4s; ij++)
1290						if (ip4[ii].s_addr ==
1291						    ppr->pr_ip4[ij].s_addr)
1292							break;
1293					if (ij == ppr->pr_ip4s)
1294						break;
1295				}
1296				if (ij == ppr->pr_ip4s) {
1297					error = EPERM;
1298					goto done_deref_locked;
1299				}
1300			}
1301		}
1302		/*
1303		 * Check for conflicting IP addresses.  We permit them
1304		 * if there is no more than one IP on each jail.  If
1305		 * there is a duplicate on a jail with more than one
1306		 * IP stop checking and return error.
1307		 */
1308		tppr = ppr;
1309#ifdef VIMAGE
1310		for (; tppr != &prison0; tppr = tppr->pr_parent)
1311			if (tppr->pr_flags & PR_VNET)
1312				break;
1313#endif
1314		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1315			if (tpr == pr ||
1316#ifdef VIMAGE
1317			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1318#endif
1319			    tpr->pr_uref == 0) {
1320				descend = 0;
1321				continue;
1322			}
1323			if (!(tpr->pr_flags & PR_IP4_USER))
1324				continue;
1325			descend = 0;
1326			if (tpr->pr_ip4 == NULL ||
1327			    (ip4s == 1 && tpr->pr_ip4s == 1))
1328				continue;
1329			for (ii = 0; ii < ip4s; ii++) {
1330				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1331					error = EADDRINUSE;
1332					vfs_opterror(opts,
1333					    "IPv4 addresses clash");
1334					goto done_deref_locked;
1335				}
1336			}
1337		}
1338	}
1339#endif
1340#ifdef INET6
1341	if (ip6s > 0) {
1342		if (ppr->pr_flags & PR_IP6) {
1343			/*
1344			 * Make sure the new set of IP addresses is a
1345			 * subset of the parent's list.
1346			 */
1347			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1348				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1349				    &ppr->pr_ip6[ij]))
1350					break;
1351			if (ij == ppr->pr_ip6s) {
1352				error = EPERM;
1353				goto done_deref_locked;
1354			}
1355			if (ip6s > 1) {
1356				for (ii = ij = 1; ii < ip6s; ii++) {
1357					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1358					     &ppr->pr_ip6[0]))
1359						continue;
1360					for (; ij < ppr->pr_ip6s; ij++)
1361						if (IN6_ARE_ADDR_EQUAL(
1362						    &ip6[ii], &ppr->pr_ip6[ij]))
1363							break;
1364					if (ij == ppr->pr_ip6s)
1365						break;
1366				}
1367				if (ij == ppr->pr_ip6s) {
1368					error = EPERM;
1369					goto done_deref_locked;
1370				}
1371			}
1372		}
1373		/* Check for conflicting IP addresses. */
1374		tppr = ppr;
1375#ifdef VIMAGE
1376		for (; tppr != &prison0; tppr = tppr->pr_parent)
1377			if (tppr->pr_flags & PR_VNET)
1378				break;
1379#endif
1380		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1381			if (tpr == pr ||
1382#ifdef VIMAGE
1383			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1384#endif
1385			    tpr->pr_uref == 0) {
1386				descend = 0;
1387				continue;
1388			}
1389			if (!(tpr->pr_flags & PR_IP6_USER))
1390				continue;
1391			descend = 0;
1392			if (tpr->pr_ip6 == NULL ||
1393			    (ip6s == 1 && tpr->pr_ip6s == 1))
1394				continue;
1395			for (ii = 0; ii < ip6s; ii++) {
1396				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1397					error = EADDRINUSE;
1398					vfs_opterror(opts,
1399					    "IPv6 addresses clash");
1400					goto done_deref_locked;
1401				}
1402			}
1403		}
1404	}
1405#endif
1406	onamelen = namelen = 0;
1407	if (name != NULL) {
1408		/* Give a default name of the jid. */
1409		if (name[0] == '\0')
1410			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1411		else if (strtoul(name, &p, 10) != jid && *p == '\0') {
1412			error = EINVAL;
1413			vfs_opterror(opts, "name cannot be numeric");
1414			goto done_deref_locked;
1415		}
1416		/*
1417		 * Make sure the name isn't too long for the prison or its
1418		 * children.
1419		 */
1420		onamelen = strlen(pr->pr_name);
1421		namelen = strlen(name);
1422		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1423			error = ENAMETOOLONG;
1424			goto done_deref_locked;
1425		}
1426		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1427			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1428			    sizeof(pr->pr_name)) {
1429				error = ENAMETOOLONG;
1430				goto done_deref_locked;
1431			}
1432		}
1433	}
1434	if (pr_allow & ~ppr->pr_allow) {
1435		error = EPERM;
1436		goto done_deref_locked;
1437	}
1438
1439	/* Set the parameters of the prison. */
1440#ifdef INET
1441	redo_ip4 = 0;
1442	if (pr_flags & PR_IP4_USER) {
1443		pr->pr_flags |= PR_IP4;
1444		free(pr->pr_ip4, M_PRISON);
1445		pr->pr_ip4s = ip4s;
1446		pr->pr_ip4 = ip4;
1447		ip4 = NULL;
1448		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1449#ifdef VIMAGE
1450			if (tpr->pr_flags & PR_VNET) {
1451				descend = 0;
1452				continue;
1453			}
1454#endif
1455			if (prison_restrict_ip4(tpr, NULL)) {
1456				redo_ip4 = 1;
1457				descend = 0;
1458			}
1459		}
1460	}
1461#endif
1462#ifdef INET6
1463	redo_ip6 = 0;
1464	if (pr_flags & PR_IP6_USER) {
1465		pr->pr_flags |= PR_IP6;
1466		free(pr->pr_ip6, M_PRISON);
1467		pr->pr_ip6s = ip6s;
1468		pr->pr_ip6 = ip6;
1469		ip6 = NULL;
1470		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1471#ifdef VIMAGE
1472			if (tpr->pr_flags & PR_VNET) {
1473				descend = 0;
1474				continue;
1475			}
1476#endif
1477			if (prison_restrict_ip6(tpr, NULL)) {
1478				redo_ip6 = 1;
1479				descend = 0;
1480			}
1481		}
1482	}
1483#endif
1484	if (gotslevel) {
1485		pr->pr_securelevel = slevel;
1486		/* Set all child jails to be at least this level. */
1487		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1488			if (tpr->pr_securelevel < slevel)
1489				tpr->pr_securelevel = slevel;
1490	}
1491	if (gotchildmax) {
1492		pr->pr_childmax = childmax;
1493		/* Set all child jails to under this limit. */
1494		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1495			if (tpr->pr_childmax > childmax - level)
1496				tpr->pr_childmax = childmax > level
1497				    ? childmax - level : 0;
1498	}
1499	if (gotenforce) {
1500		pr->pr_enforce_statfs = enforce;
1501		/* Pass this restriction on to the children. */
1502		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1503			if (tpr->pr_enforce_statfs < enforce)
1504				tpr->pr_enforce_statfs = enforce;
1505	}
1506	if (name != NULL) {
1507		if (ppr == &prison0)
1508			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1509		else
1510			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1511			    ppr->pr_name, name);
1512		/* Change this component of child names. */
1513		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1514			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1515			    strlen(tpr->pr_name + onamelen) + 1);
1516			bcopy(pr->pr_name, tpr->pr_name, namelen);
1517		}
1518	}
1519	if (path != NULL) {
1520		/* Try to keep a real-rooted full pathname. */
1521		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1522			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1523			    mypr->pr_path, path);
1524		else
1525			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1526		pr->pr_root = root;
1527	}
1528	if (PR_HOST & ch_flags & ~pr_flags) {
1529		if (pr->pr_flags & PR_HOST) {
1530			/*
1531			 * Copy the parent's host info.  As with pr_ip4 above,
1532			 * the lack of a lock on the parent is not a problem;
1533			 * it is always set with allprison_lock at least
1534			 * shared, and is held exclusively here.
1535			 */
1536			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1537			    sizeof(pr->pr_hostname));
1538			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1539			    sizeof(pr->pr_domainname));
1540			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1541			    sizeof(pr->pr_hostuuid));
1542			pr->pr_hostid = pr->pr_parent->pr_hostid;
1543		}
1544	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1545		/* Set this prison, and any descendants without PR_HOST. */
1546		if (host != NULL)
1547			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1548		if (domain != NULL)
1549			strlcpy(pr->pr_domainname, domain,
1550			    sizeof(pr->pr_domainname));
1551		if (uuid != NULL)
1552			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1553		if (gothid)
1554			pr->pr_hostid = hid;
1555		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1556			if (tpr->pr_flags & PR_HOST)
1557				descend = 0;
1558			else {
1559				if (host != NULL)
1560					strlcpy(tpr->pr_hostname,
1561					    pr->pr_hostname,
1562					    sizeof(tpr->pr_hostname));
1563				if (domain != NULL)
1564					strlcpy(tpr->pr_domainname,
1565					    pr->pr_domainname,
1566					    sizeof(tpr->pr_domainname));
1567				if (uuid != NULL)
1568					strlcpy(tpr->pr_hostuuid,
1569					    pr->pr_hostuuid,
1570					    sizeof(tpr->pr_hostuuid));
1571				if (gothid)
1572					tpr->pr_hostid = hid;
1573			}
1574		}
1575	}
1576	if ((tallow = ch_allow & ~pr_allow)) {
1577		/* Clear allow bits in all children. */
1578		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1579			tpr->pr_allow &= ~tallow;
1580	}
1581	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1582	/*
1583	 * Persistent prisons get an extra reference, and prisons losing their
1584	 * persist flag lose that reference.  Only do this for existing prisons
1585	 * for now, so new ones will remain unseen until after the module
1586	 * handlers have completed.
1587	 */
1588	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1589		if (pr_flags & PR_PERSIST) {
1590			pr->pr_ref++;
1591			pr->pr_uref++;
1592		} else {
1593			pr->pr_ref--;
1594			pr->pr_uref--;
1595		}
1596	}
1597	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1598	mtx_unlock(&pr->pr_mtx);
1599
1600	/* Locks may have prevented a complete restriction of child IP
1601	 * addresses.  If so, allocate some more memory and try again.
1602	 */
1603#ifdef INET
1604	while (redo_ip4) {
1605		ip4s = pr->pr_ip4s;
1606		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1607		mtx_lock(&pr->pr_mtx);
1608		redo_ip4 = 0;
1609		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1610#ifdef VIMAGE
1611			if (tpr->pr_flags & PR_VNET) {
1612				descend = 0;
1613				continue;
1614			}
1615#endif
1616			if (prison_restrict_ip4(tpr, ip4)) {
1617				if (ip4 != NULL)
1618					ip4 = NULL;
1619				else
1620					redo_ip4 = 1;
1621			}
1622		}
1623		mtx_unlock(&pr->pr_mtx);
1624	}
1625#endif
1626#ifdef INET6
1627	while (redo_ip6) {
1628		ip6s = pr->pr_ip6s;
1629		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1630		mtx_lock(&pr->pr_mtx);
1631		redo_ip6 = 0;
1632		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1633#ifdef VIMAGE
1634			if (tpr->pr_flags & PR_VNET) {
1635				descend = 0;
1636				continue;
1637			}
1638#endif
1639			if (prison_restrict_ip6(tpr, ip6)) {
1640				if (ip6 != NULL)
1641					ip6 = NULL;
1642				else
1643					redo_ip6 = 1;
1644			}
1645		}
1646		mtx_unlock(&pr->pr_mtx);
1647	}
1648#endif
1649
1650	/* Let the modules do their work. */
1651	sx_downgrade(&allprison_lock);
1652	if (created) {
1653		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1654		if (error) {
1655			prison_deref(pr, PD_LIST_SLOCKED);
1656			goto done_errmsg;
1657		}
1658	}
1659	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1660	if (error) {
1661		prison_deref(pr, created
1662		    ? PD_LIST_SLOCKED
1663		    : PD_DEREF | PD_LIST_SLOCKED);
1664		goto done_errmsg;
1665	}
1666
1667	/* Attach this process to the prison if requested. */
1668	if (flags & JAIL_ATTACH) {
1669		mtx_lock(&pr->pr_mtx);
1670		error = do_jail_attach(td, pr);
1671		if (error) {
1672			vfs_opterror(opts, "attach failed");
1673			if (!created)
1674				prison_deref(pr, PD_DEREF);
1675			goto done_errmsg;
1676		}
1677	}
1678
1679	/*
1680	 * Now that it is all there, drop the temporary reference from existing
1681	 * prisons.  Or add a reference to newly created persistent prisons
1682	 * (which was not done earlier so that the prison would not be publicly
1683	 * visible).
1684	 */
1685	if (!created) {
1686		prison_deref(pr, (flags & JAIL_ATTACH)
1687		    ? PD_DEREF
1688		    : PD_DEREF | PD_LIST_SLOCKED);
1689	} else {
1690		if (pr_flags & PR_PERSIST) {
1691			mtx_lock(&pr->pr_mtx);
1692			pr->pr_ref++;
1693			pr->pr_uref++;
1694			mtx_unlock(&pr->pr_mtx);
1695		}
1696		if (!(flags & JAIL_ATTACH))
1697			sx_sunlock(&allprison_lock);
1698	}
1699	td->td_retval[0] = pr->pr_id;
1700	goto done_errmsg;
1701
1702 done_deref_locked:
1703	prison_deref(pr, created
1704	    ? PD_LOCKED | PD_LIST_XLOCKED
1705	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1706	goto done_releroot;
1707 done_unlock_list:
1708	sx_xunlock(&allprison_lock);
1709 done_releroot:
1710	if (root != NULL) {
1711		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1712		vrele(root);
1713		VFS_UNLOCK_GIANT(vfslocked);
1714	}
1715 done_errmsg:
1716	if (error) {
1717		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1718		if (errmsg_len > 0) {
1719			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1720			if (errmsg_pos > 0) {
1721				if (optuio->uio_segflg == UIO_SYSSPACE)
1722					bcopy(errmsg,
1723					   optuio->uio_iov[errmsg_pos].iov_base,
1724					   errmsg_len);
1725				else
1726					copyout(errmsg,
1727					   optuio->uio_iov[errmsg_pos].iov_base,
1728					   errmsg_len);
1729			}
1730		}
1731	}
1732 done_free:
1733#ifdef INET
1734	free(ip4, M_PRISON);
1735#endif
1736#ifdef INET6
1737	free(ip6, M_PRISON);
1738#endif
1739	vfs_freeopts(opts);
1740	return (error);
1741}
1742
1743
1744/*
1745 * struct jail_get_args {
1746 *	struct iovec *iovp;
1747 *	unsigned int iovcnt;
1748 *	int flags;
1749 * };
1750 */
1751int
1752jail_get(struct thread *td, struct jail_get_args *uap)
1753{
1754	struct uio *auio;
1755	int error;
1756
1757	/* Check that we have an even number of iovecs. */
1758	if (uap->iovcnt & 1)
1759		return (EINVAL);
1760
1761	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1762	if (error)
1763		return (error);
1764	error = kern_jail_get(td, auio, uap->flags);
1765	if (error == 0)
1766		error = copyout(auio->uio_iov, uap->iovp,
1767		    uap->iovcnt * sizeof (struct iovec));
1768	free(auio, M_IOV);
1769	return (error);
1770}
1771
1772int
1773kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1774{
1775	struct prison *pr, *mypr;
1776	struct vfsopt *opt;
1777	struct vfsoptlist *opts;
1778	char *errmsg, *name;
1779	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1780
1781	if (flags & ~JAIL_GET_MASK)
1782		return (EINVAL);
1783
1784	/* Get the parameter list. */
1785	error = vfs_buildopts(optuio, &opts);
1786	if (error)
1787		return (error);
1788	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1789	mypr = td->td_ucred->cr_prison;
1790
1791	/*
1792	 * Find the prison specified by one of: lastjid, jid, name.
1793	 */
1794	sx_slock(&allprison_lock);
1795	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1796	if (error == 0) {
1797		TAILQ_FOREACH(pr, &allprison, pr_list) {
1798			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1799				mtx_lock(&pr->pr_mtx);
1800				if (pr->pr_ref > 0 &&
1801				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1802					break;
1803				mtx_unlock(&pr->pr_mtx);
1804			}
1805		}
1806		if (pr != NULL)
1807			goto found_prison;
1808		error = ENOENT;
1809		vfs_opterror(opts, "no jail after %d", jid);
1810		goto done_unlock_list;
1811	} else if (error != ENOENT)
1812		goto done_unlock_list;
1813
1814	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1815	if (error == 0) {
1816		if (jid != 0) {
1817			pr = prison_find_child(mypr, jid);
1818			if (pr != NULL) {
1819				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1820					mtx_unlock(&pr->pr_mtx);
1821					error = ENOENT;
1822					vfs_opterror(opts, "jail %d is dying",
1823					    jid);
1824					goto done_unlock_list;
1825				}
1826				goto found_prison;
1827			}
1828			error = ENOENT;
1829			vfs_opterror(opts, "jail %d not found", jid);
1830			goto done_unlock_list;
1831		}
1832	} else if (error != ENOENT)
1833		goto done_unlock_list;
1834
1835	error = vfs_getopt(opts, "name", (void **)&name, &len);
1836	if (error == 0) {
1837		if (len == 0 || name[len - 1] != '\0') {
1838			error = EINVAL;
1839			goto done_unlock_list;
1840		}
1841		pr = prison_find_name(mypr, name);
1842		if (pr != NULL) {
1843			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1844				mtx_unlock(&pr->pr_mtx);
1845				error = ENOENT;
1846				vfs_opterror(opts, "jail \"%s\" is dying",
1847				    name);
1848				goto done_unlock_list;
1849			}
1850			goto found_prison;
1851		}
1852		error = ENOENT;
1853		vfs_opterror(opts, "jail \"%s\" not found", name);
1854		goto done_unlock_list;
1855	} else if (error != ENOENT)
1856		goto done_unlock_list;
1857
1858	vfs_opterror(opts, "no jail specified");
1859	error = ENOENT;
1860	goto done_unlock_list;
1861
1862 found_prison:
1863	/* Get the parameters of the prison. */
1864	pr->pr_ref++;
1865	locked = PD_LOCKED;
1866	td->td_retval[0] = pr->pr_id;
1867	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1868	if (error != 0 && error != ENOENT)
1869		goto done_deref;
1870	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1871	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1872	if (error != 0 && error != ENOENT)
1873		goto done_deref;
1874	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1875	if (error != 0 && error != ENOENT)
1876		goto done_deref;
1877	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1878	    sizeof(pr->pr_cpuset->cs_id));
1879	if (error != 0 && error != ENOENT)
1880		goto done_deref;
1881	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1882	if (error != 0 && error != ENOENT)
1883		goto done_deref;
1884#ifdef INET
1885	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1886	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1887	if (error != 0 && error != ENOENT)
1888		goto done_deref;
1889#endif
1890#ifdef INET6
1891	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1892	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1893	if (error != 0 && error != ENOENT)
1894		goto done_deref;
1895#endif
1896	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1897	    sizeof(pr->pr_securelevel));
1898	if (error != 0 && error != ENOENT)
1899		goto done_deref;
1900	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
1901	    sizeof(pr->pr_childcount));
1902	if (error != 0 && error != ENOENT)
1903		goto done_deref;
1904	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
1905	    sizeof(pr->pr_childmax));
1906	if (error != 0 && error != ENOENT)
1907		goto done_deref;
1908	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
1909	if (error != 0 && error != ENOENT)
1910		goto done_deref;
1911	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
1912	if (error != 0 && error != ENOENT)
1913		goto done_deref;
1914	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
1915	if (error != 0 && error != ENOENT)
1916		goto done_deref;
1917#ifdef COMPAT_IA32
1918	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
1919		uint32_t hid32 = pr->pr_hostid;
1920
1921		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
1922	} else
1923#endif
1924	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
1925	    sizeof(pr->pr_hostid));
1926	if (error != 0 && error != ENOENT)
1927		goto done_deref;
1928	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
1929	    sizeof(pr->pr_enforce_statfs));
1930	if (error != 0 && error != ENOENT)
1931		goto done_deref;
1932	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
1933	    fi++) {
1934		if (pr_flag_names[fi] == NULL)
1935			continue;
1936		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
1937		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
1938		if (error != 0 && error != ENOENT)
1939			goto done_deref;
1940		i = !i;
1941		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
1942		if (error != 0 && error != ENOENT)
1943			goto done_deref;
1944	}
1945	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
1946	    fi++) {
1947		i = pr->pr_flags &
1948		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
1949		i = pr_flag_jailsys[fi].disable &&
1950		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
1951		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
1952		    : JAIL_SYS_INHERIT;
1953		error =
1954		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
1955		if (error != 0 && error != ENOENT)
1956			goto done_deref;
1957	}
1958	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
1959	    fi++) {
1960		if (pr_allow_names[fi] == NULL)
1961			continue;
1962		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
1963		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
1964		if (error != 0 && error != ENOENT)
1965			goto done_deref;
1966		i = !i;
1967		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
1968		if (error != 0 && error != ENOENT)
1969			goto done_deref;
1970	}
1971	i = (pr->pr_uref == 0);
1972	error = vfs_setopt(opts, "dying", &i, sizeof(i));
1973	if (error != 0 && error != ENOENT)
1974		goto done_deref;
1975	i = !i;
1976	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
1977	if (error != 0 && error != ENOENT)
1978		goto done_deref;
1979
1980	/* Get the module parameters. */
1981	mtx_unlock(&pr->pr_mtx);
1982	locked = 0;
1983	error = osd_jail_call(pr, PR_METHOD_GET, opts);
1984	if (error)
1985		goto done_deref;
1986	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
1987
1988	/* By now, all parameters should have been noted. */
1989	TAILQ_FOREACH(opt, opts, link) {
1990		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1991			error = EINVAL;
1992			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1993			goto done_errmsg;
1994		}
1995	}
1996
1997	/* Write the fetched parameters back to userspace. */
1998	error = 0;
1999	TAILQ_FOREACH(opt, opts, link) {
2000		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2001			pos = 2 * opt->pos + 1;
2002			optuio->uio_iov[pos].iov_len = opt->len;
2003			if (opt->value != NULL) {
2004				if (optuio->uio_segflg == UIO_SYSSPACE) {
2005					bcopy(opt->value,
2006					    optuio->uio_iov[pos].iov_base,
2007					    opt->len);
2008				} else {
2009					error = copyout(opt->value,
2010					    optuio->uio_iov[pos].iov_base,
2011					    opt->len);
2012					if (error)
2013						break;
2014				}
2015			}
2016		}
2017	}
2018	goto done_errmsg;
2019
2020 done_deref:
2021	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2022	goto done_errmsg;
2023
2024 done_unlock_list:
2025	sx_sunlock(&allprison_lock);
2026 done_errmsg:
2027	if (error && errmsg_pos >= 0) {
2028		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2029		errmsg_pos = 2 * errmsg_pos + 1;
2030		if (errmsg_len > 0) {
2031			if (optuio->uio_segflg == UIO_SYSSPACE)
2032				bcopy(errmsg,
2033				    optuio->uio_iov[errmsg_pos].iov_base,
2034				    errmsg_len);
2035			else
2036				copyout(errmsg,
2037				    optuio->uio_iov[errmsg_pos].iov_base,
2038				    errmsg_len);
2039		}
2040	}
2041	vfs_freeopts(opts);
2042	return (error);
2043}
2044
2045
2046/*
2047 * struct jail_remove_args {
2048 *	int jid;
2049 * };
2050 */
2051int
2052jail_remove(struct thread *td, struct jail_remove_args *uap)
2053{
2054	struct prison *pr, *cpr, *lpr, *tpr;
2055	int descend, error;
2056
2057	error = priv_check(td, PRIV_JAIL_REMOVE);
2058	if (error)
2059		return (error);
2060
2061	sx_xlock(&allprison_lock);
2062	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2063	if (pr == NULL) {
2064		sx_xunlock(&allprison_lock);
2065		return (EINVAL);
2066	}
2067
2068	/* Remove all descendants of this prison, then remove this prison. */
2069	pr->pr_ref++;
2070	pr->pr_flags |= PR_REMOVE;
2071	if (!LIST_EMPTY(&pr->pr_children)) {
2072		mtx_unlock(&pr->pr_mtx);
2073		lpr = NULL;
2074		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2075			mtx_lock(&cpr->pr_mtx);
2076			if (cpr->pr_ref > 0) {
2077				tpr = cpr;
2078				cpr->pr_ref++;
2079				cpr->pr_flags |= PR_REMOVE;
2080			} else {
2081				/* Already removed - do not do it again. */
2082				tpr = NULL;
2083			}
2084			mtx_unlock(&cpr->pr_mtx);
2085			if (lpr != NULL) {
2086				mtx_lock(&lpr->pr_mtx);
2087				prison_remove_one(lpr);
2088				sx_xlock(&allprison_lock);
2089			}
2090			lpr = tpr;
2091		}
2092		if (lpr != NULL) {
2093			mtx_lock(&lpr->pr_mtx);
2094			prison_remove_one(lpr);
2095			sx_xlock(&allprison_lock);
2096		}
2097		mtx_lock(&pr->pr_mtx);
2098	}
2099	prison_remove_one(pr);
2100	return (0);
2101}
2102
2103static void
2104prison_remove_one(struct prison *pr)
2105{
2106	struct proc *p;
2107	int deuref;
2108
2109	/* If the prison was persistent, it is not anymore. */
2110	deuref = 0;
2111	if (pr->pr_flags & PR_PERSIST) {
2112		pr->pr_ref--;
2113		deuref = PD_DEUREF;
2114		pr->pr_flags &= ~PR_PERSIST;
2115	}
2116
2117	/*
2118	 * jail_remove added a reference.  If that's the only one, remove
2119	 * the prison now.
2120	 */
2121	KASSERT(pr->pr_ref > 0,
2122	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2123	if (pr->pr_ref == 1) {
2124		prison_deref(pr,
2125		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2126		return;
2127	}
2128
2129	mtx_unlock(&pr->pr_mtx);
2130	sx_xunlock(&allprison_lock);
2131	/*
2132	 * Kill all processes unfortunate enough to be attached to this prison.
2133	 */
2134	sx_slock(&allproc_lock);
2135	LIST_FOREACH(p, &allproc, p_list) {
2136		PROC_LOCK(p);
2137		if (p->p_state != PRS_NEW && p->p_ucred &&
2138		    p->p_ucred->cr_prison == pr)
2139			psignal(p, SIGKILL);
2140		PROC_UNLOCK(p);
2141	}
2142	sx_sunlock(&allproc_lock);
2143	/* Remove the temporary reference added by jail_remove. */
2144	prison_deref(pr, deuref | PD_DEREF);
2145}
2146
2147
2148/*
2149 * struct jail_attach_args {
2150 *	int jid;
2151 * };
2152 */
2153int
2154jail_attach(struct thread *td, struct jail_attach_args *uap)
2155{
2156	struct prison *pr;
2157	int error;
2158
2159	error = priv_check(td, PRIV_JAIL_ATTACH);
2160	if (error)
2161		return (error);
2162
2163	sx_slock(&allprison_lock);
2164	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2165	if (pr == NULL) {
2166		sx_sunlock(&allprison_lock);
2167		return (EINVAL);
2168	}
2169
2170	/*
2171	 * Do not allow a process to attach to a prison that is not
2172	 * considered to be "alive".
2173	 */
2174	if (pr->pr_uref == 0) {
2175		mtx_unlock(&pr->pr_mtx);
2176		sx_sunlock(&allprison_lock);
2177		return (EINVAL);
2178	}
2179
2180	return (do_jail_attach(td, pr));
2181}
2182
2183static int
2184do_jail_attach(struct thread *td, struct prison *pr)
2185{
2186	struct prison *ppr;
2187	struct proc *p;
2188	struct ucred *newcred, *oldcred;
2189	int vfslocked, error;
2190
2191	/*
2192	 * XXX: Note that there is a slight race here if two threads
2193	 * in the same privileged process attempt to attach to two
2194	 * different jails at the same time.  It is important for
2195	 * user processes not to do this, or they might end up with
2196	 * a process root from one prison, but attached to the jail
2197	 * of another.
2198	 */
2199	pr->pr_ref++;
2200	pr->pr_uref++;
2201	mtx_unlock(&pr->pr_mtx);
2202
2203	/* Let modules do whatever they need to prepare for attaching. */
2204	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2205	if (error) {
2206		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2207		return (error);
2208	}
2209	sx_sunlock(&allprison_lock);
2210
2211	/*
2212	 * Reparent the newly attached process to this jail.
2213	 */
2214	ppr = td->td_ucred->cr_prison;
2215	p = td->td_proc;
2216	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2217	if (error)
2218		goto e_revert_osd;
2219
2220	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2221	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2222	if ((error = change_dir(pr->pr_root, td)) != 0)
2223		goto e_unlock;
2224#ifdef MAC
2225	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2226		goto e_unlock;
2227#endif
2228	VOP_UNLOCK(pr->pr_root, 0);
2229	if ((error = change_root(pr->pr_root, td)))
2230		goto e_unlock_giant;
2231	VFS_UNLOCK_GIANT(vfslocked);
2232
2233	newcred = crget();
2234	PROC_LOCK(p);
2235	oldcred = p->p_ucred;
2236	setsugid(p);
2237	crcopy(newcred, oldcred);
2238	newcred->cr_prison = pr;
2239	p->p_ucred = newcred;
2240	PROC_UNLOCK(p);
2241	crfree(oldcred);
2242	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2243	return (0);
2244 e_unlock:
2245	VOP_UNLOCK(pr->pr_root, 0);
2246 e_unlock_giant:
2247	VFS_UNLOCK_GIANT(vfslocked);
2248 e_revert_osd:
2249	/* Tell modules this thread is still in its old jail after all. */
2250	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2251	prison_deref(pr, PD_DEREF | PD_DEUREF);
2252	return (error);
2253}
2254
2255
2256/*
2257 * Returns a locked prison instance, or NULL on failure.
2258 */
2259struct prison *
2260prison_find(int prid)
2261{
2262	struct prison *pr;
2263
2264	sx_assert(&allprison_lock, SX_LOCKED);
2265	TAILQ_FOREACH(pr, &allprison, pr_list) {
2266		if (pr->pr_id == prid) {
2267			mtx_lock(&pr->pr_mtx);
2268			if (pr->pr_ref > 0)
2269				return (pr);
2270			mtx_unlock(&pr->pr_mtx);
2271		}
2272	}
2273	return (NULL);
2274}
2275
2276/*
2277 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2278 */
2279struct prison *
2280prison_find_child(struct prison *mypr, int prid)
2281{
2282	struct prison *pr;
2283	int descend;
2284
2285	sx_assert(&allprison_lock, SX_LOCKED);
2286	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2287		if (pr->pr_id == prid) {
2288			mtx_lock(&pr->pr_mtx);
2289			if (pr->pr_ref > 0)
2290				return (pr);
2291			mtx_unlock(&pr->pr_mtx);
2292		}
2293	}
2294	return (NULL);
2295}
2296
2297/*
2298 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2299 */
2300struct prison *
2301prison_find_name(struct prison *mypr, const char *name)
2302{
2303	struct prison *pr, *deadpr;
2304	size_t mylen;
2305	int descend;
2306
2307	sx_assert(&allprison_lock, SX_LOCKED);
2308	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2309 again:
2310	deadpr = NULL;
2311	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2312		if (!strcmp(pr->pr_name + mylen, name)) {
2313			mtx_lock(&pr->pr_mtx);
2314			if (pr->pr_ref > 0) {
2315				if (pr->pr_uref > 0)
2316					return (pr);
2317				deadpr = pr;
2318			}
2319			mtx_unlock(&pr->pr_mtx);
2320		}
2321	}
2322	/* There was no valid prison - perhaps there was a dying one. */
2323	if (deadpr != NULL) {
2324		mtx_lock(&deadpr->pr_mtx);
2325		if (deadpr->pr_ref == 0) {
2326			mtx_unlock(&deadpr->pr_mtx);
2327			goto again;
2328		}
2329	}
2330	return (deadpr);
2331}
2332
2333/*
2334 * See if a prison has the specific flag set.
2335 */
2336int
2337prison_flag(struct ucred *cred, unsigned flag)
2338{
2339
2340	/* This is an atomic read, so no locking is necessary. */
2341	return (cred->cr_prison->pr_flags & flag);
2342}
2343
2344int
2345prison_allow(struct ucred *cred, unsigned flag)
2346{
2347
2348	/* This is an atomic read, so no locking is necessary. */
2349	return (cred->cr_prison->pr_allow & flag);
2350}
2351
2352/*
2353 * Remove a prison reference.  If that was the last reference, remove the
2354 * prison itself - but not in this context in case there are locks held.
2355 */
2356void
2357prison_free_locked(struct prison *pr)
2358{
2359
2360	mtx_assert(&pr->pr_mtx, MA_OWNED);
2361	pr->pr_ref--;
2362	if (pr->pr_ref == 0) {
2363		mtx_unlock(&pr->pr_mtx);
2364		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2365		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2366		return;
2367	}
2368	mtx_unlock(&pr->pr_mtx);
2369}
2370
2371void
2372prison_free(struct prison *pr)
2373{
2374
2375	mtx_lock(&pr->pr_mtx);
2376	prison_free_locked(pr);
2377}
2378
2379static void
2380prison_complete(void *context, int pending)
2381{
2382
2383	prison_deref((struct prison *)context, 0);
2384}
2385
2386/*
2387 * Remove a prison reference (usually).  This internal version assumes no
2388 * mutexes are held, except perhaps the prison itself.  If there are no more
2389 * references, release and delist the prison.  On completion, the prison lock
2390 * and the allprison lock are both unlocked.
2391 */
2392static void
2393prison_deref(struct prison *pr, int flags)
2394{
2395	struct prison *ppr, *tpr;
2396	int vfslocked;
2397
2398	if (!(flags & PD_LOCKED))
2399		mtx_lock(&pr->pr_mtx);
2400	/* Decrement the user references in a separate loop. */
2401	if (flags & PD_DEUREF) {
2402		for (tpr = pr;; tpr = tpr->pr_parent) {
2403			if (tpr != pr)
2404				mtx_lock(&tpr->pr_mtx);
2405			if (--tpr->pr_uref > 0)
2406				break;
2407			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2408			mtx_unlock(&tpr->pr_mtx);
2409		}
2410		/* Done if there were only user references to remove. */
2411		if (!(flags & PD_DEREF)) {
2412			mtx_unlock(&tpr->pr_mtx);
2413			if (flags & PD_LIST_SLOCKED)
2414				sx_sunlock(&allprison_lock);
2415			else if (flags & PD_LIST_XLOCKED)
2416				sx_xunlock(&allprison_lock);
2417			return;
2418		}
2419		if (tpr != pr) {
2420			mtx_unlock(&tpr->pr_mtx);
2421			mtx_lock(&pr->pr_mtx);
2422		}
2423	}
2424
2425	for (;;) {
2426		if (flags & PD_DEREF)
2427			pr->pr_ref--;
2428		/* If the prison still has references, nothing else to do. */
2429		if (pr->pr_ref > 0) {
2430			mtx_unlock(&pr->pr_mtx);
2431			if (flags & PD_LIST_SLOCKED)
2432				sx_sunlock(&allprison_lock);
2433			else if (flags & PD_LIST_XLOCKED)
2434				sx_xunlock(&allprison_lock);
2435			return;
2436		}
2437
2438		mtx_unlock(&pr->pr_mtx);
2439		if (flags & PD_LIST_SLOCKED) {
2440			if (!sx_try_upgrade(&allprison_lock)) {
2441				sx_sunlock(&allprison_lock);
2442				sx_xlock(&allprison_lock);
2443			}
2444		} else if (!(flags & PD_LIST_XLOCKED))
2445			sx_xlock(&allprison_lock);
2446
2447		TAILQ_REMOVE(&allprison, pr, pr_list);
2448		LIST_REMOVE(pr, pr_sibling);
2449		ppr = pr->pr_parent;
2450		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2451			tpr->pr_childcount--;
2452		sx_downgrade(&allprison_lock);
2453
2454#ifdef VIMAGE
2455		if (pr->pr_flags & PR_VNET)
2456			vnet_destroy(pr->pr_vnet);
2457#endif
2458		if (pr->pr_root != NULL) {
2459			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2460			vrele(pr->pr_root);
2461			VFS_UNLOCK_GIANT(vfslocked);
2462		}
2463		mtx_destroy(&pr->pr_mtx);
2464#ifdef INET
2465		free(pr->pr_ip4, M_PRISON);
2466#endif
2467#ifdef INET6
2468		free(pr->pr_ip6, M_PRISON);
2469#endif
2470		if (pr->pr_cpuset != NULL)
2471			cpuset_rel(pr->pr_cpuset);
2472		osd_jail_exit(pr);
2473		free(pr, M_PRISON);
2474
2475		/* Removing a prison frees a reference on its parent. */
2476		pr = ppr;
2477		mtx_lock(&pr->pr_mtx);
2478		flags = PD_DEREF | PD_LIST_SLOCKED;
2479	}
2480}
2481
2482void
2483prison_hold_locked(struct prison *pr)
2484{
2485
2486	mtx_assert(&pr->pr_mtx, MA_OWNED);
2487	KASSERT(pr->pr_ref > 0,
2488	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2489	pr->pr_ref++;
2490}
2491
2492void
2493prison_hold(struct prison *pr)
2494{
2495
2496	mtx_lock(&pr->pr_mtx);
2497	prison_hold_locked(pr);
2498	mtx_unlock(&pr->pr_mtx);
2499}
2500
2501void
2502prison_proc_hold(struct prison *pr)
2503{
2504
2505	mtx_lock(&pr->pr_mtx);
2506	KASSERT(pr->pr_uref > 0,
2507	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2508	pr->pr_uref++;
2509	mtx_unlock(&pr->pr_mtx);
2510}
2511
2512void
2513prison_proc_free(struct prison *pr)
2514{
2515
2516	mtx_lock(&pr->pr_mtx);
2517	KASSERT(pr->pr_uref > 0,
2518	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2519	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2520}
2521
2522
2523#ifdef INET
2524/*
2525 * Restrict a prison's IP address list with its parent's, possibly replacing
2526 * it.  Return true if the replacement buffer was used (or would have been).
2527 */
2528static int
2529prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2530{
2531	int ii, ij, used;
2532	struct prison *ppr;
2533
2534	ppr = pr->pr_parent;
2535	if (!(pr->pr_flags & PR_IP4_USER)) {
2536		/* This has no user settings, so just copy the parent's list. */
2537		if (pr->pr_ip4s < ppr->pr_ip4s) {
2538			/*
2539			 * There's no room for the parent's list.  Use the
2540			 * new list buffer, which is assumed to be big enough
2541			 * (if it was passed).  If there's no buffer, try to
2542			 * allocate one.
2543			 */
2544			used = 1;
2545			if (newip4 == NULL) {
2546				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2547				    M_PRISON, M_NOWAIT);
2548				if (newip4 != NULL)
2549					used = 0;
2550			}
2551			if (newip4 != NULL) {
2552				bcopy(ppr->pr_ip4, newip4,
2553				    ppr->pr_ip4s * sizeof(*newip4));
2554				free(pr->pr_ip4, M_PRISON);
2555				pr->pr_ip4 = newip4;
2556				pr->pr_ip4s = ppr->pr_ip4s;
2557			}
2558			return (used);
2559		}
2560		pr->pr_ip4s = ppr->pr_ip4s;
2561		if (pr->pr_ip4s > 0)
2562			bcopy(ppr->pr_ip4, pr->pr_ip4,
2563			    pr->pr_ip4s * sizeof(*newip4));
2564		else if (pr->pr_ip4 != NULL) {
2565			free(pr->pr_ip4, M_PRISON);
2566			pr->pr_ip4 = NULL;
2567		}
2568	} else if (pr->pr_ip4s > 0) {
2569		/* Remove addresses that aren't in the parent. */
2570		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2571			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2572				break;
2573		if (ij < ppr->pr_ip4s)
2574			ii = 1;
2575		else {
2576			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2577			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2578			ii = 0;
2579		}
2580		for (ij = 1; ii < pr->pr_ip4s; ) {
2581			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2582				ii++;
2583				continue;
2584			}
2585			switch (ij >= ppr->pr_ip4s ? -1 :
2586				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2587			case -1:
2588				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2589				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2590				break;
2591			case 0:
2592				ii++;
2593				ij++;
2594				break;
2595			case 1:
2596				ij++;
2597				break;
2598			}
2599		}
2600		if (pr->pr_ip4s == 0) {
2601			pr->pr_flags |= PR_IP4_DISABLE;
2602			free(pr->pr_ip4, M_PRISON);
2603			pr->pr_ip4 = NULL;
2604		}
2605	}
2606	return (0);
2607}
2608
2609/*
2610 * Pass back primary IPv4 address of this jail.
2611 *
2612 * If not restricted return success but do not alter the address.  Caller has
2613 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2614 *
2615 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2616 * Address returned in NBO.
2617 */
2618int
2619prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2620{
2621	struct prison *pr;
2622
2623	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2624	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2625
2626	pr = cred->cr_prison;
2627	if (!(pr->pr_flags & PR_IP4))
2628		return (0);
2629	mtx_lock(&pr->pr_mtx);
2630	if (!(pr->pr_flags & PR_IP4)) {
2631		mtx_unlock(&pr->pr_mtx);
2632		return (0);
2633	}
2634	if (pr->pr_ip4 == NULL) {
2635		mtx_unlock(&pr->pr_mtx);
2636		return (EAFNOSUPPORT);
2637	}
2638
2639	ia->s_addr = pr->pr_ip4[0].s_addr;
2640	mtx_unlock(&pr->pr_mtx);
2641	return (0);
2642}
2643
2644/*
2645 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2646 */
2647int
2648prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2649{
2650
2651	if (pr1 == pr2)
2652		return (1);
2653
2654	/*
2655	 * No need to lock since the PR_IP4_USER flag can't be altered for
2656	 * existing prisons.
2657	 */
2658	while (pr1 != &prison0 &&
2659#ifdef VIMAGE
2660	       !(pr1->pr_flags & PR_VNET) &&
2661#endif
2662	       !(pr1->pr_flags & PR_IP4_USER))
2663		pr1 = pr1->pr_parent;
2664	while (pr2 != &prison0 &&
2665#ifdef VIMAGE
2666	       !(pr2->pr_flags & PR_VNET) &&
2667#endif
2668	       !(pr2->pr_flags & PR_IP4_USER))
2669		pr2 = pr2->pr_parent;
2670	return (pr1 == pr2);
2671}
2672
2673/*
2674 * Make sure our (source) address is set to something meaningful to this
2675 * jail.
2676 *
2677 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2678 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2679 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2680 */
2681int
2682prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2683{
2684	struct prison *pr;
2685	struct in_addr ia0;
2686	int error;
2687
2688	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2689	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2690
2691	pr = cred->cr_prison;
2692	if (!(pr->pr_flags & PR_IP4))
2693		return (0);
2694	mtx_lock(&pr->pr_mtx);
2695	if (!(pr->pr_flags & PR_IP4)) {
2696		mtx_unlock(&pr->pr_mtx);
2697		return (0);
2698	}
2699	if (pr->pr_ip4 == NULL) {
2700		mtx_unlock(&pr->pr_mtx);
2701		return (EAFNOSUPPORT);
2702	}
2703
2704	ia0.s_addr = ntohl(ia->s_addr);
2705	if (ia0.s_addr == INADDR_LOOPBACK) {
2706		ia->s_addr = pr->pr_ip4[0].s_addr;
2707		mtx_unlock(&pr->pr_mtx);
2708		return (0);
2709	}
2710
2711	if (ia0.s_addr == INADDR_ANY) {
2712		/*
2713		 * In case there is only 1 IPv4 address, bind directly.
2714		 */
2715		if (pr->pr_ip4s == 1)
2716			ia->s_addr = pr->pr_ip4[0].s_addr;
2717		mtx_unlock(&pr->pr_mtx);
2718		return (0);
2719	}
2720
2721	error = _prison_check_ip4(pr, ia);
2722	mtx_unlock(&pr->pr_mtx);
2723	return (error);
2724}
2725
2726/*
2727 * Rewrite destination address in case we will connect to loopback address.
2728 *
2729 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2730 * Address passed in in NBO and returned in NBO.
2731 */
2732int
2733prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2734{
2735	struct prison *pr;
2736
2737	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2738	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2739
2740	pr = cred->cr_prison;
2741	if (!(pr->pr_flags & PR_IP4))
2742		return (0);
2743	mtx_lock(&pr->pr_mtx);
2744	if (!(pr->pr_flags & PR_IP4)) {
2745		mtx_unlock(&pr->pr_mtx);
2746		return (0);
2747	}
2748	if (pr->pr_ip4 == NULL) {
2749		mtx_unlock(&pr->pr_mtx);
2750		return (EAFNOSUPPORT);
2751	}
2752
2753	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2754		ia->s_addr = pr->pr_ip4[0].s_addr;
2755		mtx_unlock(&pr->pr_mtx);
2756		return (0);
2757	}
2758
2759	/*
2760	 * Return success because nothing had to be changed.
2761	 */
2762	mtx_unlock(&pr->pr_mtx);
2763	return (0);
2764}
2765
2766/*
2767 * Check if given address belongs to the jail referenced by cred/prison.
2768 *
2769 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2770 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2771 * doesn't allow IPv4.  Address passed in in NBO.
2772 */
2773static int
2774_prison_check_ip4(struct prison *pr, struct in_addr *ia)
2775{
2776	int i, a, z, d;
2777
2778	/*
2779	 * Check the primary IP.
2780	 */
2781	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2782		return (0);
2783
2784	/*
2785	 * All the other IPs are sorted so we can do a binary search.
2786	 */
2787	a = 0;
2788	z = pr->pr_ip4s - 2;
2789	while (a <= z) {
2790		i = (a + z) / 2;
2791		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2792		if (d > 0)
2793			z = i - 1;
2794		else if (d < 0)
2795			a = i + 1;
2796		else
2797			return (0);
2798	}
2799
2800	return (EADDRNOTAVAIL);
2801}
2802
2803int
2804prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2805{
2806	struct prison *pr;
2807	int error;
2808
2809	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2810	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2811
2812	pr = cred->cr_prison;
2813	if (!(pr->pr_flags & PR_IP4))
2814		return (0);
2815	mtx_lock(&pr->pr_mtx);
2816	if (!(pr->pr_flags & PR_IP4)) {
2817		mtx_unlock(&pr->pr_mtx);
2818		return (0);
2819	}
2820	if (pr->pr_ip4 == NULL) {
2821		mtx_unlock(&pr->pr_mtx);
2822		return (EAFNOSUPPORT);
2823	}
2824
2825	error = _prison_check_ip4(pr, ia);
2826	mtx_unlock(&pr->pr_mtx);
2827	return (error);
2828}
2829#endif
2830
2831#ifdef INET6
2832static int
2833prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2834{
2835	int ii, ij, used;
2836	struct prison *ppr;
2837
2838	ppr = pr->pr_parent;
2839	if (!(pr->pr_flags & PR_IP6_USER)) {
2840		/* This has no user settings, so just copy the parent's list. */
2841		if (pr->pr_ip6s < ppr->pr_ip6s) {
2842			/*
2843			 * There's no room for the parent's list.  Use the
2844			 * new list buffer, which is assumed to be big enough
2845			 * (if it was passed).  If there's no buffer, try to
2846			 * allocate one.
2847			 */
2848			used = 1;
2849			if (newip6 == NULL) {
2850				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2851				    M_PRISON, M_NOWAIT);
2852				if (newip6 != NULL)
2853					used = 0;
2854			}
2855			if (newip6 != NULL) {
2856				bcopy(ppr->pr_ip6, newip6,
2857				    ppr->pr_ip6s * sizeof(*newip6));
2858				free(pr->pr_ip6, M_PRISON);
2859				pr->pr_ip6 = newip6;
2860				pr->pr_ip6s = ppr->pr_ip6s;
2861			}
2862			return (used);
2863		}
2864		pr->pr_ip6s = ppr->pr_ip6s;
2865		if (pr->pr_ip6s > 0)
2866			bcopy(ppr->pr_ip6, pr->pr_ip6,
2867			    pr->pr_ip6s * sizeof(*newip6));
2868		else if (pr->pr_ip6 != NULL) {
2869			free(pr->pr_ip6, M_PRISON);
2870			pr->pr_ip6 = NULL;
2871		}
2872	} else if (pr->pr_ip6s > 0) {
2873		/* Remove addresses that aren't in the parent. */
2874		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2875			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2876			    &ppr->pr_ip6[ij]))
2877				break;
2878		if (ij < ppr->pr_ip6s)
2879			ii = 1;
2880		else {
2881			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2882			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
2883			ii = 0;
2884		}
2885		for (ij = 1; ii < pr->pr_ip6s; ) {
2886			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
2887			    &ppr->pr_ip6[0])) {
2888				ii++;
2889				continue;
2890			}
2891			switch (ij >= ppr->pr_ip4s ? -1 :
2892				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
2893			case -1:
2894				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
2895				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
2896				break;
2897			case 0:
2898				ii++;
2899				ij++;
2900				break;
2901			case 1:
2902				ij++;
2903				break;
2904			}
2905		}
2906		if (pr->pr_ip6s == 0) {
2907			pr->pr_flags |= PR_IP6_DISABLE;
2908			free(pr->pr_ip6, M_PRISON);
2909			pr->pr_ip6 = NULL;
2910		}
2911	}
2912	return 0;
2913}
2914
2915/*
2916 * Pass back primary IPv6 address for this jail.
2917 *
2918 * If not restricted return success but do not alter the address.  Caller has
2919 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
2920 *
2921 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2922 */
2923int
2924prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
2925{
2926	struct prison *pr;
2927
2928	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2929	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2930
2931	pr = cred->cr_prison;
2932	if (!(pr->pr_flags & PR_IP6))
2933		return (0);
2934	mtx_lock(&pr->pr_mtx);
2935	if (!(pr->pr_flags & PR_IP6)) {
2936		mtx_unlock(&pr->pr_mtx);
2937		return (0);
2938	}
2939	if (pr->pr_ip6 == NULL) {
2940		mtx_unlock(&pr->pr_mtx);
2941		return (EAFNOSUPPORT);
2942	}
2943
2944	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2945	mtx_unlock(&pr->pr_mtx);
2946	return (0);
2947}
2948
2949/*
2950 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
2951 */
2952int
2953prison_equal_ip6(struct prison *pr1, struct prison *pr2)
2954{
2955
2956	if (pr1 == pr2)
2957		return (1);
2958
2959	while (pr1 != &prison0 &&
2960#ifdef VIMAGE
2961	       !(pr1->pr_flags & PR_VNET) &&
2962#endif
2963	       !(pr1->pr_flags & PR_IP6_USER))
2964		pr1 = pr1->pr_parent;
2965	while (pr2 != &prison0 &&
2966#ifdef VIMAGE
2967	       !(pr2->pr_flags & PR_VNET) &&
2968#endif
2969	       !(pr2->pr_flags & PR_IP6_USER))
2970		pr2 = pr2->pr_parent;
2971	return (pr1 == pr2);
2972}
2973
2974/*
2975 * Make sure our (source) address is set to something meaningful to this jail.
2976 *
2977 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
2978 * when needed while binding.
2979 *
2980 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
2981 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2982 * doesn't allow IPv6.
2983 */
2984int
2985prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
2986{
2987	struct prison *pr;
2988	int error;
2989
2990	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2991	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2992
2993	pr = cred->cr_prison;
2994	if (!(pr->pr_flags & PR_IP6))
2995		return (0);
2996	mtx_lock(&pr->pr_mtx);
2997	if (!(pr->pr_flags & PR_IP6)) {
2998		mtx_unlock(&pr->pr_mtx);
2999		return (0);
3000	}
3001	if (pr->pr_ip6 == NULL) {
3002		mtx_unlock(&pr->pr_mtx);
3003		return (EAFNOSUPPORT);
3004	}
3005
3006	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3007		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3008		mtx_unlock(&pr->pr_mtx);
3009		return (0);
3010	}
3011
3012	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3013		/*
3014		 * In case there is only 1 IPv6 address, and v6only is true,
3015		 * then bind directly.
3016		 */
3017		if (v6only != 0 && pr->pr_ip6s == 1)
3018			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3019		mtx_unlock(&pr->pr_mtx);
3020		return (0);
3021	}
3022
3023	error = _prison_check_ip6(pr, ia6);
3024	mtx_unlock(&pr->pr_mtx);
3025	return (error);
3026}
3027
3028/*
3029 * Rewrite destination address in case we will connect to loopback address.
3030 *
3031 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3032 */
3033int
3034prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3035{
3036	struct prison *pr;
3037
3038	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3039	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3040
3041	pr = cred->cr_prison;
3042	if (!(pr->pr_flags & PR_IP6))
3043		return (0);
3044	mtx_lock(&pr->pr_mtx);
3045	if (!(pr->pr_flags & PR_IP6)) {
3046		mtx_unlock(&pr->pr_mtx);
3047		return (0);
3048	}
3049	if (pr->pr_ip6 == NULL) {
3050		mtx_unlock(&pr->pr_mtx);
3051		return (EAFNOSUPPORT);
3052	}
3053
3054	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3055		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3056		mtx_unlock(&pr->pr_mtx);
3057		return (0);
3058	}
3059
3060	/*
3061	 * Return success because nothing had to be changed.
3062	 */
3063	mtx_unlock(&pr->pr_mtx);
3064	return (0);
3065}
3066
3067/*
3068 * Check if given address belongs to the jail referenced by cred/prison.
3069 *
3070 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3071 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3072 * doesn't allow IPv6.
3073 */
3074static int
3075_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3076{
3077	int i, a, z, d;
3078
3079	/*
3080	 * Check the primary IP.
3081	 */
3082	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3083		return (0);
3084
3085	/*
3086	 * All the other IPs are sorted so we can do a binary search.
3087	 */
3088	a = 0;
3089	z = pr->pr_ip6s - 2;
3090	while (a <= z) {
3091		i = (a + z) / 2;
3092		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3093		if (d > 0)
3094			z = i - 1;
3095		else if (d < 0)
3096			a = i + 1;
3097		else
3098			return (0);
3099	}
3100
3101	return (EADDRNOTAVAIL);
3102}
3103
3104int
3105prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3106{
3107	struct prison *pr;
3108	int error;
3109
3110	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3111	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3112
3113	pr = cred->cr_prison;
3114	if (!(pr->pr_flags & PR_IP6))
3115		return (0);
3116	mtx_lock(&pr->pr_mtx);
3117	if (!(pr->pr_flags & PR_IP6)) {
3118		mtx_unlock(&pr->pr_mtx);
3119		return (0);
3120	}
3121	if (pr->pr_ip6 == NULL) {
3122		mtx_unlock(&pr->pr_mtx);
3123		return (EAFNOSUPPORT);
3124	}
3125
3126	error = _prison_check_ip6(pr, ia6);
3127	mtx_unlock(&pr->pr_mtx);
3128	return (error);
3129}
3130#endif
3131
3132/*
3133 * Check if a jail supports the given address family.
3134 *
3135 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3136 * if not.
3137 */
3138int
3139prison_check_af(struct ucred *cred, int af)
3140{
3141	struct prison *pr;
3142	int error;
3143
3144	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3145
3146	pr = cred->cr_prison;
3147#ifdef VIMAGE
3148	/* Prisons with their own network stack are not limited. */
3149	if (pr->pr_flags & PR_VNET)
3150		return (0);
3151#endif
3152
3153	error = 0;
3154	switch (af)
3155	{
3156#ifdef INET
3157	case AF_INET:
3158		if (pr->pr_flags & PR_IP4)
3159		{
3160			mtx_lock(&pr->pr_mtx);
3161			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3162				error = EAFNOSUPPORT;
3163			mtx_unlock(&pr->pr_mtx);
3164		}
3165		break;
3166#endif
3167#ifdef INET6
3168	case AF_INET6:
3169		if (pr->pr_flags & PR_IP6)
3170		{
3171			mtx_lock(&pr->pr_mtx);
3172			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3173				error = EAFNOSUPPORT;
3174			mtx_unlock(&pr->pr_mtx);
3175		}
3176		break;
3177#endif
3178	case AF_LOCAL:
3179	case AF_ROUTE:
3180		break;
3181	default:
3182		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3183			error = EAFNOSUPPORT;
3184	}
3185	return (error);
3186}
3187
3188/*
3189 * Check if given address belongs to the jail referenced by cred (wrapper to
3190 * prison_check_ip[46]).
3191 *
3192 * Returns 0 if jail doesn't restrict the address family or if address belongs
3193 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3194 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3195 */
3196int
3197prison_if(struct ucred *cred, struct sockaddr *sa)
3198{
3199#ifdef INET
3200	struct sockaddr_in *sai;
3201#endif
3202#ifdef INET6
3203	struct sockaddr_in6 *sai6;
3204#endif
3205	int error;
3206
3207	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3208	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3209
3210	error = 0;
3211	switch (sa->sa_family)
3212	{
3213#ifdef INET
3214	case AF_INET:
3215		sai = (struct sockaddr_in *)sa;
3216		error = prison_check_ip4(cred, &sai->sin_addr);
3217		break;
3218#endif
3219#ifdef INET6
3220	case AF_INET6:
3221		sai6 = (struct sockaddr_in6 *)sa;
3222		error = prison_check_ip6(cred, &sai6->sin6_addr);
3223		break;
3224#endif
3225	default:
3226		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3227			error = EAFNOSUPPORT;
3228	}
3229	return (error);
3230}
3231
3232/*
3233 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3234 */
3235int
3236prison_check(struct ucred *cred1, struct ucred *cred2)
3237{
3238
3239	return ((cred1->cr_prison == cred2->cr_prison ||
3240	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3241}
3242
3243/*
3244 * Return 1 if p2 is a child of p1, otherwise 0.
3245 */
3246int
3247prison_ischild(struct prison *pr1, struct prison *pr2)
3248{
3249
3250	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3251		if (pr1 == pr2)
3252			return (1);
3253	return (0);
3254}
3255
3256/*
3257 * Return 1 if the passed credential is in a jail, otherwise 0.
3258 */
3259int
3260jailed(struct ucred *cred)
3261{
3262
3263	return (cred->cr_prison != &prison0);
3264}
3265
3266/*
3267 * Return the correct hostname (domainname, et al) for the passed credential.
3268 */
3269void
3270getcredhostname(struct ucred *cred, char *buf, size_t size)
3271{
3272	struct prison *pr;
3273
3274	/*
3275	 * A NULL credential can be used to shortcut to the physical
3276	 * system's hostname.
3277	 */
3278	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3279	mtx_lock(&pr->pr_mtx);
3280	strlcpy(buf, pr->pr_hostname, size);
3281	mtx_unlock(&pr->pr_mtx);
3282}
3283
3284void
3285getcreddomainname(struct ucred *cred, char *buf, size_t size)
3286{
3287
3288	mtx_lock(&cred->cr_prison->pr_mtx);
3289	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3290	mtx_unlock(&cred->cr_prison->pr_mtx);
3291}
3292
3293void
3294getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3295{
3296
3297	mtx_lock(&cred->cr_prison->pr_mtx);
3298	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3299	mtx_unlock(&cred->cr_prison->pr_mtx);
3300}
3301
3302void
3303getcredhostid(struct ucred *cred, unsigned long *hostid)
3304{
3305
3306	mtx_lock(&cred->cr_prison->pr_mtx);
3307	*hostid = cred->cr_prison->pr_hostid;
3308	mtx_unlock(&cred->cr_prison->pr_mtx);
3309}
3310
3311/*
3312 * Determine whether the subject represented by cred can "see"
3313 * status of a mount point.
3314 * Returns: 0 for permitted, ENOENT otherwise.
3315 * XXX: This function should be called cr_canseemount() and should be
3316 *      placed in kern_prot.c.
3317 */
3318int
3319prison_canseemount(struct ucred *cred, struct mount *mp)
3320{
3321	struct prison *pr;
3322	struct statfs *sp;
3323	size_t len;
3324
3325	pr = cred->cr_prison;
3326	if (pr->pr_enforce_statfs == 0)
3327		return (0);
3328	if (pr->pr_root->v_mount == mp)
3329		return (0);
3330	if (pr->pr_enforce_statfs == 2)
3331		return (ENOENT);
3332	/*
3333	 * If jail's chroot directory is set to "/" we should be able to see
3334	 * all mount-points from inside a jail.
3335	 * This is ugly check, but this is the only situation when jail's
3336	 * directory ends with '/'.
3337	 */
3338	if (strcmp(pr->pr_path, "/") == 0)
3339		return (0);
3340	len = strlen(pr->pr_path);
3341	sp = &mp->mnt_stat;
3342	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3343		return (ENOENT);
3344	/*
3345	 * Be sure that we don't have situation where jail's root directory
3346	 * is "/some/path" and mount point is "/some/pathpath".
3347	 */
3348	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3349		return (ENOENT);
3350	return (0);
3351}
3352
3353void
3354prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3355{
3356	char jpath[MAXPATHLEN];
3357	struct prison *pr;
3358	size_t len;
3359
3360	pr = cred->cr_prison;
3361	if (pr->pr_enforce_statfs == 0)
3362		return;
3363	if (prison_canseemount(cred, mp) != 0) {
3364		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3365		strlcpy(sp->f_mntonname, "[restricted]",
3366		    sizeof(sp->f_mntonname));
3367		return;
3368	}
3369	if (pr->pr_root->v_mount == mp) {
3370		/*
3371		 * Clear current buffer data, so we are sure nothing from
3372		 * the valid path left there.
3373		 */
3374		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3375		*sp->f_mntonname = '/';
3376		return;
3377	}
3378	/*
3379	 * If jail's chroot directory is set to "/" we should be able to see
3380	 * all mount-points from inside a jail.
3381	 */
3382	if (strcmp(pr->pr_path, "/") == 0)
3383		return;
3384	len = strlen(pr->pr_path);
3385	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3386	/*
3387	 * Clear current buffer data, so we are sure nothing from
3388	 * the valid path left there.
3389	 */
3390	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3391	if (*jpath == '\0') {
3392		/* Should never happen. */
3393		*sp->f_mntonname = '/';
3394	} else {
3395		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3396	}
3397}
3398
3399/*
3400 * Check with permission for a specific privilege is granted within jail.  We
3401 * have a specific list of accepted privileges; the rest are denied.
3402 */
3403int
3404prison_priv_check(struct ucred *cred, int priv)
3405{
3406
3407	if (!jailed(cred))
3408		return (0);
3409
3410#ifdef VIMAGE
3411	/*
3412	 * Privileges specific to prisons with a virtual network stack.
3413	 * There might be a duplicate entry here in case the privilege
3414	 * is only granted conditionally in the legacy jail case.
3415	 */
3416	switch (priv) {
3417#ifdef notyet
3418		/*
3419		 * NFS-specific privileges.
3420		 */
3421	case PRIV_NFS_DAEMON:
3422	case PRIV_NFS_LOCKD:
3423#endif
3424		/*
3425		 * Network stack privileges.
3426		 */
3427	case PRIV_NET_BRIDGE:
3428	case PRIV_NET_GRE:
3429	case PRIV_NET_BPF:
3430	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3431	case PRIV_NET_ROUTE:
3432	case PRIV_NET_TAP:
3433	case PRIV_NET_SETIFMTU:
3434	case PRIV_NET_SETIFFLAGS:
3435	case PRIV_NET_SETIFCAP:
3436	case PRIV_NET_SETIFNAME	:
3437	case PRIV_NET_SETIFMETRIC:
3438	case PRIV_NET_SETIFPHYS:
3439	case PRIV_NET_SETIFMAC:
3440	case PRIV_NET_ADDMULTI:
3441	case PRIV_NET_DELMULTI:
3442	case PRIV_NET_HWIOCTL:
3443	case PRIV_NET_SETLLADDR:
3444	case PRIV_NET_ADDIFGROUP:
3445	case PRIV_NET_DELIFGROUP:
3446	case PRIV_NET_IFCREATE:
3447	case PRIV_NET_IFDESTROY:
3448	case PRIV_NET_ADDIFADDR:
3449	case PRIV_NET_DELIFADDR:
3450	case PRIV_NET_LAGG:
3451	case PRIV_NET_GIF:
3452	case PRIV_NET_SETIFVNET:
3453
3454		/*
3455		 * 802.11-related privileges.
3456		 */
3457	case PRIV_NET80211_GETKEY:
3458#ifdef notyet
3459	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3460#endif
3461
3462#ifdef notyet
3463		/*
3464		 * AppleTalk privileges.
3465		 */
3466	case PRIV_NETATALK_RESERVEDPORT:
3467
3468		/*
3469		 * ATM privileges.
3470		 */
3471	case PRIV_NETATM_CFG:
3472	case PRIV_NETATM_ADD:
3473	case PRIV_NETATM_DEL:
3474	case PRIV_NETATM_SET:
3475
3476		/*
3477		 * Bluetooth privileges.
3478		 */
3479	case PRIV_NETBLUETOOTH_RAW:
3480#endif
3481
3482		/*
3483		 * Netgraph and netgraph module privileges.
3484		 */
3485	case PRIV_NETGRAPH_CONTROL:
3486#ifdef notyet
3487	case PRIV_NETGRAPH_TTY:
3488#endif
3489
3490		/*
3491		 * IPv4 and IPv6 privileges.
3492		 */
3493	case PRIV_NETINET_IPFW:
3494	case PRIV_NETINET_DIVERT:
3495	case PRIV_NETINET_PF:
3496	case PRIV_NETINET_DUMMYNET:
3497	case PRIV_NETINET_CARP:
3498	case PRIV_NETINET_MROUTE:
3499	case PRIV_NETINET_RAW:
3500	case PRIV_NETINET_ADDRCTRL6:
3501	case PRIV_NETINET_ND6:
3502	case PRIV_NETINET_SCOPE6:
3503	case PRIV_NETINET_ALIFETIME6:
3504	case PRIV_NETINET_IPSEC:
3505	case PRIV_NETINET_BINDANY:
3506
3507#ifdef notyet
3508		/*
3509		 * IPX/SPX privileges.
3510		 */
3511	case PRIV_NETIPX_RESERVEDPORT:
3512	case PRIV_NETIPX_RAW:
3513
3514		/*
3515		 * NCP privileges.
3516		 */
3517	case PRIV_NETNCP:
3518
3519		/*
3520		 * SMB privileges.
3521		 */
3522	case PRIV_NETSMB:
3523#endif
3524
3525	/*
3526	 * No default: or deny here.
3527	 * In case of no permit fall through to next switch().
3528	 */
3529		if (cred->cr_prison->pr_flags & PR_VNET)
3530			return (0);
3531	}
3532#endif /* VIMAGE */
3533
3534	switch (priv) {
3535
3536		/*
3537		 * Allow ktrace privileges for root in jail.
3538		 */
3539	case PRIV_KTRACE:
3540
3541#if 0
3542		/*
3543		 * Allow jailed processes to configure audit identity and
3544		 * submit audit records (login, etc).  In the future we may
3545		 * want to further refine the relationship between audit and
3546		 * jail.
3547		 */
3548	case PRIV_AUDIT_GETAUDIT:
3549	case PRIV_AUDIT_SETAUDIT:
3550	case PRIV_AUDIT_SUBMIT:
3551#endif
3552
3553		/*
3554		 * Allow jailed processes to manipulate process UNIX
3555		 * credentials in any way they see fit.
3556		 */
3557	case PRIV_CRED_SETUID:
3558	case PRIV_CRED_SETEUID:
3559	case PRIV_CRED_SETGID:
3560	case PRIV_CRED_SETEGID:
3561	case PRIV_CRED_SETGROUPS:
3562	case PRIV_CRED_SETREUID:
3563	case PRIV_CRED_SETREGID:
3564	case PRIV_CRED_SETRESUID:
3565	case PRIV_CRED_SETRESGID:
3566
3567		/*
3568		 * Jail implements visibility constraints already, so allow
3569		 * jailed root to override uid/gid-based constraints.
3570		 */
3571	case PRIV_SEEOTHERGIDS:
3572	case PRIV_SEEOTHERUIDS:
3573
3574		/*
3575		 * Jail implements inter-process debugging limits already, so
3576		 * allow jailed root various debugging privileges.
3577		 */
3578	case PRIV_DEBUG_DIFFCRED:
3579	case PRIV_DEBUG_SUGID:
3580	case PRIV_DEBUG_UNPRIV:
3581
3582		/*
3583		 * Allow jail to set various resource limits and login
3584		 * properties, and for now, exceed process resource limits.
3585		 */
3586	case PRIV_PROC_LIMIT:
3587	case PRIV_PROC_SETLOGIN:
3588	case PRIV_PROC_SETRLIMIT:
3589
3590		/*
3591		 * System V and POSIX IPC privileges are granted in jail.
3592		 */
3593	case PRIV_IPC_READ:
3594	case PRIV_IPC_WRITE:
3595	case PRIV_IPC_ADMIN:
3596	case PRIV_IPC_MSGSIZE:
3597	case PRIV_MQ_ADMIN:
3598
3599		/*
3600		 * Jail operations within a jail work on child jails.
3601		 */
3602	case PRIV_JAIL_ATTACH:
3603	case PRIV_JAIL_SET:
3604	case PRIV_JAIL_REMOVE:
3605
3606		/*
3607		 * Jail implements its own inter-process limits, so allow
3608		 * root processes in jail to change scheduling on other
3609		 * processes in the same jail.  Likewise for signalling.
3610		 */
3611	case PRIV_SCHED_DIFFCRED:
3612	case PRIV_SCHED_CPUSET:
3613	case PRIV_SIGNAL_DIFFCRED:
3614	case PRIV_SIGNAL_SUGID:
3615
3616		/*
3617		 * Allow jailed processes to write to sysctls marked as jail
3618		 * writable.
3619		 */
3620	case PRIV_SYSCTL_WRITEJAIL:
3621
3622		/*
3623		 * Allow root in jail to manage a variety of quota
3624		 * properties.  These should likely be conditional on a
3625		 * configuration option.
3626		 */
3627	case PRIV_VFS_GETQUOTA:
3628	case PRIV_VFS_SETQUOTA:
3629
3630		/*
3631		 * Since Jail relies on chroot() to implement file system
3632		 * protections, grant many VFS privileges to root in jail.
3633		 * Be careful to exclude mount-related and NFS-related
3634		 * privileges.
3635		 */
3636	case PRIV_VFS_READ:
3637	case PRIV_VFS_WRITE:
3638	case PRIV_VFS_ADMIN:
3639	case PRIV_VFS_EXEC:
3640	case PRIV_VFS_LOOKUP:
3641	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3642	case PRIV_VFS_CHFLAGS_DEV:
3643	case PRIV_VFS_CHOWN:
3644	case PRIV_VFS_CHROOT:
3645	case PRIV_VFS_RETAINSUGID:
3646	case PRIV_VFS_FCHROOT:
3647	case PRIV_VFS_LINK:
3648	case PRIV_VFS_SETGID:
3649	case PRIV_VFS_STAT:
3650	case PRIV_VFS_STICKYFILE:
3651		return (0);
3652
3653		/*
3654		 * Depending on the global setting, allow privilege of
3655		 * setting system flags.
3656		 */
3657	case PRIV_VFS_SYSFLAGS:
3658		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3659			return (0);
3660		else
3661			return (EPERM);
3662
3663		/*
3664		 * Depending on the global setting, allow privilege of
3665		 * mounting/unmounting file systems.
3666		 */
3667	case PRIV_VFS_MOUNT:
3668	case PRIV_VFS_UNMOUNT:
3669	case PRIV_VFS_MOUNT_NONUSER:
3670	case PRIV_VFS_MOUNT_OWNER:
3671		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3672			return (0);
3673		else
3674			return (EPERM);
3675
3676		/*
3677		 * Allow jailed root to bind reserved ports and reuse in-use
3678		 * ports.
3679		 */
3680	case PRIV_NETINET_RESERVEDPORT:
3681	case PRIV_NETINET_REUSEPORT:
3682		return (0);
3683
3684		/*
3685		 * Allow jailed root to set certian IPv4/6 (option) headers.
3686		 */
3687	case PRIV_NETINET_SETHDROPTS:
3688		return (0);
3689
3690		/*
3691		 * Conditionally allow creating raw sockets in jail.
3692		 */
3693	case PRIV_NETINET_RAW:
3694		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3695			return (0);
3696		else
3697			return (EPERM);
3698
3699		/*
3700		 * Since jail implements its own visibility limits on netstat
3701		 * sysctls, allow getcred.  This allows identd to work in
3702		 * jail.
3703		 */
3704	case PRIV_NETINET_GETCRED:
3705		return (0);
3706
3707	default:
3708		/*
3709		 * In all remaining cases, deny the privilege request.  This
3710		 * includes almost all network privileges, many system
3711		 * configuration privileges.
3712		 */
3713		return (EPERM);
3714	}
3715}
3716
3717/*
3718 * Return the part of pr2's name that is relative to pr1, or the whole name
3719 * if it does not directly follow.
3720 */
3721
3722char *
3723prison_name(struct prison *pr1, struct prison *pr2)
3724{
3725	char *name;
3726
3727	/* Jails see themselves as "0" (if they see themselves at all). */
3728	if (pr1 == pr2)
3729		return "0";
3730	name = pr2->pr_name;
3731	if (prison_ischild(pr1, pr2)) {
3732		/*
3733		 * pr1 isn't locked (and allprison_lock may not be either)
3734		 * so its length can't be counted on.  But the number of dots
3735		 * can be counted on - and counted.
3736		 */
3737		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3738			name = strchr(name, '.') + 1;
3739	}
3740	return (name);
3741}
3742
3743/*
3744 * Return the part of pr2's path that is relative to pr1, or the whole path
3745 * if it does not directly follow.
3746 */
3747static char *
3748prison_path(struct prison *pr1, struct prison *pr2)
3749{
3750	char *path1, *path2;
3751	int len1;
3752
3753	path1 = pr1->pr_path;
3754	path2 = pr2->pr_path;
3755	if (!strcmp(path1, "/"))
3756		return (path2);
3757	len1 = strlen(path1);
3758	if (strncmp(path1, path2, len1))
3759		return (path2);
3760	if (path2[len1] == '\0')
3761		return "/";
3762	if (path2[len1] == '/')
3763		return (path2 + len1);
3764	return (path2);
3765}
3766
3767
3768/*
3769 * Jail-related sysctls.
3770 */
3771SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3772    "Jails");
3773
3774static int
3775sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3776{
3777	struct xprison *xp;
3778	struct prison *pr, *cpr;
3779#ifdef INET
3780	struct in_addr *ip4 = NULL;
3781	int ip4s = 0;
3782#endif
3783#ifdef INET6
3784	struct in_addr *ip6 = NULL;
3785	int ip6s = 0;
3786#endif
3787	int descend, error;
3788
3789	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3790	pr = req->td->td_ucred->cr_prison;
3791	error = 0;
3792	sx_slock(&allprison_lock);
3793	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3794#if defined(INET) || defined(INET6)
3795 again:
3796#endif
3797		mtx_lock(&cpr->pr_mtx);
3798#ifdef INET
3799		if (cpr->pr_ip4s > 0) {
3800			if (ip4s < cpr->pr_ip4s) {
3801				ip4s = cpr->pr_ip4s;
3802				mtx_unlock(&cpr->pr_mtx);
3803				ip4 = realloc(ip4, ip4s *
3804				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3805				goto again;
3806			}
3807			bcopy(cpr->pr_ip4, ip4,
3808			    cpr->pr_ip4s * sizeof(struct in_addr));
3809		}
3810#endif
3811#ifdef INET6
3812		if (cpr->pr_ip6s > 0) {
3813			if (ip6s < cpr->pr_ip6s) {
3814				ip6s = cpr->pr_ip6s;
3815				mtx_unlock(&cpr->pr_mtx);
3816				ip6 = realloc(ip6, ip6s *
3817				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3818				goto again;
3819			}
3820			bcopy(cpr->pr_ip6, ip6,
3821			    cpr->pr_ip6s * sizeof(struct in6_addr));
3822		}
3823#endif
3824		if (cpr->pr_ref == 0) {
3825			mtx_unlock(&cpr->pr_mtx);
3826			continue;
3827		}
3828		bzero(xp, sizeof(*xp));
3829		xp->pr_version = XPRISON_VERSION;
3830		xp->pr_id = cpr->pr_id;
3831		xp->pr_state = cpr->pr_uref > 0
3832		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3833		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3834		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3835		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3836#ifdef INET
3837		xp->pr_ip4s = cpr->pr_ip4s;
3838#endif
3839#ifdef INET6
3840		xp->pr_ip6s = cpr->pr_ip6s;
3841#endif
3842		mtx_unlock(&cpr->pr_mtx);
3843		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3844		if (error)
3845			break;
3846#ifdef INET
3847		if (xp->pr_ip4s > 0) {
3848			error = SYSCTL_OUT(req, ip4,
3849			    xp->pr_ip4s * sizeof(struct in_addr));
3850			if (error)
3851				break;
3852		}
3853#endif
3854#ifdef INET6
3855		if (xp->pr_ip6s > 0) {
3856			error = SYSCTL_OUT(req, ip6,
3857			    xp->pr_ip6s * sizeof(struct in6_addr));
3858			if (error)
3859				break;
3860		}
3861#endif
3862	}
3863	sx_sunlock(&allprison_lock);
3864	free(xp, M_TEMP);
3865#ifdef INET
3866	free(ip4, M_TEMP);
3867#endif
3868#ifdef INET6
3869	free(ip6, M_TEMP);
3870#endif
3871	return (error);
3872}
3873
3874SYSCTL_OID(_security_jail, OID_AUTO, list,
3875    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3876    sysctl_jail_list, "S", "List of active jails");
3877
3878static int
3879sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3880{
3881	int error, injail;
3882
3883	injail = jailed(req->td->td_ucred);
3884	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3885
3886	return (error);
3887}
3888
3889SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3890    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3891    sysctl_jail_jailed, "I", "Process in jail?");
3892
3893#if defined(INET) || defined(INET6)
3894SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3895    &jail_max_af_ips, 0,
3896    "Number of IP addresses a jail may have at most per address family");
3897#endif
3898
3899/*
3900 * Default parameters for jail(2) compatability.  For historical reasons,
3901 * the sysctl names have varying similarity to the parameter names.  Prisons
3902 * just see their own parameters, and can't change them.
3903 */
3904static int
3905sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3906{
3907	struct prison *pr;
3908	int allow, error, i;
3909
3910	pr = req->td->td_ucred->cr_prison;
3911	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
3912
3913	/* Get the current flag value, and convert it to a boolean. */
3914	i = (allow & arg2) ? 1 : 0;
3915	if (arg1 != NULL)
3916		i = !i;
3917	error = sysctl_handle_int(oidp, &i, 0, req);
3918	if (error || !req->newptr)
3919		return (error);
3920	i = i ? arg2 : 0;
3921	if (arg1 != NULL)
3922		i ^= arg2;
3923	/*
3924	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3925	 * for writing.
3926	 */
3927	mtx_lock(&prison0.pr_mtx);
3928	jail_default_allow = (jail_default_allow & ~arg2) | i;
3929	mtx_unlock(&prison0.pr_mtx);
3930	return (0);
3931}
3932
3933SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3934    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3935    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3936    "Processes in jail can set their hostnames");
3937SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3938    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3939    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3940    "Processes in jail are limited to creating UNIX/IP/route sockets only");
3941SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3942    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3943    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3944    "Processes in jail can use System V IPC primitives");
3945SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3946    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3947    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3948    "Prison root can create raw sockets");
3949SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3950    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3951    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3952    "Processes in jail can alter system file flags");
3953SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3954    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3955    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3956    "Processes in jail can mount/unmount jail-friendly file systems");
3957
3958static int
3959sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3960{
3961	struct prison *pr;
3962	int level, error;
3963
3964	pr = req->td->td_ucred->cr_prison;
3965	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
3966	error = sysctl_handle_int(oidp, &level, 0, req);
3967	if (error || !req->newptr)
3968		return (error);
3969	*(int *)arg1 = level;
3970	return (0);
3971}
3972
3973SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
3974    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3975    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
3976    sysctl_jail_default_level, "I",
3977    "Processes in jail cannot see all mounted file systems");
3978
3979/*
3980 * Nodes to describe jail parameters.  Maximum length of string parameters
3981 * is returned in the string itself, and the other parameters exist merely
3982 * to make themselves and their types known.
3983 */
3984SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
3985    "Jail parameters");
3986
3987int
3988sysctl_jail_param(SYSCTL_HANDLER_ARGS)
3989{
3990	int i;
3991	long l;
3992	size_t s;
3993	char numbuf[12];
3994
3995	switch (oidp->oid_kind & CTLTYPE)
3996	{
3997	case CTLTYPE_LONG:
3998	case CTLTYPE_ULONG:
3999		l = 0;
4000#ifdef SCTL_MASK32
4001		if (!(req->flags & SCTL_MASK32))
4002#endif
4003			return (SYSCTL_OUT(req, &l, sizeof(l)));
4004	case CTLTYPE_INT:
4005	case CTLTYPE_UINT:
4006		i = 0;
4007		return (SYSCTL_OUT(req, &i, sizeof(i)));
4008	case CTLTYPE_STRING:
4009		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
4010		return
4011		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4012	case CTLTYPE_STRUCT:
4013		s = (size_t)arg2;
4014		return (SYSCTL_OUT(req, &s, sizeof(s)));
4015	}
4016	return (0);
4017}
4018
4019SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4020SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4021SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4022SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4023SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4024    "I", "Jail secure level");
4025SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4026    "I", "Jail cannot see all mounted file systems");
4027SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4028    "B", "Jail persistence");
4029#ifdef VIMAGE
4030SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4031    "E,jailsys", "Virtual network stack");
4032#endif
4033SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4034    "B", "Jail is in the process of shutting down");
4035
4036SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4037SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4038    "I", "Current number of child jails");
4039SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4040    "I", "Maximum number of child jails");
4041
4042SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4043SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4044    "Jail hostname");
4045SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4046    "Jail NIS domainname");
4047SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4048    "Jail host UUID");
4049SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4050    "LU", "Jail host ID");
4051
4052SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4053SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4054
4055#ifdef INET
4056SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4057    "Jail IPv4 address virtualization");
4058SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4059    "S,in_addr,a", "Jail IPv4 addresses");
4060#endif
4061#ifdef INET6
4062SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4063    "Jail IPv6 address virtualization");
4064SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4065    "S,in6_addr,a", "Jail IPv6 addresses");
4066#endif
4067
4068SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4069SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4070    "B", "Jail may set hostname");
4071SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4072    "B", "Jail may use SYSV IPC");
4073SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4074    "B", "Jail may create raw sockets");
4075SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4076    "B", "Jail may alter system file flags");
4077SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
4078    "B", "Jail may mount/unmount jail-friendly file systems");
4079SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4080    "B", "Jail may set file quotas");
4081SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4082    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4083
4084
4085#ifdef DDB
4086
4087static void
4088db_show_prison(struct prison *pr)
4089{
4090	int fi;
4091#if defined(INET) || defined(INET6)
4092	int ii;
4093#endif
4094	unsigned jsf;
4095#ifdef INET6
4096	char ip6buf[INET6_ADDRSTRLEN];
4097#endif
4098
4099	db_printf("prison %p:\n", pr);
4100	db_printf(" jid             = %d\n", pr->pr_id);
4101	db_printf(" name            = %s\n", pr->pr_name);
4102	db_printf(" parent          = %p\n", pr->pr_parent);
4103	db_printf(" ref             = %d\n", pr->pr_ref);
4104	db_printf(" uref            = %d\n", pr->pr_uref);
4105	db_printf(" path            = %s\n", pr->pr_path);
4106	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4107	    ? pr->pr_cpuset->cs_id : -1);
4108#ifdef VIMAGE
4109	db_printf(" vnet            = %p\n", pr->pr_vnet);
4110#endif
4111	db_printf(" root            = %p\n", pr->pr_root);
4112	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4113	db_printf(" childcount      = %d\n", pr->pr_childcount);
4114	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4115	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4116	db_printf(" flags           = %x", pr->pr_flags);
4117	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4118	    fi++)
4119		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4120			db_printf(" %s", pr_flag_names[fi]);
4121	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4122	    fi++) {
4123		jsf = pr->pr_flags &
4124		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4125		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4126		    pr_flag_jailsys[fi].disable &&
4127		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4128		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4129		    : "inherit");
4130	}
4131	db_printf(" allow           = %x", pr->pr_allow);
4132	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4133	    fi++)
4134		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4135			db_printf(" %s", pr_allow_names[fi]);
4136	db_printf("\n");
4137	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4138	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4139	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4140	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4141	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4142#ifdef INET
4143	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4144	for (ii = 0; ii < pr->pr_ip4s; ii++)
4145		db_printf(" %s %s\n",
4146		    ii == 0 ? "ip4             =" : "                 ",
4147		    inet_ntoa(pr->pr_ip4[ii]));
4148#endif
4149#ifdef INET6
4150	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4151	for (ii = 0; ii < pr->pr_ip6s; ii++)
4152		db_printf(" %s %s\n",
4153		    ii == 0 ? "ip6             =" : "                 ",
4154		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4155#endif
4156}
4157
4158DB_SHOW_COMMAND(prison, db_show_prison_command)
4159{
4160	struct prison *pr;
4161
4162	if (!have_addr) {
4163		/*
4164		 * Show all prisons in the list, and prison0 which is not
4165		 * listed.
4166		 */
4167		db_show_prison(&prison0);
4168		if (!db_pager_quit) {
4169			TAILQ_FOREACH(pr, &allprison, pr_list) {
4170				db_show_prison(pr);
4171				if (db_pager_quit)
4172					break;
4173			}
4174		}
4175		return;
4176	}
4177
4178	if (addr == 0)
4179		pr = &prison0;
4180	else {
4181		/* Look for a prison with the ID and with references. */
4182		TAILQ_FOREACH(pr, &allprison, pr_list)
4183			if (pr->pr_id == addr && pr->pr_ref > 0)
4184				break;
4185		if (pr == NULL)
4186			/* Look again, without requiring a reference. */
4187			TAILQ_FOREACH(pr, &allprison, pr_list)
4188				if (pr->pr_id == addr)
4189					break;
4190		if (pr == NULL)
4191			/* Assume address points to a valid prison. */
4192			pr = (struct prison *)addr;
4193	}
4194	db_show_prison(pr);
4195}
4196
4197#endif /* DDB */
4198