kern_jail.c revision 301907
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 301907 2016-06-15 01:56:20Z jamie $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/racct.h>
53#include <sys/refcount.h>
54#include <sys/sx.h>
55#include <sys/sysent.h>
56#include <sys/namei.h>
57#include <sys/mount.h>
58#include <sys/queue.h>
59#include <sys/socket.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/vnode.h>
63
64#include <net/if.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68
69#ifdef DDB
70#include <ddb/ddb.h>
71#ifdef INET6
72#include <netinet6/in6_var.h>
73#endif /* INET6 */
74#endif /* DDB */
75
76#include <security/mac/mac_framework.h>
77
78#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79
80MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82
83/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84#ifdef INET
85#ifdef INET6
86#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87#else
88#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89#endif
90#else /* !INET */
91#ifdef INET6
92#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93#else
94#define	_PR_IP_SADDRSEL	0
95#endif
96#endif
97
98/* prison0 describes what is "real" about the system. */
99struct prison prison0 = {
100	.pr_id		= 0,
101	.pr_name	= "0",
102	.pr_ref		= 1,
103	.pr_uref	= 1,
104	.pr_path	= "/",
105	.pr_securelevel	= -1,
106	.pr_devfs_rsnum = 0,
107	.pr_childmax	= JAIL_MAX,
108	.pr_hostuuid	= DEFAULT_HOSTUUID,
109	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
110#ifdef VIMAGE
111	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
112#else
113	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
114#endif
115	.pr_allow	= PR_ALLOW_ALL,
116};
117MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
118
119/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
120struct	sx allprison_lock;
121SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
122struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
123LIST_HEAD(, prison_racct) allprison_racct;
124int	lastprid = 0;
125
126static int do_jail_attach(struct thread *td, struct prison *pr);
127static void prison_complete(void *context, int pending);
128static void prison_deref(struct prison *pr, int flags);
129static char *prison_path(struct prison *pr1, struct prison *pr2);
130static void prison_remove_one(struct prison *pr);
131#ifdef RACCT
132static void prison_racct_attach(struct prison *pr);
133static void prison_racct_modify(struct prison *pr);
134static void prison_racct_detach(struct prison *pr);
135#endif
136#ifdef INET
137static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
138static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
139#endif
140#ifdef INET6
141static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
142static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
143#endif
144
145/* Flags for prison_deref */
146#define	PD_DEREF	0x01
147#define	PD_DEUREF	0x02
148#define	PD_LOCKED	0x04
149#define	PD_LIST_SLOCKED	0x08
150#define	PD_LIST_XLOCKED	0x10
151
152/*
153 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
154 * as we cannot figure out the size of a sparse array, or an array without a
155 * terminating entry.
156 */
157static char *pr_flag_names[] = {
158	[0] = "persist",
159#ifdef INET
160	[7] = "ip4.saddrsel",
161#endif
162#ifdef INET6
163	[8] = "ip6.saddrsel",
164#endif
165};
166const size_t pr_flag_names_size = sizeof(pr_flag_names);
167
168static char *pr_flag_nonames[] = {
169	[0] = "nopersist",
170#ifdef INET
171	[7] = "ip4.nosaddrsel",
172#endif
173#ifdef INET6
174	[8] = "ip6.nosaddrsel",
175#endif
176};
177const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
178
179struct jailsys_flags {
180	const char	*name;
181	unsigned	 disable;
182	unsigned	 new;
183} pr_flag_jailsys[] = {
184	{ "host", 0, PR_HOST },
185#ifdef VIMAGE
186	{ "vnet", 0, PR_VNET },
187#endif
188#ifdef INET
189	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
190#endif
191#ifdef INET6
192	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
193#endif
194};
195const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196
197static char *pr_allow_names[] = {
198	"allow.set_hostname",
199	"allow.sysvipc",
200	"allow.raw_sockets",
201	"allow.chflags",
202	"allow.mount",
203	"allow.quotas",
204	"allow.socket_af",
205	"allow.mount.devfs",
206	"allow.mount.nullfs",
207	"allow.mount.zfs",
208	"allow.mount.procfs",
209	"allow.mount.tmpfs",
210	"allow.mount.fdescfs",
211	"allow.mount.linprocfs",
212	"allow.mount.linsysfs",
213};
214const size_t pr_allow_names_size = sizeof(pr_allow_names);
215
216static char *pr_allow_nonames[] = {
217	"allow.noset_hostname",
218	"allow.nosysvipc",
219	"allow.noraw_sockets",
220	"allow.nochflags",
221	"allow.nomount",
222	"allow.noquotas",
223	"allow.nosocket_af",
224	"allow.mount.nodevfs",
225	"allow.mount.nonullfs",
226	"allow.mount.nozfs",
227	"allow.mount.noprocfs",
228	"allow.mount.notmpfs",
229	"allow.mount.nofdescfs",
230	"allow.mount.nolinprocfs",
231	"allow.mount.nolinsysfs",
232};
233const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
234
235#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
236#define	JAIL_DEFAULT_ENFORCE_STATFS	2
237#define	JAIL_DEFAULT_DEVFS_RSNUM	0
238static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241#if defined(INET) || defined(INET6)
242static unsigned jail_max_af_ips = 255;
243#endif
244
245/*
246 * Initialize the parts of prison0 that can't be static-initialized with
247 * constants.  This is called from proc0_init() after creating thread0 cpuset.
248 */
249void
250prison0_init(void)
251{
252
253	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
254	prison0.pr_osreldate = osreldate;
255	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
256}
257
258#ifdef INET
259static int
260qcmp_v4(const void *ip1, const void *ip2)
261{
262	in_addr_t iaa, iab;
263
264	/*
265	 * We need to compare in HBO here to get the list sorted as expected
266	 * by the result of the code.  Sorting NBO addresses gives you
267	 * interesting results.  If you do not understand, do not try.
268	 */
269	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
270	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
271
272	/*
273	 * Do not simply return the difference of the two numbers, the int is
274	 * not wide enough.
275	 */
276	if (iaa > iab)
277		return (1);
278	else if (iaa < iab)
279		return (-1);
280	else
281		return (0);
282}
283#endif
284
285#ifdef INET6
286static int
287qcmp_v6(const void *ip1, const void *ip2)
288{
289	const struct in6_addr *ia6a, *ia6b;
290	int i, rc;
291
292	ia6a = (const struct in6_addr *)ip1;
293	ia6b = (const struct in6_addr *)ip2;
294
295	rc = 0;
296	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
297		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
298			rc = 1;
299		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
300			rc = -1;
301	}
302	return (rc);
303}
304#endif
305
306/*
307 * struct jail_args {
308 *	struct jail *jail;
309 * };
310 */
311int
312sys_jail(struct thread *td, struct jail_args *uap)
313{
314	uint32_t version;
315	int error;
316	struct jail j;
317
318	error = copyin(uap->jail, &version, sizeof(uint32_t));
319	if (error)
320		return (error);
321
322	switch (version) {
323	case 0:
324	{
325		struct jail_v0 j0;
326
327		/* FreeBSD single IPv4 jails. */
328		bzero(&j, sizeof(struct jail));
329		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
330		if (error)
331			return (error);
332		j.version = j0.version;
333		j.path = j0.path;
334		j.hostname = j0.hostname;
335		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
336		break;
337	}
338
339	case 1:
340		/*
341		 * Version 1 was used by multi-IPv4 jail implementations
342		 * that never made it into the official kernel.
343		 */
344		return (EINVAL);
345
346	case 2:	/* JAIL_API_VERSION */
347		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
348		error = copyin(uap->jail, &j, sizeof(struct jail));
349		if (error)
350			return (error);
351		break;
352
353	default:
354		/* Sci-Fi jails are not supported, sorry. */
355		return (EINVAL);
356	}
357	return (kern_jail(td, &j));
358}
359
360int
361kern_jail(struct thread *td, struct jail *j)
362{
363	struct iovec optiov[2 * (4
364			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
365#ifdef INET
366			    + 1
367#endif
368#ifdef INET6
369			    + 1
370#endif
371			    )];
372	struct uio opt;
373	char *u_path, *u_hostname, *u_name;
374#ifdef INET
375	uint32_t ip4s;
376	struct in_addr *u_ip4;
377#endif
378#ifdef INET6
379	struct in6_addr *u_ip6;
380#endif
381	size_t tmplen;
382	int error, enforce_statfs, fi;
383
384	bzero(&optiov, sizeof(optiov));
385	opt.uio_iov = optiov;
386	opt.uio_iovcnt = 0;
387	opt.uio_offset = -1;
388	opt.uio_resid = -1;
389	opt.uio_segflg = UIO_SYSSPACE;
390	opt.uio_rw = UIO_READ;
391	opt.uio_td = td;
392
393	/* Set permissions for top-level jails from sysctls. */
394	if (!jailed(td->td_ucred)) {
395		for (fi = 0; fi < sizeof(pr_allow_names) /
396		     sizeof(pr_allow_names[0]); fi++) {
397			optiov[opt.uio_iovcnt].iov_base =
398			    (jail_default_allow & (1 << fi))
399			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
400			optiov[opt.uio_iovcnt].iov_len =
401			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
402			opt.uio_iovcnt += 2;
403		}
404		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
405		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
406		opt.uio_iovcnt++;
407		enforce_statfs = jail_default_enforce_statfs;
408		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
409		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
410		opt.uio_iovcnt++;
411	}
412
413	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
414#ifdef INET
415	ip4s = (j->version == 0) ? 1 : j->ip4s;
416	if (ip4s > jail_max_af_ips)
417		return (EINVAL);
418	tmplen += ip4s * sizeof(struct in_addr);
419#else
420	if (j->ip4s > 0)
421		return (EINVAL);
422#endif
423#ifdef INET6
424	if (j->ip6s > jail_max_af_ips)
425		return (EINVAL);
426	tmplen += j->ip6s * sizeof(struct in6_addr);
427#else
428	if (j->ip6s > 0)
429		return (EINVAL);
430#endif
431	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
432	u_hostname = u_path + MAXPATHLEN;
433	u_name = u_hostname + MAXHOSTNAMELEN;
434#ifdef INET
435	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
436#endif
437#ifdef INET6
438#ifdef INET
439	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
440#else
441	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
442#endif
443#endif
444	optiov[opt.uio_iovcnt].iov_base = "path";
445	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
446	opt.uio_iovcnt++;
447	optiov[opt.uio_iovcnt].iov_base = u_path;
448	error = copyinstr(j->path, u_path, MAXPATHLEN,
449	    &optiov[opt.uio_iovcnt].iov_len);
450	if (error) {
451		free(u_path, M_TEMP);
452		return (error);
453	}
454	opt.uio_iovcnt++;
455	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
456	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
457	opt.uio_iovcnt++;
458	optiov[opt.uio_iovcnt].iov_base = u_hostname;
459	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
460	    &optiov[opt.uio_iovcnt].iov_len);
461	if (error) {
462		free(u_path, M_TEMP);
463		return (error);
464	}
465	opt.uio_iovcnt++;
466	if (j->jailname != NULL) {
467		optiov[opt.uio_iovcnt].iov_base = "name";
468		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
469		opt.uio_iovcnt++;
470		optiov[opt.uio_iovcnt].iov_base = u_name;
471		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
472		    &optiov[opt.uio_iovcnt].iov_len);
473		if (error) {
474			free(u_path, M_TEMP);
475			return (error);
476		}
477		opt.uio_iovcnt++;
478	}
479#ifdef INET
480	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
481	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
482	opt.uio_iovcnt++;
483	optiov[opt.uio_iovcnt].iov_base = u_ip4;
484	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
485	if (j->version == 0)
486		u_ip4->s_addr = j->ip4s;
487	else {
488		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
489		if (error) {
490			free(u_path, M_TEMP);
491			return (error);
492		}
493	}
494	opt.uio_iovcnt++;
495#endif
496#ifdef INET6
497	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
498	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
499	opt.uio_iovcnt++;
500	optiov[opt.uio_iovcnt].iov_base = u_ip6;
501	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
502	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
503	if (error) {
504		free(u_path, M_TEMP);
505		return (error);
506	}
507	opt.uio_iovcnt++;
508#endif
509	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
510	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
511	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
512	free(u_path, M_TEMP);
513	return (error);
514}
515
516
517/*
518 * struct jail_set_args {
519 *	struct iovec *iovp;
520 *	unsigned int iovcnt;
521 *	int flags;
522 * };
523 */
524int
525sys_jail_set(struct thread *td, struct jail_set_args *uap)
526{
527	struct uio *auio;
528	int error;
529
530	/* Check that we have an even number of iovecs. */
531	if (uap->iovcnt & 1)
532		return (EINVAL);
533
534	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
535	if (error)
536		return (error);
537	error = kern_jail_set(td, auio, uap->flags);
538	free(auio, M_IOV);
539	return (error);
540}
541
542int
543kern_jail_set(struct thread *td, struct uio *optuio, int flags)
544{
545	struct nameidata nd;
546#ifdef INET
547	struct in_addr *ip4;
548#endif
549#ifdef INET6
550	struct in6_addr *ip6;
551#endif
552	struct vfsopt *opt;
553	struct vfsoptlist *opts;
554	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
555	struct vnode *root;
556	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
557	char *g_path, *osrelstr;
558#if defined(INET) || defined(INET6)
559	struct prison *tppr;
560	void *op;
561#endif
562	unsigned long hid;
563	size_t namelen, onamelen, pnamelen;
564	int born, created, cuflags, descend, enforce;
565	int error, errmsg_len, errmsg_pos;
566	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
567	int fi, jid, jsys, len, level;
568	int childmax, osreldt, rsnum, slevel;
569	int fullpath_disabled;
570#if defined(INET) || defined(INET6)
571	int ii, ij;
572#endif
573#ifdef INET
574	int ip4s, redo_ip4;
575#endif
576#ifdef INET6
577	int ip6s, redo_ip6;
578#endif
579	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
580	unsigned tallow;
581	char numbuf[12];
582
583	error = priv_check(td, PRIV_JAIL_SET);
584	if (!error && (flags & JAIL_ATTACH))
585		error = priv_check(td, PRIV_JAIL_ATTACH);
586	if (error)
587		return (error);
588	mypr = td->td_ucred->cr_prison;
589	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
590		return (EPERM);
591	if (flags & ~JAIL_SET_MASK)
592		return (EINVAL);
593
594	/*
595	 * Check all the parameters before committing to anything.  Not all
596	 * errors can be caught early, but we may as well try.  Also, this
597	 * takes care of some expensive stuff (path lookup) before getting
598	 * the allprison lock.
599	 *
600	 * XXX Jails are not filesystems, and jail parameters are not mount
601	 *     options.  But it makes more sense to re-use the vfsopt code
602	 *     than duplicate it under a different name.
603	 */
604	error = vfs_buildopts(optuio, &opts);
605	if (error)
606		return (error);
607#ifdef INET
608	ip4 = NULL;
609#endif
610#ifdef INET6
611	ip6 = NULL;
612#endif
613	g_path = NULL;
614
615	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
616	if (!cuflags) {
617		error = EINVAL;
618		vfs_opterror(opts, "no valid operation (create or update)");
619		goto done_errmsg;
620	}
621
622	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
623	if (error == ENOENT)
624		jid = 0;
625	else if (error != 0)
626		goto done_free;
627
628	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
629	if (error == ENOENT)
630		gotslevel = 0;
631	else if (error != 0)
632		goto done_free;
633	else
634		gotslevel = 1;
635
636	error =
637	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
638	if (error == ENOENT)
639		gotchildmax = 0;
640	else if (error != 0)
641		goto done_free;
642	else
643		gotchildmax = 1;
644
645	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
646	if (error == ENOENT)
647		gotenforce = 0;
648	else if (error != 0)
649		goto done_free;
650	else if (enforce < 0 || enforce > 2) {
651		error = EINVAL;
652		goto done_free;
653	} else
654		gotenforce = 1;
655
656	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
657	if (error == ENOENT)
658		gotrsnum = 0;
659	else if (error != 0)
660		goto done_free;
661	else
662		gotrsnum = 1;
663
664	pr_flags = ch_flags = 0;
665	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
666	    fi++) {
667		if (pr_flag_names[fi] == NULL)
668			continue;
669		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
670		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
671	}
672	ch_flags |= pr_flags;
673	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
674	    fi++) {
675		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
676		    sizeof(jsys));
677		if (error == ENOENT)
678			continue;
679		if (error != 0)
680			goto done_free;
681		switch (jsys) {
682		case JAIL_SYS_DISABLE:
683			if (!pr_flag_jailsys[fi].disable) {
684				error = EINVAL;
685				goto done_free;
686			}
687			pr_flags |= pr_flag_jailsys[fi].disable;
688			break;
689		case JAIL_SYS_NEW:
690			pr_flags |= pr_flag_jailsys[fi].new;
691			break;
692		case JAIL_SYS_INHERIT:
693			break;
694		default:
695			error = EINVAL;
696			goto done_free;
697		}
698		ch_flags |=
699		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
700	}
701	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
702	    && !(pr_flags & PR_PERSIST)) {
703		error = EINVAL;
704		vfs_opterror(opts, "new jail must persist or attach");
705		goto done_errmsg;
706	}
707#ifdef VIMAGE
708	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
709		error = EINVAL;
710		vfs_opterror(opts, "vnet cannot be changed after creation");
711		goto done_errmsg;
712	}
713#endif
714#ifdef INET
715	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
716		error = EINVAL;
717		vfs_opterror(opts, "ip4 cannot be changed after creation");
718		goto done_errmsg;
719	}
720#endif
721#ifdef INET6
722	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
723		error = EINVAL;
724		vfs_opterror(opts, "ip6 cannot be changed after creation");
725		goto done_errmsg;
726	}
727#endif
728
729	pr_allow = ch_allow = 0;
730	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
731	    fi++) {
732		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
733		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
734	}
735	ch_allow |= pr_allow;
736
737	error = vfs_getopt(opts, "name", (void **)&name, &len);
738	if (error == ENOENT)
739		name = NULL;
740	else if (error != 0)
741		goto done_free;
742	else {
743		if (len == 0 || name[len - 1] != '\0') {
744			error = EINVAL;
745			goto done_free;
746		}
747		if (len > MAXHOSTNAMELEN) {
748			error = ENAMETOOLONG;
749			goto done_free;
750		}
751	}
752
753	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
754	if (error == ENOENT)
755		host = NULL;
756	else if (error != 0)
757		goto done_free;
758	else {
759		ch_flags |= PR_HOST;
760		pr_flags |= PR_HOST;
761		if (len == 0 || host[len - 1] != '\0') {
762			error = EINVAL;
763			goto done_free;
764		}
765		if (len > MAXHOSTNAMELEN) {
766			error = ENAMETOOLONG;
767			goto done_free;
768		}
769	}
770
771	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
772	if (error == ENOENT)
773		domain = NULL;
774	else if (error != 0)
775		goto done_free;
776	else {
777		ch_flags |= PR_HOST;
778		pr_flags |= PR_HOST;
779		if (len == 0 || domain[len - 1] != '\0') {
780			error = EINVAL;
781			goto done_free;
782		}
783		if (len > MAXHOSTNAMELEN) {
784			error = ENAMETOOLONG;
785			goto done_free;
786		}
787	}
788
789	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
790	if (error == ENOENT)
791		uuid = NULL;
792	else if (error != 0)
793		goto done_free;
794	else {
795		ch_flags |= PR_HOST;
796		pr_flags |= PR_HOST;
797		if (len == 0 || uuid[len - 1] != '\0') {
798			error = EINVAL;
799			goto done_free;
800		}
801		if (len > HOSTUUIDLEN) {
802			error = ENAMETOOLONG;
803			goto done_free;
804		}
805	}
806
807#ifdef COMPAT_FREEBSD32
808	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
809		uint32_t hid32;
810
811		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
812		hid = hid32;
813	} else
814#endif
815		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
816	if (error == ENOENT)
817		gothid = 0;
818	else if (error != 0)
819		goto done_free;
820	else {
821		gothid = 1;
822		ch_flags |= PR_HOST;
823		pr_flags |= PR_HOST;
824	}
825
826#ifdef INET
827	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
828	if (error == ENOENT)
829		ip4s = 0;
830	else if (error != 0)
831		goto done_free;
832	else if (ip4s & (sizeof(*ip4) - 1)) {
833		error = EINVAL;
834		goto done_free;
835	} else {
836		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
837		if (ip4s == 0)
838			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
839		else {
840			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
841			ip4s /= sizeof(*ip4);
842			if (ip4s > jail_max_af_ips) {
843				error = EINVAL;
844				vfs_opterror(opts, "too many IPv4 addresses");
845				goto done_errmsg;
846			}
847			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
848			bcopy(op, ip4, ip4s * sizeof(*ip4));
849			/*
850			 * IP addresses are all sorted but ip[0] to preserve
851			 * the primary IP address as given from userland.
852			 * This special IP is used for unbound outgoing
853			 * connections as well for "loopback" traffic in case
854			 * source address selection cannot find any more fitting
855			 * address to connect from.
856			 */
857			if (ip4s > 1)
858				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
859			/*
860			 * Check for duplicate addresses and do some simple
861			 * zero and broadcast checks. If users give other bogus
862			 * addresses it is their problem.
863			 *
864			 * We do not have to care about byte order for these
865			 * checks so we will do them in NBO.
866			 */
867			for (ii = 0; ii < ip4s; ii++) {
868				if (ip4[ii].s_addr == INADDR_ANY ||
869				    ip4[ii].s_addr == INADDR_BROADCAST) {
870					error = EINVAL;
871					goto done_free;
872				}
873				if ((ii+1) < ip4s &&
874				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
875				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
876					error = EINVAL;
877					goto done_free;
878				}
879			}
880		}
881	}
882#endif
883
884#ifdef INET6
885	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
886	if (error == ENOENT)
887		ip6s = 0;
888	else if (error != 0)
889		goto done_free;
890	else if (ip6s & (sizeof(*ip6) - 1)) {
891		error = EINVAL;
892		goto done_free;
893	} else {
894		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
895		if (ip6s == 0)
896			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
897		else {
898			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
899			ip6s /= sizeof(*ip6);
900			if (ip6s > jail_max_af_ips) {
901				error = EINVAL;
902				vfs_opterror(opts, "too many IPv6 addresses");
903				goto done_errmsg;
904			}
905			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
906			bcopy(op, ip6, ip6s * sizeof(*ip6));
907			if (ip6s > 1)
908				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
909			for (ii = 0; ii < ip6s; ii++) {
910				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
911					error = EINVAL;
912					goto done_free;
913				}
914				if ((ii+1) < ip6s &&
915				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
916				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
917				{
918					error = EINVAL;
919					goto done_free;
920				}
921			}
922		}
923	}
924#endif
925
926#if defined(VIMAGE) && (defined(INET) || defined(INET6))
927	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
928		error = EINVAL;
929		vfs_opterror(opts,
930		    "vnet jails cannot have IP address restrictions");
931		goto done_errmsg;
932	}
933#endif
934
935	fullpath_disabled = 0;
936	root = NULL;
937	error = vfs_getopt(opts, "path", (void **)&path, &len);
938	if (error == ENOENT)
939		path = NULL;
940	else if (error != 0)
941		goto done_free;
942	else {
943		if (flags & JAIL_UPDATE) {
944			error = EINVAL;
945			vfs_opterror(opts,
946			    "path cannot be changed after creation");
947			goto done_errmsg;
948		}
949		if (len == 0 || path[len - 1] != '\0') {
950			error = EINVAL;
951			goto done_free;
952		}
953		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
954		    path, td);
955		error = namei(&nd);
956		if (error)
957			goto done_free;
958		root = nd.ni_vp;
959		NDFREE(&nd, NDF_ONLY_PNBUF);
960		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
961		strlcpy(g_path, path, MAXPATHLEN);
962		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
963		if (error == 0)
964			path = g_path;
965		else if (error == ENODEV) {
966			/* proceed if sysctl debug.disablefullpath == 1 */
967			fullpath_disabled = 1;
968			if (len < 2 || (len == 2 && path[0] == '/'))
969				path = NULL;
970		} else {
971			/* exit on other errors */
972			goto done_free;
973		}
974		if (root->v_type != VDIR) {
975			error = ENOTDIR;
976			vput(root);
977			goto done_free;
978		}
979		VOP_UNLOCK(root, 0);
980		if (fullpath_disabled) {
981			/* Leave room for a real-root full pathname. */
982			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
983			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
984				error = ENAMETOOLONG;
985				goto done_free;
986			}
987		}
988	}
989
990	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
991	if (error == ENOENT)
992		osrelstr = NULL;
993	else if (error != 0)
994		goto done_free;
995	else {
996		if (flags & JAIL_UPDATE) {
997			error = EINVAL;
998			vfs_opterror(opts,
999			    "osrelease cannot be changed after creation");
1000			goto done_errmsg;
1001		}
1002		if (len == 0 || len >= OSRELEASELEN) {
1003			error = EINVAL;
1004			vfs_opterror(opts,
1005			    "osrelease string must be 1-%d bytes long",
1006			    OSRELEASELEN - 1);
1007			goto done_errmsg;
1008		}
1009	}
1010
1011	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1012	if (error == ENOENT)
1013		osreldt = 0;
1014	else if (error != 0)
1015		goto done_free;
1016	else {
1017		if (flags & JAIL_UPDATE) {
1018			error = EINVAL;
1019			vfs_opterror(opts,
1020			    "osreldate cannot be changed after creation");
1021			goto done_errmsg;
1022		}
1023		if (osreldt == 0) {
1024			error = EINVAL;
1025			vfs_opterror(opts, "osreldate cannot be 0");
1026			goto done_errmsg;
1027		}
1028	}
1029
1030	/*
1031	 * Find the specified jail, or at least its parent.
1032	 * This abuses the file error codes ENOENT and EEXIST.
1033	 */
1034	pr = NULL;
1035	ppr = mypr;
1036	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1037		namelc = strrchr(name, '.');
1038		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1039		if (*p != '\0')
1040			jid = 0;
1041	}
1042	sx_xlock(&allprison_lock);
1043	if (jid != 0) {
1044		/*
1045		 * See if a requested jid already exists.  There is an
1046		 * information leak here if the jid exists but is not within
1047		 * the caller's jail hierarchy.  Jail creators will get EEXIST
1048		 * even though they cannot see the jail, and CREATE | UPDATE
1049		 * will return ENOENT which is not normally a valid error.
1050		 */
1051		if (jid < 0) {
1052			error = EINVAL;
1053			vfs_opterror(opts, "negative jid");
1054			goto done_unlock_list;
1055		}
1056		pr = prison_find(jid);
1057		if (pr != NULL) {
1058			ppr = pr->pr_parent;
1059			/* Create: jid must not exist. */
1060			if (cuflags == JAIL_CREATE) {
1061				mtx_unlock(&pr->pr_mtx);
1062				error = EEXIST;
1063				vfs_opterror(opts, "jail %d already exists",
1064				    jid);
1065				goto done_unlock_list;
1066			}
1067			if (!prison_ischild(mypr, pr)) {
1068				mtx_unlock(&pr->pr_mtx);
1069				pr = NULL;
1070			} else if (pr->pr_uref == 0) {
1071				if (!(flags & JAIL_DYING)) {
1072					mtx_unlock(&pr->pr_mtx);
1073					error = ENOENT;
1074					vfs_opterror(opts, "jail %d is dying",
1075					    jid);
1076					goto done_unlock_list;
1077				} else if ((flags & JAIL_ATTACH) ||
1078				    (pr_flags & PR_PERSIST)) {
1079					/*
1080					 * A dying jail might be resurrected
1081					 * (via attach or persist), but first
1082					 * it must determine if another jail
1083					 * has claimed its name.  Accomplish
1084					 * this by implicitly re-setting the
1085					 * name.
1086					 */
1087					if (name == NULL)
1088						name = prison_name(mypr, pr);
1089				}
1090			}
1091		}
1092		if (pr == NULL) {
1093			/* Update: jid must exist. */
1094			if (cuflags == JAIL_UPDATE) {
1095				error = ENOENT;
1096				vfs_opterror(opts, "jail %d not found", jid);
1097				goto done_unlock_list;
1098			}
1099		}
1100	}
1101	/*
1102	 * If the caller provided a name, look for a jail by that name.
1103	 * This has different semantics for creates and updates keyed by jid
1104	 * (where the name must not already exist in a different jail),
1105	 * and updates keyed by the name itself (where the name must exist
1106	 * because that is the jail being updated).
1107	 */
1108	namelc = NULL;
1109	if (name != NULL) {
1110		namelc = strrchr(name, '.');
1111		if (namelc == NULL)
1112			namelc = name;
1113		else {
1114			/*
1115			 * This is a hierarchical name.  Split it into the
1116			 * parent and child names, and make sure the parent
1117			 * exists or matches an already found jail.
1118			 */
1119			if (pr != NULL) {
1120				if (strncmp(name, ppr->pr_name, namelc - name)
1121				    || ppr->pr_name[namelc - name] != '\0') {
1122					mtx_unlock(&pr->pr_mtx);
1123					error = EINVAL;
1124					vfs_opterror(opts,
1125					    "cannot change jail's parent");
1126					goto done_unlock_list;
1127				}
1128			} else {
1129				*namelc = '\0';
1130				ppr = prison_find_name(mypr, name);
1131				if (ppr == NULL) {
1132					error = ENOENT;
1133					vfs_opterror(opts,
1134					    "jail \"%s\" not found", name);
1135					goto done_unlock_list;
1136				}
1137				mtx_unlock(&ppr->pr_mtx);
1138				*namelc = '.';
1139			}
1140			namelc++;
1141		}
1142		if (namelc[0] != '\0') {
1143			pnamelen =
1144			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1145 name_again:
1146			deadpr = NULL;
1147			FOREACH_PRISON_CHILD(ppr, tpr) {
1148				if (tpr != pr && tpr->pr_ref > 0 &&
1149				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1150					if (pr == NULL &&
1151					    cuflags != JAIL_CREATE) {
1152						mtx_lock(&tpr->pr_mtx);
1153						if (tpr->pr_ref > 0) {
1154							/*
1155							 * Use this jail
1156							 * for updates.
1157							 */
1158							if (tpr->pr_uref > 0) {
1159								pr = tpr;
1160								break;
1161							}
1162							deadpr = tpr;
1163						}
1164						mtx_unlock(&tpr->pr_mtx);
1165					} else if (tpr->pr_uref > 0) {
1166						/*
1167						 * Create, or update(jid):
1168						 * name must not exist in an
1169						 * active sibling jail.
1170						 */
1171						error = EEXIST;
1172						if (pr != NULL)
1173							mtx_unlock(&pr->pr_mtx);
1174						vfs_opterror(opts,
1175						   "jail \"%s\" already exists",
1176						   name);
1177						goto done_unlock_list;
1178					}
1179				}
1180			}
1181			/* If no active jail is found, use a dying one. */
1182			if (deadpr != NULL && pr == NULL) {
1183				if (flags & JAIL_DYING) {
1184					mtx_lock(&deadpr->pr_mtx);
1185					if (deadpr->pr_ref == 0) {
1186						mtx_unlock(&deadpr->pr_mtx);
1187						goto name_again;
1188					}
1189					pr = deadpr;
1190				} else if (cuflags == JAIL_UPDATE) {
1191					error = ENOENT;
1192					vfs_opterror(opts,
1193					    "jail \"%s\" is dying", name);
1194					goto done_unlock_list;
1195				}
1196			}
1197			/* Update: name must exist if no jid. */
1198			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1199				error = ENOENT;
1200				vfs_opterror(opts, "jail \"%s\" not found",
1201				    name);
1202				goto done_unlock_list;
1203			}
1204		}
1205	}
1206	/* Update: must provide a jid or name. */
1207	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1208		error = ENOENT;
1209		vfs_opterror(opts, "update specified no jail");
1210		goto done_unlock_list;
1211	}
1212
1213	/* If there's no prison to update, create a new one and link it in. */
1214	if (pr == NULL) {
1215		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1216			if (tpr->pr_childcount >= tpr->pr_childmax) {
1217				error = EPERM;
1218				vfs_opterror(opts, "prison limit exceeded");
1219				goto done_unlock_list;
1220			}
1221		created = 1;
1222		mtx_lock(&ppr->pr_mtx);
1223		if (ppr->pr_ref == 0) {
1224			mtx_unlock(&ppr->pr_mtx);
1225			error = ENOENT;
1226			vfs_opterror(opts, "jail \"%s\" not found",
1227			    prison_name(mypr, ppr));
1228			goto done_unlock_list;
1229		}
1230		ppr->pr_ref++;
1231		ppr->pr_uref++;
1232		mtx_unlock(&ppr->pr_mtx);
1233		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1234		if (jid == 0) {
1235			/* Find the next free jid. */
1236			jid = lastprid + 1;
1237 findnext:
1238			if (jid == JAIL_MAX)
1239				jid = 1;
1240			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1241				if (tpr->pr_id < jid)
1242					continue;
1243				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1244					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1245					break;
1246				}
1247				if (jid == lastprid) {
1248					error = EAGAIN;
1249					vfs_opterror(opts,
1250					    "no available jail IDs");
1251					free(pr, M_PRISON);
1252					prison_deref(ppr, PD_DEREF |
1253					    PD_DEUREF | PD_LIST_XLOCKED);
1254					goto done_releroot;
1255				}
1256				jid++;
1257				goto findnext;
1258			}
1259			lastprid = jid;
1260		} else {
1261			/*
1262			 * The jail already has a jid (that did not yet exist),
1263			 * so just find where to insert it.
1264			 */
1265			TAILQ_FOREACH(tpr, &allprison, pr_list)
1266				if (tpr->pr_id >= jid) {
1267					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1268					break;
1269				}
1270		}
1271		if (tpr == NULL)
1272			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1273		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1274		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1275			tpr->pr_childcount++;
1276
1277		pr->pr_parent = ppr;
1278		pr->pr_id = jid;
1279
1280		/* Set some default values, and inherit some from the parent. */
1281		if (namelc == NULL)
1282			namelc = "";
1283		if (path == NULL) {
1284			path = "/";
1285			root = mypr->pr_root;
1286			vref(root);
1287		}
1288		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1289		pr->pr_flags |= PR_HOST;
1290#if defined(INET) || defined(INET6)
1291#ifdef VIMAGE
1292		if (!(pr_flags & PR_VNET))
1293#endif
1294		{
1295#ifdef INET
1296			if (!(ch_flags & PR_IP4_USER))
1297				pr->pr_flags |=
1298				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1299			else if (!(pr_flags & PR_IP4_USER)) {
1300				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1301				if (ppr->pr_ip4 != NULL) {
1302					pr->pr_ip4s = ppr->pr_ip4s;
1303					pr->pr_ip4 = malloc(pr->pr_ip4s *
1304					    sizeof(struct in_addr), M_PRISON,
1305					    M_WAITOK);
1306					bcopy(ppr->pr_ip4, pr->pr_ip4,
1307					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1308				}
1309			}
1310#endif
1311#ifdef INET6
1312			if (!(ch_flags & PR_IP6_USER))
1313				pr->pr_flags |=
1314				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1315			else if (!(pr_flags & PR_IP6_USER)) {
1316				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1317				if (ppr->pr_ip6 != NULL) {
1318					pr->pr_ip6s = ppr->pr_ip6s;
1319					pr->pr_ip6 = malloc(pr->pr_ip6s *
1320					    sizeof(struct in6_addr), M_PRISON,
1321					    M_WAITOK);
1322					bcopy(ppr->pr_ip6, pr->pr_ip6,
1323					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1324				}
1325			}
1326#endif
1327		}
1328#endif
1329		/* Source address selection is always on by default. */
1330		pr->pr_flags |= _PR_IP_SADDRSEL;
1331
1332		pr->pr_securelevel = ppr->pr_securelevel;
1333		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1334		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1335		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1336
1337		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1338		if (osrelstr == NULL)
1339		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
1340		else
1341		    strcpy(pr->pr_osrelease, osrelstr);
1342
1343		LIST_INIT(&pr->pr_children);
1344		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1345		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1346
1347#ifdef VIMAGE
1348		/* Allocate a new vnet if specified. */
1349		pr->pr_vnet = (pr_flags & PR_VNET)
1350		    ? vnet_alloc() : ppr->pr_vnet;
1351#endif
1352		/*
1353		 * Allocate a dedicated cpuset for each jail.
1354		 * Unlike other initial settings, this may return an erorr.
1355		 */
1356		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1357		if (error) {
1358			prison_deref(pr, PD_LIST_XLOCKED);
1359			goto done_releroot;
1360		}
1361
1362		mtx_lock(&pr->pr_mtx);
1363		/*
1364		 * New prisons do not yet have a reference, because we do not
1365		 * want others to see the incomplete prison once the
1366		 * allprison_lock is downgraded.
1367		 */
1368	} else {
1369		created = 0;
1370		/*
1371		 * Grab a reference for existing prisons, to ensure they
1372		 * continue to exist for the duration of the call.
1373		 */
1374		pr->pr_ref++;
1375#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1376		if ((pr->pr_flags & PR_VNET) &&
1377		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1378			error = EINVAL;
1379			vfs_opterror(opts,
1380			    "vnet jails cannot have IP address restrictions");
1381			goto done_deref_locked;
1382		}
1383#endif
1384#ifdef INET
1385		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1386			error = EINVAL;
1387			vfs_opterror(opts,
1388			    "ip4 cannot be changed after creation");
1389			goto done_deref_locked;
1390		}
1391#endif
1392#ifdef INET6
1393		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1394			error = EINVAL;
1395			vfs_opterror(opts,
1396			    "ip6 cannot be changed after creation");
1397			goto done_deref_locked;
1398		}
1399#endif
1400	}
1401
1402	/* Do final error checking before setting anything. */
1403	if (gotslevel) {
1404		if (slevel < ppr->pr_securelevel) {
1405			error = EPERM;
1406			goto done_deref_locked;
1407		}
1408	}
1409	if (gotchildmax) {
1410		if (childmax >= ppr->pr_childmax) {
1411			error = EPERM;
1412			goto done_deref_locked;
1413		}
1414	}
1415	if (gotenforce) {
1416		if (enforce < ppr->pr_enforce_statfs) {
1417			error = EPERM;
1418			goto done_deref_locked;
1419		}
1420	}
1421	if (gotrsnum) {
1422		/*
1423		 * devfs_rsnum is a uint16_t
1424		 */
1425		if (rsnum < 0 || rsnum > 65535) {
1426			error = EINVAL;
1427			goto done_deref_locked;
1428		}
1429		/*
1430		 * Nested jails always inherit parent's devfs ruleset
1431		 */
1432		if (jailed(td->td_ucred)) {
1433			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1434				error = EPERM;
1435				goto done_deref_locked;
1436			} else
1437				rsnum = ppr->pr_devfs_rsnum;
1438		}
1439	}
1440#ifdef INET
1441	if (ip4s > 0) {
1442		if (ppr->pr_flags & PR_IP4) {
1443			/*
1444			 * Make sure the new set of IP addresses is a
1445			 * subset of the parent's list.  Don't worry
1446			 * about the parent being unlocked, as any
1447			 * setting is done with allprison_lock held.
1448			 */
1449			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1450				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1451					break;
1452			if (ij == ppr->pr_ip4s) {
1453				error = EPERM;
1454				goto done_deref_locked;
1455			}
1456			if (ip4s > 1) {
1457				for (ii = ij = 1; ii < ip4s; ii++) {
1458					if (ip4[ii].s_addr ==
1459					    ppr->pr_ip4[0].s_addr)
1460						continue;
1461					for (; ij < ppr->pr_ip4s; ij++)
1462						if (ip4[ii].s_addr ==
1463						    ppr->pr_ip4[ij].s_addr)
1464							break;
1465					if (ij == ppr->pr_ip4s)
1466						break;
1467				}
1468				if (ij == ppr->pr_ip4s) {
1469					error = EPERM;
1470					goto done_deref_locked;
1471				}
1472			}
1473		}
1474		/*
1475		 * Check for conflicting IP addresses.  We permit them
1476		 * if there is no more than one IP on each jail.  If
1477		 * there is a duplicate on a jail with more than one
1478		 * IP stop checking and return error.
1479		 */
1480		tppr = ppr;
1481#ifdef VIMAGE
1482		for (; tppr != &prison0; tppr = tppr->pr_parent)
1483			if (tppr->pr_flags & PR_VNET)
1484				break;
1485#endif
1486		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1487			if (tpr == pr ||
1488#ifdef VIMAGE
1489			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1490#endif
1491			    tpr->pr_uref == 0) {
1492				descend = 0;
1493				continue;
1494			}
1495			if (!(tpr->pr_flags & PR_IP4_USER))
1496				continue;
1497			descend = 0;
1498			if (tpr->pr_ip4 == NULL ||
1499			    (ip4s == 1 && tpr->pr_ip4s == 1))
1500				continue;
1501			for (ii = 0; ii < ip4s; ii++) {
1502				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1503					error = EADDRINUSE;
1504					vfs_opterror(opts,
1505					    "IPv4 addresses clash");
1506					goto done_deref_locked;
1507				}
1508			}
1509		}
1510	}
1511#endif
1512#ifdef INET6
1513	if (ip6s > 0) {
1514		if (ppr->pr_flags & PR_IP6) {
1515			/*
1516			 * Make sure the new set of IP addresses is a
1517			 * subset of the parent's list.
1518			 */
1519			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1520				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1521				    &ppr->pr_ip6[ij]))
1522					break;
1523			if (ij == ppr->pr_ip6s) {
1524				error = EPERM;
1525				goto done_deref_locked;
1526			}
1527			if (ip6s > 1) {
1528				for (ii = ij = 1; ii < ip6s; ii++) {
1529					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1530					     &ppr->pr_ip6[0]))
1531						continue;
1532					for (; ij < ppr->pr_ip6s; ij++)
1533						if (IN6_ARE_ADDR_EQUAL(
1534						    &ip6[ii], &ppr->pr_ip6[ij]))
1535							break;
1536					if (ij == ppr->pr_ip6s)
1537						break;
1538				}
1539				if (ij == ppr->pr_ip6s) {
1540					error = EPERM;
1541					goto done_deref_locked;
1542				}
1543			}
1544		}
1545		/* Check for conflicting IP addresses. */
1546		tppr = ppr;
1547#ifdef VIMAGE
1548		for (; tppr != &prison0; tppr = tppr->pr_parent)
1549			if (tppr->pr_flags & PR_VNET)
1550				break;
1551#endif
1552		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1553			if (tpr == pr ||
1554#ifdef VIMAGE
1555			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1556#endif
1557			    tpr->pr_uref == 0) {
1558				descend = 0;
1559				continue;
1560			}
1561			if (!(tpr->pr_flags & PR_IP6_USER))
1562				continue;
1563			descend = 0;
1564			if (tpr->pr_ip6 == NULL ||
1565			    (ip6s == 1 && tpr->pr_ip6s == 1))
1566				continue;
1567			for (ii = 0; ii < ip6s; ii++) {
1568				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1569					error = EADDRINUSE;
1570					vfs_opterror(opts,
1571					    "IPv6 addresses clash");
1572					goto done_deref_locked;
1573				}
1574			}
1575		}
1576	}
1577#endif
1578	onamelen = namelen = 0;
1579	if (namelc != NULL) {
1580		/* Give a default name of the jid.  Also allow the name to be
1581		 * explicitly the jid - but not any other number, and only in
1582		 * normal form (no leading zero/etc).
1583		 */
1584		if (namelc[0] == '\0')
1585			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1586		else if ((strtoul(namelc, &p, 10) != jid ||
1587			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1588			error = EINVAL;
1589			vfs_opterror(opts,
1590			    "name cannot be numeric (unless it is the jid)");
1591			goto done_deref_locked;
1592		}
1593		/*
1594		 * Make sure the name isn't too long for the prison or its
1595		 * children.
1596		 */
1597		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1598		onamelen = strlen(pr->pr_name + pnamelen);
1599		namelen = strlen(namelc);
1600		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1601			error = ENAMETOOLONG;
1602			goto done_deref_locked;
1603		}
1604		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1605			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1606			    sizeof(pr->pr_name)) {
1607				error = ENAMETOOLONG;
1608				goto done_deref_locked;
1609			}
1610		}
1611	}
1612	if (pr_allow & ~ppr->pr_allow) {
1613		error = EPERM;
1614		goto done_deref_locked;
1615	}
1616
1617	/*
1618	 * Let modules check their parameters.  This requires unlocking and
1619	 * then re-locking the prison, but this is still a valid state as long
1620	 * as allprison_lock remains xlocked.
1621	 */
1622	mtx_unlock(&pr->pr_mtx);
1623	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1624	if (error != 0) {
1625		prison_deref(pr, created
1626		    ? PD_LIST_XLOCKED
1627		    : PD_DEREF | PD_LIST_XLOCKED);
1628		goto done_releroot;
1629	}
1630	mtx_lock(&pr->pr_mtx);
1631
1632	/* At this point, all valid parameters should have been noted. */
1633	TAILQ_FOREACH(opt, opts, link) {
1634		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1635			error = EINVAL;
1636			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1637			goto done_deref_locked;
1638		}
1639	}
1640
1641	/* Set the parameters of the prison. */
1642#ifdef INET
1643	redo_ip4 = 0;
1644	if (pr_flags & PR_IP4_USER) {
1645		pr->pr_flags |= PR_IP4;
1646		free(pr->pr_ip4, M_PRISON);
1647		pr->pr_ip4s = ip4s;
1648		pr->pr_ip4 = ip4;
1649		ip4 = NULL;
1650		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1651#ifdef VIMAGE
1652			if (tpr->pr_flags & PR_VNET) {
1653				descend = 0;
1654				continue;
1655			}
1656#endif
1657			if (prison_restrict_ip4(tpr, NULL)) {
1658				redo_ip4 = 1;
1659				descend = 0;
1660			}
1661		}
1662	}
1663#endif
1664#ifdef INET6
1665	redo_ip6 = 0;
1666	if (pr_flags & PR_IP6_USER) {
1667		pr->pr_flags |= PR_IP6;
1668		free(pr->pr_ip6, M_PRISON);
1669		pr->pr_ip6s = ip6s;
1670		pr->pr_ip6 = ip6;
1671		ip6 = NULL;
1672		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1673#ifdef VIMAGE
1674			if (tpr->pr_flags & PR_VNET) {
1675				descend = 0;
1676				continue;
1677			}
1678#endif
1679			if (prison_restrict_ip6(tpr, NULL)) {
1680				redo_ip6 = 1;
1681				descend = 0;
1682			}
1683		}
1684	}
1685#endif
1686	if (gotslevel) {
1687		pr->pr_securelevel = slevel;
1688		/* Set all child jails to be at least this level. */
1689		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1690			if (tpr->pr_securelevel < slevel)
1691				tpr->pr_securelevel = slevel;
1692	}
1693	if (gotchildmax) {
1694		pr->pr_childmax = childmax;
1695		/* Set all child jails to under this limit. */
1696		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1697			if (tpr->pr_childmax > childmax - level)
1698				tpr->pr_childmax = childmax > level
1699				    ? childmax - level : 0;
1700	}
1701	if (gotenforce) {
1702		pr->pr_enforce_statfs = enforce;
1703		/* Pass this restriction on to the children. */
1704		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1705			if (tpr->pr_enforce_statfs < enforce)
1706				tpr->pr_enforce_statfs = enforce;
1707	}
1708	if (gotrsnum) {
1709		pr->pr_devfs_rsnum = rsnum;
1710		/* Pass this restriction on to the children. */
1711		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1712			tpr->pr_devfs_rsnum = rsnum;
1713	}
1714	if (namelc != NULL) {
1715		if (ppr == &prison0)
1716			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1717		else
1718			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1719			    ppr->pr_name, namelc);
1720		/* Change this component of child names. */
1721		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1722			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1723			    strlen(tpr->pr_name + onamelen) + 1);
1724			bcopy(pr->pr_name, tpr->pr_name, namelen);
1725		}
1726	}
1727	if (path != NULL) {
1728		/* Try to keep a real-rooted full pathname. */
1729		if (fullpath_disabled && path[0] == '/' &&
1730		    strcmp(mypr->pr_path, "/"))
1731			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1732			    mypr->pr_path, path);
1733		else
1734			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1735		pr->pr_root = root;
1736	}
1737	if (PR_HOST & ch_flags & ~pr_flags) {
1738		if (pr->pr_flags & PR_HOST) {
1739			/*
1740			 * Copy the parent's host info.  As with pr_ip4 above,
1741			 * the lack of a lock on the parent is not a problem;
1742			 * it is always set with allprison_lock at least
1743			 * shared, and is held exclusively here.
1744			 */
1745			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1746			    sizeof(pr->pr_hostname));
1747			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1748			    sizeof(pr->pr_domainname));
1749			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1750			    sizeof(pr->pr_hostuuid));
1751			pr->pr_hostid = pr->pr_parent->pr_hostid;
1752		}
1753	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1754		/* Set this prison, and any descendants without PR_HOST. */
1755		if (host != NULL)
1756			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1757		if (domain != NULL)
1758			strlcpy(pr->pr_domainname, domain,
1759			    sizeof(pr->pr_domainname));
1760		if (uuid != NULL)
1761			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1762		if (gothid)
1763			pr->pr_hostid = hid;
1764		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1765			if (tpr->pr_flags & PR_HOST)
1766				descend = 0;
1767			else {
1768				if (host != NULL)
1769					strlcpy(tpr->pr_hostname,
1770					    pr->pr_hostname,
1771					    sizeof(tpr->pr_hostname));
1772				if (domain != NULL)
1773					strlcpy(tpr->pr_domainname,
1774					    pr->pr_domainname,
1775					    sizeof(tpr->pr_domainname));
1776				if (uuid != NULL)
1777					strlcpy(tpr->pr_hostuuid,
1778					    pr->pr_hostuuid,
1779					    sizeof(tpr->pr_hostuuid));
1780				if (gothid)
1781					tpr->pr_hostid = hid;
1782			}
1783		}
1784	}
1785	if ((tallow = ch_allow & ~pr_allow)) {
1786		/* Clear allow bits in all children. */
1787		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1788			tpr->pr_allow &= ~tallow;
1789	}
1790	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1791	/*
1792	 * Persistent prisons get an extra reference, and prisons losing their
1793	 * persist flag lose that reference.  Only do this for existing prisons
1794	 * for now, so new ones will remain unseen until after the module
1795	 * handlers have completed.
1796	 */
1797	born = pr->pr_uref == 0;
1798	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1799		if (pr_flags & PR_PERSIST) {
1800			pr->pr_ref++;
1801			pr->pr_uref++;
1802		} else {
1803			pr->pr_ref--;
1804			pr->pr_uref--;
1805		}
1806	}
1807	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1808	mtx_unlock(&pr->pr_mtx);
1809
1810#ifdef RACCT
1811	if (racct_enable && created)
1812		prison_racct_attach(pr);
1813#endif
1814
1815	/* Locks may have prevented a complete restriction of child IP
1816	 * addresses.  If so, allocate some more memory and try again.
1817	 */
1818#ifdef INET
1819	while (redo_ip4) {
1820		ip4s = pr->pr_ip4s;
1821		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1822		mtx_lock(&pr->pr_mtx);
1823		redo_ip4 = 0;
1824		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1825#ifdef VIMAGE
1826			if (tpr->pr_flags & PR_VNET) {
1827				descend = 0;
1828				continue;
1829			}
1830#endif
1831			if (prison_restrict_ip4(tpr, ip4)) {
1832				if (ip4 != NULL)
1833					ip4 = NULL;
1834				else
1835					redo_ip4 = 1;
1836			}
1837		}
1838		mtx_unlock(&pr->pr_mtx);
1839	}
1840#endif
1841#ifdef INET6
1842	while (redo_ip6) {
1843		ip6s = pr->pr_ip6s;
1844		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1845		mtx_lock(&pr->pr_mtx);
1846		redo_ip6 = 0;
1847		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1848#ifdef VIMAGE
1849			if (tpr->pr_flags & PR_VNET) {
1850				descend = 0;
1851				continue;
1852			}
1853#endif
1854			if (prison_restrict_ip6(tpr, ip6)) {
1855				if (ip6 != NULL)
1856					ip6 = NULL;
1857				else
1858					redo_ip6 = 1;
1859			}
1860		}
1861		mtx_unlock(&pr->pr_mtx);
1862	}
1863#endif
1864
1865	/* Let the modules do their work. */
1866	sx_downgrade(&allprison_lock);
1867	if (born) {
1868		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1869		if (error) {
1870			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1871			prison_deref(pr, created
1872			    ? PD_LIST_SLOCKED
1873			    : PD_DEREF | PD_LIST_SLOCKED);
1874			goto done_errmsg;
1875		}
1876	}
1877	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1878	if (error) {
1879		if (born)
1880			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1881		prison_deref(pr, created
1882		    ? PD_LIST_SLOCKED
1883		    : PD_DEREF | PD_LIST_SLOCKED);
1884		goto done_errmsg;
1885	}
1886
1887	/* Attach this process to the prison if requested. */
1888	if (flags & JAIL_ATTACH) {
1889		mtx_lock(&pr->pr_mtx);
1890		error = do_jail_attach(td, pr);
1891		if (error) {
1892			vfs_opterror(opts, "attach failed");
1893			if (!created)
1894				prison_deref(pr, PD_DEREF);
1895			goto done_errmsg;
1896		}
1897	}
1898
1899#ifdef RACCT
1900	if (racct_enable && !created) {
1901		if (!(flags & JAIL_ATTACH))
1902			sx_sunlock(&allprison_lock);
1903		prison_racct_modify(pr);
1904		if (!(flags & JAIL_ATTACH))
1905			sx_slock(&allprison_lock);
1906	}
1907#endif
1908
1909	td->td_retval[0] = pr->pr_id;
1910
1911	/*
1912	 * Now that it is all there, drop the temporary reference from existing
1913	 * prisons.  Or add a reference to newly created persistent prisons
1914	 * (which was not done earlier so that the prison would not be publicly
1915	 * visible).
1916	 */
1917	if (!created) {
1918		prison_deref(pr, (flags & JAIL_ATTACH)
1919		    ? PD_DEREF
1920		    : PD_DEREF | PD_LIST_SLOCKED);
1921	} else {
1922		if (pr_flags & PR_PERSIST) {
1923			mtx_lock(&pr->pr_mtx);
1924			pr->pr_ref++;
1925			pr->pr_uref++;
1926			mtx_unlock(&pr->pr_mtx);
1927		}
1928		if (!(flags & JAIL_ATTACH))
1929			sx_sunlock(&allprison_lock);
1930	}
1931
1932	goto done_free;
1933
1934 done_deref_locked:
1935	prison_deref(pr, created
1936	    ? PD_LOCKED | PD_LIST_XLOCKED
1937	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1938	goto done_releroot;
1939 done_unlock_list:
1940	sx_xunlock(&allprison_lock);
1941 done_releroot:
1942	if (root != NULL)
1943		vrele(root);
1944 done_errmsg:
1945	if (error) {
1946		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1947		if (errmsg_len > 0) {
1948			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1949			if (errmsg_pos > 0) {
1950				if (optuio->uio_segflg == UIO_SYSSPACE)
1951					bcopy(errmsg,
1952					   optuio->uio_iov[errmsg_pos].iov_base,
1953					   errmsg_len);
1954				else
1955					copyout(errmsg,
1956					   optuio->uio_iov[errmsg_pos].iov_base,
1957					   errmsg_len);
1958			}
1959		}
1960	}
1961 done_free:
1962#ifdef INET
1963	free(ip4, M_PRISON);
1964#endif
1965#ifdef INET6
1966	free(ip6, M_PRISON);
1967#endif
1968	if (g_path != NULL)
1969		free(g_path, M_TEMP);
1970	vfs_freeopts(opts);
1971	return (error);
1972}
1973
1974
1975/*
1976 * struct jail_get_args {
1977 *	struct iovec *iovp;
1978 *	unsigned int iovcnt;
1979 *	int flags;
1980 * };
1981 */
1982int
1983sys_jail_get(struct thread *td, struct jail_get_args *uap)
1984{
1985	struct uio *auio;
1986	int error;
1987
1988	/* Check that we have an even number of iovecs. */
1989	if (uap->iovcnt & 1)
1990		return (EINVAL);
1991
1992	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1993	if (error)
1994		return (error);
1995	error = kern_jail_get(td, auio, uap->flags);
1996	if (error == 0)
1997		error = copyout(auio->uio_iov, uap->iovp,
1998		    uap->iovcnt * sizeof (struct iovec));
1999	free(auio, M_IOV);
2000	return (error);
2001}
2002
2003int
2004kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2005{
2006	struct prison *pr, *mypr;
2007	struct vfsopt *opt;
2008	struct vfsoptlist *opts;
2009	char *errmsg, *name;
2010	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
2011
2012	if (flags & ~JAIL_GET_MASK)
2013		return (EINVAL);
2014
2015	/* Get the parameter list. */
2016	error = vfs_buildopts(optuio, &opts);
2017	if (error)
2018		return (error);
2019	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2020	mypr = td->td_ucred->cr_prison;
2021
2022	/*
2023	 * Find the prison specified by one of: lastjid, jid, name.
2024	 */
2025	sx_slock(&allprison_lock);
2026	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2027	if (error == 0) {
2028		TAILQ_FOREACH(pr, &allprison, pr_list) {
2029			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
2030				mtx_lock(&pr->pr_mtx);
2031				if (pr->pr_ref > 0 &&
2032				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
2033					break;
2034				mtx_unlock(&pr->pr_mtx);
2035			}
2036		}
2037		if (pr != NULL)
2038			goto found_prison;
2039		error = ENOENT;
2040		vfs_opterror(opts, "no jail after %d", jid);
2041		goto done_unlock_list;
2042	} else if (error != ENOENT)
2043		goto done_unlock_list;
2044
2045	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2046	if (error == 0) {
2047		if (jid != 0) {
2048			pr = prison_find_child(mypr, jid);
2049			if (pr != NULL) {
2050				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2051					mtx_unlock(&pr->pr_mtx);
2052					error = ENOENT;
2053					vfs_opterror(opts, "jail %d is dying",
2054					    jid);
2055					goto done_unlock_list;
2056				}
2057				goto found_prison;
2058			}
2059			error = ENOENT;
2060			vfs_opterror(opts, "jail %d not found", jid);
2061			goto done_unlock_list;
2062		}
2063	} else if (error != ENOENT)
2064		goto done_unlock_list;
2065
2066	error = vfs_getopt(opts, "name", (void **)&name, &len);
2067	if (error == 0) {
2068		if (len == 0 || name[len - 1] != '\0') {
2069			error = EINVAL;
2070			goto done_unlock_list;
2071		}
2072		pr = prison_find_name(mypr, name);
2073		if (pr != NULL) {
2074			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2075				mtx_unlock(&pr->pr_mtx);
2076				error = ENOENT;
2077				vfs_opterror(opts, "jail \"%s\" is dying",
2078				    name);
2079				goto done_unlock_list;
2080			}
2081			goto found_prison;
2082		}
2083		error = ENOENT;
2084		vfs_opterror(opts, "jail \"%s\" not found", name);
2085		goto done_unlock_list;
2086	} else if (error != ENOENT)
2087		goto done_unlock_list;
2088
2089	vfs_opterror(opts, "no jail specified");
2090	error = ENOENT;
2091	goto done_unlock_list;
2092
2093 found_prison:
2094	/* Get the parameters of the prison. */
2095	pr->pr_ref++;
2096	locked = PD_LOCKED;
2097	td->td_retval[0] = pr->pr_id;
2098	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2099	if (error != 0 && error != ENOENT)
2100		goto done_deref;
2101	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2102	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2103	if (error != 0 && error != ENOENT)
2104		goto done_deref;
2105	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2106	if (error != 0 && error != ENOENT)
2107		goto done_deref;
2108	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2109	    sizeof(pr->pr_cpuset->cs_id));
2110	if (error != 0 && error != ENOENT)
2111		goto done_deref;
2112	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2113	if (error != 0 && error != ENOENT)
2114		goto done_deref;
2115#ifdef INET
2116	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2117	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2118	if (error != 0 && error != ENOENT)
2119		goto done_deref;
2120#endif
2121#ifdef INET6
2122	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2123	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2124	if (error != 0 && error != ENOENT)
2125		goto done_deref;
2126#endif
2127	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2128	    sizeof(pr->pr_securelevel));
2129	if (error != 0 && error != ENOENT)
2130		goto done_deref;
2131	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2132	    sizeof(pr->pr_childcount));
2133	if (error != 0 && error != ENOENT)
2134		goto done_deref;
2135	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2136	    sizeof(pr->pr_childmax));
2137	if (error != 0 && error != ENOENT)
2138		goto done_deref;
2139	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2140	if (error != 0 && error != ENOENT)
2141		goto done_deref;
2142	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2143	if (error != 0 && error != ENOENT)
2144		goto done_deref;
2145	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2146	if (error != 0 && error != ENOENT)
2147		goto done_deref;
2148#ifdef COMPAT_FREEBSD32
2149	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2150		uint32_t hid32 = pr->pr_hostid;
2151
2152		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2153	} else
2154#endif
2155	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2156	    sizeof(pr->pr_hostid));
2157	if (error != 0 && error != ENOENT)
2158		goto done_deref;
2159	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2160	    sizeof(pr->pr_enforce_statfs));
2161	if (error != 0 && error != ENOENT)
2162		goto done_deref;
2163	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2164	    sizeof(pr->pr_devfs_rsnum));
2165	if (error != 0 && error != ENOENT)
2166		goto done_deref;
2167	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2168	    fi++) {
2169		if (pr_flag_names[fi] == NULL)
2170			continue;
2171		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2172		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2173		if (error != 0 && error != ENOENT)
2174			goto done_deref;
2175		i = !i;
2176		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2177		if (error != 0 && error != ENOENT)
2178			goto done_deref;
2179	}
2180	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2181	    fi++) {
2182		i = pr->pr_flags &
2183		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2184		i = pr_flag_jailsys[fi].disable &&
2185		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2186		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2187		    : JAIL_SYS_INHERIT;
2188		error =
2189		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2190		if (error != 0 && error != ENOENT)
2191			goto done_deref;
2192	}
2193	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2194	    fi++) {
2195		if (pr_allow_names[fi] == NULL)
2196			continue;
2197		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2198		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2199		if (error != 0 && error != ENOENT)
2200			goto done_deref;
2201		i = !i;
2202		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2203		if (error != 0 && error != ENOENT)
2204			goto done_deref;
2205	}
2206	i = (pr->pr_uref == 0);
2207	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2208	if (error != 0 && error != ENOENT)
2209		goto done_deref;
2210	i = !i;
2211	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2212	if (error != 0 && error != ENOENT)
2213		goto done_deref;
2214	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2215	    sizeof(pr->pr_osreldate));
2216	if (error != 0 && error != ENOENT)
2217		goto done_deref;
2218	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2219	if (error != 0 && error != ENOENT)
2220		goto done_deref;
2221
2222	/* Get the module parameters. */
2223	mtx_unlock(&pr->pr_mtx);
2224	locked = 0;
2225	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2226	if (error)
2227		goto done_deref;
2228	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2229
2230	/* By now, all parameters should have been noted. */
2231	TAILQ_FOREACH(opt, opts, link) {
2232		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2233			error = EINVAL;
2234			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2235			goto done_errmsg;
2236		}
2237	}
2238
2239	/* Write the fetched parameters back to userspace. */
2240	error = 0;
2241	TAILQ_FOREACH(opt, opts, link) {
2242		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2243			pos = 2 * opt->pos + 1;
2244			optuio->uio_iov[pos].iov_len = opt->len;
2245			if (opt->value != NULL) {
2246				if (optuio->uio_segflg == UIO_SYSSPACE) {
2247					bcopy(opt->value,
2248					    optuio->uio_iov[pos].iov_base,
2249					    opt->len);
2250				} else {
2251					error = copyout(opt->value,
2252					    optuio->uio_iov[pos].iov_base,
2253					    opt->len);
2254					if (error)
2255						break;
2256				}
2257			}
2258		}
2259	}
2260	goto done_errmsg;
2261
2262 done_deref:
2263	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2264	goto done_errmsg;
2265
2266 done_unlock_list:
2267	sx_sunlock(&allprison_lock);
2268 done_errmsg:
2269	if (error && errmsg_pos >= 0) {
2270		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2271		errmsg_pos = 2 * errmsg_pos + 1;
2272		if (errmsg_len > 0) {
2273			if (optuio->uio_segflg == UIO_SYSSPACE)
2274				bcopy(errmsg,
2275				    optuio->uio_iov[errmsg_pos].iov_base,
2276				    errmsg_len);
2277			else
2278				copyout(errmsg,
2279				    optuio->uio_iov[errmsg_pos].iov_base,
2280				    errmsg_len);
2281		}
2282	}
2283	vfs_freeopts(opts);
2284	return (error);
2285}
2286
2287
2288/*
2289 * struct jail_remove_args {
2290 *	int jid;
2291 * };
2292 */
2293int
2294sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2295{
2296	struct prison *pr, *cpr, *lpr, *tpr;
2297	int descend, error;
2298
2299	error = priv_check(td, PRIV_JAIL_REMOVE);
2300	if (error)
2301		return (error);
2302
2303	sx_xlock(&allprison_lock);
2304	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2305	if (pr == NULL) {
2306		sx_xunlock(&allprison_lock);
2307		return (EINVAL);
2308	}
2309
2310	/* Remove all descendants of this prison, then remove this prison. */
2311	pr->pr_ref++;
2312	if (!LIST_EMPTY(&pr->pr_children)) {
2313		mtx_unlock(&pr->pr_mtx);
2314		lpr = NULL;
2315		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2316			mtx_lock(&cpr->pr_mtx);
2317			if (cpr->pr_ref > 0) {
2318				tpr = cpr;
2319				cpr->pr_ref++;
2320			} else {
2321				/* Already removed - do not do it again. */
2322				tpr = NULL;
2323			}
2324			mtx_unlock(&cpr->pr_mtx);
2325			if (lpr != NULL) {
2326				mtx_lock(&lpr->pr_mtx);
2327				prison_remove_one(lpr);
2328				sx_xlock(&allprison_lock);
2329			}
2330			lpr = tpr;
2331		}
2332		if (lpr != NULL) {
2333			mtx_lock(&lpr->pr_mtx);
2334			prison_remove_one(lpr);
2335			sx_xlock(&allprison_lock);
2336		}
2337		mtx_lock(&pr->pr_mtx);
2338	}
2339	prison_remove_one(pr);
2340	return (0);
2341}
2342
2343static void
2344prison_remove_one(struct prison *pr)
2345{
2346	struct proc *p;
2347	int deuref;
2348
2349	/* If the prison was persistent, it is not anymore. */
2350	deuref = 0;
2351	if (pr->pr_flags & PR_PERSIST) {
2352		pr->pr_ref--;
2353		deuref = PD_DEUREF;
2354		pr->pr_flags &= ~PR_PERSIST;
2355	}
2356
2357	/*
2358	 * jail_remove added a reference.  If that's the only one, remove
2359	 * the prison now.
2360	 */
2361	KASSERT(pr->pr_ref > 0,
2362	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2363	if (pr->pr_ref == 1) {
2364		prison_deref(pr,
2365		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2366		return;
2367	}
2368
2369	mtx_unlock(&pr->pr_mtx);
2370	sx_xunlock(&allprison_lock);
2371	/*
2372	 * Kill all processes unfortunate enough to be attached to this prison.
2373	 */
2374	sx_slock(&allproc_lock);
2375	LIST_FOREACH(p, &allproc, p_list) {
2376		PROC_LOCK(p);
2377		if (p->p_state != PRS_NEW && p->p_ucred &&
2378		    p->p_ucred->cr_prison == pr)
2379			kern_psignal(p, SIGKILL);
2380		PROC_UNLOCK(p);
2381	}
2382	sx_sunlock(&allproc_lock);
2383	/* Remove the temporary reference added by jail_remove. */
2384	prison_deref(pr, deuref | PD_DEREF);
2385}
2386
2387
2388/*
2389 * struct jail_attach_args {
2390 *	int jid;
2391 * };
2392 */
2393int
2394sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2395{
2396	struct prison *pr;
2397	int error;
2398
2399	error = priv_check(td, PRIV_JAIL_ATTACH);
2400	if (error)
2401		return (error);
2402
2403	/*
2404	 * Start with exclusive hold on allprison_lock to ensure that a possible
2405	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
2406	 * But then immediately downgrade it since we don't need to stop
2407	 * readers.
2408	 */
2409	sx_xlock(&allprison_lock);
2410	sx_downgrade(&allprison_lock);
2411	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2412	if (pr == NULL) {
2413		sx_sunlock(&allprison_lock);
2414		return (EINVAL);
2415	}
2416
2417	/*
2418	 * Do not allow a process to attach to a prison that is not
2419	 * considered to be "alive".
2420	 */
2421	if (pr->pr_uref == 0) {
2422		mtx_unlock(&pr->pr_mtx);
2423		sx_sunlock(&allprison_lock);
2424		return (EINVAL);
2425	}
2426
2427	return (do_jail_attach(td, pr));
2428}
2429
2430static int
2431do_jail_attach(struct thread *td, struct prison *pr)
2432{
2433	struct proc *p;
2434	struct ucred *newcred, *oldcred;
2435	int error;
2436
2437	/*
2438	 * XXX: Note that there is a slight race here if two threads
2439	 * in the same privileged process attempt to attach to two
2440	 * different jails at the same time.  It is important for
2441	 * user processes not to do this, or they might end up with
2442	 * a process root from one prison, but attached to the jail
2443	 * of another.
2444	 */
2445	pr->pr_ref++;
2446	pr->pr_uref++;
2447	mtx_unlock(&pr->pr_mtx);
2448
2449	/* Let modules do whatever they need to prepare for attaching. */
2450	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2451	if (error) {
2452		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2453		return (error);
2454	}
2455	sx_sunlock(&allprison_lock);
2456
2457	/*
2458	 * Reparent the newly attached process to this jail.
2459	 */
2460	p = td->td_proc;
2461	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2462	if (error)
2463		goto e_revert_osd;
2464
2465	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2466	if ((error = change_dir(pr->pr_root, td)) != 0)
2467		goto e_unlock;
2468#ifdef MAC
2469	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2470		goto e_unlock;
2471#endif
2472	VOP_UNLOCK(pr->pr_root, 0);
2473	if ((error = change_root(pr->pr_root, td)))
2474		goto e_revert_osd;
2475
2476	newcred = crget();
2477	PROC_LOCK(p);
2478	oldcred = crcopysafe(p, newcred);
2479	newcred->cr_prison = pr;
2480	p->p_ucred = newcred;
2481	setsugid(p);
2482	PROC_UNLOCK(p);
2483#ifdef RACCT
2484	racct_proc_ucred_changed(p, oldcred, newcred);
2485#endif
2486	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
2487	crfree(oldcred);
2488	return (0);
2489
2490 e_unlock:
2491	VOP_UNLOCK(pr->pr_root, 0);
2492 e_revert_osd:
2493	/* Tell modules this thread is still in its old jail after all. */
2494	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2495	prison_deref(pr, PD_DEREF | PD_DEUREF);
2496	return (error);
2497}
2498
2499
2500/*
2501 * Returns a locked prison instance, or NULL on failure.
2502 */
2503struct prison *
2504prison_find(int prid)
2505{
2506	struct prison *pr;
2507
2508	sx_assert(&allprison_lock, SX_LOCKED);
2509	TAILQ_FOREACH(pr, &allprison, pr_list) {
2510		if (pr->pr_id == prid) {
2511			mtx_lock(&pr->pr_mtx);
2512			if (pr->pr_ref > 0)
2513				return (pr);
2514			mtx_unlock(&pr->pr_mtx);
2515		}
2516	}
2517	return (NULL);
2518}
2519
2520/*
2521 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2522 */
2523struct prison *
2524prison_find_child(struct prison *mypr, int prid)
2525{
2526	struct prison *pr;
2527	int descend;
2528
2529	sx_assert(&allprison_lock, SX_LOCKED);
2530	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2531		if (pr->pr_id == prid) {
2532			mtx_lock(&pr->pr_mtx);
2533			if (pr->pr_ref > 0)
2534				return (pr);
2535			mtx_unlock(&pr->pr_mtx);
2536		}
2537	}
2538	return (NULL);
2539}
2540
2541/*
2542 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2543 */
2544struct prison *
2545prison_find_name(struct prison *mypr, const char *name)
2546{
2547	struct prison *pr, *deadpr;
2548	size_t mylen;
2549	int descend;
2550
2551	sx_assert(&allprison_lock, SX_LOCKED);
2552	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2553 again:
2554	deadpr = NULL;
2555	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2556		if (!strcmp(pr->pr_name + mylen, name)) {
2557			mtx_lock(&pr->pr_mtx);
2558			if (pr->pr_ref > 0) {
2559				if (pr->pr_uref > 0)
2560					return (pr);
2561				deadpr = pr;
2562			}
2563			mtx_unlock(&pr->pr_mtx);
2564		}
2565	}
2566	/* There was no valid prison - perhaps there was a dying one. */
2567	if (deadpr != NULL) {
2568		mtx_lock(&deadpr->pr_mtx);
2569		if (deadpr->pr_ref == 0) {
2570			mtx_unlock(&deadpr->pr_mtx);
2571			goto again;
2572		}
2573	}
2574	return (deadpr);
2575}
2576
2577/*
2578 * See if a prison has the specific flag set.
2579 */
2580int
2581prison_flag(struct ucred *cred, unsigned flag)
2582{
2583
2584	/* This is an atomic read, so no locking is necessary. */
2585	return (cred->cr_prison->pr_flags & flag);
2586}
2587
2588int
2589prison_allow(struct ucred *cred, unsigned flag)
2590{
2591
2592	/* This is an atomic read, so no locking is necessary. */
2593	return (cred->cr_prison->pr_allow & flag);
2594}
2595
2596/*
2597 * Remove a prison reference.  If that was the last reference, remove the
2598 * prison itself - but not in this context in case there are locks held.
2599 */
2600void
2601prison_free_locked(struct prison *pr)
2602{
2603	int ref;
2604
2605	mtx_assert(&pr->pr_mtx, MA_OWNED);
2606	ref = --pr->pr_ref;
2607	mtx_unlock(&pr->pr_mtx);
2608	if (ref == 0)
2609		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2610}
2611
2612void
2613prison_free(struct prison *pr)
2614{
2615
2616	mtx_lock(&pr->pr_mtx);
2617	prison_free_locked(pr);
2618}
2619
2620/*
2621 * Complete a call to either prison_free or prison_proc_free.
2622 */
2623static void
2624prison_complete(void *context, int pending)
2625{
2626	struct prison *pr = context;
2627
2628	sx_xlock(&allprison_lock);
2629	mtx_lock(&pr->pr_mtx);
2630	prison_deref(pr, pr->pr_uref
2631	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
2632	    : PD_LOCKED | PD_LIST_XLOCKED);
2633}
2634
2635/*
2636 * Remove a prison reference (usually).  This internal version assumes no
2637 * mutexes are held, except perhaps the prison itself.  If there are no more
2638 * references, release and delist the prison.  On completion, the prison lock
2639 * and the allprison lock are both unlocked.
2640 */
2641static void
2642prison_deref(struct prison *pr, int flags)
2643{
2644	struct prison *ppr, *tpr;
2645	int ref, lasturef;
2646
2647	if (!(flags & PD_LOCKED))
2648		mtx_lock(&pr->pr_mtx);
2649	for (;;) {
2650		if (flags & PD_DEUREF) {
2651			KASSERT(pr->pr_uref > 0,
2652			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2653			     pr->pr_id));
2654			pr->pr_uref--;
2655			lasturef = pr->pr_uref == 0;
2656			if (lasturef)
2657				pr->pr_ref++;
2658			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2659		} else
2660			lasturef = 0;
2661		if (flags & PD_DEREF) {
2662			KASSERT(pr->pr_ref > 0,
2663			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2664			     pr->pr_id));
2665			pr->pr_ref--;
2666		}
2667		ref = pr->pr_ref;
2668		mtx_unlock(&pr->pr_mtx);
2669
2670		/*
2671		 * Tell the modules if the last user reference was removed
2672		 * (even it sticks around in dying state).
2673		 */
2674		if (lasturef) {
2675			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
2676				sx_xlock(&allprison_lock);
2677				flags |= PD_LIST_XLOCKED;
2678			}
2679			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2680			mtx_lock(&pr->pr_mtx);
2681			ref = --pr->pr_ref;
2682			mtx_unlock(&pr->pr_mtx);
2683		}
2684
2685		/* If the prison still has references, nothing else to do. */
2686		if (ref > 0) {
2687			if (flags & PD_LIST_SLOCKED)
2688				sx_sunlock(&allprison_lock);
2689			else if (flags & PD_LIST_XLOCKED)
2690				sx_xunlock(&allprison_lock);
2691			return;
2692		}
2693
2694		if (flags & PD_LIST_SLOCKED) {
2695			if (!sx_try_upgrade(&allprison_lock)) {
2696				sx_sunlock(&allprison_lock);
2697				sx_xlock(&allprison_lock);
2698			}
2699		} else if (!(flags & PD_LIST_XLOCKED))
2700			sx_xlock(&allprison_lock);
2701
2702		TAILQ_REMOVE(&allprison, pr, pr_list);
2703		LIST_REMOVE(pr, pr_sibling);
2704		ppr = pr->pr_parent;
2705		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2706			tpr->pr_childcount--;
2707		sx_xunlock(&allprison_lock);
2708
2709#ifdef VIMAGE
2710		if (pr->pr_vnet != ppr->pr_vnet)
2711			vnet_destroy(pr->pr_vnet);
2712#endif
2713		if (pr->pr_root != NULL)
2714			vrele(pr->pr_root);
2715		mtx_destroy(&pr->pr_mtx);
2716#ifdef INET
2717		free(pr->pr_ip4, M_PRISON);
2718#endif
2719#ifdef INET6
2720		free(pr->pr_ip6, M_PRISON);
2721#endif
2722		if (pr->pr_cpuset != NULL)
2723			cpuset_rel(pr->pr_cpuset);
2724		osd_jail_exit(pr);
2725#ifdef RACCT
2726		if (racct_enable)
2727			prison_racct_detach(pr);
2728#endif
2729		free(pr, M_PRISON);
2730
2731		/* Removing a prison frees a reference on its parent. */
2732		pr = ppr;
2733		mtx_lock(&pr->pr_mtx);
2734		flags = PD_DEREF | PD_DEUREF;
2735	}
2736}
2737
2738void
2739prison_hold_locked(struct prison *pr)
2740{
2741
2742	mtx_assert(&pr->pr_mtx, MA_OWNED);
2743	KASSERT(pr->pr_ref > 0,
2744	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2745	pr->pr_ref++;
2746}
2747
2748void
2749prison_hold(struct prison *pr)
2750{
2751
2752	mtx_lock(&pr->pr_mtx);
2753	prison_hold_locked(pr);
2754	mtx_unlock(&pr->pr_mtx);
2755}
2756
2757void
2758prison_proc_hold(struct prison *pr)
2759{
2760
2761	mtx_lock(&pr->pr_mtx);
2762	KASSERT(pr->pr_uref > 0,
2763	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2764	pr->pr_uref++;
2765	mtx_unlock(&pr->pr_mtx);
2766}
2767
2768void
2769prison_proc_free(struct prison *pr)
2770{
2771
2772	mtx_lock(&pr->pr_mtx);
2773	KASSERT(pr->pr_uref > 0,
2774	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2775	if (pr->pr_uref > 1)
2776		pr->pr_uref--;
2777	else {
2778		/*
2779		 * Don't remove the last user reference in this context, which
2780		 * is expected to be a process that is not only locked, but
2781		 * also half dead.
2782		 */
2783		pr->pr_ref++;
2784		mtx_unlock(&pr->pr_mtx);
2785		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2786		return;
2787	}
2788	mtx_unlock(&pr->pr_mtx);
2789}
2790
2791
2792#ifdef INET
2793/*
2794 * Restrict a prison's IP address list with its parent's, possibly replacing
2795 * it.  Return true if the replacement buffer was used (or would have been).
2796 */
2797static int
2798prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2799{
2800	int ii, ij, used;
2801	struct prison *ppr;
2802
2803	ppr = pr->pr_parent;
2804	if (!(pr->pr_flags & PR_IP4_USER)) {
2805		/* This has no user settings, so just copy the parent's list. */
2806		if (pr->pr_ip4s < ppr->pr_ip4s) {
2807			/*
2808			 * There's no room for the parent's list.  Use the
2809			 * new list buffer, which is assumed to be big enough
2810			 * (if it was passed).  If there's no buffer, try to
2811			 * allocate one.
2812			 */
2813			used = 1;
2814			if (newip4 == NULL) {
2815				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2816				    M_PRISON, M_NOWAIT);
2817				if (newip4 != NULL)
2818					used = 0;
2819			}
2820			if (newip4 != NULL) {
2821				bcopy(ppr->pr_ip4, newip4,
2822				    ppr->pr_ip4s * sizeof(*newip4));
2823				free(pr->pr_ip4, M_PRISON);
2824				pr->pr_ip4 = newip4;
2825				pr->pr_ip4s = ppr->pr_ip4s;
2826			}
2827			return (used);
2828		}
2829		pr->pr_ip4s = ppr->pr_ip4s;
2830		if (pr->pr_ip4s > 0)
2831			bcopy(ppr->pr_ip4, pr->pr_ip4,
2832			    pr->pr_ip4s * sizeof(*newip4));
2833		else if (pr->pr_ip4 != NULL) {
2834			free(pr->pr_ip4, M_PRISON);
2835			pr->pr_ip4 = NULL;
2836		}
2837	} else if (pr->pr_ip4s > 0) {
2838		/* Remove addresses that aren't in the parent. */
2839		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2840			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2841				break;
2842		if (ij < ppr->pr_ip4s)
2843			ii = 1;
2844		else {
2845			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2846			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2847			ii = 0;
2848		}
2849		for (ij = 1; ii < pr->pr_ip4s; ) {
2850			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2851				ii++;
2852				continue;
2853			}
2854			switch (ij >= ppr->pr_ip4s ? -1 :
2855				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2856			case -1:
2857				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2858				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2859				break;
2860			case 0:
2861				ii++;
2862				ij++;
2863				break;
2864			case 1:
2865				ij++;
2866				break;
2867			}
2868		}
2869		if (pr->pr_ip4s == 0) {
2870			pr->pr_flags |= PR_IP4_DISABLE;
2871			free(pr->pr_ip4, M_PRISON);
2872			pr->pr_ip4 = NULL;
2873		}
2874	}
2875	return (0);
2876}
2877
2878/*
2879 * Pass back primary IPv4 address of this jail.
2880 *
2881 * If not restricted return success but do not alter the address.  Caller has
2882 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2883 *
2884 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2885 * Address returned in NBO.
2886 */
2887int
2888prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2889{
2890	struct prison *pr;
2891
2892	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2893	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2894
2895	pr = cred->cr_prison;
2896	if (!(pr->pr_flags & PR_IP4))
2897		return (0);
2898	mtx_lock(&pr->pr_mtx);
2899	if (!(pr->pr_flags & PR_IP4)) {
2900		mtx_unlock(&pr->pr_mtx);
2901		return (0);
2902	}
2903	if (pr->pr_ip4 == NULL) {
2904		mtx_unlock(&pr->pr_mtx);
2905		return (EAFNOSUPPORT);
2906	}
2907
2908	ia->s_addr = pr->pr_ip4[0].s_addr;
2909	mtx_unlock(&pr->pr_mtx);
2910	return (0);
2911}
2912
2913/*
2914 * Return 1 if we should do proper source address selection or are not jailed.
2915 * We will return 0 if we should bypass source address selection in favour
2916 * of the primary jail IPv4 address. Only in this case *ia will be updated and
2917 * returned in NBO.
2918 * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2919 */
2920int
2921prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2922{
2923	struct prison *pr;
2924	struct in_addr lia;
2925	int error;
2926
2927	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2928	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2929
2930	if (!jailed(cred))
2931		return (1);
2932
2933	pr = cred->cr_prison;
2934	if (pr->pr_flags & PR_IP4_SADDRSEL)
2935		return (1);
2936
2937	lia.s_addr = INADDR_ANY;
2938	error = prison_get_ip4(cred, &lia);
2939	if (error)
2940		return (error);
2941	if (lia.s_addr == INADDR_ANY)
2942		return (1);
2943
2944	ia->s_addr = lia.s_addr;
2945	return (0);
2946}
2947
2948/*
2949 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2950 */
2951int
2952prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2953{
2954
2955	if (pr1 == pr2)
2956		return (1);
2957
2958	/*
2959	 * No need to lock since the PR_IP4_USER flag can't be altered for
2960	 * existing prisons.
2961	 */
2962	while (pr1 != &prison0 &&
2963#ifdef VIMAGE
2964	       !(pr1->pr_flags & PR_VNET) &&
2965#endif
2966	       !(pr1->pr_flags & PR_IP4_USER))
2967		pr1 = pr1->pr_parent;
2968	while (pr2 != &prison0 &&
2969#ifdef VIMAGE
2970	       !(pr2->pr_flags & PR_VNET) &&
2971#endif
2972	       !(pr2->pr_flags & PR_IP4_USER))
2973		pr2 = pr2->pr_parent;
2974	return (pr1 == pr2);
2975}
2976
2977/*
2978 * Make sure our (source) address is set to something meaningful to this
2979 * jail.
2980 *
2981 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2982 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2983 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2984 */
2985int
2986prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2987{
2988	struct prison *pr;
2989	struct in_addr ia0;
2990	int error;
2991
2992	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2993	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2994
2995	pr = cred->cr_prison;
2996	if (!(pr->pr_flags & PR_IP4))
2997		return (0);
2998	mtx_lock(&pr->pr_mtx);
2999	if (!(pr->pr_flags & PR_IP4)) {
3000		mtx_unlock(&pr->pr_mtx);
3001		return (0);
3002	}
3003	if (pr->pr_ip4 == NULL) {
3004		mtx_unlock(&pr->pr_mtx);
3005		return (EAFNOSUPPORT);
3006	}
3007
3008	ia0.s_addr = ntohl(ia->s_addr);
3009	if (ia0.s_addr == INADDR_LOOPBACK) {
3010		ia->s_addr = pr->pr_ip4[0].s_addr;
3011		mtx_unlock(&pr->pr_mtx);
3012		return (0);
3013	}
3014
3015	if (ia0.s_addr == INADDR_ANY) {
3016		/*
3017		 * In case there is only 1 IPv4 address, bind directly.
3018		 */
3019		if (pr->pr_ip4s == 1)
3020			ia->s_addr = pr->pr_ip4[0].s_addr;
3021		mtx_unlock(&pr->pr_mtx);
3022		return (0);
3023	}
3024
3025	error = _prison_check_ip4(pr, ia);
3026	mtx_unlock(&pr->pr_mtx);
3027	return (error);
3028}
3029
3030/*
3031 * Rewrite destination address in case we will connect to loopback address.
3032 *
3033 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
3034 * Address passed in in NBO and returned in NBO.
3035 */
3036int
3037prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
3038{
3039	struct prison *pr;
3040
3041	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3042	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3043
3044	pr = cred->cr_prison;
3045	if (!(pr->pr_flags & PR_IP4))
3046		return (0);
3047	mtx_lock(&pr->pr_mtx);
3048	if (!(pr->pr_flags & PR_IP4)) {
3049		mtx_unlock(&pr->pr_mtx);
3050		return (0);
3051	}
3052	if (pr->pr_ip4 == NULL) {
3053		mtx_unlock(&pr->pr_mtx);
3054		return (EAFNOSUPPORT);
3055	}
3056
3057	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
3058		ia->s_addr = pr->pr_ip4[0].s_addr;
3059		mtx_unlock(&pr->pr_mtx);
3060		return (0);
3061	}
3062
3063	/*
3064	 * Return success because nothing had to be changed.
3065	 */
3066	mtx_unlock(&pr->pr_mtx);
3067	return (0);
3068}
3069
3070/*
3071 * Check if given address belongs to the jail referenced by cred/prison.
3072 *
3073 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
3074 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3075 * doesn't allow IPv4.  Address passed in in NBO.
3076 */
3077static int
3078_prison_check_ip4(struct prison *pr, struct in_addr *ia)
3079{
3080	int i, a, z, d;
3081
3082	/*
3083	 * Check the primary IP.
3084	 */
3085	if (pr->pr_ip4[0].s_addr == ia->s_addr)
3086		return (0);
3087
3088	/*
3089	 * All the other IPs are sorted so we can do a binary search.
3090	 */
3091	a = 0;
3092	z = pr->pr_ip4s - 2;
3093	while (a <= z) {
3094		i = (a + z) / 2;
3095		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
3096		if (d > 0)
3097			z = i - 1;
3098		else if (d < 0)
3099			a = i + 1;
3100		else
3101			return (0);
3102	}
3103
3104	return (EADDRNOTAVAIL);
3105}
3106
3107int
3108prison_check_ip4(struct ucred *cred, struct in_addr *ia)
3109{
3110	struct prison *pr;
3111	int error;
3112
3113	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3114	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3115
3116	pr = cred->cr_prison;
3117	if (!(pr->pr_flags & PR_IP4))
3118		return (0);
3119	mtx_lock(&pr->pr_mtx);
3120	if (!(pr->pr_flags & PR_IP4)) {
3121		mtx_unlock(&pr->pr_mtx);
3122		return (0);
3123	}
3124	if (pr->pr_ip4 == NULL) {
3125		mtx_unlock(&pr->pr_mtx);
3126		return (EAFNOSUPPORT);
3127	}
3128
3129	error = _prison_check_ip4(pr, ia);
3130	mtx_unlock(&pr->pr_mtx);
3131	return (error);
3132}
3133#endif
3134
3135#ifdef INET6
3136static int
3137prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
3138{
3139	int ii, ij, used;
3140	struct prison *ppr;
3141
3142	ppr = pr->pr_parent;
3143	if (!(pr->pr_flags & PR_IP6_USER)) {
3144		/* This has no user settings, so just copy the parent's list. */
3145		if (pr->pr_ip6s < ppr->pr_ip6s) {
3146			/*
3147			 * There's no room for the parent's list.  Use the
3148			 * new list buffer, which is assumed to be big enough
3149			 * (if it was passed).  If there's no buffer, try to
3150			 * allocate one.
3151			 */
3152			used = 1;
3153			if (newip6 == NULL) {
3154				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
3155				    M_PRISON, M_NOWAIT);
3156				if (newip6 != NULL)
3157					used = 0;
3158			}
3159			if (newip6 != NULL) {
3160				bcopy(ppr->pr_ip6, newip6,
3161				    ppr->pr_ip6s * sizeof(*newip6));
3162				free(pr->pr_ip6, M_PRISON);
3163				pr->pr_ip6 = newip6;
3164				pr->pr_ip6s = ppr->pr_ip6s;
3165			}
3166			return (used);
3167		}
3168		pr->pr_ip6s = ppr->pr_ip6s;
3169		if (pr->pr_ip6s > 0)
3170			bcopy(ppr->pr_ip6, pr->pr_ip6,
3171			    pr->pr_ip6s * sizeof(*newip6));
3172		else if (pr->pr_ip6 != NULL) {
3173			free(pr->pr_ip6, M_PRISON);
3174			pr->pr_ip6 = NULL;
3175		}
3176	} else if (pr->pr_ip6s > 0) {
3177		/* Remove addresses that aren't in the parent. */
3178		for (ij = 0; ij < ppr->pr_ip6s; ij++)
3179			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
3180			    &ppr->pr_ip6[ij]))
3181				break;
3182		if (ij < ppr->pr_ip6s)
3183			ii = 1;
3184		else {
3185			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
3186			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3187			ii = 0;
3188		}
3189		for (ij = 1; ii < pr->pr_ip6s; ) {
3190			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3191			    &ppr->pr_ip6[0])) {
3192				ii++;
3193				continue;
3194			}
3195			switch (ij >= ppr->pr_ip6s ? -1 :
3196				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3197			case -1:
3198				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3199				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3200				break;
3201			case 0:
3202				ii++;
3203				ij++;
3204				break;
3205			case 1:
3206				ij++;
3207				break;
3208			}
3209		}
3210		if (pr->pr_ip6s == 0) {
3211			pr->pr_flags |= PR_IP6_DISABLE;
3212			free(pr->pr_ip6, M_PRISON);
3213			pr->pr_ip6 = NULL;
3214		}
3215	}
3216	return 0;
3217}
3218
3219/*
3220 * Pass back primary IPv6 address for this jail.
3221 *
3222 * If not restricted return success but do not alter the address.  Caller has
3223 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3224 *
3225 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3226 */
3227int
3228prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3229{
3230	struct prison *pr;
3231
3232	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3233	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3234
3235	pr = cred->cr_prison;
3236	if (!(pr->pr_flags & PR_IP6))
3237		return (0);
3238	mtx_lock(&pr->pr_mtx);
3239	if (!(pr->pr_flags & PR_IP6)) {
3240		mtx_unlock(&pr->pr_mtx);
3241		return (0);
3242	}
3243	if (pr->pr_ip6 == NULL) {
3244		mtx_unlock(&pr->pr_mtx);
3245		return (EAFNOSUPPORT);
3246	}
3247
3248	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3249	mtx_unlock(&pr->pr_mtx);
3250	return (0);
3251}
3252
3253/*
3254 * Return 1 if we should do proper source address selection or are not jailed.
3255 * We will return 0 if we should bypass source address selection in favour
3256 * of the primary jail IPv6 address. Only in this case *ia will be updated and
3257 * returned in NBO.
3258 * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3259 */
3260int
3261prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3262{
3263	struct prison *pr;
3264	struct in6_addr lia6;
3265	int error;
3266
3267	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3268	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3269
3270	if (!jailed(cred))
3271		return (1);
3272
3273	pr = cred->cr_prison;
3274	if (pr->pr_flags & PR_IP6_SADDRSEL)
3275		return (1);
3276
3277	lia6 = in6addr_any;
3278	error = prison_get_ip6(cred, &lia6);
3279	if (error)
3280		return (error);
3281	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3282		return (1);
3283
3284	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3285	return (0);
3286}
3287
3288/*
3289 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3290 */
3291int
3292prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3293{
3294
3295	if (pr1 == pr2)
3296		return (1);
3297
3298	while (pr1 != &prison0 &&
3299#ifdef VIMAGE
3300	       !(pr1->pr_flags & PR_VNET) &&
3301#endif
3302	       !(pr1->pr_flags & PR_IP6_USER))
3303		pr1 = pr1->pr_parent;
3304	while (pr2 != &prison0 &&
3305#ifdef VIMAGE
3306	       !(pr2->pr_flags & PR_VNET) &&
3307#endif
3308	       !(pr2->pr_flags & PR_IP6_USER))
3309		pr2 = pr2->pr_parent;
3310	return (pr1 == pr2);
3311}
3312
3313/*
3314 * Make sure our (source) address is set to something meaningful to this jail.
3315 *
3316 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3317 * when needed while binding.
3318 *
3319 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3320 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3321 * doesn't allow IPv6.
3322 */
3323int
3324prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3325{
3326	struct prison *pr;
3327	int error;
3328
3329	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3330	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3331
3332	pr = cred->cr_prison;
3333	if (!(pr->pr_flags & PR_IP6))
3334		return (0);
3335	mtx_lock(&pr->pr_mtx);
3336	if (!(pr->pr_flags & PR_IP6)) {
3337		mtx_unlock(&pr->pr_mtx);
3338		return (0);
3339	}
3340	if (pr->pr_ip6 == NULL) {
3341		mtx_unlock(&pr->pr_mtx);
3342		return (EAFNOSUPPORT);
3343	}
3344
3345	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3346		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3347		mtx_unlock(&pr->pr_mtx);
3348		return (0);
3349	}
3350
3351	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3352		/*
3353		 * In case there is only 1 IPv6 address, and v6only is true,
3354		 * then bind directly.
3355		 */
3356		if (v6only != 0 && pr->pr_ip6s == 1)
3357			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3358		mtx_unlock(&pr->pr_mtx);
3359		return (0);
3360	}
3361
3362	error = _prison_check_ip6(pr, ia6);
3363	mtx_unlock(&pr->pr_mtx);
3364	return (error);
3365}
3366
3367/*
3368 * Rewrite destination address in case we will connect to loopback address.
3369 *
3370 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3371 */
3372int
3373prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3374{
3375	struct prison *pr;
3376
3377	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3378	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3379
3380	pr = cred->cr_prison;
3381	if (!(pr->pr_flags & PR_IP6))
3382		return (0);
3383	mtx_lock(&pr->pr_mtx);
3384	if (!(pr->pr_flags & PR_IP6)) {
3385		mtx_unlock(&pr->pr_mtx);
3386		return (0);
3387	}
3388	if (pr->pr_ip6 == NULL) {
3389		mtx_unlock(&pr->pr_mtx);
3390		return (EAFNOSUPPORT);
3391	}
3392
3393	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3394		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3395		mtx_unlock(&pr->pr_mtx);
3396		return (0);
3397	}
3398
3399	/*
3400	 * Return success because nothing had to be changed.
3401	 */
3402	mtx_unlock(&pr->pr_mtx);
3403	return (0);
3404}
3405
3406/*
3407 * Check if given address belongs to the jail referenced by cred/prison.
3408 *
3409 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3410 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3411 * doesn't allow IPv6.
3412 */
3413static int
3414_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3415{
3416	int i, a, z, d;
3417
3418	/*
3419	 * Check the primary IP.
3420	 */
3421	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3422		return (0);
3423
3424	/*
3425	 * All the other IPs are sorted so we can do a binary search.
3426	 */
3427	a = 0;
3428	z = pr->pr_ip6s - 2;
3429	while (a <= z) {
3430		i = (a + z) / 2;
3431		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3432		if (d > 0)
3433			z = i - 1;
3434		else if (d < 0)
3435			a = i + 1;
3436		else
3437			return (0);
3438	}
3439
3440	return (EADDRNOTAVAIL);
3441}
3442
3443int
3444prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3445{
3446	struct prison *pr;
3447	int error;
3448
3449	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3450	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3451
3452	pr = cred->cr_prison;
3453	if (!(pr->pr_flags & PR_IP6))
3454		return (0);
3455	mtx_lock(&pr->pr_mtx);
3456	if (!(pr->pr_flags & PR_IP6)) {
3457		mtx_unlock(&pr->pr_mtx);
3458		return (0);
3459	}
3460	if (pr->pr_ip6 == NULL) {
3461		mtx_unlock(&pr->pr_mtx);
3462		return (EAFNOSUPPORT);
3463	}
3464
3465	error = _prison_check_ip6(pr, ia6);
3466	mtx_unlock(&pr->pr_mtx);
3467	return (error);
3468}
3469#endif
3470
3471/*
3472 * Check if a jail supports the given address family.
3473 *
3474 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3475 * if not.
3476 */
3477int
3478prison_check_af(struct ucred *cred, int af)
3479{
3480	struct prison *pr;
3481	int error;
3482
3483	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3484
3485	pr = cred->cr_prison;
3486#ifdef VIMAGE
3487	/* Prisons with their own network stack are not limited. */
3488	if (prison_owns_vnet(cred))
3489		return (0);
3490#endif
3491
3492	error = 0;
3493	switch (af)
3494	{
3495#ifdef INET
3496	case AF_INET:
3497		if (pr->pr_flags & PR_IP4)
3498		{
3499			mtx_lock(&pr->pr_mtx);
3500			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3501				error = EAFNOSUPPORT;
3502			mtx_unlock(&pr->pr_mtx);
3503		}
3504		break;
3505#endif
3506#ifdef INET6
3507	case AF_INET6:
3508		if (pr->pr_flags & PR_IP6)
3509		{
3510			mtx_lock(&pr->pr_mtx);
3511			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3512				error = EAFNOSUPPORT;
3513			mtx_unlock(&pr->pr_mtx);
3514		}
3515		break;
3516#endif
3517	case AF_LOCAL:
3518	case AF_ROUTE:
3519		break;
3520	default:
3521		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3522			error = EAFNOSUPPORT;
3523	}
3524	return (error);
3525}
3526
3527/*
3528 * Check if given address belongs to the jail referenced by cred (wrapper to
3529 * prison_check_ip[46]).
3530 *
3531 * Returns 0 if jail doesn't restrict the address family or if address belongs
3532 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3533 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3534 */
3535int
3536prison_if(struct ucred *cred, struct sockaddr *sa)
3537{
3538#ifdef INET
3539	struct sockaddr_in *sai;
3540#endif
3541#ifdef INET6
3542	struct sockaddr_in6 *sai6;
3543#endif
3544	int error;
3545
3546	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3547	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3548
3549#ifdef VIMAGE
3550	if (prison_owns_vnet(cred))
3551		return (0);
3552#endif
3553
3554	error = 0;
3555	switch (sa->sa_family)
3556	{
3557#ifdef INET
3558	case AF_INET:
3559		sai = (struct sockaddr_in *)sa;
3560		error = prison_check_ip4(cred, &sai->sin_addr);
3561		break;
3562#endif
3563#ifdef INET6
3564	case AF_INET6:
3565		sai6 = (struct sockaddr_in6 *)sa;
3566		error = prison_check_ip6(cred, &sai6->sin6_addr);
3567		break;
3568#endif
3569	default:
3570		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3571			error = EAFNOSUPPORT;
3572	}
3573	return (error);
3574}
3575
3576/*
3577 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3578 */
3579int
3580prison_check(struct ucred *cred1, struct ucred *cred2)
3581{
3582
3583	return ((cred1->cr_prison == cred2->cr_prison ||
3584	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3585}
3586
3587/*
3588 * Return 1 if p2 is a child of p1, otherwise 0.
3589 */
3590int
3591prison_ischild(struct prison *pr1, struct prison *pr2)
3592{
3593
3594	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3595		if (pr1 == pr2)
3596			return (1);
3597	return (0);
3598}
3599
3600/*
3601 * Return 1 if the passed credential is in a jail, otherwise 0.
3602 */
3603int
3604jailed(struct ucred *cred)
3605{
3606
3607	return (cred->cr_prison != &prison0);
3608}
3609
3610/*
3611 * Return 1 if the passed credential is in a jail and that jail does not
3612 * have its own virtual network stack, otherwise 0.
3613 */
3614int
3615jailed_without_vnet(struct ucred *cred)
3616{
3617
3618	if (!jailed(cred))
3619		return (0);
3620#ifdef VIMAGE
3621	if (prison_owns_vnet(cred))
3622		return (0);
3623#endif
3624
3625	return (1);
3626}
3627
3628/*
3629 * Return the correct hostname (domainname, et al) for the passed credential.
3630 */
3631void
3632getcredhostname(struct ucred *cred, char *buf, size_t size)
3633{
3634	struct prison *pr;
3635
3636	/*
3637	 * A NULL credential can be used to shortcut to the physical
3638	 * system's hostname.
3639	 */
3640	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3641	mtx_lock(&pr->pr_mtx);
3642	strlcpy(buf, pr->pr_hostname, size);
3643	mtx_unlock(&pr->pr_mtx);
3644}
3645
3646void
3647getcreddomainname(struct ucred *cred, char *buf, size_t size)
3648{
3649
3650	mtx_lock(&cred->cr_prison->pr_mtx);
3651	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3652	mtx_unlock(&cred->cr_prison->pr_mtx);
3653}
3654
3655void
3656getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3657{
3658
3659	mtx_lock(&cred->cr_prison->pr_mtx);
3660	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3661	mtx_unlock(&cred->cr_prison->pr_mtx);
3662}
3663
3664void
3665getcredhostid(struct ucred *cred, unsigned long *hostid)
3666{
3667
3668	mtx_lock(&cred->cr_prison->pr_mtx);
3669	*hostid = cred->cr_prison->pr_hostid;
3670	mtx_unlock(&cred->cr_prison->pr_mtx);
3671}
3672
3673#ifdef VIMAGE
3674/*
3675 * Determine whether the prison represented by cred owns
3676 * its vnet rather than having it inherited.
3677 *
3678 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3679 */
3680int
3681prison_owns_vnet(struct ucred *cred)
3682{
3683
3684	/*
3685	 * vnets cannot be added/removed after jail creation,
3686	 * so no need to lock here.
3687	 */
3688	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3689}
3690#endif
3691
3692/*
3693 * Determine whether the subject represented by cred can "see"
3694 * status of a mount point.
3695 * Returns: 0 for permitted, ENOENT otherwise.
3696 * XXX: This function should be called cr_canseemount() and should be
3697 *      placed in kern_prot.c.
3698 */
3699int
3700prison_canseemount(struct ucred *cred, struct mount *mp)
3701{
3702	struct prison *pr;
3703	struct statfs *sp;
3704	size_t len;
3705
3706	pr = cred->cr_prison;
3707	if (pr->pr_enforce_statfs == 0)
3708		return (0);
3709	if (pr->pr_root->v_mount == mp)
3710		return (0);
3711	if (pr->pr_enforce_statfs == 2)
3712		return (ENOENT);
3713	/*
3714	 * If jail's chroot directory is set to "/" we should be able to see
3715	 * all mount-points from inside a jail.
3716	 * This is ugly check, but this is the only situation when jail's
3717	 * directory ends with '/'.
3718	 */
3719	if (strcmp(pr->pr_path, "/") == 0)
3720		return (0);
3721	len = strlen(pr->pr_path);
3722	sp = &mp->mnt_stat;
3723	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3724		return (ENOENT);
3725	/*
3726	 * Be sure that we don't have situation where jail's root directory
3727	 * is "/some/path" and mount point is "/some/pathpath".
3728	 */
3729	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3730		return (ENOENT);
3731	return (0);
3732}
3733
3734void
3735prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3736{
3737	char jpath[MAXPATHLEN];
3738	struct prison *pr;
3739	size_t len;
3740
3741	pr = cred->cr_prison;
3742	if (pr->pr_enforce_statfs == 0)
3743		return;
3744	if (prison_canseemount(cred, mp) != 0) {
3745		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3746		strlcpy(sp->f_mntonname, "[restricted]",
3747		    sizeof(sp->f_mntonname));
3748		return;
3749	}
3750	if (pr->pr_root->v_mount == mp) {
3751		/*
3752		 * Clear current buffer data, so we are sure nothing from
3753		 * the valid path left there.
3754		 */
3755		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3756		*sp->f_mntonname = '/';
3757		return;
3758	}
3759	/*
3760	 * If jail's chroot directory is set to "/" we should be able to see
3761	 * all mount-points from inside a jail.
3762	 */
3763	if (strcmp(pr->pr_path, "/") == 0)
3764		return;
3765	len = strlen(pr->pr_path);
3766	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3767	/*
3768	 * Clear current buffer data, so we are sure nothing from
3769	 * the valid path left there.
3770	 */
3771	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3772	if (*jpath == '\0') {
3773		/* Should never happen. */
3774		*sp->f_mntonname = '/';
3775	} else {
3776		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3777	}
3778}
3779
3780/*
3781 * Check with permission for a specific privilege is granted within jail.  We
3782 * have a specific list of accepted privileges; the rest are denied.
3783 */
3784int
3785prison_priv_check(struct ucred *cred, int priv)
3786{
3787
3788	if (!jailed(cred))
3789		return (0);
3790
3791#ifdef VIMAGE
3792	/*
3793	 * Privileges specific to prisons with a virtual network stack.
3794	 * There might be a duplicate entry here in case the privilege
3795	 * is only granted conditionally in the legacy jail case.
3796	 */
3797	switch (priv) {
3798#ifdef notyet
3799		/*
3800		 * NFS-specific privileges.
3801		 */
3802	case PRIV_NFS_DAEMON:
3803	case PRIV_NFS_LOCKD:
3804#endif
3805		/*
3806		 * Network stack privileges.
3807		 */
3808	case PRIV_NET_BRIDGE:
3809	case PRIV_NET_GRE:
3810	case PRIV_NET_BPF:
3811	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3812	case PRIV_NET_ROUTE:
3813	case PRIV_NET_TAP:
3814	case PRIV_NET_SETIFMTU:
3815	case PRIV_NET_SETIFFLAGS:
3816	case PRIV_NET_SETIFCAP:
3817	case PRIV_NET_SETIFDESCR:
3818	case PRIV_NET_SETIFNAME	:
3819	case PRIV_NET_SETIFMETRIC:
3820	case PRIV_NET_SETIFPHYS:
3821	case PRIV_NET_SETIFMAC:
3822	case PRIV_NET_ADDMULTI:
3823	case PRIV_NET_DELMULTI:
3824	case PRIV_NET_HWIOCTL:
3825	case PRIV_NET_SETLLADDR:
3826	case PRIV_NET_ADDIFGROUP:
3827	case PRIV_NET_DELIFGROUP:
3828	case PRIV_NET_IFCREATE:
3829	case PRIV_NET_IFDESTROY:
3830	case PRIV_NET_ADDIFADDR:
3831	case PRIV_NET_DELIFADDR:
3832	case PRIV_NET_LAGG:
3833	case PRIV_NET_GIF:
3834	case PRIV_NET_SETIFVNET:
3835	case PRIV_NET_SETIFFIB:
3836
3837		/*
3838		 * 802.11-related privileges.
3839		 */
3840	case PRIV_NET80211_GETKEY:
3841#ifdef notyet
3842	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3843#endif
3844
3845#ifdef notyet
3846		/*
3847		 * AppleTalk privileges.
3848		 */
3849	case PRIV_NETATALK_RESERVEDPORT:
3850
3851		/*
3852		 * ATM privileges.
3853		 */
3854	case PRIV_NETATM_CFG:
3855	case PRIV_NETATM_ADD:
3856	case PRIV_NETATM_DEL:
3857	case PRIV_NETATM_SET:
3858
3859		/*
3860		 * Bluetooth privileges.
3861		 */
3862	case PRIV_NETBLUETOOTH_RAW:
3863#endif
3864
3865		/*
3866		 * Netgraph and netgraph module privileges.
3867		 */
3868	case PRIV_NETGRAPH_CONTROL:
3869#ifdef notyet
3870	case PRIV_NETGRAPH_TTY:
3871#endif
3872
3873		/*
3874		 * IPv4 and IPv6 privileges.
3875		 */
3876	case PRIV_NETINET_IPFW:
3877	case PRIV_NETINET_DIVERT:
3878	case PRIV_NETINET_PF:
3879	case PRIV_NETINET_DUMMYNET:
3880	case PRIV_NETINET_CARP:
3881	case PRIV_NETINET_MROUTE:
3882	case PRIV_NETINET_RAW:
3883	case PRIV_NETINET_ADDRCTRL6:
3884	case PRIV_NETINET_ND6:
3885	case PRIV_NETINET_SCOPE6:
3886	case PRIV_NETINET_ALIFETIME6:
3887	case PRIV_NETINET_IPSEC:
3888	case PRIV_NETINET_BINDANY:
3889
3890#ifdef notyet
3891		/*
3892		 * IPX/SPX privileges.
3893		 */
3894	case PRIV_NETIPX_RESERVEDPORT:
3895	case PRIV_NETIPX_RAW:
3896
3897		/*
3898		 * NCP privileges.
3899		 */
3900	case PRIV_NETNCP:
3901
3902		/*
3903		 * SMB privileges.
3904		 */
3905	case PRIV_NETSMB:
3906#endif
3907
3908	/*
3909	 * No default: or deny here.
3910	 * In case of no permit fall through to next switch().
3911	 */
3912		if (cred->cr_prison->pr_flags & PR_VNET)
3913			return (0);
3914	}
3915#endif /* VIMAGE */
3916
3917	switch (priv) {
3918
3919		/*
3920		 * Allow ktrace privileges for root in jail.
3921		 */
3922	case PRIV_KTRACE:
3923
3924#if 0
3925		/*
3926		 * Allow jailed processes to configure audit identity and
3927		 * submit audit records (login, etc).  In the future we may
3928		 * want to further refine the relationship between audit and
3929		 * jail.
3930		 */
3931	case PRIV_AUDIT_GETAUDIT:
3932	case PRIV_AUDIT_SETAUDIT:
3933	case PRIV_AUDIT_SUBMIT:
3934#endif
3935
3936		/*
3937		 * Allow jailed processes to manipulate process UNIX
3938		 * credentials in any way they see fit.
3939		 */
3940	case PRIV_CRED_SETUID:
3941	case PRIV_CRED_SETEUID:
3942	case PRIV_CRED_SETGID:
3943	case PRIV_CRED_SETEGID:
3944	case PRIV_CRED_SETGROUPS:
3945	case PRIV_CRED_SETREUID:
3946	case PRIV_CRED_SETREGID:
3947	case PRIV_CRED_SETRESUID:
3948	case PRIV_CRED_SETRESGID:
3949
3950		/*
3951		 * Jail implements visibility constraints already, so allow
3952		 * jailed root to override uid/gid-based constraints.
3953		 */
3954	case PRIV_SEEOTHERGIDS:
3955	case PRIV_SEEOTHERUIDS:
3956
3957		/*
3958		 * Jail implements inter-process debugging limits already, so
3959		 * allow jailed root various debugging privileges.
3960		 */
3961	case PRIV_DEBUG_DIFFCRED:
3962	case PRIV_DEBUG_SUGID:
3963	case PRIV_DEBUG_UNPRIV:
3964
3965		/*
3966		 * Allow jail to set various resource limits and login
3967		 * properties, and for now, exceed process resource limits.
3968		 */
3969	case PRIV_PROC_LIMIT:
3970	case PRIV_PROC_SETLOGIN:
3971	case PRIV_PROC_SETRLIMIT:
3972
3973		/*
3974		 * System V and POSIX IPC privileges are granted in jail.
3975		 */
3976	case PRIV_IPC_READ:
3977	case PRIV_IPC_WRITE:
3978	case PRIV_IPC_ADMIN:
3979	case PRIV_IPC_MSGSIZE:
3980	case PRIV_MQ_ADMIN:
3981
3982		/*
3983		 * Jail operations within a jail work on child jails.
3984		 */
3985	case PRIV_JAIL_ATTACH:
3986	case PRIV_JAIL_SET:
3987	case PRIV_JAIL_REMOVE:
3988
3989		/*
3990		 * Jail implements its own inter-process limits, so allow
3991		 * root processes in jail to change scheduling on other
3992		 * processes in the same jail.  Likewise for signalling.
3993		 */
3994	case PRIV_SCHED_DIFFCRED:
3995	case PRIV_SCHED_CPUSET:
3996	case PRIV_SIGNAL_DIFFCRED:
3997	case PRIV_SIGNAL_SUGID:
3998
3999		/*
4000		 * Allow jailed processes to write to sysctls marked as jail
4001		 * writable.
4002		 */
4003	case PRIV_SYSCTL_WRITEJAIL:
4004
4005		/*
4006		 * Allow root in jail to manage a variety of quota
4007		 * properties.  These should likely be conditional on a
4008		 * configuration option.
4009		 */
4010	case PRIV_VFS_GETQUOTA:
4011	case PRIV_VFS_SETQUOTA:
4012
4013		/*
4014		 * Since Jail relies on chroot() to implement file system
4015		 * protections, grant many VFS privileges to root in jail.
4016		 * Be careful to exclude mount-related and NFS-related
4017		 * privileges.
4018		 */
4019	case PRIV_VFS_READ:
4020	case PRIV_VFS_WRITE:
4021	case PRIV_VFS_ADMIN:
4022	case PRIV_VFS_EXEC:
4023	case PRIV_VFS_LOOKUP:
4024	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4025	case PRIV_VFS_CHFLAGS_DEV:
4026	case PRIV_VFS_CHOWN:
4027	case PRIV_VFS_CHROOT:
4028	case PRIV_VFS_RETAINSUGID:
4029	case PRIV_VFS_FCHROOT:
4030	case PRIV_VFS_LINK:
4031	case PRIV_VFS_SETGID:
4032	case PRIV_VFS_STAT:
4033	case PRIV_VFS_STICKYFILE:
4034
4035		/*
4036		 * As in the non-jail case, non-root users are expected to be
4037		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
4038		 * exists in the jail and they have permission to access it).
4039		 */
4040	case PRIV_KMEM_READ:
4041		return (0);
4042
4043		/*
4044		 * Depending on the global setting, allow privilege of
4045		 * setting system flags.
4046		 */
4047	case PRIV_VFS_SYSFLAGS:
4048		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4049			return (0);
4050		else
4051			return (EPERM);
4052
4053		/*
4054		 * Depending on the global setting, allow privilege of
4055		 * mounting/unmounting file systems.
4056		 */
4057	case PRIV_VFS_MOUNT:
4058	case PRIV_VFS_UNMOUNT:
4059	case PRIV_VFS_MOUNT_NONUSER:
4060	case PRIV_VFS_MOUNT_OWNER:
4061		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
4062		    cred->cr_prison->pr_enforce_statfs < 2)
4063			return (0);
4064		else
4065			return (EPERM);
4066
4067		/*
4068		 * Allow jailed root to bind reserved ports and reuse in-use
4069		 * ports.
4070		 */
4071	case PRIV_NETINET_RESERVEDPORT:
4072	case PRIV_NETINET_REUSEPORT:
4073		return (0);
4074
4075		/*
4076		 * Allow jailed root to set certian IPv4/6 (option) headers.
4077		 */
4078	case PRIV_NETINET_SETHDROPTS:
4079		return (0);
4080
4081		/*
4082		 * Conditionally allow creating raw sockets in jail.
4083		 */
4084	case PRIV_NETINET_RAW:
4085		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4086			return (0);
4087		else
4088			return (EPERM);
4089
4090		/*
4091		 * Since jail implements its own visibility limits on netstat
4092		 * sysctls, allow getcred.  This allows identd to work in
4093		 * jail.
4094		 */
4095	case PRIV_NETINET_GETCRED:
4096		return (0);
4097
4098		/*
4099		 * Allow jailed root to set loginclass.
4100		 */
4101	case PRIV_PROC_SETLOGINCLASS:
4102		return (0);
4103
4104	default:
4105		/*
4106		 * In all remaining cases, deny the privilege request.  This
4107		 * includes almost all network privileges, many system
4108		 * configuration privileges.
4109		 */
4110		return (EPERM);
4111	}
4112}
4113
4114/*
4115 * Return the part of pr2's name that is relative to pr1, or the whole name
4116 * if it does not directly follow.
4117 */
4118
4119char *
4120prison_name(struct prison *pr1, struct prison *pr2)
4121{
4122	char *name;
4123
4124	/* Jails see themselves as "0" (if they see themselves at all). */
4125	if (pr1 == pr2)
4126		return "0";
4127	name = pr2->pr_name;
4128	if (prison_ischild(pr1, pr2)) {
4129		/*
4130		 * pr1 isn't locked (and allprison_lock may not be either)
4131		 * so its length can't be counted on.  But the number of dots
4132		 * can be counted on - and counted.
4133		 */
4134		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4135			name = strchr(name, '.') + 1;
4136	}
4137	return (name);
4138}
4139
4140/*
4141 * Return the part of pr2's path that is relative to pr1, or the whole path
4142 * if it does not directly follow.
4143 */
4144static char *
4145prison_path(struct prison *pr1, struct prison *pr2)
4146{
4147	char *path1, *path2;
4148	int len1;
4149
4150	path1 = pr1->pr_path;
4151	path2 = pr2->pr_path;
4152	if (!strcmp(path1, "/"))
4153		return (path2);
4154	len1 = strlen(path1);
4155	if (strncmp(path1, path2, len1))
4156		return (path2);
4157	if (path2[len1] == '\0')
4158		return "/";
4159	if (path2[len1] == '/')
4160		return (path2 + len1);
4161	return (path2);
4162}
4163
4164
4165/*
4166 * Jail-related sysctls.
4167 */
4168static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4169    "Jails");
4170
4171static int
4172sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4173{
4174	struct xprison *xp;
4175	struct prison *pr, *cpr;
4176#ifdef INET
4177	struct in_addr *ip4 = NULL;
4178	int ip4s = 0;
4179#endif
4180#ifdef INET6
4181	struct in6_addr *ip6 = NULL;
4182	int ip6s = 0;
4183#endif
4184	int descend, error;
4185
4186	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4187	pr = req->td->td_ucred->cr_prison;
4188	error = 0;
4189	sx_slock(&allprison_lock);
4190	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4191#if defined(INET) || defined(INET6)
4192 again:
4193#endif
4194		mtx_lock(&cpr->pr_mtx);
4195#ifdef INET
4196		if (cpr->pr_ip4s > 0) {
4197			if (ip4s < cpr->pr_ip4s) {
4198				ip4s = cpr->pr_ip4s;
4199				mtx_unlock(&cpr->pr_mtx);
4200				ip4 = realloc(ip4, ip4s *
4201				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4202				goto again;
4203			}
4204			bcopy(cpr->pr_ip4, ip4,
4205			    cpr->pr_ip4s * sizeof(struct in_addr));
4206		}
4207#endif
4208#ifdef INET6
4209		if (cpr->pr_ip6s > 0) {
4210			if (ip6s < cpr->pr_ip6s) {
4211				ip6s = cpr->pr_ip6s;
4212				mtx_unlock(&cpr->pr_mtx);
4213				ip6 = realloc(ip6, ip6s *
4214				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4215				goto again;
4216			}
4217			bcopy(cpr->pr_ip6, ip6,
4218			    cpr->pr_ip6s * sizeof(struct in6_addr));
4219		}
4220#endif
4221		if (cpr->pr_ref == 0) {
4222			mtx_unlock(&cpr->pr_mtx);
4223			continue;
4224		}
4225		bzero(xp, sizeof(*xp));
4226		xp->pr_version = XPRISON_VERSION;
4227		xp->pr_id = cpr->pr_id;
4228		xp->pr_state = cpr->pr_uref > 0
4229		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4230		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4231		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4232		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4233#ifdef INET
4234		xp->pr_ip4s = cpr->pr_ip4s;
4235#endif
4236#ifdef INET6
4237		xp->pr_ip6s = cpr->pr_ip6s;
4238#endif
4239		mtx_unlock(&cpr->pr_mtx);
4240		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4241		if (error)
4242			break;
4243#ifdef INET
4244		if (xp->pr_ip4s > 0) {
4245			error = SYSCTL_OUT(req, ip4,
4246			    xp->pr_ip4s * sizeof(struct in_addr));
4247			if (error)
4248				break;
4249		}
4250#endif
4251#ifdef INET6
4252		if (xp->pr_ip6s > 0) {
4253			error = SYSCTL_OUT(req, ip6,
4254			    xp->pr_ip6s * sizeof(struct in6_addr));
4255			if (error)
4256				break;
4257		}
4258#endif
4259	}
4260	sx_sunlock(&allprison_lock);
4261	free(xp, M_TEMP);
4262#ifdef INET
4263	free(ip4, M_TEMP);
4264#endif
4265#ifdef INET6
4266	free(ip6, M_TEMP);
4267#endif
4268	return (error);
4269}
4270
4271SYSCTL_OID(_security_jail, OID_AUTO, list,
4272    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4273    sysctl_jail_list, "S", "List of active jails");
4274
4275static int
4276sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4277{
4278	int error, injail;
4279
4280	injail = jailed(req->td->td_ucred);
4281	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4282
4283	return (error);
4284}
4285
4286SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4287    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4288    sysctl_jail_jailed, "I", "Process in jail?");
4289
4290static int
4291sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4292{
4293	int error, havevnet;
4294#ifdef VIMAGE
4295	struct ucred *cred = req->td->td_ucred;
4296
4297	havevnet = jailed(cred) && prison_owns_vnet(cred);
4298#else
4299	havevnet = 0;
4300#endif
4301	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4302
4303	return (error);
4304}
4305
4306SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4307    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4308    sysctl_jail_vnet, "I", "Jail owns VNET?");
4309
4310#if defined(INET) || defined(INET6)
4311SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4312    &jail_max_af_ips, 0,
4313    "Number of IP addresses a jail may have at most per address family (deprecated)");
4314#endif
4315
4316/*
4317 * Default parameters for jail(2) compatability.  For historical reasons,
4318 * the sysctl names have varying similarity to the parameter names.  Prisons
4319 * just see their own parameters, and can't change them.
4320 */
4321static int
4322sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4323{
4324	struct prison *pr;
4325	int allow, error, i;
4326
4327	pr = req->td->td_ucred->cr_prison;
4328	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4329
4330	/* Get the current flag value, and convert it to a boolean. */
4331	i = (allow & arg2) ? 1 : 0;
4332	if (arg1 != NULL)
4333		i = !i;
4334	error = sysctl_handle_int(oidp, &i, 0, req);
4335	if (error || !req->newptr)
4336		return (error);
4337	i = i ? arg2 : 0;
4338	if (arg1 != NULL)
4339		i ^= arg2;
4340	/*
4341	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4342	 * for writing.
4343	 */
4344	mtx_lock(&prison0.pr_mtx);
4345	jail_default_allow = (jail_default_allow & ~arg2) | i;
4346	mtx_unlock(&prison0.pr_mtx);
4347	return (0);
4348}
4349
4350SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4351    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4352    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4353    "Processes in jail can set their hostnames (deprecated)");
4354SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4355    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4356    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4357    "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4358SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4359    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4360    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4361    "Processes in jail can use System V IPC primitives (deprecated)");
4362SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4363    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4364    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4365    "Prison root can create raw sockets (deprecated)");
4366SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4367    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4368    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4369    "Processes in jail can alter system file flags (deprecated)");
4370SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4371    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4372    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4373    "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4374SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
4375    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4376    NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
4377    "Processes in jail can mount the devfs file system (deprecated)");
4378SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
4379    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4380    NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
4381    "Processes in jail can mount the fdescfs file system (deprecated)");
4382SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
4383    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4384    NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
4385    "Processes in jail can mount the nullfs file system (deprecated)");
4386SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
4387    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4388    NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
4389    "Processes in jail can mount the procfs file system (deprecated)");
4390SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
4391    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4392    NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
4393    "Processes in jail can mount the linprocfs file system (deprecated)");
4394SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
4395    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4396    NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
4397    "Processes in jail can mount the linsysfs file system (deprecated)");
4398SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
4399    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4400    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
4401    "Processes in jail can mount the tmpfs file system (deprecated)");
4402SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
4403    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4404    NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
4405    "Processes in jail can mount the zfs file system (deprecated)");
4406
4407static int
4408sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4409{
4410	struct prison *pr;
4411	int level, error;
4412
4413	pr = req->td->td_ucred->cr_prison;
4414	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4415	error = sysctl_handle_int(oidp, &level, 0, req);
4416	if (error || !req->newptr)
4417		return (error);
4418	*(int *)arg1 = level;
4419	return (0);
4420}
4421
4422SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4423    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4424    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4425    sysctl_jail_default_level, "I",
4426    "Processes in jail cannot see all mounted file systems (deprecated)");
4427
4428SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4429    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4430    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4431    sysctl_jail_default_level, "I",
4432    "Ruleset for the devfs filesystem in jail (deprecated)");
4433
4434/*
4435 * Nodes to describe jail parameters.  Maximum length of string parameters
4436 * is returned in the string itself, and the other parameters exist merely
4437 * to make themselves and their types known.
4438 */
4439SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4440    "Jail parameters");
4441
4442int
4443sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4444{
4445	int i;
4446	long l;
4447	size_t s;
4448	char numbuf[12];
4449
4450	switch (oidp->oid_kind & CTLTYPE)
4451	{
4452	case CTLTYPE_LONG:
4453	case CTLTYPE_ULONG:
4454		l = 0;
4455#ifdef SCTL_MASK32
4456		if (!(req->flags & SCTL_MASK32))
4457#endif
4458			return (SYSCTL_OUT(req, &l, sizeof(l)));
4459	case CTLTYPE_INT:
4460	case CTLTYPE_UINT:
4461		i = 0;
4462		return (SYSCTL_OUT(req, &i, sizeof(i)));
4463	case CTLTYPE_STRING:
4464		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4465		return
4466		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4467	case CTLTYPE_STRUCT:
4468		s = (size_t)arg2;
4469		return (SYSCTL_OUT(req, &s, sizeof(s)));
4470	}
4471	return (0);
4472}
4473
4474/*
4475 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4476 * jail creation time but cannot be changed in an existing jail.
4477 */
4478SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4479SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4480SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4481SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4482SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4483    "I", "Jail secure level");
4484SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4485    "Jail value for kern.osreldate and uname -K");
4486SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4487    "Jail value for kern.osrelease and uname -r");
4488SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4489    "I", "Jail cannot see all mounted file systems");
4490SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4491    "I", "Ruleset for in-jail devfs mounts");
4492SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4493    "B", "Jail persistence");
4494#ifdef VIMAGE
4495SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4496    "E,jailsys", "Virtual network stack");
4497#endif
4498SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4499    "B", "Jail is in the process of shutting down");
4500
4501SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4502SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4503    "I", "Current number of child jails");
4504SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4505    "I", "Maximum number of child jails");
4506
4507SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4508SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4509    "Jail hostname");
4510SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4511    "Jail NIS domainname");
4512SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4513    "Jail host UUID");
4514SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4515    "LU", "Jail host ID");
4516
4517SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4518SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4519
4520#ifdef INET
4521SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4522    "Jail IPv4 address virtualization");
4523SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4524    "S,in_addr,a", "Jail IPv4 addresses");
4525SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4526    "B", "Do (not) use IPv4 source address selection rather than the "
4527    "primary jail IPv4 address.");
4528#endif
4529#ifdef INET6
4530SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4531    "Jail IPv6 address virtualization");
4532SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4533    "S,in6_addr,a", "Jail IPv6 addresses");
4534SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4535    "B", "Do (not) use IPv6 source address selection rather than the "
4536    "primary jail IPv6 address.");
4537#endif
4538
4539SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4540SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4541    "B", "Jail may set hostname");
4542SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4543    "B", "Jail may use SYSV IPC");
4544SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4545    "B", "Jail may create raw sockets");
4546SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4547    "B", "Jail may alter system file flags");
4548SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4549    "B", "Jail may set file quotas");
4550SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4551    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4552
4553SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4554SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4555    "B", "Jail may mount/unmount jail-friendly file systems in general");
4556SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
4557    "B", "Jail may mount the devfs file system");
4558SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
4559    "B", "Jail may mount the fdescfs file system");
4560SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
4561    "B", "Jail may mount the nullfs file system");
4562SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
4563    "B", "Jail may mount the procfs file system");
4564SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
4565    "B", "Jail may mount the linprocfs file system");
4566SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
4567    "B", "Jail may mount the linsysfs file system");
4568SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
4569    "B", "Jail may mount the tmpfs file system");
4570SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
4571    "B", "Jail may mount the zfs file system");
4572
4573#ifdef RACCT
4574void
4575prison_racct_foreach(void (*callback)(struct racct *racct,
4576    void *arg2, void *arg3), void *arg2, void *arg3)
4577{
4578	struct prison_racct *prr;
4579
4580	ASSERT_RACCT_ENABLED();
4581
4582	sx_slock(&allprison_lock);
4583	LIST_FOREACH(prr, &allprison_racct, prr_next)
4584		(callback)(prr->prr_racct, arg2, arg3);
4585	sx_sunlock(&allprison_lock);
4586}
4587
4588static struct prison_racct *
4589prison_racct_find_locked(const char *name)
4590{
4591	struct prison_racct *prr;
4592
4593	ASSERT_RACCT_ENABLED();
4594	sx_assert(&allprison_lock, SA_XLOCKED);
4595
4596	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4597		return (NULL);
4598
4599	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4600		if (strcmp(name, prr->prr_name) != 0)
4601			continue;
4602
4603		/* Found prison_racct with a matching name? */
4604		prison_racct_hold(prr);
4605		return (prr);
4606	}
4607
4608	/* Add new prison_racct. */
4609	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4610	racct_create(&prr->prr_racct);
4611
4612	strcpy(prr->prr_name, name);
4613	refcount_init(&prr->prr_refcount, 1);
4614	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4615
4616	return (prr);
4617}
4618
4619struct prison_racct *
4620prison_racct_find(const char *name)
4621{
4622	struct prison_racct *prr;
4623
4624	ASSERT_RACCT_ENABLED();
4625
4626	sx_xlock(&allprison_lock);
4627	prr = prison_racct_find_locked(name);
4628	sx_xunlock(&allprison_lock);
4629	return (prr);
4630}
4631
4632void
4633prison_racct_hold(struct prison_racct *prr)
4634{
4635
4636	ASSERT_RACCT_ENABLED();
4637
4638	refcount_acquire(&prr->prr_refcount);
4639}
4640
4641static void
4642prison_racct_free_locked(struct prison_racct *prr)
4643{
4644
4645	ASSERT_RACCT_ENABLED();
4646	sx_assert(&allprison_lock, SA_XLOCKED);
4647
4648	if (refcount_release(&prr->prr_refcount)) {
4649		racct_destroy(&prr->prr_racct);
4650		LIST_REMOVE(prr, prr_next);
4651		free(prr, M_PRISON_RACCT);
4652	}
4653}
4654
4655void
4656prison_racct_free(struct prison_racct *prr)
4657{
4658	int old;
4659
4660	ASSERT_RACCT_ENABLED();
4661	sx_assert(&allprison_lock, SA_UNLOCKED);
4662
4663	old = prr->prr_refcount;
4664	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4665		return;
4666
4667	sx_xlock(&allprison_lock);
4668	prison_racct_free_locked(prr);
4669	sx_xunlock(&allprison_lock);
4670}
4671
4672static void
4673prison_racct_attach(struct prison *pr)
4674{
4675	struct prison_racct *prr;
4676
4677	ASSERT_RACCT_ENABLED();
4678	sx_assert(&allprison_lock, SA_XLOCKED);
4679
4680	prr = prison_racct_find_locked(pr->pr_name);
4681	KASSERT(prr != NULL, ("cannot find prison_racct"));
4682
4683	pr->pr_prison_racct = prr;
4684}
4685
4686/*
4687 * Handle jail renaming.  From the racct point of view, renaming means
4688 * moving from one prison_racct to another.
4689 */
4690static void
4691prison_racct_modify(struct prison *pr)
4692{
4693	struct proc *p;
4694	struct ucred *cred;
4695	struct prison_racct *oldprr;
4696
4697	ASSERT_RACCT_ENABLED();
4698
4699	sx_slock(&allproc_lock);
4700	sx_xlock(&allprison_lock);
4701
4702	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4703		sx_xunlock(&allprison_lock);
4704		sx_sunlock(&allproc_lock);
4705		return;
4706	}
4707
4708	oldprr = pr->pr_prison_racct;
4709	pr->pr_prison_racct = NULL;
4710
4711	prison_racct_attach(pr);
4712
4713	/*
4714	 * Move resource utilisation records.
4715	 */
4716	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4717
4718	/*
4719	 * Force rctl to reattach rules to processes.
4720	 */
4721	FOREACH_PROC_IN_SYSTEM(p) {
4722		PROC_LOCK(p);
4723		cred = crhold(p->p_ucred);
4724		PROC_UNLOCK(p);
4725		racct_proc_ucred_changed(p, cred, cred);
4726		crfree(cred);
4727	}
4728
4729	sx_sunlock(&allproc_lock);
4730	prison_racct_free_locked(oldprr);
4731	sx_xunlock(&allprison_lock);
4732}
4733
4734static void
4735prison_racct_detach(struct prison *pr)
4736{
4737
4738	ASSERT_RACCT_ENABLED();
4739	sx_assert(&allprison_lock, SA_UNLOCKED);
4740
4741	if (pr->pr_prison_racct == NULL)
4742		return;
4743	prison_racct_free(pr->pr_prison_racct);
4744	pr->pr_prison_racct = NULL;
4745}
4746#endif /* RACCT */
4747
4748#ifdef DDB
4749
4750static void
4751db_show_prison(struct prison *pr)
4752{
4753	int fi;
4754#if defined(INET) || defined(INET6)
4755	int ii;
4756#endif
4757	unsigned jsf;
4758#ifdef INET6
4759	char ip6buf[INET6_ADDRSTRLEN];
4760#endif
4761
4762	db_printf("prison %p:\n", pr);
4763	db_printf(" jid             = %d\n", pr->pr_id);
4764	db_printf(" name            = %s\n", pr->pr_name);
4765	db_printf(" parent          = %p\n", pr->pr_parent);
4766	db_printf(" ref             = %d\n", pr->pr_ref);
4767	db_printf(" uref            = %d\n", pr->pr_uref);
4768	db_printf(" path            = %s\n", pr->pr_path);
4769	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4770	    ? pr->pr_cpuset->cs_id : -1);
4771#ifdef VIMAGE
4772	db_printf(" vnet            = %p\n", pr->pr_vnet);
4773#endif
4774	db_printf(" root            = %p\n", pr->pr_root);
4775	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4776	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4777	db_printf(" children.max    = %d\n", pr->pr_childmax);
4778	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4779	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4780	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4781	db_printf(" flags           = 0x%x", pr->pr_flags);
4782	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4783	    fi++)
4784		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4785			db_printf(" %s", pr_flag_names[fi]);
4786	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4787	    fi++) {
4788		jsf = pr->pr_flags &
4789		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4790		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4791		    pr_flag_jailsys[fi].disable &&
4792		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4793		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4794		    : "inherit");
4795	}
4796	db_printf(" allow           = 0x%x", pr->pr_allow);
4797	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4798	    fi++)
4799		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4800			db_printf(" %s", pr_allow_names[fi]);
4801	db_printf("\n");
4802	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4803	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4804	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4805	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4806	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4807#ifdef INET
4808	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4809	for (ii = 0; ii < pr->pr_ip4s; ii++)
4810		db_printf(" %s %s\n",
4811		    ii == 0 ? "ip4.addr        =" : "                 ",
4812		    inet_ntoa(pr->pr_ip4[ii]));
4813#endif
4814#ifdef INET6
4815	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4816	for (ii = 0; ii < pr->pr_ip6s; ii++)
4817		db_printf(" %s %s\n",
4818		    ii == 0 ? "ip6.addr        =" : "                 ",
4819		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4820#endif
4821}
4822
4823DB_SHOW_COMMAND(prison, db_show_prison_command)
4824{
4825	struct prison *pr;
4826
4827	if (!have_addr) {
4828		/*
4829		 * Show all prisons in the list, and prison0 which is not
4830		 * listed.
4831		 */
4832		db_show_prison(&prison0);
4833		if (!db_pager_quit) {
4834			TAILQ_FOREACH(pr, &allprison, pr_list) {
4835				db_show_prison(pr);
4836				if (db_pager_quit)
4837					break;
4838			}
4839		}
4840		return;
4841	}
4842
4843	if (addr == 0)
4844		pr = &prison0;
4845	else {
4846		/* Look for a prison with the ID and with references. */
4847		TAILQ_FOREACH(pr, &allprison, pr_list)
4848			if (pr->pr_id == addr && pr->pr_ref > 0)
4849				break;
4850		if (pr == NULL)
4851			/* Look again, without requiring a reference. */
4852			TAILQ_FOREACH(pr, &allprison, pr_list)
4853				if (pr->pr_id == addr)
4854					break;
4855		if (pr == NULL)
4856			/* Assume address points to a valid prison. */
4857			pr = (struct prison *)addr;
4858	}
4859	db_show_prison(pr);
4860}
4861
4862#endif /* DDB */
4863