kern_jail.c revision 191673
1139804Simp/*-
2185435Sbz * Copyright (c) 1999 Poul-Henning Kamp.
3185435Sbz * Copyright (c) 2008 Bjoern A. Zeeb.
4191673Sjamie * Copyright (c) 2009 James Gritton.
5185435Sbz * All rights reserved.
6190466Sjamie *
7185404Sbz * Redistribution and use in source and binary forms, with or without
8185404Sbz * modification, are permitted provided that the following conditions
9185404Sbz * are met:
10185404Sbz * 1. Redistributions of source code must retain the above copyright
11185404Sbz *    notice, this list of conditions and the following disclaimer.
12185404Sbz * 2. Redistributions in binary form must reproduce the above copyright
13185404Sbz *    notice, this list of conditions and the following disclaimer in the
14185404Sbz *    documentation and/or other materials provided with the distribution.
15185404Sbz *
16185404Sbz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17185404Sbz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18185404Sbz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19185404Sbz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20185404Sbz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21185404Sbz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22185404Sbz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23185404Sbz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24185404Sbz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25185404Sbz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26185404Sbz * SUCH DAMAGE.
2746197Sphk */
2846155Sphk
29116182Sobrien#include <sys/cdefs.h>
30116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 191673 2009-04-29 21:14:15Z jamie $");
31116182Sobrien
32185435Sbz#include "opt_ddb.h"
33185435Sbz#include "opt_inet.h"
34185435Sbz#include "opt_inet6.h"
35131177Spjd#include "opt_mac.h"
36131177Spjd
3746155Sphk#include <sys/param.h>
3846155Sphk#include <sys/types.h>
3946155Sphk#include <sys/kernel.h>
4046155Sphk#include <sys/systm.h>
4146155Sphk#include <sys/errno.h>
4246155Sphk#include <sys/sysproto.h>
4346155Sphk#include <sys/malloc.h>
44164032Srwatson#include <sys/priv.h>
4546155Sphk#include <sys/proc.h>
46124882Srwatson#include <sys/taskqueue.h>
47177785Skib#include <sys/fcntl.h>
4846155Sphk#include <sys/jail.h>
4987275Srwatson#include <sys/lock.h>
5087275Srwatson#include <sys/mutex.h>
51191673Sjamie#include <sys/osd.h>
52168401Spjd#include <sys/sx.h>
53113275Smike#include <sys/namei.h>
54147185Spjd#include <sys/mount.h>
55113275Smike#include <sys/queue.h>
5646155Sphk#include <sys/socket.h>
57113275Smike#include <sys/syscallsubr.h>
5857163Srwatson#include <sys/sysctl.h>
59113275Smike#include <sys/vnode.h>
60181803Sbz#include <sys/vimage.h>
6146155Sphk#include <net/if.h>
6246155Sphk#include <netinet/in.h>
63185435Sbz#ifdef DDB
64185435Sbz#include <ddb/ddb.h>
65185435Sbz#ifdef INET6
66185435Sbz#include <netinet6/in6_var.h>
67185435Sbz#endif /* INET6 */
68185435Sbz#endif /* DDB */
6946155Sphk
70163606Srwatson#include <security/mac/mac_framework.h>
71163606Srwatson
7246155SphkMALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
7346155Sphk
7489414SarrSYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
7557163Srwatson    "Jail rules");
7657163Srwatson
7757163Srwatsonint	jail_set_hostname_allowed = 1;
7889414SarrSYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
7957163Srwatson    &jail_set_hostname_allowed, 0,
8057163Srwatson    "Processes in jail can set their hostnames");
8157163Srwatson
8261235Srwatsonint	jail_socket_unixiproute_only = 1;
8389414SarrSYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
8461235Srwatson    &jail_socket_unixiproute_only, 0,
85185435Sbz    "Processes in jail are limited to creating UNIX/IP/route sockets only");
8661235Srwatson
8768024Srwatsonint	jail_sysvipc_allowed = 0;
8889414SarrSYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
8968024Srwatson    &jail_sysvipc_allowed, 0,
9068024Srwatson    "Processes in jail can use System V IPC primitives");
9168024Srwatson
92147185Spjdstatic int jail_enforce_statfs = 2;
93147185SpjdSYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
94147185Spjd    &jail_enforce_statfs, 0,
95147185Spjd    "Processes in jail cannot see all mounted file systems");
96125804Srwatson
97128664Sbmilekicint	jail_allow_raw_sockets = 0;
98128664SbmilekicSYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
99128664Sbmilekic    &jail_allow_raw_sockets, 0,
100128664Sbmilekic    "Prison root can create raw sockets");
101128664Sbmilekic
102141543Scpercivaint	jail_chflags_allowed = 0;
103141543ScpercivaSYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
104141543Scperciva    &jail_chflags_allowed, 0,
105141543Scperciva    "Processes in jail can alter system file flags");
106141543Scperciva
107168396Spjdint	jail_mount_allowed = 0;
108168396SpjdSYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
109168396Spjd    &jail_mount_allowed, 0,
110168396Spjd    "Processes in jail can mount/unmount jail-friendly file systems");
111168396Spjd
112185435Sbzint	jail_max_af_ips = 255;
113185435SbzSYSCTL_INT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
114185435Sbz    &jail_max_af_ips, 0,
115185435Sbz    "Number of IP addresses a jail may have at most per address family");
116185435Sbz
117179881Sdelphij/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
118168401Spjdstruct	sx allprison_lock;
119191673SjamieSX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
120191673Sjamiestruct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
121179881Sdelphijint	lastprid = 0;
122113275Smikeint	prisoncount = 0;
123113275Smike
124191673Sjamiestatic int do_jail_attach(struct thread *td, struct prison *pr);
125190466Sjamiestatic void prison_complete(void *context, int pending);
126191673Sjamiestatic void prison_deref(struct prison *pr, int flags);
127185435Sbz#ifdef INET
128190466Sjamiestatic int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
129185435Sbz#endif
130185435Sbz#ifdef INET6
131190466Sjamiestatic int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
132185435Sbz#endif
133191671Sjamiestatic int sysctl_jail_list(SYSCTL_HANDLER_ARGS);
134113275Smike
135191673Sjamie/* Flags for prison_deref */
136191673Sjamie#define	PD_DEREF	0x01
137191673Sjamie#define	PD_DEUREF	0x02
138191673Sjamie#define	PD_LOCKED	0x04
139191673Sjamie#define	PD_LIST_SLOCKED	0x08
140191673Sjamie#define	PD_LIST_XLOCKED	0x10
141113275Smike
142185435Sbz#ifdef INET
143185435Sbzstatic int
144185435Sbzqcmp_v4(const void *ip1, const void *ip2)
145185435Sbz{
146185435Sbz	in_addr_t iaa, iab;
147185435Sbz
148185435Sbz	/*
149185435Sbz	 * We need to compare in HBO here to get the list sorted as expected
150185435Sbz	 * by the result of the code.  Sorting NBO addresses gives you
151185435Sbz	 * interesting results.  If you do not understand, do not try.
152185435Sbz	 */
153185435Sbz	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
154185435Sbz	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
155185435Sbz
156185435Sbz	/*
157185435Sbz	 * Do not simply return the difference of the two numbers, the int is
158185435Sbz	 * not wide enough.
159185435Sbz	 */
160185435Sbz	if (iaa > iab)
161185435Sbz		return (1);
162185435Sbz	else if (iaa < iab)
163185435Sbz		return (-1);
164185435Sbz	else
165185435Sbz		return (0);
166185435Sbz}
167185435Sbz#endif
168185435Sbz
169185435Sbz#ifdef INET6
170185435Sbzstatic int
171185435Sbzqcmp_v6(const void *ip1, const void *ip2)
172185435Sbz{
173185435Sbz	const struct in6_addr *ia6a, *ia6b;
174185435Sbz	int i, rc;
175185435Sbz
176185435Sbz	ia6a = (const struct in6_addr *)ip1;
177185435Sbz	ia6b = (const struct in6_addr *)ip2;
178185435Sbz
179185435Sbz	rc = 0;
180190466Sjamie	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
181185435Sbz		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
182185435Sbz			rc = 1;
183185435Sbz		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
184185435Sbz			rc = -1;
185185435Sbz	}
186185435Sbz	return (rc);
187185435Sbz}
188185435Sbz#endif
189185435Sbz
190191673Sjamie/*
191191673Sjamie * struct jail_args {
192191673Sjamie *	struct jail *jail;
193191673Sjamie * };
194191673Sjamie */
195191673Sjamieint
196191673Sjamiejail(struct thread *td, struct jail_args *uap)
197185435Sbz{
198191673Sjamie	struct iovec optiov[10];
199191673Sjamie	struct uio opt;
200191673Sjamie	char *u_path, *u_hostname, *u_name;
201191673Sjamie#ifdef INET
202191673Sjamie	struct in_addr *u_ip4;
203191673Sjamie#endif
204191673Sjamie#ifdef INET6
205191673Sjamie	struct in6_addr *u_ip6;
206191673Sjamie#endif
207191673Sjamie	uint32_t version;
208191673Sjamie	int error;
209185435Sbz
210191673Sjamie	error = copyin(uap->jail, &version, sizeof(uint32_t));
211191673Sjamie	if (error)
212191673Sjamie		return (error);
213185435Sbz
214191673Sjamie	switch (version) {
215191673Sjamie	case 0:
216191673Sjamie	{
217191673Sjamie		/* FreeBSD single IPv4 jails. */
218191673Sjamie		struct jail_v0 j0;
219185435Sbz
220191673Sjamie		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
221191673Sjamie		if (error)
222191673Sjamie			return (error);
223191673Sjamie		u_path = malloc(MAXPATHLEN + MAXHOSTNAMELEN, M_TEMP, M_WAITOK);
224191673Sjamie		u_hostname = u_path + MAXPATHLEN;
225191673Sjamie		opt.uio_iov = optiov;
226191673Sjamie		opt.uio_iovcnt = 4;
227191673Sjamie		opt.uio_offset = -1;
228191673Sjamie		opt.uio_resid = -1;
229191673Sjamie		opt.uio_segflg = UIO_SYSSPACE;
230191673Sjamie		opt.uio_rw = UIO_READ;
231191673Sjamie		opt.uio_td = td;
232191673Sjamie		optiov[0].iov_base = "path";
233191673Sjamie		optiov[0].iov_len = sizeof("path");
234191673Sjamie		optiov[1].iov_base = u_path;
235191673Sjamie		error =
236191673Sjamie		    copyinstr(j0.path, u_path, MAXPATHLEN, &optiov[1].iov_len);
237191673Sjamie		if (error) {
238191673Sjamie			free(u_path, M_TEMP);
239191673Sjamie			return (error);
240191673Sjamie		}
241191673Sjamie		optiov[2].iov_base = "host.hostname";
242191673Sjamie		optiov[2].iov_len = sizeof("host.hostname");
243191673Sjamie		optiov[3].iov_base = u_hostname;
244191673Sjamie		error = copyinstr(j0.hostname, u_hostname, MAXHOSTNAMELEN,
245191673Sjamie		    &optiov[3].iov_len);
246191673Sjamie		if (error) {
247191673Sjamie			free(u_path, M_TEMP);
248191673Sjamie			return (error);
249191673Sjamie		}
250191673Sjamie#ifdef INET
251191673Sjamie		optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
252191673Sjamie		optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
253191673Sjamie		opt.uio_iovcnt++;
254191673Sjamie		optiov[opt.uio_iovcnt].iov_base = &j0.ip_number;
255191673Sjamie		j0.ip_number = htonl(j0.ip_number);
256191673Sjamie		optiov[opt.uio_iovcnt].iov_len = sizeof(j0.ip_number);
257191673Sjamie		opt.uio_iovcnt++;
258191673Sjamie#endif
259191673Sjamie		break;
260191673Sjamie	}
261191673Sjamie
262191673Sjamie	case 1:
263185435Sbz		/*
264191673Sjamie		 * Version 1 was used by multi-IPv4 jail implementations
265191673Sjamie		 * that never made it into the official kernel.
266185435Sbz		 */
267191673Sjamie		return (EINVAL);
268185435Sbz
269191673Sjamie	case 2:	/* JAIL_API_VERSION */
270191673Sjamie	{
271191673Sjamie		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
272191673Sjamie		struct jail j;
273191673Sjamie		size_t tmplen;
274191673Sjamie
275191673Sjamie		error = copyin(uap->jail, &j, sizeof(struct jail));
276191673Sjamie		if (error)
277191673Sjamie			return (error);
278191673Sjamie		tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
279185435Sbz#ifdef INET
280191673Sjamie		if (j.ip4s > jail_max_af_ips)
281191673Sjamie			return (EINVAL);
282191673Sjamie		tmplen += j.ip4s * sizeof(struct in_addr);
283191673Sjamie#else
284191673Sjamie		if (j.ip4s > 0)
285191673Sjamie			return (EINVAL);
286191673Sjamie#endif
287191673Sjamie#ifdef INET6
288191673Sjamie		if (j.ip6s > jail_max_af_ips)
289191673Sjamie			return (EINVAL);
290191673Sjamie		tmplen += j.ip6s * sizeof(struct in6_addr);
291191673Sjamie#else
292191673Sjamie		if (j.ip6s > 0)
293191673Sjamie			return (EINVAL);
294191673Sjamie#endif
295191673Sjamie		u_path = malloc(tmplen, M_TEMP, M_WAITOK);
296191673Sjamie		u_hostname = u_path + MAXPATHLEN;
297191673Sjamie		u_name = u_hostname + MAXHOSTNAMELEN;
298191673Sjamie#ifdef INET
299191673Sjamie		u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
300191673Sjamie#endif
301191673Sjamie#ifdef INET6
302191673Sjamie#ifdef INET
303191673Sjamie		u_ip6 = (struct in6_addr *)(u_ip4 + j.ip4s);
304191673Sjamie#else
305191673Sjamie		u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
306191673Sjamie#endif
307191673Sjamie#endif
308191673Sjamie		opt.uio_iov = optiov;
309191673Sjamie		opt.uio_iovcnt = 4;
310191673Sjamie		opt.uio_offset = -1;
311191673Sjamie		opt.uio_resid = -1;
312191673Sjamie		opt.uio_segflg = UIO_SYSSPACE;
313191673Sjamie		opt.uio_rw = UIO_READ;
314191673Sjamie		opt.uio_td = td;
315191673Sjamie		optiov[0].iov_base = "path";
316191673Sjamie		optiov[0].iov_len = sizeof("path");
317191673Sjamie		optiov[1].iov_base = u_path;
318191673Sjamie		error =
319191673Sjamie		    copyinstr(j.path, u_path, MAXPATHLEN, &optiov[1].iov_len);
320191673Sjamie		if (error) {
321191673Sjamie			free(u_path, M_TEMP);
322191673Sjamie			return (error);
323191673Sjamie		}
324191673Sjamie		optiov[2].iov_base = "host.hostname";
325191673Sjamie		optiov[2].iov_len = sizeof("host.hostname");
326191673Sjamie		optiov[3].iov_base = u_hostname;
327191673Sjamie		error = copyinstr(j.hostname, u_hostname, MAXHOSTNAMELEN,
328191673Sjamie		    &optiov[3].iov_len);
329191673Sjamie		if (error) {
330191673Sjamie			free(u_path, M_TEMP);
331191673Sjamie			return (error);
332191673Sjamie		}
333191673Sjamie		if (j.jailname != NULL) {
334191673Sjamie			optiov[opt.uio_iovcnt].iov_base = "name";
335191673Sjamie			optiov[opt.uio_iovcnt].iov_len = sizeof("name");
336191673Sjamie			opt.uio_iovcnt++;
337191673Sjamie			optiov[opt.uio_iovcnt].iov_base = u_name;
338191673Sjamie			error = copyinstr(j.jailname, u_name, MAXHOSTNAMELEN,
339191673Sjamie			    &optiov[opt.uio_iovcnt].iov_len);
340191673Sjamie			if (error) {
341191673Sjamie				free(u_path, M_TEMP);
342191673Sjamie				return (error);
343185435Sbz			}
344191673Sjamie			opt.uio_iovcnt++;
345185435Sbz		}
346191673Sjamie#ifdef INET
347191673Sjamie		optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
348191673Sjamie		optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
349191673Sjamie		opt.uio_iovcnt++;
350191673Sjamie		optiov[opt.uio_iovcnt].iov_base = u_ip4;
351191673Sjamie		optiov[opt.uio_iovcnt].iov_len =
352191673Sjamie		    j.ip4s * sizeof(struct in_addr);
353191673Sjamie		error = copyin(j.ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
354191673Sjamie		if (error) {
355191673Sjamie			free(u_path, M_TEMP);
356191673Sjamie			return (error);
357191673Sjamie		}
358191673Sjamie		opt.uio_iovcnt++;
359185435Sbz#endif
360185435Sbz#ifdef INET6
361191673Sjamie		optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
362191673Sjamie		optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
363191673Sjamie		opt.uio_iovcnt++;
364191673Sjamie		optiov[opt.uio_iovcnt].iov_base = u_ip6;
365191673Sjamie		optiov[opt.uio_iovcnt].iov_len =
366191673Sjamie		    j.ip6s * sizeof(struct in6_addr);
367191673Sjamie		error = copyin(j.ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
368191673Sjamie		if (error) {
369191673Sjamie			free(u_path, M_TEMP);
370191673Sjamie			return (error);
371185435Sbz		}
372191673Sjamie		opt.uio_iovcnt++;
373185435Sbz#endif
374191673Sjamie		break;
375185435Sbz	}
376185435Sbz
377191673Sjamie	default:
378191673Sjamie		/* Sci-Fi jails are not supported, sorry. */
379191673Sjamie		return (EINVAL);
380191673Sjamie	}
381191673Sjamie	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
382191673Sjamie	free(u_path, M_TEMP);
383191673Sjamie	return (error);
384185435Sbz}
385185435Sbz
386191673Sjamie/*
387191673Sjamie * struct jail_set_args {
388191673Sjamie *	struct iovec *iovp;
389191673Sjamie *	unsigned int iovcnt;
390191673Sjamie *	int flags;
391191673Sjamie * };
392191673Sjamie */
393191673Sjamieint
394191673Sjamiejail_set(struct thread *td, struct jail_set_args *uap)
395185435Sbz{
396191673Sjamie	struct uio *auio;
397191673Sjamie	int error;
398191673Sjamie
399191673Sjamie	/* Check that we have an even number of iovecs. */
400191673Sjamie	if (uap->iovcnt & 1)
401191673Sjamie		return (EINVAL);
402191673Sjamie
403191673Sjamie	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
404191673Sjamie	if (error)
405191673Sjamie		return (error);
406191673Sjamie	error = kern_jail_set(td, auio, uap->flags);
407191673Sjamie	free(auio, M_IOV);
408191673Sjamie	return (error);
409191673Sjamie}
410191673Sjamie
411191673Sjamieint
412191673Sjamiekern_jail_set(struct thread *td, struct uio *optuio, int flags)
413191673Sjamie{
414191673Sjamie	struct nameidata nd;
415185435Sbz#ifdef INET
416190466Sjamie	struct in_addr *ip4;
417185435Sbz#endif
418185435Sbz#ifdef INET6
419185435Sbz	struct in6_addr *ip6;
420185435Sbz#endif
421191673Sjamie	struct vfsopt *opt;
422191673Sjamie	struct vfsoptlist *opts;
423191673Sjamie	struct prison *pr, *deadpr, *tpr;
424191673Sjamie	struct vnode *root;
425191673Sjamie	char *errmsg, *host, *name, *p, *path;
426191673Sjamie	void *op;
427191673Sjamie	int created, cuflags, error, errmsg_len, errmsg_pos;
428191673Sjamie	int gotslevel, jid, len;
429191673Sjamie	int slevel, vfslocked;
430191673Sjamie#if defined(INET) || defined(INET6)
431191673Sjamie	int ii;
432191673Sjamie#endif
433191673Sjamie#ifdef INET
434191673Sjamie	int ip4s;
435191673Sjamie#endif
436191673Sjamie#ifdef INET6
437191673Sjamie	int ip6s;
438191673Sjamie#endif
439191673Sjamie	unsigned pr_flags, ch_flags;
440191673Sjamie	char numbuf[12];
441185435Sbz
442191673Sjamie	error = priv_check(td, PRIV_JAIL_SET);
443191673Sjamie	if (!error && (flags & JAIL_ATTACH))
444191673Sjamie		error = priv_check(td, PRIV_JAIL_ATTACH);
445191673Sjamie	if (error)
446191673Sjamie		return (error);
447191673Sjamie	if (flags & ~JAIL_SET_MASK)
448191673Sjamie		return (EINVAL);
449191673Sjamie
450185435Sbz	/*
451191673Sjamie	 * Check all the parameters before committing to anything.  Not all
452191673Sjamie	 * errors can be caught early, but we may as well try.  Also, this
453191673Sjamie	 * takes care of some expensive stuff (path lookup) before getting
454191673Sjamie	 * the allprison lock.
455185435Sbz	 *
456191673Sjamie	 * XXX Jails are not filesystems, and jail parameters are not mount
457191673Sjamie	 *     options.  But it makes more sense to re-use the vfsopt code
458191673Sjamie	 *     than duplicate it under a different name.
459185435Sbz	 */
460191673Sjamie	error = vfs_buildopts(optuio, &opts);
461191673Sjamie	if (error)
462191673Sjamie		return (error);
463185435Sbz#ifdef INET
464185435Sbz	ip4 = NULL;
465185435Sbz#endif
466185435Sbz#ifdef INET6
467185435Sbz	ip6 = NULL;
468185435Sbz#endif
469191673Sjamie
470191673Sjamie	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
471191673Sjamie	if (error == ENOENT)
472191673Sjamie		jid = 0;
473191673Sjamie	else if (error != 0)
474191673Sjamie		goto done_free;
475191673Sjamie
476191673Sjamie	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
477191673Sjamie	if (error == ENOENT)
478191673Sjamie		gotslevel = 0;
479191673Sjamie	else if (error != 0)
480191673Sjamie		goto done_free;
481191673Sjamie	else
482191673Sjamie		gotslevel = 1;
483191673Sjamie
484191673Sjamie	pr_flags = ch_flags = 0;
485191673Sjamie	vfs_flagopt(opts, "persist", &pr_flags, PR_PERSIST);
486191673Sjamie	vfs_flagopt(opts, "nopersist", &ch_flags, PR_PERSIST);
487191673Sjamie	ch_flags |= pr_flags;
488191673Sjamie	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
489191673Sjamie	    && !(pr_flags & PR_PERSIST)) {
490191673Sjamie		error = EINVAL;
491191673Sjamie		vfs_opterror(opts, "new jail must persist or attach");
492191673Sjamie		goto done_errmsg;
493191673Sjamie	}
494191673Sjamie
495191673Sjamie	error = vfs_getopt(opts, "name", (void **)&name, &len);
496191673Sjamie	if (error == ENOENT)
497191673Sjamie		name = NULL;
498191673Sjamie	else if (error != 0)
499191673Sjamie		goto done_free;
500191673Sjamie	else {
501191673Sjamie		if (len == 0 || name[len - 1] != '\0') {
502191673Sjamie			error = EINVAL;
503191673Sjamie			goto done_free;
504191673Sjamie		}
505191673Sjamie		if (len > MAXHOSTNAMELEN) {
506191673Sjamie			error = ENAMETOOLONG;
507191673Sjamie			goto done_free;
508191673Sjamie		}
509191673Sjamie	}
510191673Sjamie
511191673Sjamie	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
512191673Sjamie	if (error == ENOENT)
513191673Sjamie		host = NULL;
514191673Sjamie	else if (error != 0)
515191673Sjamie		goto done_free;
516191673Sjamie	else {
517191673Sjamie		if (len == 0 || host[len - 1] != '\0') {
518191673Sjamie			error = EINVAL;
519191673Sjamie			goto done_free;
520191673Sjamie		}
521191673Sjamie		if (len > MAXHOSTNAMELEN) {
522191673Sjamie			error = ENAMETOOLONG;
523191673Sjamie			goto done_free;
524191673Sjamie		}
525191673Sjamie	}
526191673Sjamie
527185435Sbz#ifdef INET
528191673Sjamie	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
529191673Sjamie	if (error == ENOENT)
530191673Sjamie		ip4s = -1;
531191673Sjamie	else if (error != 0)
532191673Sjamie		goto done_free;
533191673Sjamie	else if (ip4s & (sizeof(*ip4) - 1)) {
534191673Sjamie		error = EINVAL;
535191673Sjamie		goto done_free;
536191673Sjamie	} else if (ip4s > 0) {
537191673Sjamie		ip4s /= sizeof(*ip4);
538191673Sjamie		if (ip4s > jail_max_af_ips) {
539191673Sjamie			error = EINVAL;
540191673Sjamie			vfs_opterror(opts, "too many IPv4 addresses");
541191673Sjamie			goto done_errmsg;
542191673Sjamie		}
543191673Sjamie		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
544191673Sjamie		bcopy(op, ip4, ip4s * sizeof(*ip4));
545185435Sbz		/*
546191673Sjamie		 * IP addresses are all sorted but ip[0] to preserve the
547191673Sjamie		 * primary IP address as given from userland.  This special IP
548191673Sjamie		 * is used for unbound outgoing connections as well for
549191673Sjamie		 * "loopback" traffic.
550185435Sbz		 */
551191673Sjamie		if (ip4s > 1)
552191673Sjamie			qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
553191673Sjamie		/*
554191673Sjamie		 * Check for duplicate addresses and do some simple zero and
555191673Sjamie		 * broadcast checks. If users give other bogus addresses it is
556191673Sjamie		 * their problem.
557191673Sjamie		 *
558191673Sjamie		 * We do not have to care about byte order for these checks so
559191673Sjamie		 * we will do them in NBO.
560191673Sjamie		 */
561191673Sjamie		for (ii = 0; ii < ip4s; ii++) {
562191673Sjamie			if (ip4[ii].s_addr == INADDR_ANY ||
563191673Sjamie			    ip4[ii].s_addr == INADDR_BROADCAST) {
564185435Sbz				error = EINVAL;
565191673Sjamie				goto done_free;
566185435Sbz			}
567191673Sjamie			if ((ii+1) < ip4s &&
568191673Sjamie			    (ip4[0].s_addr == ip4[ii+1].s_addr ||
569191673Sjamie			     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
570185435Sbz				error = EINVAL;
571191673Sjamie				goto done_free;
572185435Sbz			}
573185435Sbz		}
574191673Sjamie	}
575191673Sjamie#endif
576185435Sbz
577185435Sbz#ifdef INET6
578191673Sjamie	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
579191673Sjamie	if (error == ENOENT)
580191673Sjamie		ip6s = -1;
581191673Sjamie	else if (error != 0)
582191673Sjamie		goto done_free;
583191673Sjamie	else if (ip6s & (sizeof(*ip6) - 1)) {
584191673Sjamie		error = EINVAL;
585191673Sjamie		goto done_free;
586191673Sjamie	} else if (ip6s > 0) {
587191673Sjamie		ip6s /= sizeof(*ip6);
588191673Sjamie		if (ip6s > jail_max_af_ips) {
589191673Sjamie			error = EINVAL;
590191673Sjamie			vfs_opterror(opts, "too many IPv6 addresses");
591191673Sjamie			goto done_errmsg;
592191673Sjamie		}
593191673Sjamie		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
594191673Sjamie		bcopy(op, ip6, ip6s * sizeof(*ip6));
595191673Sjamie		if (ip6s > 1)
596191673Sjamie			qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
597191673Sjamie		for (ii = 0; ii < ip6s; ii++) {
598191673Sjamie			if (IN6_IS_ADDR_UNSPECIFIED(&ip6[0])) {
599185435Sbz				error = EINVAL;
600191673Sjamie				goto done_free;
601185435Sbz			}
602191673Sjamie			if ((ii+1) < ip6s &&
603191673Sjamie			    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
604191673Sjamie			     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
605191673Sjamie			{
606185435Sbz				error = EINVAL;
607191673Sjamie				goto done_free;
608185435Sbz			}
609185435Sbz		}
610191673Sjamie	}
611185435Sbz#endif
612185435Sbz
613191673Sjamie	root = NULL;
614191673Sjamie	error = vfs_getopt(opts, "path", (void **)&path, &len);
615191673Sjamie	if (error == ENOENT)
616191673Sjamie		path = NULL;
617191673Sjamie	else if (error != 0)
618191673Sjamie		goto done_free;
619191673Sjamie	else {
620191673Sjamie		if (flags & JAIL_UPDATE) {
621191673Sjamie			error = EINVAL;
622191673Sjamie			vfs_opterror(opts,
623191673Sjamie			    "path cannot be changed after creation");
624191673Sjamie			goto done_errmsg;
625191673Sjamie		}
626191673Sjamie		if (len == 0 || path[len - 1] != '\0') {
627191673Sjamie			error = EINVAL;
628191673Sjamie			goto done_free;
629191673Sjamie		}
630191673Sjamie		if (len > MAXPATHLEN) {
631191673Sjamie			error = ENAMETOOLONG;
632191673Sjamie			goto done_free;
633191673Sjamie		}
634191673Sjamie		if (len < 2 || (len == 2 && path[0] == '/'))
635191673Sjamie			path = NULL;
636191673Sjamie		else {
637191673Sjamie			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
638191673Sjamie			    path, td);
639191673Sjamie			error = namei(&nd);
640191673Sjamie			if (error)
641191673Sjamie				goto done_free;
642191673Sjamie			vfslocked = NDHASGIANT(&nd);
643191673Sjamie			root = nd.ni_vp;
644191673Sjamie			NDFREE(&nd, NDF_ONLY_PNBUF);
645191673Sjamie			if (root->v_type != VDIR) {
646191673Sjamie				error = ENOTDIR;
647191673Sjamie				vrele(root);
648191673Sjamie				VFS_UNLOCK_GIANT(vfslocked);
649191673Sjamie				goto done_free;
650191673Sjamie			}
651191673Sjamie			VFS_UNLOCK_GIANT(vfslocked);
652191673Sjamie		}
653191673Sjamie	}
654185435Sbz
655191673Sjamie	/*
656191673Sjamie	 * Grab the allprison lock before letting modules check their
657191673Sjamie	 * parameters.  Once we have it, do not let go so we'll have a
658191673Sjamie	 * consistent view of the OSD list.
659191673Sjamie	 */
660191673Sjamie	sx_xlock(&allprison_lock);
661191673Sjamie	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
662191673Sjamie	if (error)
663191673Sjamie		goto done_unlock_list;
664185435Sbz
665191673Sjamie	/* By now, all parameters should have been noted. */
666191673Sjamie	TAILQ_FOREACH(opt, opts, link) {
667191673Sjamie		if (!opt->seen && strcmp(opt->name, "errmsg")) {
668191673Sjamie			error = EINVAL;
669191673Sjamie			vfs_opterror(opts, "unknown parameter: %s", opt->name);
670191673Sjamie			goto done_unlock_list;
671191673Sjamie		}
672191673Sjamie	}
673191673Sjamie
674185435Sbz	/*
675191673Sjamie	 * See if we are creating a new record or updating an existing one.
676191673Sjamie	 * This abuses the file error codes ENOENT and EEXIST.
677185435Sbz	 */
678191673Sjamie	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
679191673Sjamie	if (!cuflags) {
680191673Sjamie		error = EINVAL;
681191673Sjamie		vfs_opterror(opts, "no valid operation (create or update)");
682191673Sjamie		goto done_unlock_list;
683191673Sjamie	}
684191673Sjamie	pr = NULL;
685191673Sjamie	if (jid != 0) {
686191673Sjamie		/* See if a requested jid already exists. */
687191673Sjamie		if (jid < 0) {
688191673Sjamie			error = EINVAL;
689191673Sjamie			vfs_opterror(opts, "negative jid");
690191673Sjamie			goto done_unlock_list;
691191673Sjamie		}
692191673Sjamie		pr = prison_find(jid);
693191673Sjamie		if (pr != NULL) {
694191673Sjamie			/* Create: jid must not exist. */
695191673Sjamie			if (cuflags == JAIL_CREATE) {
696191673Sjamie				mtx_unlock(&pr->pr_mtx);
697191673Sjamie				error = EEXIST;
698191673Sjamie				vfs_opterror(opts, "jail %d already exists",
699191673Sjamie				    jid);
700191673Sjamie				goto done_unlock_list;
701191673Sjamie			}
702191673Sjamie			if (pr->pr_uref == 0) {
703191673Sjamie				if (!(flags & JAIL_DYING)) {
704191673Sjamie					mtx_unlock(&pr->pr_mtx);
705191673Sjamie					error = ENOENT;
706191673Sjamie					vfs_opterror(opts, "jail %d is dying",
707191673Sjamie					    jid);
708191673Sjamie					goto done_unlock_list;
709191673Sjamie				} else if ((flags & JAIL_ATTACH) ||
710191673Sjamie				    (pr_flags & PR_PERSIST)) {
711191673Sjamie					/*
712191673Sjamie					 * A dying jail might be resurrected
713191673Sjamie					 * (via attach or persist), but first
714191673Sjamie					 * it must determine if another jail
715191673Sjamie					 * has claimed its name.  Accomplish
716191673Sjamie					 * this by implicitly re-setting the
717191673Sjamie					 * name.
718191673Sjamie					 */
719191673Sjamie					if (name == NULL)
720191673Sjamie						name = pr->pr_name;
721191673Sjamie				}
722191673Sjamie			}
723191673Sjamie		}
724191673Sjamie		if (pr == NULL) {
725191673Sjamie			/* Update: jid must exist. */
726191673Sjamie			if (cuflags == JAIL_UPDATE) {
727191673Sjamie				error = ENOENT;
728191673Sjamie				vfs_opterror(opts, "jail %d not found", jid);
729191673Sjamie				goto done_unlock_list;
730191673Sjamie			}
731191673Sjamie		}
732191673Sjamie	}
733191673Sjamie	/*
734191673Sjamie	 * If the caller provided a name, look for a jail by that name.
735191673Sjamie	 * This has different semantics for creates and updates keyed by jid
736191673Sjamie	 * (where the name must not already exist in a different jail),
737191673Sjamie	 * and updates keyed by the name itself (where the name must exist
738191673Sjamie	 * because that is the jail being updated).
739191673Sjamie	 */
740191673Sjamie	if (name != NULL) {
741191673Sjamie		if (name[0] != '\0') {
742191673Sjamie			deadpr = NULL;
743191673Sjamie name_again:
744191673Sjamie			TAILQ_FOREACH(tpr, &allprison, pr_list) {
745191673Sjamie				if (tpr != pr && tpr->pr_ref > 0 &&
746191673Sjamie				    !strcmp(tpr->pr_name, name)) {
747191673Sjamie					if (pr == NULL &&
748191673Sjamie					    cuflags != JAIL_CREATE) {
749191673Sjamie						mtx_lock(&tpr->pr_mtx);
750191673Sjamie						if (tpr->pr_ref > 0) {
751191673Sjamie							/*
752191673Sjamie							 * Use this jail
753191673Sjamie							 * for updates.
754191673Sjamie							 */
755191673Sjamie							if (tpr->pr_uref > 0) {
756191673Sjamie								pr = tpr;
757191673Sjamie								break;
758191673Sjamie							}
759191673Sjamie							deadpr = tpr;
760191673Sjamie						}
761191673Sjamie						mtx_unlock(&tpr->pr_mtx);
762191673Sjamie					} else if (tpr->pr_uref > 0) {
763191673Sjamie						/*
764191673Sjamie						 * Create, or update(jid):
765191673Sjamie						 * name must not exist in an
766191673Sjamie						 * active jail.
767191673Sjamie						 */
768191673Sjamie						error = EEXIST;
769191673Sjamie						if (pr != NULL)
770191673Sjamie							mtx_unlock(&pr->pr_mtx);
771191673Sjamie						vfs_opterror(opts,
772191673Sjamie						   "jail \"%s\" already exists",
773191673Sjamie						   name);
774191673Sjamie						goto done_unlock_list;
775191673Sjamie					}
776191673Sjamie				}
777191673Sjamie			}
778191673Sjamie			/* If no active jail is found, use a dying one. */
779191673Sjamie			if (deadpr != NULL && pr == NULL) {
780191673Sjamie				if (flags & JAIL_DYING) {
781191673Sjamie					mtx_lock(&deadpr->pr_mtx);
782191673Sjamie					if (deadpr->pr_ref == 0) {
783191673Sjamie						mtx_unlock(&deadpr->pr_mtx);
784191673Sjamie						goto name_again;
785191673Sjamie					}
786191673Sjamie					pr = deadpr;
787191673Sjamie				} else if (cuflags == JAIL_UPDATE) {
788191673Sjamie					error = ENOENT;
789191673Sjamie					vfs_opterror(opts,
790191673Sjamie					    "jail \"%s\" is dying", name);
791191673Sjamie					goto done_unlock_list;
792191673Sjamie				}
793191673Sjamie			}
794191673Sjamie			/* Update: name must exist if no jid. */
795191673Sjamie			else if (cuflags == JAIL_UPDATE && pr == NULL) {
796191673Sjamie				error = ENOENT;
797191673Sjamie				vfs_opterror(opts, "jail \"%s\" not found",
798191673Sjamie				    name);
799191673Sjamie				goto done_unlock_list;
800191673Sjamie			}
801191673Sjamie		}
802191673Sjamie	}
803191673Sjamie	/* Update: must provide a jid or name. */
804191673Sjamie	else if (cuflags == JAIL_UPDATE && pr == NULL) {
805191673Sjamie		error = ENOENT;
806191673Sjamie		vfs_opterror(opts, "update specified no jail");
807191673Sjamie		goto done_unlock_list;
808191673Sjamie	}
809185435Sbz
810191673Sjamie	/* If there's no prison to update, create a new one and link it in. */
811191673Sjamie	if (pr == NULL) {
812191673Sjamie		created = 1;
813191673Sjamie		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
814191673Sjamie		if (jid == 0) {
815191673Sjamie			/* Find the next free jid. */
816191673Sjamie			jid = lastprid + 1;
817191673Sjamie findnext:
818191673Sjamie			if (jid == JAIL_MAX)
819191673Sjamie				jid = 1;
820191673Sjamie			TAILQ_FOREACH(tpr, &allprison, pr_list) {
821191673Sjamie				if (tpr->pr_id < jid)
822191673Sjamie					continue;
823191673Sjamie				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
824191673Sjamie					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
825191673Sjamie					break;
826191673Sjamie				}
827191673Sjamie				if (jid == lastprid) {
828191673Sjamie					error = EAGAIN;
829191673Sjamie					vfs_opterror(opts,
830191673Sjamie					    "no available jail IDs");
831191673Sjamie					free(pr, M_PRISON);
832191673Sjamie					goto done_unlock_list;
833191673Sjamie				}
834191673Sjamie				jid++;
835191673Sjamie				goto findnext;
836191673Sjamie			}
837191673Sjamie			lastprid = jid;
838191673Sjamie		} else {
839191673Sjamie			/*
840191673Sjamie			 * The jail already has a jid (that did not yet exist),
841191673Sjamie			 * so just find where to insert it.
842191673Sjamie			 */
843191673Sjamie			TAILQ_FOREACH(tpr, &allprison, pr_list)
844191673Sjamie				if (tpr->pr_id >= jid) {
845191673Sjamie					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
846191673Sjamie					break;
847191673Sjamie				}
848191673Sjamie		}
849191673Sjamie		if (tpr == NULL)
850191673Sjamie			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
851191673Sjamie		prisoncount++;
852185435Sbz
853191673Sjamie		pr->pr_id = jid;
854191673Sjamie		if (name == NULL)
855191673Sjamie			name = "";
856191673Sjamie		if (path == NULL) {
857191673Sjamie			path = "/";
858191673Sjamie			root = rootvnode;
859191673Sjamie			vref(root);
860191673Sjamie		}
861191673Sjamie
862191673Sjamie		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
863191673Sjamie
864185435Sbz		/*
865191673Sjamie		 * Allocate a dedicated cpuset for each jail.
866191673Sjamie		 * Unlike other initial settings, this may return an erorr.
867185435Sbz		 */
868191673Sjamie		error = cpuset_create_root(td, &pr->pr_cpuset);
869191673Sjamie		if (error) {
870191673Sjamie			prison_deref(pr, PD_LIST_XLOCKED);
871191673Sjamie			goto done_releroot;
872191673Sjamie		}
873185435Sbz
874191673Sjamie		mtx_lock(&pr->pr_mtx);
875185435Sbz		/*
876191673Sjamie		 * New prisons do not yet have a reference, because we do not
877191673Sjamie		 * want other to see the incomplete prison once the
878191673Sjamie		 * allprison_lock is downgraded.
879185435Sbz		 */
880191673Sjamie	} else {
881191673Sjamie		created = 0;
882191673Sjamie		/*
883191673Sjamie		 * Grab a reference for existing prisons, to ensure they
884191673Sjamie		 * continue to exist for the duration of the call.
885191673Sjamie		 */
886191673Sjamie		pr->pr_ref++;
887191673Sjamie	}
888185435Sbz
889191673Sjamie	/* Do final error checking before setting anything. */
890191673Sjamie	error = 0;
891185435Sbz#if defined(INET) || defined(INET6)
892191673Sjamie	if (
893185435Sbz#ifdef INET
894191673Sjamie	    ip4s > 0
895191673Sjamie#ifdef INET6
896191673Sjamie	    ||
897185435Sbz#endif
898191673Sjamie#endif
899185435Sbz#ifdef INET6
900191673Sjamie	    ip6s > 0
901185435Sbz#endif
902191673Sjamie	    )
903191673Sjamie		/*
904191673Sjamie		 * Check for conflicting IP addresses.  We permit them if there
905191673Sjamie		 * is no more than 1 IP on each jail.  If there is a duplicate
906191673Sjamie		 * on a jail with more than one IP stop checking and return
907191673Sjamie		 * error.
908191673Sjamie		 */
909191673Sjamie		TAILQ_FOREACH(tpr, &allprison, pr_list) {
910191673Sjamie			if (tpr == pr || tpr->pr_uref == 0)
911191673Sjamie				continue;
912191673Sjamie#ifdef INET
913191673Sjamie			if ((ip4s > 0 && tpr->pr_ip4s > 1) ||
914191673Sjamie			    (ip4s > 1 && tpr->pr_ip4s > 0))
915191673Sjamie				for (ii = 0; ii < ip4s; ii++)
916191673Sjamie					if (_prison_check_ip4(tpr,
917191673Sjamie					    &ip4[ii]) == 0) {
918191673Sjamie						error = EINVAL;
919191673Sjamie						vfs_opterror(opts,
920191673Sjamie						    "IPv4 addresses clash");
921191673Sjamie						goto done_deref_locked;
922191673Sjamie					}
923185435Sbz#endif
924191673Sjamie#ifdef INET6
925191673Sjamie			if ((ip6s > 0 && tpr->pr_ip6s > 1) ||
926191673Sjamie			    (ip6s > 1 && tpr->pr_ip6s > 0))
927191673Sjamie				for (ii = 0; ii < ip6s; ii++)
928191673Sjamie					if (_prison_check_ip6(tpr,
929191673Sjamie					    &ip6[ii]) == 0) {
930191673Sjamie						error = EINVAL;
931191673Sjamie						vfs_opterror(opts,
932191673Sjamie						    "IPv6 addresses clash");
933191673Sjamie						goto done_deref_locked;
934191673Sjamie					}
935191673Sjamie#endif
936191673Sjamie		}
937191673Sjamie#endif
938191673Sjamie	if (error == 0 && name != NULL) {
939191673Sjamie		/* Give a default name of the jid. */
940191673Sjamie		if (name[0] == '\0')
941191673Sjamie			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
942191673Sjamie		else if (strtoul(name, &p, 10) != jid && *p == '\0') {
943191673Sjamie			error = EINVAL;
944191673Sjamie			vfs_opterror(opts, "name cannot be numeric");
945191673Sjamie		}
946191673Sjamie	}
947191673Sjamie	if (error) {
948191673Sjamie done_deref_locked:
949191673Sjamie		/*
950191673Sjamie		 * Some parameter had an error so do not set anything.
951191673Sjamie		 * If this is a new jail, it will go away without ever
952191673Sjamie		 * having been seen.
953191673Sjamie		 */
954191673Sjamie		prison_deref(pr, created
955191673Sjamie		    ? PD_LOCKED | PD_LIST_XLOCKED
956191673Sjamie		    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
957191673Sjamie		goto done_releroot;
958191673Sjamie	}
959185435Sbz
960191673Sjamie	/* Set the parameters of the prison. */
961191673Sjamie#ifdef INET
962191673Sjamie	if (ip4s >= 0) {
963191673Sjamie		pr->pr_ip4s = ip4s;
964191673Sjamie		free(pr->pr_ip4, M_PRISON);
965191673Sjamie		pr->pr_ip4 = ip4;
966191673Sjamie		ip4 = NULL;
967185435Sbz	}
968191673Sjamie#endif
969191673Sjamie#ifdef INET6
970191673Sjamie	if (ip6s >= 0) {
971191673Sjamie		pr->pr_ip6s = ip6s;
972191673Sjamie		free(pr->pr_ip6, M_PRISON);
973191673Sjamie		pr->pr_ip6 = ip6;
974191673Sjamie		ip6 = NULL;
975191673Sjamie	}
976191673Sjamie#endif
977191673Sjamie	if (gotslevel)
978191673Sjamie		pr->pr_securelevel = slevel;
979191673Sjamie	if (name != NULL)
980191673Sjamie		strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
981191673Sjamie	if (path != NULL) {
982191673Sjamie		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
983191673Sjamie		pr->pr_root = root;
984191673Sjamie	}
985191673Sjamie	if (host != NULL)
986191673Sjamie		strlcpy(pr->pr_host, host, sizeof(pr->pr_host));
987191673Sjamie	/*
988191673Sjamie	 * Persistent prisons get an extra reference, and prisons losing their
989191673Sjamie	 * persist flag lose that reference.  Only do this for existing prisons
990191673Sjamie	 * for now, so new ones will remain unseen until after the module
991191673Sjamie	 * handlers have completed.
992191673Sjamie	 */
993191673Sjamie	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
994191673Sjamie		if (pr_flags & PR_PERSIST) {
995191673Sjamie			pr->pr_ref++;
996191673Sjamie			pr->pr_uref++;
997191673Sjamie		} else {
998191673Sjamie			pr->pr_ref--;
999191673Sjamie			pr->pr_uref--;
1000191673Sjamie		}
1001191673Sjamie	}
1002191673Sjamie	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1003191673Sjamie	mtx_unlock(&pr->pr_mtx);
1004185435Sbz
1005191673Sjamie	/* Let the modules do their work. */
1006191673Sjamie	sx_downgrade(&allprison_lock);
1007191673Sjamie	if (created) {
1008191673Sjamie		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1009191673Sjamie		if (error) {
1010191673Sjamie			prison_deref(pr, PD_LIST_SLOCKED);
1011191673Sjamie			goto done_errmsg;
1012191673Sjamie		}
1013191673Sjamie	}
1014191673Sjamie	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1015191673Sjamie	if (error) {
1016191673Sjamie		prison_deref(pr, created
1017191673Sjamie		    ? PD_LIST_SLOCKED
1018191673Sjamie		    : PD_DEREF | PD_LIST_SLOCKED);
1019191673Sjamie		goto done_errmsg;
1020191673Sjamie	}
1021191673Sjamie
1022191673Sjamie	/* Attach this process to the prison if requested. */
1023191673Sjamie	if (flags & JAIL_ATTACH) {
1024191673Sjamie		mtx_lock(&pr->pr_mtx);
1025191673Sjamie		error = do_jail_attach(td, pr);
1026191673Sjamie		if (error) {
1027191673Sjamie			vfs_opterror(opts, "attach failed");
1028191673Sjamie			if (!created)
1029191673Sjamie				prison_deref(pr, PD_DEREF);
1030191673Sjamie			goto done_errmsg;
1031191673Sjamie		}
1032191673Sjamie	}
1033191673Sjamie
1034191673Sjamie	/*
1035191673Sjamie	 * Now that it is all there, drop the temporary reference from existing
1036191673Sjamie	 * prisons.  Or add a reference to newly created persistent prisons
1037191673Sjamie	 * (which was not done earlier so that the prison would not be publicly
1038191673Sjamie	 * visible).
1039191673Sjamie	 */
1040191673Sjamie	if (!created) {
1041191673Sjamie		prison_deref(pr, (flags & JAIL_ATTACH)
1042191673Sjamie		    ? PD_DEREF
1043191673Sjamie		    : PD_DEREF | PD_LIST_SLOCKED);
1044191673Sjamie	} else {
1045191673Sjamie		if (pr_flags & PR_PERSIST) {
1046191673Sjamie			mtx_lock(&pr->pr_mtx);
1047191673Sjamie			pr->pr_ref++;
1048191673Sjamie			pr->pr_uref++;
1049191673Sjamie			mtx_unlock(&pr->pr_mtx);
1050191673Sjamie		}
1051191673Sjamie		if (!(flags & JAIL_ATTACH))
1052191673Sjamie			sx_sunlock(&allprison_lock);
1053191673Sjamie	}
1054191673Sjamie	td->td_retval[0] = pr->pr_id;
1055191673Sjamie	goto done_errmsg;
1056191673Sjamie
1057191673Sjamie done_unlock_list:
1058191673Sjamie	sx_xunlock(&allprison_lock);
1059191673Sjamie done_releroot:
1060191673Sjamie	if (root != NULL) {
1061191673Sjamie		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1062191673Sjamie		vrele(root);
1063191673Sjamie		VFS_UNLOCK_GIANT(vfslocked);
1064191673Sjamie	}
1065191673Sjamie done_errmsg:
1066191673Sjamie	if (error) {
1067191673Sjamie		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1068191673Sjamie		if (errmsg_len > 0) {
1069191673Sjamie			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1070191673Sjamie			if (errmsg_pos > 0) {
1071191673Sjamie				if (optuio->uio_segflg == UIO_SYSSPACE)
1072191673Sjamie					bcopy(errmsg,
1073191673Sjamie					   optuio->uio_iov[errmsg_pos].iov_base,
1074191673Sjamie					   errmsg_len);
1075191673Sjamie				else
1076191673Sjamie					copyout(errmsg,
1077191673Sjamie					   optuio->uio_iov[errmsg_pos].iov_base,
1078191673Sjamie					   errmsg_len);
1079191673Sjamie			}
1080191673Sjamie		}
1081191673Sjamie	}
1082191673Sjamie done_free:
1083191673Sjamie#ifdef INET
1084191673Sjamie	free(ip4, M_PRISON);
1085191673Sjamie#endif
1086191673Sjamie#ifdef INET6
1087191673Sjamie	free(ip6, M_PRISON);
1088191673Sjamie#endif
1089191673Sjamie	vfs_freeopts(opts);
1090191673Sjamie	return (error);
1091191673Sjamie}
1092191673Sjamie
1093191673Sjamie/*
1094191673Sjamie * Sysctl nodes to describe jail parameters.  Maximum length of string
1095191673Sjamie * parameters is returned in the string itself, and the other parameters
1096191673Sjamie * exist merely to make themselves and their types known.
1097191673Sjamie */
1098191673SjamieSYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
1099191673Sjamie    "Jail parameters");
1100191673Sjamie
1101191673Sjamieint
1102191673Sjamiesysctl_jail_param(SYSCTL_HANDLER_ARGS)
1103191673Sjamie{
1104191673Sjamie	int i;
1105191673Sjamie	long l;
1106191673Sjamie	size_t s;
1107191673Sjamie	char numbuf[12];
1108191673Sjamie
1109191673Sjamie	switch (oidp->oid_kind & CTLTYPE)
1110191673Sjamie	{
1111191673Sjamie	case CTLTYPE_LONG:
1112191673Sjamie	case CTLTYPE_ULONG:
1113191673Sjamie		l = 0;
1114191673Sjamie#ifdef SCTL_MASK32
1115191673Sjamie		if (!(req->flags & SCTL_MASK32))
1116191673Sjamie#endif
1117191673Sjamie			return (SYSCTL_OUT(req, &l, sizeof(l)));
1118191673Sjamie	case CTLTYPE_INT:
1119191673Sjamie	case CTLTYPE_UINT:
1120191673Sjamie		i = 0;
1121191673Sjamie		return (SYSCTL_OUT(req, &i, sizeof(i)));
1122191673Sjamie	case CTLTYPE_STRING:
1123191673Sjamie		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
1124191673Sjamie		return
1125191673Sjamie		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
1126191673Sjamie	case CTLTYPE_STRUCT:
1127191673Sjamie		s = (size_t)arg2;
1128191673Sjamie		return (SYSCTL_OUT(req, &s, sizeof(s)));
1129191673Sjamie	}
1130185435Sbz	return (0);
1131185435Sbz}
1132185435Sbz
1133191673SjamieSYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail ID");
1134191673SjamieSYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
1135191673SjamieSYSCTL_JAIL_PARAM(, cpuset, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
1136191673SjamieSYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RD, MAXPATHLEN, "Jail root path");
1137191673SjamieSYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
1138191673Sjamie    "I", "Jail secure level");
1139191673SjamieSYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
1140191673Sjamie    "B", "Jail persistence");
1141191673SjamieSYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
1142191673Sjamie    "B", "Jail is in the process of shutting down");
1143185435Sbz
1144191673SjamieSYSCTL_JAIL_PARAM_NODE(host, "Jail host info");
1145191673SjamieSYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
1146191673Sjamie    "Jail hostname");
1147191673Sjamie
1148191673Sjamie#ifdef INET
1149191673SjamieSYSCTL_JAIL_PARAM_NODE(ip4, "Jail IPv4 address virtualization");
1150191673SjamieSYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
1151191673Sjamie    "S,in_addr,a", "Jail IPv4 addresses");
1152191673Sjamie#endif
1153191673Sjamie#ifdef INET6
1154191673SjamieSYSCTL_JAIL_PARAM_NODE(ip6, "Jail IPv6 address virtualization");
1155191673SjamieSYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
1156191673Sjamie    "S,in6_addr,a", "Jail IPv6 addresses");
1157191673Sjamie#endif
1158191673Sjamie
1159191673Sjamie
116082710Sdillon/*
1161191673Sjamie * struct jail_get_args {
1162191673Sjamie *	struct iovec *iovp;
1163191673Sjamie *	unsigned int iovcnt;
1164191673Sjamie *	int flags;
1165114168Smike * };
116682710Sdillon */
116746155Sphkint
1168191673Sjamiejail_get(struct thread *td, struct jail_get_args *uap)
116946155Sphk{
1170191673Sjamie	struct uio *auio;
1171185435Sbz	int error;
1172185435Sbz
1173191673Sjamie	/* Check that we have an even number of iovecs. */
1174191673Sjamie	if (uap->iovcnt & 1)
1175191673Sjamie		return (EINVAL);
1176191673Sjamie
1177191673Sjamie	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1178185435Sbz	if (error)
1179185435Sbz		return (error);
1180191673Sjamie	error = kern_jail_get(td, auio, uap->flags);
1181191673Sjamie	if (error == 0)
1182191673Sjamie		error = copyout(auio->uio_iov, uap->iovp,
1183191673Sjamie		    uap->iovcnt * sizeof (struct iovec));
1184191673Sjamie	free(auio, M_IOV);
1185191673Sjamie	return (error);
1186191673Sjamie}
1187185435Sbz
1188191673Sjamieint
1189191673Sjamiekern_jail_get(struct thread *td, struct uio *optuio, int flags)
1190191673Sjamie{
1191191673Sjamie	struct prison *pr;
1192191673Sjamie	struct vfsopt *opt;
1193191673Sjamie	struct vfsoptlist *opts;
1194191673Sjamie	char *errmsg, *name;
1195191673Sjamie	int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos;
1196185435Sbz
1197191673Sjamie	if (flags & ~JAIL_GET_MASK)
1198191673Sjamie		return (EINVAL);
1199191673Sjamie	if (jailed(td->td_ucred)) {
1200185435Sbz		/*
1201191673Sjamie		 * Don't allow a jailed process to see any jails,
1202191673Sjamie		 * not even its own.
1203185435Sbz		 */
1204191673Sjamie		vfs_opterror(opts, "jail not found");
1205191673Sjamie		return (ENOENT);
1206191673Sjamie	}
1207185435Sbz
1208191673Sjamie	/* Get the parameter list. */
1209191673Sjamie	error = vfs_buildopts(optuio, &opts);
1210191673Sjamie	if (error)
1211191673Sjamie		return (error);
1212191673Sjamie	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1213185435Sbz
1214191673Sjamie	/*
1215191673Sjamie	 * Find the prison specified by one of: lastjid, jid, name.
1216191673Sjamie	 */
1217191673Sjamie	sx_slock(&allprison_lock);
1218191673Sjamie	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1219191673Sjamie	if (error == 0) {
1220191673Sjamie		TAILQ_FOREACH(pr, &allprison, pr_list) {
1221191673Sjamie			if (pr->pr_id > jid) {
1222191673Sjamie				mtx_lock(&pr->pr_mtx);
1223191673Sjamie				if (pr->pr_ref > 0 &&
1224191673Sjamie				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1225191673Sjamie					break;
1226191673Sjamie				mtx_unlock(&pr->pr_mtx);
1227191673Sjamie			}
1228191673Sjamie		}
1229191673Sjamie		if (pr != NULL)
1230191673Sjamie			goto found_prison;
1231191673Sjamie		error = ENOENT;
1232191673Sjamie		vfs_opterror(opts, "no jail after %d", jid);
1233191673Sjamie		goto done_unlock_list;
1234191673Sjamie	} else if (error != ENOENT)
1235191673Sjamie		goto done_unlock_list;
1236185435Sbz
1237191673Sjamie	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1238191673Sjamie	if (error == 0) {
1239191673Sjamie		if (jid != 0) {
1240191673Sjamie			pr = prison_find(jid);
1241191673Sjamie			if (pr != NULL) {
1242191673Sjamie				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1243191673Sjamie					mtx_unlock(&pr->pr_mtx);
1244191673Sjamie					error = ENOENT;
1245191673Sjamie					vfs_opterror(opts, "jail %d is dying",
1246191673Sjamie					    jid);
1247191673Sjamie					goto done_unlock_list;
1248191673Sjamie				}
1249191673Sjamie				goto found_prison;
1250191673Sjamie			}
1251191673Sjamie			error = ENOENT;
1252191673Sjamie			vfs_opterror(opts, "jail %d not found", jid);
1253191673Sjamie			goto done_unlock_list;
1254191673Sjamie		}
1255191673Sjamie	} else if (error != ENOENT)
1256191673Sjamie		goto done_unlock_list;
125746155Sphk
1258191673Sjamie	error = vfs_getopt(opts, "name", (void **)&name, &len);
1259191673Sjamie	if (error == 0) {
1260191673Sjamie		if (len == 0 || name[len - 1] != '\0') {
1261191673Sjamie			error = EINVAL;
1262191673Sjamie			goto done_unlock_list;
1263191673Sjamie		}
1264191673Sjamie		pr = prison_find_name(name);
1265191673Sjamie		if (pr != NULL) {
1266191673Sjamie			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1267191673Sjamie				mtx_unlock(&pr->pr_mtx);
1268191673Sjamie				error = ENOENT;
1269191673Sjamie				vfs_opterror(opts, "jail \"%s\" is dying",
1270191673Sjamie				    name);
1271191673Sjamie				goto done_unlock_list;
1272191673Sjamie			}
1273191673Sjamie			goto found_prison;
1274191673Sjamie		}
1275191673Sjamie		error = ENOENT;
1276191673Sjamie		vfs_opterror(opts, "jail \"%s\" not found", name);
1277191673Sjamie		goto done_unlock_list;
1278191673Sjamie	} else if (error != ENOENT)
1279191673Sjamie		goto done_unlock_list;
1280185435Sbz
1281191673Sjamie	vfs_opterror(opts, "no jail specified");
1282191673Sjamie	error = ENOENT;
1283191673Sjamie	goto done_unlock_list;
1284191673Sjamie
1285191673Sjamie found_prison:
1286191673Sjamie	/* Get the parameters of the prison. */
1287191673Sjamie	pr->pr_ref++;
1288191673Sjamie	locked = PD_LOCKED;
1289191673Sjamie	td->td_retval[0] = pr->pr_id;
1290191673Sjamie	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1291191673Sjamie	if (error != 0 && error != ENOENT)
1292191673Sjamie		goto done_deref;
1293191673Sjamie	error = vfs_setopts(opts, "name", pr->pr_name);
1294191673Sjamie	if (error != 0 && error != ENOENT)
1295191673Sjamie		goto done_deref;
1296191673Sjamie	error = vfs_setopt(opts, "cpuset", &pr->pr_cpuset->cs_id,
1297191673Sjamie	    sizeof(pr->pr_cpuset->cs_id));
1298191673Sjamie	if (error != 0 && error != ENOENT)
1299191673Sjamie		goto done_deref;
1300191673Sjamie	error = vfs_setopts(opts, "path", pr->pr_path);
1301191673Sjamie	if (error != 0 && error != ENOENT)
1302191673Sjamie		goto done_deref;
1303191673Sjamie#ifdef INET
1304191673Sjamie	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1305191673Sjamie	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1306191673Sjamie	if (error != 0 && error != ENOENT)
1307191673Sjamie		goto done_deref;
1308191673Sjamie#endif
1309191673Sjamie#ifdef INET6
1310191673Sjamie	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1311191673Sjamie	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1312191673Sjamie	if (error != 0 && error != ENOENT)
1313191673Sjamie		goto done_deref;
1314191673Sjamie#endif
1315191673Sjamie	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1316191673Sjamie	    sizeof(pr->pr_securelevel));
1317191673Sjamie	if (error != 0 && error != ENOENT)
1318191673Sjamie		goto done_deref;
1319191673Sjamie	error = vfs_setopts(opts, "host.hostname", pr->pr_host);
1320191673Sjamie	if (error != 0 && error != ENOENT)
1321191673Sjamie		goto done_deref;
1322191673Sjamie	i = pr->pr_flags & PR_PERSIST ? 1 : 0;
1323191673Sjamie	error = vfs_setopt(opts, "persist", &i, sizeof(i));
1324191673Sjamie	if (error != 0 && error != ENOENT)
1325191673Sjamie		goto done_deref;
1326191673Sjamie	i = !i;
1327191673Sjamie	error = vfs_setopt(opts, "nopersist", &i, sizeof(i));
1328191673Sjamie	if (error != 0 && error != ENOENT)
1329191673Sjamie		goto done_deref;
1330191673Sjamie	i = (pr->pr_uref == 0);
1331191673Sjamie	error = vfs_setopt(opts, "dying", &i, sizeof(i));
1332191673Sjamie	if (error != 0 && error != ENOENT)
1333191673Sjamie		goto done_deref;
1334191673Sjamie	i = !i;
1335191673Sjamie	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
1336191673Sjamie	if (error != 0 && error != ENOENT)
1337191673Sjamie		goto done_deref;
1338191673Sjamie
1339191673Sjamie	/* Get the module parameters. */
1340191673Sjamie	mtx_unlock(&pr->pr_mtx);
1341191673Sjamie	locked = 0;
1342191673Sjamie	error = osd_jail_call(pr, PR_METHOD_GET, opts);
134346155Sphk	if (error)
1344191673Sjamie		goto done_deref;
1345191673Sjamie	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
134684828Sjhb
1347191673Sjamie	/* By now, all parameters should have been noted. */
1348191673Sjamie	TAILQ_FOREACH(opt, opts, link) {
1349191673Sjamie		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1350191673Sjamie			error = EINVAL;
1351191673Sjamie			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1352191673Sjamie			goto done_errmsg;
1353191673Sjamie		}
1354185435Sbz	}
1355191673Sjamie
1356191673Sjamie	/* Write the fetched parameters back to userspace. */
1357191673Sjamie	error = 0;
1358191673Sjamie	TAILQ_FOREACH(opt, opts, link) {
1359191673Sjamie		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
1360191673Sjamie			pos = 2 * opt->pos + 1;
1361191673Sjamie			optuio->uio_iov[pos].iov_len = opt->len;
1362191673Sjamie			if (opt->value != NULL) {
1363191673Sjamie				if (optuio->uio_segflg == UIO_SYSSPACE) {
1364191673Sjamie					bcopy(opt->value,
1365191673Sjamie					    optuio->uio_iov[pos].iov_base,
1366191673Sjamie					    opt->len);
1367191673Sjamie				} else {
1368191673Sjamie					error = copyout(opt->value,
1369191673Sjamie					    optuio->uio_iov[pos].iov_base,
1370191673Sjamie					    opt->len);
1371191673Sjamie					if (error)
1372191673Sjamie						break;
1373191673Sjamie				}
1374191673Sjamie			}
1375191673Sjamie		}
1376185435Sbz	}
1377191673Sjamie	goto done_errmsg;
1378191673Sjamie
1379191673Sjamie done_deref:
1380191673Sjamie	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
1381191673Sjamie	goto done_errmsg;
1382191673Sjamie
1383191673Sjamie done_unlock_list:
1384191673Sjamie	sx_sunlock(&allprison_lock);
1385191673Sjamie done_errmsg:
1386191673Sjamie	if (error && errmsg_pos >= 0) {
1387191673Sjamie		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1388191673Sjamie		errmsg_pos = 2 * errmsg_pos + 1;
1389191673Sjamie		if (errmsg_len > 0) {
1390191673Sjamie			if (optuio->uio_segflg == UIO_SYSSPACE)
1391191673Sjamie				bcopy(errmsg,
1392191673Sjamie				    optuio->uio_iov[errmsg_pos].iov_base,
1393191673Sjamie				    errmsg_len);
1394191673Sjamie			else
1395191673Sjamie				copyout(errmsg,
1396191673Sjamie				    optuio->uio_iov[errmsg_pos].iov_base,
1397191673Sjamie				    errmsg_len);
1398191673Sjamie		}
1399185435Sbz	}
1400191673Sjamie	vfs_freeopts(opts);
1401191673Sjamie	return (error);
1402191673Sjamie}
1403113275Smike
1404191673Sjamie/*
1405191673Sjamie * struct jail_remove_args {
1406191673Sjamie *	int jid;
1407191673Sjamie * };
1408191673Sjamie */
1409191673Sjamieint
1410191673Sjamiejail_remove(struct thread *td, struct jail_remove_args *uap)
1411191673Sjamie{
1412191673Sjamie	struct prison *pr;
1413191673Sjamie	struct proc *p;
1414191673Sjamie	int deuref, error;
1415185435Sbz
1416191673Sjamie	error = priv_check(td, PRIV_JAIL_REMOVE);
1417185435Sbz	if (error)
1418191673Sjamie		return (error);
1419185435Sbz
1420185435Sbz	sx_xlock(&allprison_lock);
1421191673Sjamie	pr = prison_find(uap->jid);
1422191673Sjamie	if (pr == NULL) {
1423185435Sbz		sx_xunlock(&allprison_lock);
1424191673Sjamie		return (EINVAL);
1425185435Sbz	}
1426185435Sbz
1427191673Sjamie	/* If the prison was persistent, it is not anymore. */
1428191673Sjamie	deuref = 0;
1429191673Sjamie	if (pr->pr_flags & PR_PERSIST) {
1430191673Sjamie		pr->pr_ref--;
1431191673Sjamie		deuref = PD_DEUREF;
1432191673Sjamie		pr->pr_flags &= ~PR_PERSIST;
1433179881Sdelphij	}
1434113275Smike
1435191673Sjamie	/* If there are no references left, remove the prison now. */
1436191673Sjamie	if (pr->pr_ref == 0) {
1437191673Sjamie		prison_deref(pr,
1438191673Sjamie		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1439191673Sjamie		return (0);
1440191673Sjamie	}
1441191673Sjamie
1442191673Sjamie	/*
1443191673Sjamie	 * Keep a temporary reference to make sure this prison sticks around.
1444191673Sjamie	 */
1445191673Sjamie	pr->pr_ref++;
1446113275Smike	mtx_unlock(&pr->pr_mtx);
1447191673Sjamie	sx_xunlock(&allprison_lock);
1448191673Sjamie	/*
1449191673Sjamie	 * Kill all processes unfortunate enough to be attached to this prison.
1450191673Sjamie	 */
1451191673Sjamie	sx_slock(&allproc_lock);
1452191673Sjamie	LIST_FOREACH(p, &allproc, p_list) {
1453191673Sjamie		PROC_LOCK(p);
1454191673Sjamie		if (p->p_state != PRS_NEW && p->p_ucred &&
1455191673Sjamie		    p->p_ucred->cr_prison == pr)
1456191673Sjamie			psignal(p, SIGKILL);
1457191673Sjamie		PROC_UNLOCK(p);
1458191673Sjamie	}
1459191673Sjamie	sx_sunlock(&allproc_lock);
1460191673Sjamie	/* Remove the temporary reference. */
1461191673Sjamie	prison_deref(pr, deuref | PD_DEREF);
1462113275Smike	return (0);
1463113275Smike}
1464113275Smike
1465190466Sjamie
1466113275Smike/*
1467114168Smike * struct jail_attach_args {
1468114168Smike *	int jid;
1469114168Smike * };
1470113275Smike */
1471113275Smikeint
1472114168Smikejail_attach(struct thread *td, struct jail_attach_args *uap)
1473113275Smike{
1474113275Smike	struct prison *pr;
1475191673Sjamie	int error;
1476167309Spjd
1477164032Srwatson	error = priv_check(td, PRIV_JAIL_ATTACH);
1478126023Snectar	if (error)
1479126023Snectar		return (error);
1480126023Snectar
1481168401Spjd	sx_slock(&allprison_lock);
1482113275Smike	pr = prison_find(uap->jid);
1483113275Smike	if (pr == NULL) {
1484168401Spjd		sx_sunlock(&allprison_lock);
1485113275Smike		return (EINVAL);
1486113275Smike	}
1487185435Sbz
1488185435Sbz	/*
1489185435Sbz	 * Do not allow a process to attach to a prison that is not
1490191673Sjamie	 * considered to be "alive".
1491185435Sbz	 */
1492191673Sjamie	if (pr->pr_uref == 0) {
1493185435Sbz		mtx_unlock(&pr->pr_mtx);
1494185435Sbz		sx_sunlock(&allprison_lock);
1495185435Sbz		return (EINVAL);
1496185435Sbz	}
1497191673Sjamie
1498191673Sjamie	return (do_jail_attach(td, pr));
1499191673Sjamie}
1500191673Sjamie
1501191673Sjamiestatic int
1502191673Sjamiedo_jail_attach(struct thread *td, struct prison *pr)
1503191673Sjamie{
1504191673Sjamie	struct proc *p;
1505191673Sjamie	struct ucred *newcred, *oldcred;
1506191673Sjamie	int vfslocked, error;
1507191673Sjamie
1508191673Sjamie	/*
1509191673Sjamie	 * XXX: Note that there is a slight race here if two threads
1510191673Sjamie	 * in the same privileged process attempt to attach to two
1511191673Sjamie	 * different jails at the same time.  It is important for
1512191673Sjamie	 * user processes not to do this, or they might end up with
1513191673Sjamie	 * a process root from one prison, but attached to the jail
1514191673Sjamie	 * of another.
1515191673Sjamie	 */
1516113275Smike	pr->pr_ref++;
1517191673Sjamie	pr->pr_uref++;
1518113275Smike	mtx_unlock(&pr->pr_mtx);
1519191673Sjamie
1520191673Sjamie	/* Let modules do whatever they need to prepare for attaching. */
1521191673Sjamie	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
1522191673Sjamie	if (error) {
1523191673Sjamie		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
1524191673Sjamie		return (error);
1525191673Sjamie	}
1526168401Spjd	sx_sunlock(&allprison_lock);
1527113275Smike
1528185435Sbz	/*
1529185435Sbz	 * Reparent the newly attached process to this jail.
1530185435Sbz	 */
1531191673Sjamie	p = td->td_proc;
1532185435Sbz	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
1533185435Sbz	if (error)
1534191673Sjamie		goto e_revert_osd;
1535185435Sbz
1536150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
1537175202Sattilio	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
1538113275Smike	if ((error = change_dir(pr->pr_root, td)) != 0)
1539113275Smike		goto e_unlock;
1540113275Smike#ifdef MAC
1541172930Srwatson	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
1542113275Smike		goto e_unlock;
1543113275Smike#endif
1544175294Sattilio	VOP_UNLOCK(pr->pr_root, 0);
1545191673Sjamie	if ((error = change_root(pr->pr_root, td)))
1546191673Sjamie		goto e_unlock_giant;
1547150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
1548113275Smike
154984828Sjhb	newcred = crget();
155084828Sjhb	PROC_LOCK(p);
155184828Sjhb	oldcred = p->p_ucred;
1552113275Smike	setsugid(p);
155384828Sjhb	crcopy(newcred, oldcred);
1554113630Sjhb	newcred->cr_prison = pr;
155584828Sjhb	p->p_ucred = newcred;
155684828Sjhb	PROC_UNLOCK(p);
155784828Sjhb	crfree(oldcred);
155846155Sphk	return (0);
1559191673Sjamie e_unlock:
1560175294Sattilio	VOP_UNLOCK(pr->pr_root, 0);
1561191673Sjamie e_unlock_giant:
1562150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
1563191673Sjamie e_revert_osd:
1564191673Sjamie	/* Tell modules this thread is still in its old jail after all. */
1565191673Sjamie	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
1566191673Sjamie	prison_deref(pr, PD_DEREF | PD_DEUREF);
156746155Sphk	return (error);
156846155Sphk}
156946155Sphk
1570113275Smike/*
1571113275Smike * Returns a locked prison instance, or NULL on failure.
1572113275Smike */
1573168399Spjdstruct prison *
1574113275Smikeprison_find(int prid)
1575113275Smike{
1576113275Smike	struct prison *pr;
1577113275Smike
1578168401Spjd	sx_assert(&allprison_lock, SX_LOCKED);
1579191673Sjamie	TAILQ_FOREACH(pr, &allprison, pr_list) {
1580113275Smike		if (pr->pr_id == prid) {
1581113275Smike			mtx_lock(&pr->pr_mtx);
1582191673Sjamie			if (pr->pr_ref > 0)
1583191673Sjamie				return (pr);
1584191673Sjamie			mtx_unlock(&pr->pr_mtx);
1585113275Smike		}
1586113275Smike	}
1587113275Smike	return (NULL);
1588113275Smike}
1589113275Smike
1590191673Sjamie/*
1591191673Sjamie * Look for the named prison.  Returns a locked prison or NULL.
1592191673Sjamie */
1593191673Sjamiestruct prison *
1594191673Sjamieprison_find_name(const char *name)
1595191673Sjamie{
1596191673Sjamie	struct prison *pr, *deadpr;
1597191673Sjamie
1598191673Sjamie	sx_assert(&allprison_lock, SX_LOCKED);
1599191673Sjamie again:
1600191673Sjamie	deadpr = NULL;
1601191673Sjamie	TAILQ_FOREACH(pr, &allprison, pr_list) {
1602191673Sjamie		if (!strcmp(pr->pr_name, name)) {
1603191673Sjamie			mtx_lock(&pr->pr_mtx);
1604191673Sjamie			if (pr->pr_ref > 0) {
1605191673Sjamie				if (pr->pr_uref > 0)
1606191673Sjamie					return (pr);
1607191673Sjamie				deadpr = pr;
1608191673Sjamie			}
1609191673Sjamie			mtx_unlock(&pr->pr_mtx);
1610191673Sjamie		}
1611191673Sjamie	}
1612191673Sjamie	/* There was no valid prison - perhaps there was a dying one */
1613191673Sjamie	if (deadpr != NULL) {
1614191673Sjamie		mtx_lock(&deadpr->pr_mtx);
1615191673Sjamie		if (deadpr->pr_ref == 0) {
1616191673Sjamie			mtx_unlock(&deadpr->pr_mtx);
1617191673Sjamie			goto again;
1618191673Sjamie		}
1619191673Sjamie	}
1620191673Sjamie	return (deadpr);
1621191673Sjamie}
1622191673Sjamie
1623191673Sjamie/*
1624191673Sjamie * Remove a prison reference.  If that was the last reference, remove the
1625191673Sjamie * prison itself - but not in this context in case there are locks held.
1626191673Sjamie */
162772786Srwatsonvoid
1628185029Spjdprison_free_locked(struct prison *pr)
162972786Srwatson{
163072786Srwatson
1631185029Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
163272786Srwatson	pr->pr_ref--;
163372786Srwatson	if (pr->pr_ref == 0) {
1634168483Spjd		mtx_unlock(&pr->pr_mtx);
1635124882Srwatson		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1636144660Sjeff		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
163787275Srwatson		return;
163872786Srwatson	}
163987275Srwatson	mtx_unlock(&pr->pr_mtx);
164072786Srwatson}
164172786Srwatson
1642185029Spjdvoid
1643185029Spjdprison_free(struct prison *pr)
1644185029Spjd{
1645185029Spjd
1646185029Spjd	mtx_lock(&pr->pr_mtx);
1647185029Spjd	prison_free_locked(pr);
1648185029Spjd}
1649185029Spjd
1650124882Srwatsonstatic void
1651124882Srwatsonprison_complete(void *context, int pending)
1652124882Srwatson{
1653191673Sjamie
1654191673Sjamie	prison_deref((struct prison *)context, 0);
1655191673Sjamie}
1656191673Sjamie
1657191673Sjamie/*
1658191673Sjamie * Remove a prison reference (usually).  This internal version assumes no
1659191673Sjamie * mutexes are held, except perhaps the prison itself.  If there are no more
1660191673Sjamie * references, release and delist the prison.  On completion, the prison lock
1661191673Sjamie * and the allprison lock are both unlocked.
1662191673Sjamie */
1663191673Sjamiestatic void
1664191673Sjamieprison_deref(struct prison *pr, int flags)
1665191673Sjamie{
1666150652Scsjp	int vfslocked;
1667124882Srwatson
1668191673Sjamie	if (!(flags & PD_LOCKED))
1669191673Sjamie		mtx_lock(&pr->pr_mtx);
1670191673Sjamie	if (flags & PD_DEUREF) {
1671191673Sjamie		pr->pr_uref--;
1672191673Sjamie		/* Done if there were only user references to remove. */
1673191673Sjamie		if (!(flags & PD_DEREF)) {
1674191673Sjamie			mtx_unlock(&pr->pr_mtx);
1675191673Sjamie			if (flags & PD_LIST_SLOCKED)
1676191673Sjamie				sx_sunlock(&allprison_lock);
1677191673Sjamie			else if (flags & PD_LIST_XLOCKED)
1678191673Sjamie				sx_xunlock(&allprison_lock);
1679191673Sjamie			return;
1680191673Sjamie		}
1681191673Sjamie	}
1682191673Sjamie	if (flags & PD_DEREF)
1683191673Sjamie		pr->pr_ref--;
1684191673Sjamie	/* If the prison still has references, nothing else to do. */
1685191673Sjamie	if (pr->pr_ref > 0) {
1686191673Sjamie		mtx_unlock(&pr->pr_mtx);
1687191673Sjamie		if (flags & PD_LIST_SLOCKED)
1688191673Sjamie			sx_sunlock(&allprison_lock);
1689191673Sjamie		else if (flags & PD_LIST_XLOCKED)
1690191673Sjamie			sx_xunlock(&allprison_lock);
1691191673Sjamie		return;
1692191673Sjamie	}
1693124882Srwatson
1694191673Sjamie	KASSERT(pr->pr_uref == 0,
1695191673Sjamie	    ("%s: Trying to remove an active prison (jid=%d).", __func__,
1696191673Sjamie	    pr->pr_id));
1697191673Sjamie	mtx_unlock(&pr->pr_mtx);
1698191673Sjamie	if (flags & PD_LIST_SLOCKED) {
1699191673Sjamie		if (!sx_try_upgrade(&allprison_lock)) {
1700191673Sjamie			sx_sunlock(&allprison_lock);
1701191673Sjamie			sx_xlock(&allprison_lock);
1702191673Sjamie		}
1703191673Sjamie	} else if (!(flags & PD_LIST_XLOCKED))
1704191673Sjamie		sx_xlock(&allprison_lock);
1705191673Sjamie
1706191673Sjamie	TAILQ_REMOVE(&allprison, pr, pr_list);
1707168489Spjd	prisoncount--;
1708185029Spjd	sx_xunlock(&allprison_lock);
1709168489Spjd
1710191673Sjamie	if (pr->pr_root != NULL) {
1711191673Sjamie		vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
1712191673Sjamie		vrele(pr->pr_root);
1713191673Sjamie		VFS_UNLOCK_GIANT(vfslocked);
1714191673Sjamie	}
1715124882Srwatson	mtx_destroy(&pr->pr_mtx);
1716185435Sbz	free(pr->pr_linux, M_PRISON);
1717191673Sjamie#ifdef INET
1718191673Sjamie	free(pr->pr_ip4, M_PRISON);
1719191673Sjamie#endif
1720185435Sbz#ifdef INET6
1721185435Sbz	free(pr->pr_ip6, M_PRISON);
1722185435Sbz#endif
1723191673Sjamie	if (pr->pr_cpuset != NULL)
1724191673Sjamie		cpuset_rel(pr->pr_cpuset);
1725191673Sjamie	osd_jail_exit(pr);
1726184205Sdes	free(pr, M_PRISON);
1727124882Srwatson}
1728124882Srwatson
172972786Srwatsonvoid
1730185029Spjdprison_hold_locked(struct prison *pr)
173172786Srwatson{
173272786Srwatson
1733185029Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
1734168489Spjd	KASSERT(pr->pr_ref > 0,
1735191671Sjamie	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
173672786Srwatson	pr->pr_ref++;
1737185029Spjd}
1738185029Spjd
1739185029Spjdvoid
1740185029Spjdprison_hold(struct prison *pr)
1741185029Spjd{
1742185029Spjd
1743185029Spjd	mtx_lock(&pr->pr_mtx);
1744185029Spjd	prison_hold_locked(pr);
174587275Srwatson	mtx_unlock(&pr->pr_mtx);
174672786Srwatson}
174772786Srwatson
1748185435Sbzvoid
1749185435Sbzprison_proc_hold(struct prison *pr)
175087275Srwatson{
175187275Srwatson
1752185435Sbz	mtx_lock(&pr->pr_mtx);
1753191673Sjamie	KASSERT(pr->pr_uref > 0,
1754191673Sjamie	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
1755191673Sjamie	pr->pr_uref++;
1756185435Sbz	mtx_unlock(&pr->pr_mtx);
175787275Srwatson}
175887275Srwatson
1759185435Sbzvoid
1760185435Sbzprison_proc_free(struct prison *pr)
1761185435Sbz{
1762185435Sbz
1763185435Sbz	mtx_lock(&pr->pr_mtx);
1764191673Sjamie	KASSERT(pr->pr_uref > 0,
1765191673Sjamie	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
1766191673Sjamie	prison_deref(pr, PD_DEUREF | PD_LOCKED);
1767185435Sbz}
1768185435Sbz
1769185435Sbz
1770185435Sbz#ifdef INET
1771185435Sbz/*
1772185435Sbz * Pass back primary IPv4 address of this jail.
1773185435Sbz *
1774185435Sbz * If not jailed return success but do not alter the address.  Caller has to
1775190466Sjamie * make sure to initialize it correctly (e.g. INADDR_ANY).
1776185435Sbz *
1777188144Sjamie * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
1778188144Sjamie * Address returned in NBO.
1779185435Sbz */
178046155Sphkint
1781187684Sbzprison_get_ip4(struct ucred *cred, struct in_addr *ia)
178246155Sphk{
1783191673Sjamie	struct prison *pr;
178446155Sphk
1785185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1786185435Sbz	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
1787185435Sbz
178872786Srwatson	if (!jailed(cred))
178946155Sphk		return (0);
1790191673Sjamie	pr = cred->cr_prison;
1791191673Sjamie	mtx_lock(&pr->pr_mtx);
1792191673Sjamie	if (pr->pr_ip4 == NULL) {
1793191673Sjamie		mtx_unlock(&pr->pr_mtx);
1794188144Sjamie		return (EAFNOSUPPORT);
1795191673Sjamie	}
1796185435Sbz
1797191673Sjamie	ia->s_addr = pr->pr_ip4[0].s_addr;
1798191673Sjamie	mtx_unlock(&pr->pr_mtx);
1799185435Sbz	return (0);
1800185435Sbz}
1801185435Sbz
1802185435Sbz/*
1803185435Sbz * Make sure our (source) address is set to something meaningful to this
1804185435Sbz * jail.
1805185435Sbz *
1806188144Sjamie * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1807188144Sjamie * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4.
1808188144Sjamie * Address passed in in NBO and returned in NBO.
1809185435Sbz */
1810185435Sbzint
1811185435Sbzprison_local_ip4(struct ucred *cred, struct in_addr *ia)
1812185435Sbz{
1813191673Sjamie	struct prison *pr;
1814185435Sbz	struct in_addr ia0;
1815191673Sjamie	int error;
1816185435Sbz
1817185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1818185435Sbz	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
1819185435Sbz
1820185435Sbz	if (!jailed(cred))
182146155Sphk		return (0);
1822191673Sjamie	pr = cred->cr_prison;
1823191673Sjamie	mtx_lock(&pr->pr_mtx);
1824191673Sjamie	if (pr->pr_ip4 == NULL) {
1825191673Sjamie		mtx_unlock(&pr->pr_mtx);
1826188144Sjamie		return (EAFNOSUPPORT);
1827191673Sjamie	}
1828185435Sbz
1829185435Sbz	ia0.s_addr = ntohl(ia->s_addr);
1830185435Sbz	if (ia0.s_addr == INADDR_LOOPBACK) {
1831191673Sjamie		ia->s_addr = pr->pr_ip4[0].s_addr;
1832191673Sjamie		mtx_unlock(&pr->pr_mtx);
1833185435Sbz		return (0);
183446155Sphk	}
1835185435Sbz
1836188144Sjamie	if (ia0.s_addr == INADDR_ANY) {
1837188144Sjamie		/*
1838188144Sjamie		 * In case there is only 1 IPv4 address, bind directly.
1839188144Sjamie		 */
1840191673Sjamie		if (pr->pr_ip4s == 1)
1841191673Sjamie			ia->s_addr = pr->pr_ip4[0].s_addr;
1842191673Sjamie		mtx_unlock(&pr->pr_mtx);
1843185435Sbz		return (0);
1844185435Sbz	}
1845185435Sbz
1846191673Sjamie	error = _prison_check_ip4(pr, ia);
1847191673Sjamie	mtx_unlock(&pr->pr_mtx);
1848191673Sjamie	return (error);
1849185435Sbz}
1850185435Sbz
1851185435Sbz/*
1852185435Sbz * Rewrite destination address in case we will connect to loopback address.
1853185435Sbz *
1854188144Sjamie * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
1855188144Sjamie * Address passed in in NBO and returned in NBO.
1856185435Sbz */
1857185435Sbzint
1858185435Sbzprison_remote_ip4(struct ucred *cred, struct in_addr *ia)
1859185435Sbz{
1860191673Sjamie	struct prison *pr;
1861185435Sbz
1862185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1863185435Sbz	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
1864185435Sbz
1865185435Sbz	if (!jailed(cred))
1866185435Sbz		return (0);
1867191673Sjamie	pr = cred->cr_prison;
1868191673Sjamie	mtx_lock(&pr->pr_mtx);
1869191673Sjamie	if (pr->pr_ip4 == NULL) {
1870191673Sjamie		mtx_unlock(&pr->pr_mtx);
1871188144Sjamie		return (EAFNOSUPPORT);
1872191673Sjamie	}
1873188144Sjamie
1874185435Sbz	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
1875191673Sjamie		ia->s_addr = pr->pr_ip4[0].s_addr;
1876191673Sjamie		mtx_unlock(&pr->pr_mtx);
1877185435Sbz		return (0);
1878185435Sbz	}
1879185435Sbz
1880185435Sbz	/*
1881185435Sbz	 * Return success because nothing had to be changed.
1882185435Sbz	 */
1883191673Sjamie	mtx_unlock(&pr->pr_mtx);
1884185435Sbz	return (0);
1885185435Sbz}
1886185435Sbz
1887185435Sbz/*
1888188144Sjamie * Check if given address belongs to the jail referenced by cred/prison.
1889185435Sbz *
1890188144Sjamie * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1891188144Sjamie * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4.
1892188144Sjamie * Address passed in in NBO.
1893185435Sbz */
1894185435Sbzstatic int
1895185435Sbz_prison_check_ip4(struct prison *pr, struct in_addr *ia)
1896185435Sbz{
1897185435Sbz	int i, a, z, d;
1898185435Sbz
1899185435Sbz	/*
1900185435Sbz	 * Check the primary IP.
1901185435Sbz	 */
1902185435Sbz	if (pr->pr_ip4[0].s_addr == ia->s_addr)
1903188144Sjamie		return (0);
1904185435Sbz
1905185435Sbz	/*
1906185435Sbz	 * All the other IPs are sorted so we can do a binary search.
1907185435Sbz	 */
1908185435Sbz	a = 0;
1909185435Sbz	z = pr->pr_ip4s - 2;
1910185435Sbz	while (a <= z) {
1911185435Sbz		i = (a + z) / 2;
1912185435Sbz		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
1913185435Sbz		if (d > 0)
1914185435Sbz			z = i - 1;
1915185435Sbz		else if (d < 0)
1916185435Sbz			a = i + 1;
191781114Srwatson		else
1918188144Sjamie			return (0);
1919185435Sbz	}
1920188144Sjamie
1921188144Sjamie	return (EADDRNOTAVAIL);
1922185435Sbz}
1923185435Sbz
1924185435Sbzint
1925185435Sbzprison_check_ip4(struct ucred *cred, struct in_addr *ia)
1926185435Sbz{
1927191673Sjamie	struct prison *pr;
1928191673Sjamie	int error;
1929185435Sbz
1930185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1931185435Sbz	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
1932185435Sbz
1933185435Sbz	if (!jailed(cred))
1934188144Sjamie		return (0);
1935191673Sjamie	pr = cred->cr_prison;
1936191673Sjamie	mtx_lock(&pr->pr_mtx);
1937191673Sjamie	if (pr->pr_ip4 == NULL) {
1938191673Sjamie		mtx_unlock(&pr->pr_mtx);
1939188144Sjamie		return (EAFNOSUPPORT);
1940191673Sjamie	}
1941185435Sbz
1942191673Sjamie	error = _prison_check_ip4(pr, ia);
1943191673Sjamie	mtx_unlock(&pr->pr_mtx);
1944191673Sjamie	return (error);
1945185435Sbz}
1946185435Sbz#endif
1947185435Sbz
1948185435Sbz#ifdef INET6
1949185435Sbz/*
1950185435Sbz * Pass back primary IPv6 address for this jail.
1951185435Sbz *
1952185435Sbz * If not jailed return success but do not alter the address.  Caller has to
1953190466Sjamie * make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
1954185435Sbz *
1955188144Sjamie * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
1956185435Sbz */
1957185435Sbzint
1958187684Sbzprison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
1959185435Sbz{
1960191673Sjamie	struct prison *pr;
1961185435Sbz
1962185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1963185435Sbz	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1964185435Sbz
1965185435Sbz	if (!jailed(cred))
196681114Srwatson		return (0);
1967191673Sjamie	pr = cred->cr_prison;
1968191673Sjamie	mtx_lock(&pr->pr_mtx);
1969191673Sjamie	if (pr->pr_ip6 == NULL) {
1970191673Sjamie		mtx_unlock(&pr->pr_mtx);
1971188144Sjamie		return (EAFNOSUPPORT);
1972191673Sjamie	}
1973188144Sjamie
1974191673Sjamie	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
1975191673Sjamie	mtx_unlock(&pr->pr_mtx);
1976185435Sbz	return (0);
1977185435Sbz}
1978185435Sbz
1979185435Sbz/*
1980185435Sbz * Make sure our (source) address is set to something meaningful to this jail.
1981185435Sbz *
1982185435Sbz * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
1983185435Sbz * when needed while binding.
1984185435Sbz *
1985188144Sjamie * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1986188144Sjamie * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6.
1987185435Sbz */
1988185435Sbzint
1989185435Sbzprison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
1990185435Sbz{
1991191673Sjamie	struct prison *pr;
1992191673Sjamie	int error;
1993185435Sbz
1994185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1995185435Sbz	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1996185435Sbz
1997185435Sbz	if (!jailed(cred))
1998185435Sbz		return (0);
1999191673Sjamie	pr = cred->cr_prison;
2000191673Sjamie	mtx_lock(&pr->pr_mtx);
2001191673Sjamie	if (pr->pr_ip6 == NULL) {
2002191673Sjamie		mtx_unlock(&pr->pr_mtx);
2003188144Sjamie		return (EAFNOSUPPORT);
2004191673Sjamie	}
2005188144Sjamie
2006185435Sbz	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
2007191673Sjamie		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2008191673Sjamie		mtx_unlock(&pr->pr_mtx);
2009185435Sbz		return (0);
201081114Srwatson	}
2011185435Sbz
2012188144Sjamie	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
2013188144Sjamie		/*
2014188144Sjamie		 * In case there is only 1 IPv6 address, and v6only is true,
2015188144Sjamie		 * then bind directly.
2016188144Sjamie		 */
2017191673Sjamie		if (v6only != 0 && pr->pr_ip6s == 1)
2018191673Sjamie			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2019191673Sjamie		mtx_unlock(&pr->pr_mtx);
2020185435Sbz		return (0);
2021185435Sbz	}
2022188144Sjamie
2023191673Sjamie	error = _prison_check_ip6(pr, ia6);
2024191673Sjamie	mtx_unlock(&pr->pr_mtx);
2025191673Sjamie	return (error);
2026185435Sbz}
2027185435Sbz
2028185435Sbz/*
2029185435Sbz * Rewrite destination address in case we will connect to loopback address.
2030185435Sbz *
2031188144Sjamie * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2032185435Sbz */
2033185435Sbzint
2034185435Sbzprison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
2035185435Sbz{
2036191673Sjamie	struct prison *pr;
2037185435Sbz
2038185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2039185435Sbz	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2040185435Sbz
2041185435Sbz	if (!jailed(cred))
2042185435Sbz		return (0);
2043191673Sjamie	pr = cred->cr_prison;
2044191673Sjamie	mtx_lock(&pr->pr_mtx);
2045191673Sjamie	if (pr->pr_ip6 == NULL) {
2046191673Sjamie		mtx_unlock(&pr->pr_mtx);
2047188144Sjamie		return (EAFNOSUPPORT);
2048191673Sjamie	}
2049188144Sjamie
2050185435Sbz	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
2051191673Sjamie		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2052191673Sjamie		mtx_unlock(&pr->pr_mtx);
2053185435Sbz		return (0);
2054185435Sbz	}
2055185435Sbz
2056185435Sbz	/*
2057185435Sbz	 * Return success because nothing had to be changed.
2058185435Sbz	 */
2059191673Sjamie	mtx_unlock(&pr->pr_mtx);
206046155Sphk	return (0);
206146155Sphk}
206246155Sphk
2063185435Sbz/*
2064188144Sjamie * Check if given address belongs to the jail referenced by cred/prison.
2065185435Sbz *
2066188144Sjamie * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
2067188144Sjamie * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6.
2068185435Sbz */
2069185435Sbzstatic int
2070185435Sbz_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
207146155Sphk{
2072185435Sbz	int i, a, z, d;
207346155Sphk
2074185435Sbz	/*
2075185435Sbz	 * Check the primary IP.
2076185435Sbz	 */
2077185435Sbz	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
2078188144Sjamie		return (0);
2079185435Sbz
2080185435Sbz	/*
2081185435Sbz	 * All the other IPs are sorted so we can do a binary search.
2082185435Sbz	 */
2083185435Sbz	a = 0;
2084185435Sbz	z = pr->pr_ip6s - 2;
2085185435Sbz	while (a <= z) {
2086185435Sbz		i = (a + z) / 2;
2087185435Sbz		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
2088185435Sbz		if (d > 0)
2089185435Sbz			z = i - 1;
2090185435Sbz		else if (d < 0)
2091185435Sbz			a = i + 1;
209246155Sphk		else
2093188144Sjamie			return (0);
209446155Sphk	}
2095188144Sjamie
2096188144Sjamie	return (EADDRNOTAVAIL);
209746155Sphk}
209846155Sphk
209946155Sphkint
2100185435Sbzprison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
2101185435Sbz{
2102191673Sjamie	struct prison *pr;
2103191673Sjamie	int error;
2104185435Sbz
2105185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2106185435Sbz	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2107185435Sbz
2108185435Sbz	if (!jailed(cred))
2109188144Sjamie		return (0);
2110191673Sjamie	pr = cred->cr_prison;
2111191673Sjamie	mtx_lock(&pr->pr_mtx);
2112191673Sjamie	if (pr->pr_ip6 == NULL) {
2113191673Sjamie		mtx_unlock(&pr->pr_mtx);
2114188144Sjamie		return (EAFNOSUPPORT);
2115191673Sjamie	}
2116185435Sbz
2117191673Sjamie	error = _prison_check_ip6(pr, ia6);
2118191673Sjamie	mtx_unlock(&pr->pr_mtx);
2119191673Sjamie	return (error);
2120185435Sbz}
2121185435Sbz#endif
2122185435Sbz
2123185435Sbz/*
2124188146Sjamie * Check if a jail supports the given address family.
2125188146Sjamie *
2126188146Sjamie * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
2127188146Sjamie * if not.
2128188146Sjamie */
2129188146Sjamieint
2130188146Sjamieprison_check_af(struct ucred *cred, int af)
2131188146Sjamie{
2132188146Sjamie	int error;
2133188146Sjamie
2134188146Sjamie	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2135188146Sjamie
2136188146Sjamie
2137188146Sjamie	if (!jailed(cred))
2138188146Sjamie		return (0);
2139188146Sjamie
2140188146Sjamie	error = 0;
2141188146Sjamie	switch (af)
2142188146Sjamie	{
2143188146Sjamie#ifdef INET
2144188146Sjamie	case AF_INET:
2145188146Sjamie		if (cred->cr_prison->pr_ip4 == NULL)
2146188146Sjamie			error = EAFNOSUPPORT;
2147188146Sjamie		break;
2148188146Sjamie#endif
2149188146Sjamie#ifdef INET6
2150188146Sjamie	case AF_INET6:
2151188146Sjamie		if (cred->cr_prison->pr_ip6 == NULL)
2152188146Sjamie			error = EAFNOSUPPORT;
2153188146Sjamie		break;
2154188146Sjamie#endif
2155188146Sjamie	case AF_LOCAL:
2156188146Sjamie	case AF_ROUTE:
2157188146Sjamie		break;
2158188146Sjamie	default:
2159188146Sjamie		if (jail_socket_unixiproute_only)
2160188146Sjamie			error = EAFNOSUPPORT;
2161188146Sjamie	}
2162188146Sjamie	return (error);
2163188146Sjamie}
2164188146Sjamie
2165188146Sjamie/*
2166185435Sbz * Check if given address belongs to the jail referenced by cred (wrapper to
2167185435Sbz * prison_check_ip[46]).
2168185435Sbz *
2169188144Sjamie * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
2170188144Sjamie * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow
2171188144Sjamie * the address family.  IPv4 Address passed in in NBO.
2172185435Sbz */
2173185435Sbzint
217472786Srwatsonprison_if(struct ucred *cred, struct sockaddr *sa)
217546155Sphk{
2176185435Sbz#ifdef INET
2177114168Smike	struct sockaddr_in *sai;
2178185435Sbz#endif
2179185435Sbz#ifdef INET6
2180185435Sbz	struct sockaddr_in6 *sai6;
2181185435Sbz#endif
2182188144Sjamie	int error;
218346155Sphk
2184185435Sbz	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2185185435Sbz	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
2186185435Sbz
2187188144Sjamie	error = 0;
2188188144Sjamie	switch (sa->sa_family)
2189185435Sbz	{
2190185435Sbz#ifdef INET
2191185435Sbz	case AF_INET:
2192185435Sbz		sai = (struct sockaddr_in *)sa;
2193188144Sjamie		error = prison_check_ip4(cred, &sai->sin_addr);
2194185435Sbz		break;
2195185435Sbz#endif
2196185435Sbz#ifdef INET6
2197185435Sbz	case AF_INET6:
2198185435Sbz		sai6 = (struct sockaddr_in6 *)sa;
2199188144Sjamie		error = prison_check_ip6(cred, &sai6->sin6_addr);
2200185435Sbz		break;
2201185435Sbz#endif
2202185435Sbz	default:
2203188144Sjamie		if (jailed(cred) && jail_socket_unixiproute_only)
2204188144Sjamie			error = EAFNOSUPPORT;
2205185435Sbz	}
2206188144Sjamie	return (error);
220746155Sphk}
220872786Srwatson
220972786Srwatson/*
221072786Srwatson * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
221172786Srwatson */
221272786Srwatsonint
2213114168Smikeprison_check(struct ucred *cred1, struct ucred *cred2)
221472786Srwatson{
221572786Srwatson
221672786Srwatson	if (jailed(cred1)) {
221772786Srwatson		if (!jailed(cred2))
221872786Srwatson			return (ESRCH);
221972786Srwatson		if (cred2->cr_prison != cred1->cr_prison)
222072786Srwatson			return (ESRCH);
222172786Srwatson	}
222272786Srwatson
222372786Srwatson	return (0);
222472786Srwatson}
222572786Srwatson
222672786Srwatson/*
222772786Srwatson * Return 1 if the passed credential is in a jail, otherwise 0.
222872786Srwatson */
222972786Srwatsonint
2230114168Smikejailed(struct ucred *cred)
223172786Srwatson{
223272786Srwatson
223372786Srwatson	return (cred->cr_prison != NULL);
223472786Srwatson}
223591384Srobert
223691384Srobert/*
223791384Srobert * Return the correct hostname for the passed credential.
223891384Srobert */
223991391Srobertvoid
2240114168Smikegetcredhostname(struct ucred *cred, char *buf, size_t size)
224191384Srobert{
2242183550Szec	INIT_VPROCG(cred->cr_vimage->v_procg);
224391384Srobert
224491391Srobert	if (jailed(cred)) {
224591391Srobert		mtx_lock(&cred->cr_prison->pr_mtx);
2246105354Srobert		strlcpy(buf, cred->cr_prison->pr_host, size);
224791391Srobert		mtx_unlock(&cred->cr_prison->pr_mtx);
2248180291Srwatson	} else {
2249180291Srwatson		mtx_lock(&hostname_mtx);
2250181803Sbz		strlcpy(buf, V_hostname, size);
2251180291Srwatson		mtx_unlock(&hostname_mtx);
2252180291Srwatson	}
225391384Srobert}
2254113275Smike
2255125804Srwatson/*
2256147185Spjd * Determine whether the subject represented by cred can "see"
2257147185Spjd * status of a mount point.
2258147185Spjd * Returns: 0 for permitted, ENOENT otherwise.
2259147185Spjd * XXX: This function should be called cr_canseemount() and should be
2260147185Spjd *      placed in kern_prot.c.
2261125804Srwatson */
2262125804Srwatsonint
2263147185Spjdprison_canseemount(struct ucred *cred, struct mount *mp)
2264125804Srwatson{
2265147185Spjd	struct prison *pr;
2266147185Spjd	struct statfs *sp;
2267147185Spjd	size_t len;
2268125804Srwatson
2269147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
2270147185Spjd		return (0);
2271147185Spjd	pr = cred->cr_prison;
2272147185Spjd	if (pr->pr_root->v_mount == mp)
2273147185Spjd		return (0);
2274147185Spjd	if (jail_enforce_statfs == 2)
2275147185Spjd		return (ENOENT);
2276147185Spjd	/*
2277147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
2278147185Spjd	 * all mount-points from inside a jail.
2279147185Spjd	 * This is ugly check, but this is the only situation when jail's
2280147185Spjd	 * directory ends with '/'.
2281147185Spjd	 */
2282147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
2283147185Spjd		return (0);
2284147185Spjd	len = strlen(pr->pr_path);
2285147185Spjd	sp = &mp->mnt_stat;
2286147185Spjd	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
2287147185Spjd		return (ENOENT);
2288147185Spjd	/*
2289147185Spjd	 * Be sure that we don't have situation where jail's root directory
2290147185Spjd	 * is "/some/path" and mount point is "/some/pathpath".
2291147185Spjd	 */
2292147185Spjd	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
2293147185Spjd		return (ENOENT);
2294147185Spjd	return (0);
2295147185Spjd}
2296147185Spjd
2297147185Spjdvoid
2298147185Spjdprison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
2299147185Spjd{
2300147185Spjd	char jpath[MAXPATHLEN];
2301147185Spjd	struct prison *pr;
2302147185Spjd	size_t len;
2303147185Spjd
2304147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
2305147185Spjd		return;
2306147185Spjd	pr = cred->cr_prison;
2307147185Spjd	if (prison_canseemount(cred, mp) != 0) {
2308147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
2309147185Spjd		strlcpy(sp->f_mntonname, "[restricted]",
2310147185Spjd		    sizeof(sp->f_mntonname));
2311147185Spjd		return;
2312125804Srwatson	}
2313147185Spjd	if (pr->pr_root->v_mount == mp) {
2314147185Spjd		/*
2315147185Spjd		 * Clear current buffer data, so we are sure nothing from
2316147185Spjd		 * the valid path left there.
2317147185Spjd		 */
2318147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
2319147185Spjd		*sp->f_mntonname = '/';
2320147185Spjd		return;
2321147185Spjd	}
2322147185Spjd	/*
2323147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
2324147185Spjd	 * all mount-points from inside a jail.
2325147185Spjd	 */
2326147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
2327147185Spjd		return;
2328147185Spjd	len = strlen(pr->pr_path);
2329147185Spjd	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
2330147185Spjd	/*
2331147185Spjd	 * Clear current buffer data, so we are sure nothing from
2332147185Spjd	 * the valid path left there.
2333147185Spjd	 */
2334147185Spjd	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
2335147185Spjd	if (*jpath == '\0') {
2336147185Spjd		/* Should never happen. */
2337147185Spjd		*sp->f_mntonname = '/';
2338147185Spjd	} else {
2339147185Spjd		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
2340147185Spjd	}
2341125804Srwatson}
2342125804Srwatson
2343164032Srwatson/*
2344164032Srwatson * Check with permission for a specific privilege is granted within jail.  We
2345164032Srwatson * have a specific list of accepted privileges; the rest are denied.
2346164032Srwatson */
2347164032Srwatsonint
2348164032Srwatsonprison_priv_check(struct ucred *cred, int priv)
2349164032Srwatson{
2350164032Srwatson
2351164032Srwatson	if (!jailed(cred))
2352164032Srwatson		return (0);
2353164032Srwatson
2354164032Srwatson	switch (priv) {
2355164032Srwatson
2356164032Srwatson		/*
2357164032Srwatson		 * Allow ktrace privileges for root in jail.
2358164032Srwatson		 */
2359164032Srwatson	case PRIV_KTRACE:
2360164032Srwatson
2361166827Srwatson#if 0
2362164032Srwatson		/*
2363164032Srwatson		 * Allow jailed processes to configure audit identity and
2364164032Srwatson		 * submit audit records (login, etc).  In the future we may
2365164032Srwatson		 * want to further refine the relationship between audit and
2366164032Srwatson		 * jail.
2367164032Srwatson		 */
2368164032Srwatson	case PRIV_AUDIT_GETAUDIT:
2369164032Srwatson	case PRIV_AUDIT_SETAUDIT:
2370164032Srwatson	case PRIV_AUDIT_SUBMIT:
2371166827Srwatson#endif
2372164032Srwatson
2373164032Srwatson		/*
2374164032Srwatson		 * Allow jailed processes to manipulate process UNIX
2375164032Srwatson		 * credentials in any way they see fit.
2376164032Srwatson		 */
2377164032Srwatson	case PRIV_CRED_SETUID:
2378164032Srwatson	case PRIV_CRED_SETEUID:
2379164032Srwatson	case PRIV_CRED_SETGID:
2380164032Srwatson	case PRIV_CRED_SETEGID:
2381164032Srwatson	case PRIV_CRED_SETGROUPS:
2382164032Srwatson	case PRIV_CRED_SETREUID:
2383164032Srwatson	case PRIV_CRED_SETREGID:
2384164032Srwatson	case PRIV_CRED_SETRESUID:
2385164032Srwatson	case PRIV_CRED_SETRESGID:
2386164032Srwatson
2387164032Srwatson		/*
2388164032Srwatson		 * Jail implements visibility constraints already, so allow
2389164032Srwatson		 * jailed root to override uid/gid-based constraints.
2390164032Srwatson		 */
2391164032Srwatson	case PRIV_SEEOTHERGIDS:
2392164032Srwatson	case PRIV_SEEOTHERUIDS:
2393164032Srwatson
2394164032Srwatson		/*
2395164032Srwatson		 * Jail implements inter-process debugging limits already, so
2396164032Srwatson		 * allow jailed root various debugging privileges.
2397164032Srwatson		 */
2398164032Srwatson	case PRIV_DEBUG_DIFFCRED:
2399164032Srwatson	case PRIV_DEBUG_SUGID:
2400164032Srwatson	case PRIV_DEBUG_UNPRIV:
2401164032Srwatson
2402164032Srwatson		/*
2403164032Srwatson		 * Allow jail to set various resource limits and login
2404164032Srwatson		 * properties, and for now, exceed process resource limits.
2405164032Srwatson		 */
2406164032Srwatson	case PRIV_PROC_LIMIT:
2407164032Srwatson	case PRIV_PROC_SETLOGIN:
2408164032Srwatson	case PRIV_PROC_SETRLIMIT:
2409164032Srwatson
2410164032Srwatson		/*
2411164032Srwatson		 * System V and POSIX IPC privileges are granted in jail.
2412164032Srwatson		 */
2413164032Srwatson	case PRIV_IPC_READ:
2414164032Srwatson	case PRIV_IPC_WRITE:
2415164032Srwatson	case PRIV_IPC_ADMIN:
2416164032Srwatson	case PRIV_IPC_MSGSIZE:
2417164032Srwatson	case PRIV_MQ_ADMIN:
2418164032Srwatson
2419164032Srwatson		/*
2420164032Srwatson		 * Jail implements its own inter-process limits, so allow
2421164032Srwatson		 * root processes in jail to change scheduling on other
2422164032Srwatson		 * processes in the same jail.  Likewise for signalling.
2423164032Srwatson		 */
2424164032Srwatson	case PRIV_SCHED_DIFFCRED:
2425185435Sbz	case PRIV_SCHED_CPUSET:
2426164032Srwatson	case PRIV_SIGNAL_DIFFCRED:
2427164032Srwatson	case PRIV_SIGNAL_SUGID:
2428164032Srwatson
2429164032Srwatson		/*
2430164032Srwatson		 * Allow jailed processes to write to sysctls marked as jail
2431164032Srwatson		 * writable.
2432164032Srwatson		 */
2433164032Srwatson	case PRIV_SYSCTL_WRITEJAIL:
2434164032Srwatson
2435164032Srwatson		/*
2436164032Srwatson		 * Allow root in jail to manage a variety of quota
2437166831Srwatson		 * properties.  These should likely be conditional on a
2438166831Srwatson		 * configuration option.
2439164032Srwatson		 */
2440166832Srwatson	case PRIV_VFS_GETQUOTA:
2441166832Srwatson	case PRIV_VFS_SETQUOTA:
2442164032Srwatson
2443164032Srwatson		/*
2444164032Srwatson		 * Since Jail relies on chroot() to implement file system
2445164032Srwatson		 * protections, grant many VFS privileges to root in jail.
2446164032Srwatson		 * Be careful to exclude mount-related and NFS-related
2447164032Srwatson		 * privileges.
2448164032Srwatson		 */
2449164032Srwatson	case PRIV_VFS_READ:
2450164032Srwatson	case PRIV_VFS_WRITE:
2451164032Srwatson	case PRIV_VFS_ADMIN:
2452164032Srwatson	case PRIV_VFS_EXEC:
2453164032Srwatson	case PRIV_VFS_LOOKUP:
2454164032Srwatson	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
2455164032Srwatson	case PRIV_VFS_CHFLAGS_DEV:
2456164032Srwatson	case PRIV_VFS_CHOWN:
2457164032Srwatson	case PRIV_VFS_CHROOT:
2458167152Spjd	case PRIV_VFS_RETAINSUGID:
2459164032Srwatson	case PRIV_VFS_FCHROOT:
2460164032Srwatson	case PRIV_VFS_LINK:
2461164032Srwatson	case PRIV_VFS_SETGID:
2462172860Srwatson	case PRIV_VFS_STAT:
2463164032Srwatson	case PRIV_VFS_STICKYFILE:
2464164032Srwatson		return (0);
2465164032Srwatson
2466164032Srwatson		/*
2467164032Srwatson		 * Depending on the global setting, allow privilege of
2468164032Srwatson		 * setting system flags.
2469164032Srwatson		 */
2470164032Srwatson	case PRIV_VFS_SYSFLAGS:
2471164032Srwatson		if (jail_chflags_allowed)
2472164032Srwatson			return (0);
2473164032Srwatson		else
2474164032Srwatson			return (EPERM);
2475164032Srwatson
2476164032Srwatson		/*
2477168396Spjd		 * Depending on the global setting, allow privilege of
2478168396Spjd		 * mounting/unmounting file systems.
2479168396Spjd		 */
2480168396Spjd	case PRIV_VFS_MOUNT:
2481168396Spjd	case PRIV_VFS_UNMOUNT:
2482168396Spjd	case PRIV_VFS_MOUNT_NONUSER:
2483168699Spjd	case PRIV_VFS_MOUNT_OWNER:
2484168396Spjd		if (jail_mount_allowed)
2485168396Spjd			return (0);
2486168396Spjd		else
2487168396Spjd			return (EPERM);
2488168396Spjd
2489168396Spjd		/*
2490168591Srwatson		 * Allow jailed root to bind reserved ports and reuse in-use
2491168591Srwatson		 * ports.
2492164032Srwatson		 */
2493164032Srwatson	case PRIV_NETINET_RESERVEDPORT:
2494168591Srwatson	case PRIV_NETINET_REUSEPORT:
2495164032Srwatson		return (0);
2496164032Srwatson
2497164032Srwatson		/*
2498175630Sbz		 * Allow jailed root to set certian IPv4/6 (option) headers.
2499175630Sbz		 */
2500175630Sbz	case PRIV_NETINET_SETHDROPTS:
2501175630Sbz		return (0);
2502175630Sbz
2503175630Sbz		/*
2504164032Srwatson		 * Conditionally allow creating raw sockets in jail.
2505164032Srwatson		 */
2506164032Srwatson	case PRIV_NETINET_RAW:
2507164032Srwatson		if (jail_allow_raw_sockets)
2508164032Srwatson			return (0);
2509164032Srwatson		else
2510164032Srwatson			return (EPERM);
2511164032Srwatson
2512164032Srwatson		/*
2513164032Srwatson		 * Since jail implements its own visibility limits on netstat
2514164032Srwatson		 * sysctls, allow getcred.  This allows identd to work in
2515164032Srwatson		 * jail.
2516164032Srwatson		 */
2517164032Srwatson	case PRIV_NETINET_GETCRED:
2518164032Srwatson		return (0);
2519164032Srwatson
2520164032Srwatson	default:
2521164032Srwatson		/*
2522164032Srwatson		 * In all remaining cases, deny the privilege request.  This
2523164032Srwatson		 * includes almost all network privileges, many system
2524164032Srwatson		 * configuration privileges.
2525164032Srwatson		 */
2526164032Srwatson		return (EPERM);
2527164032Srwatson	}
2528164032Srwatson}
2529164032Srwatson
2530113275Smikestatic int
2531113275Smikesysctl_jail_list(SYSCTL_HANDLER_ARGS)
2532113275Smike{
2533191673Sjamie	struct xprison *xp;
2534113275Smike	struct prison *pr;
2535191673Sjamie#ifdef INET
2536191673Sjamie	struct in_addr *ip4 = NULL;
2537191673Sjamie	int ip4s = 0;
2538191673Sjamie#endif
2539191673Sjamie#ifdef INET6
2540191673Sjamie	struct in_addr *ip6 = NULL;
2541191673Sjamie	int ip6s = 0;
2542191673Sjamie#endif
2543191673Sjamie	int error;
2544113275Smike
2545127020Spjd	if (jailed(req->td->td_ucred))
2546125806Srwatson		return (0);
2547113275Smike
2548191673Sjamie	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
2549191673Sjamie	error = 0;
2550168401Spjd	sx_slock(&allprison_lock);
2551191673Sjamie	TAILQ_FOREACH(pr, &allprison, pr_list) {
2552191673Sjamie again:
2553191673Sjamie		mtx_lock(&pr->pr_mtx);
2554185435Sbz#ifdef INET
2555191673Sjamie		if (pr->pr_ip4s > 0) {
2556191673Sjamie			if (ip4s < pr->pr_ip4s) {
2557191673Sjamie				ip4s = pr->pr_ip4s;
2558191673Sjamie				mtx_unlock(&pr->pr_mtx);
2559191673Sjamie				ip4 = realloc(ip4, ip4s *
2560191673Sjamie				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
2561191673Sjamie				goto again;
2562191673Sjamie			}
2563191673Sjamie			bcopy(pr->pr_ip4, ip4,
2564191673Sjamie			    pr->pr_ip4s * sizeof(struct in_addr));
2565191673Sjamie		}
2566185435Sbz#endif
2567185435Sbz#ifdef INET6
2568191673Sjamie		if (pr->pr_ip6s > 0) {
2569191673Sjamie			if (ip6s < pr->pr_ip6s) {
2570191673Sjamie				ip6s = pr->pr_ip6s;
2571191673Sjamie				mtx_unlock(&pr->pr_mtx);
2572191673Sjamie				ip6 = realloc(ip6, ip6s *
2573191673Sjamie				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
2574191673Sjamie				goto again;
2575191673Sjamie			}
2576191673Sjamie			bcopy(pr->pr_ip6, ip6,
2577191673Sjamie			    pr->pr_ip6s * sizeof(struct in6_addr));
2578191673Sjamie		}
2579185435Sbz#endif
2580191673Sjamie		if (pr->pr_ref == 0) {
2581191673Sjamie			mtx_unlock(&pr->pr_mtx);
2582191673Sjamie			continue;
2583191673Sjamie		}
2584191673Sjamie		bzero(xp, sizeof(*xp));
2585113275Smike		xp->pr_version = XPRISON_VERSION;
2586113275Smike		xp->pr_id = pr->pr_id;
2587191673Sjamie		xp->pr_state = pr->pr_uref > 0
2588191673Sjamie		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
2589113275Smike		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
2590113275Smike		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
2591185435Sbz		strlcpy(xp->pr_name, pr->pr_name, sizeof(xp->pr_name));
2592185435Sbz#ifdef INET
2593185435Sbz		xp->pr_ip4s = pr->pr_ip4s;
2594185435Sbz#endif
2595185435Sbz#ifdef INET6
2596185435Sbz		xp->pr_ip6s = pr->pr_ip6s;
2597185435Sbz#endif
2598191673Sjamie		mtx_unlock(&pr->pr_mtx);
2599191673Sjamie		error = SYSCTL_OUT(req, xp, sizeof(*xp));
2600191673Sjamie		if (error)
2601191673Sjamie			break;
2602185435Sbz#ifdef INET
2603191673Sjamie		if (xp->pr_ip4s > 0) {
2604191673Sjamie			error = SYSCTL_OUT(req, ip4,
2605191673Sjamie			    xp->pr_ip4s * sizeof(struct in_addr));
2606191673Sjamie			if (error)
2607191673Sjamie				break;
2608185435Sbz		}
2609185435Sbz#endif
2610185435Sbz#ifdef INET6
2611191673Sjamie		if (xp->pr_ip6s > 0) {
2612191673Sjamie			error = SYSCTL_OUT(req, ip6,
2613191673Sjamie			    xp->pr_ip6s * sizeof(struct in6_addr));
2614191673Sjamie			if (error)
2615191673Sjamie				break;
2616185435Sbz		}
2617185435Sbz#endif
2618113275Smike	}
2619168401Spjd	sx_sunlock(&allprison_lock);
2620191673Sjamie	free(xp, M_TEMP);
2621191673Sjamie#ifdef INET
2622191673Sjamie	free(ip4, M_TEMP);
2623191673Sjamie#endif
2624191673Sjamie#ifdef INET6
2625191673Sjamie	free(ip6, M_TEMP);
2626191673Sjamie#endif
2627167354Spjd	return (error);
2628113275Smike}
2629113275Smike
2630187864SedSYSCTL_OID(_security_jail, OID_AUTO, list,
2631187864Sed    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2632187864Sed    sysctl_jail_list, "S", "List of active jails");
2633126004Spjd
2634126004Spjdstatic int
2635126004Spjdsysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
2636126004Spjd{
2637126004Spjd	int error, injail;
2638126004Spjd
2639126004Spjd	injail = jailed(req->td->td_ucred);
2640126004Spjd	error = SYSCTL_OUT(req, &injail, sizeof(injail));
2641126004Spjd
2642126004Spjd	return (error);
2643126004Spjd}
2644187864SedSYSCTL_PROC(_security_jail, OID_AUTO, jailed,
2645187864Sed    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2646187864Sed    sysctl_jail_jailed, "I", "Process in jail?");
2647185435Sbz
2648185435Sbz#ifdef DDB
2649191673Sjamie
2650191673Sjamiestatic void
2651191673Sjamiedb_show_prison(struct prison *pr)
2652185435Sbz{
2653191673Sjamie#if defined(INET) || defined(INET6)
2654191673Sjamie	int ii;
2655185435Sbz#endif
2656185435Sbz#ifdef INET6
2657185435Sbz	char ip6buf[INET6_ADDRSTRLEN];
2658185435Sbz#endif
2659185435Sbz
2660191673Sjamie	db_printf("prison %p:\n", pr);
2661191673Sjamie	db_printf(" jid             = %d\n", pr->pr_id);
2662191673Sjamie	db_printf(" name            = %s\n", pr->pr_name);
2663191673Sjamie	db_printf(" ref             = %d\n", pr->pr_ref);
2664191673Sjamie	db_printf(" uref            = %d\n", pr->pr_uref);
2665191673Sjamie	db_printf(" path            = %s\n", pr->pr_path);
2666191673Sjamie	db_printf(" cpuset          = %d\n", pr->pr_cpuset
2667191673Sjamie	    ? pr->pr_cpuset->cs_id : -1);
2668191673Sjamie	db_printf(" root            = %p\n", pr->pr_root);
2669191673Sjamie	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
2670191673Sjamie	db_printf(" flags           = %x", pr->pr_flags);
2671191673Sjamie	if (pr->pr_flags & PR_PERSIST)
2672191673Sjamie		db_printf(" persist");
2673191673Sjamie	db_printf("\n");
2674191673Sjamie	db_printf(" host.hostname   = %s\n", pr->pr_host);
2675185435Sbz#ifdef INET
2676191673Sjamie	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
2677191673Sjamie	for (ii = 0; ii < pr->pr_ip4s; ii++)
2678191673Sjamie		db_printf(" %s %s\n",
2679191673Sjamie		    ii == 0 ? "ip4             =" : "                 ",
2680191673Sjamie		    inet_ntoa(pr->pr_ip4[ii]));
2681185435Sbz#endif
2682185435Sbz#ifdef INET6
2683191673Sjamie	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
2684191673Sjamie	for (ii = 0; ii < pr->pr_ip6s; ii++)
2685191673Sjamie		db_printf(" %s %s\n",
2686191673Sjamie		    ii == 0 ? "ip6             =" : "                 ",
2687191673Sjamie		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
2688191673Sjamie#endif
2689191673Sjamie}
2690191673Sjamie
2691191673SjamieDB_SHOW_COMMAND(prison, db_show_prison_command)
2692191673Sjamie{
2693191673Sjamie	struct prison *pr;
2694191673Sjamie
2695191673Sjamie	if (!have_addr) {
2696191673Sjamie		/* Show all prisons in the list. */
2697191673Sjamie		TAILQ_FOREACH(pr, &allprison, pr_list) {
2698191673Sjamie			db_show_prison(pr);
2699191673Sjamie			if (db_pager_quit)
2700191673Sjamie				break;
2701191673Sjamie		}
2702191673Sjamie		return;
2703191673Sjamie	}
2704191673Sjamie
2705191673Sjamie	/* Look for a prison with the ID and with references. */
2706191673Sjamie	TAILQ_FOREACH(pr, &allprison, pr_list)
2707191673Sjamie		if (pr->pr_id == addr && pr->pr_ref > 0)
2708185435Sbz			break;
2709191673Sjamie	if (pr == NULL)
2710191673Sjamie		/* Look again, without requiring a reference. */
2711191673Sjamie		TAILQ_FOREACH(pr, &allprison, pr_list)
2712191673Sjamie			if (pr->pr_id == addr)
2713191673Sjamie				break;
2714191673Sjamie	if (pr == NULL)
2715191673Sjamie		/* Assume address points to a valid prison. */
2716191673Sjamie		pr = (struct prison *)addr;
2717191673Sjamie	db_show_prison(pr);
2718185435Sbz}
2719191673Sjamie
2720185435Sbz#endif /* DDB */
2721