kern_jail.c revision 177785
1139804Simp/*-
246197Sphk * ----------------------------------------------------------------------------
346197Sphk * "THE BEER-WARE LICENSE" (Revision 42):
446197Sphk * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
546197Sphk * can do whatever you want with this stuff. If we meet some day, and you think
646197Sphk * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
746197Sphk * ----------------------------------------------------------------------------
846197Sphk */
946155Sphk
10116182Sobrien#include <sys/cdefs.h>
11116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 177785 2008-03-31 12:01:21Z kib $");
12116182Sobrien
13131177Spjd#include "opt_mac.h"
14131177Spjd
1546155Sphk#include <sys/param.h>
1646155Sphk#include <sys/types.h>
1746155Sphk#include <sys/kernel.h>
1846155Sphk#include <sys/systm.h>
1946155Sphk#include <sys/errno.h>
2046155Sphk#include <sys/sysproto.h>
2146155Sphk#include <sys/malloc.h>
22164032Srwatson#include <sys/priv.h>
2346155Sphk#include <sys/proc.h>
24124882Srwatson#include <sys/taskqueue.h>
25177785Skib#include <sys/fcntl.h>
2646155Sphk#include <sys/jail.h>
2787275Srwatson#include <sys/lock.h>
2887275Srwatson#include <sys/mutex.h>
29168401Spjd#include <sys/sx.h>
30113275Smike#include <sys/namei.h>
31147185Spjd#include <sys/mount.h>
32113275Smike#include <sys/queue.h>
3346155Sphk#include <sys/socket.h>
34113275Smike#include <sys/syscallsubr.h>
3557163Srwatson#include <sys/sysctl.h>
36113275Smike#include <sys/vnode.h>
3746155Sphk#include <net/if.h>
3846155Sphk#include <netinet/in.h>
3946155Sphk
40163606Srwatson#include <security/mac/mac_framework.h>
41163606Srwatson
4246155SphkMALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
4346155Sphk
4489414SarrSYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4557163Srwatson    "Jail rules");
4657163Srwatson
4757163Srwatsonint	jail_set_hostname_allowed = 1;
4889414SarrSYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
4957163Srwatson    &jail_set_hostname_allowed, 0,
5057163Srwatson    "Processes in jail can set their hostnames");
5157163Srwatson
5261235Srwatsonint	jail_socket_unixiproute_only = 1;
5389414SarrSYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
5461235Srwatson    &jail_socket_unixiproute_only, 0,
5561235Srwatson    "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
5661235Srwatson
5768024Srwatsonint	jail_sysvipc_allowed = 0;
5889414SarrSYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
5968024Srwatson    &jail_sysvipc_allowed, 0,
6068024Srwatson    "Processes in jail can use System V IPC primitives");
6168024Srwatson
62147185Spjdstatic int jail_enforce_statfs = 2;
63147185SpjdSYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
64147185Spjd    &jail_enforce_statfs, 0,
65147185Spjd    "Processes in jail cannot see all mounted file systems");
66125804Srwatson
67128664Sbmilekicint	jail_allow_raw_sockets = 0;
68128664SbmilekicSYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
69128664Sbmilekic    &jail_allow_raw_sockets, 0,
70128664Sbmilekic    "Prison root can create raw sockets");
71128664Sbmilekic
72141543Scpercivaint	jail_chflags_allowed = 0;
73141543ScpercivaSYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
74141543Scperciva    &jail_chflags_allowed, 0,
75141543Scperciva    "Processes in jail can alter system file flags");
76141543Scperciva
77168396Spjdint	jail_mount_allowed = 0;
78168396SpjdSYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
79168396Spjd    &jail_mount_allowed, 0,
80168396Spjd    "Processes in jail can mount/unmount jail-friendly file systems");
81168396Spjd
82168401Spjd/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
83113275Smikestruct	prisonlist allprison;
84168401Spjdstruct	sx allprison_lock;
85113275Smikeint	lastprid = 0;
86113275Smikeint	prisoncount = 0;
87113275Smike
88168401Spjd/*
89168401Spjd * List of jail services. Protected by allprison_lock.
90168401Spjd */
91168401SpjdTAILQ_HEAD(prison_services_head, prison_service);
92168401Spjdstatic struct prison_services_head prison_services =
93168401Spjd    TAILQ_HEAD_INITIALIZER(prison_services);
94168401Spjdstatic int prison_service_slots = 0;
95168401Spjd
96168401Spjdstruct prison_service {
97168401Spjd	prison_create_t ps_create;
98168401Spjd	prison_destroy_t ps_destroy;
99168401Spjd	int		ps_slotno;
100168401Spjd	TAILQ_ENTRY(prison_service) ps_next;
101168401Spjd	char	ps_name[0];
102168401Spjd};
103168401Spjd
104113275Smikestatic void		 init_prison(void *);
105124882Srwatsonstatic void		 prison_complete(void *context, int pending);
106113275Smikestatic int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
107113275Smike
108113275Smikestatic void
109113275Smikeinit_prison(void *data __unused)
110113275Smike{
111113275Smike
112168401Spjd	sx_init(&allprison_lock, "allprison");
113113275Smike	LIST_INIT(&allprison);
114113275Smike}
115113275Smike
116113275SmikeSYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
117113275Smike
11882710Sdillon/*
119114168Smike * struct jail_args {
120114168Smike *	struct jail *jail;
121114168Smike * };
12282710Sdillon */
12346155Sphkint
124114168Smikejail(struct thread *td, struct jail_args *uap)
12546155Sphk{
126113275Smike	struct nameidata nd;
127113275Smike	struct prison *pr, *tpr;
128168401Spjd	struct prison_service *psrv;
12946155Sphk	struct jail j;
130113275Smike	struct jail_attach_args jaa;
131150652Scsjp	int vfslocked, error, tryprid;
13246155Sphk
133114168Smike	error = copyin(uap->jail, &j, sizeof(j));
13446155Sphk	if (error)
13584828Sjhb		return (error);
13684828Sjhb	if (j.version != 0)
13784828Sjhb		return (EINVAL);
13884828Sjhb
139114168Smike	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
14093818Sjhb	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
141113275Smike	pr->pr_ref = 1;
142114168Smike	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
143113275Smike	if (error)
144113275Smike		goto e_killmtx;
145150652Scsjp	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
146150652Scsjp	    pr->pr_path, td);
147113275Smike	error = namei(&nd);
148150652Scsjp	if (error)
149113275Smike		goto e_killmtx;
150150652Scsjp	vfslocked = NDHASGIANT(&nd);
151113275Smike	pr->pr_root = nd.ni_vp;
152175294Sattilio	VOP_UNLOCK(nd.ni_vp, 0);
153113275Smike	NDFREE(&nd, NDF_ONLY_PNBUF);
154150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
155114168Smike	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
15684828Sjhb	if (error)
157113275Smike		goto e_dropvnref;
158113275Smike	pr->pr_ip = j.ip_number;
159113275Smike	pr->pr_linux = NULL;
160113275Smike	pr->pr_securelevel = securelevel;
161168401Spjd	if (prison_service_slots == 0)
162168401Spjd		pr->pr_slots = NULL;
163168401Spjd	else {
164168401Spjd		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
165168401Spjd		    M_PRISON, M_ZERO | M_WAITOK);
166168401Spjd	}
167113275Smike
168113275Smike	/* Determine next pr_id and add prison to allprison list. */
169168401Spjd	sx_xlock(&allprison_lock);
170113275Smike	tryprid = lastprid + 1;
171113275Smike	if (tryprid == JAIL_MAX)
172113275Smike		tryprid = 1;
173113275Smikenext:
174113275Smike	LIST_FOREACH(tpr, &allprison, pr_list) {
175113275Smike		if (tpr->pr_id == tryprid) {
176113275Smike			tryprid++;
177113275Smike			if (tryprid == JAIL_MAX) {
178168401Spjd				sx_xunlock(&allprison_lock);
179113275Smike				error = EAGAIN;
180113275Smike				goto e_dropvnref;
181113275Smike			}
182113275Smike			goto next;
183113275Smike		}
184113275Smike	}
185113275Smike	pr->pr_id = jaa.jid = lastprid = tryprid;
186113275Smike	LIST_INSERT_HEAD(&allprison, pr, pr_list);
187113275Smike	prisoncount++;
188168401Spjd	sx_downgrade(&allprison_lock);
189168401Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
190168401Spjd		psrv->ps_create(psrv, pr);
191168401Spjd	}
192168401Spjd	sx_sunlock(&allprison_lock);
193113275Smike
194113275Smike	error = jail_attach(td, &jaa);
195113275Smike	if (error)
196113275Smike		goto e_dropprref;
197113275Smike	mtx_lock(&pr->pr_mtx);
198113275Smike	pr->pr_ref--;
199113275Smike	mtx_unlock(&pr->pr_mtx);
200113275Smike	td->td_retval[0] = jaa.jid;
201113275Smike	return (0);
202113275Smikee_dropprref:
203168401Spjd	sx_xlock(&allprison_lock);
204113275Smike	LIST_REMOVE(pr, pr_list);
205113275Smike	prisoncount--;
206168401Spjd	sx_downgrade(&allprison_lock);
207168401Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
208168401Spjd		psrv->ps_destroy(psrv, pr);
209168401Spjd	}
210168401Spjd	sx_sunlock(&allprison_lock);
211113275Smikee_dropvnref:
212150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
213113275Smike	vrele(pr->pr_root);
214150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
215113275Smikee_killmtx:
216113275Smike	mtx_destroy(&pr->pr_mtx);
217113275Smike	FREE(pr, M_PRISON);
218113275Smike	return (error);
219113275Smike}
220113275Smike
221113275Smike/*
222114168Smike * struct jail_attach_args {
223114168Smike *	int jid;
224114168Smike * };
225113275Smike */
226113275Smikeint
227114168Smikejail_attach(struct thread *td, struct jail_attach_args *uap)
228113275Smike{
229113275Smike	struct proc *p;
230113275Smike	struct ucred *newcred, *oldcred;
231113275Smike	struct prison *pr;
232150652Scsjp	int vfslocked, error;
233167309Spjd
234126023Snectar	/*
235126023Snectar	 * XXX: Note that there is a slight race here if two threads
236126023Snectar	 * in the same privileged process attempt to attach to two
237126023Snectar	 * different jails at the same time.  It is important for
238126023Snectar	 * user processes not to do this, or they might end up with
239126023Snectar	 * a process root from one prison, but attached to the jail
240126023Snectar	 * of another.
241126023Snectar	 */
242164032Srwatson	error = priv_check(td, PRIV_JAIL_ATTACH);
243126023Snectar	if (error)
244126023Snectar		return (error);
245126023Snectar
246113275Smike	p = td->td_proc;
247168401Spjd	sx_slock(&allprison_lock);
248113275Smike	pr = prison_find(uap->jid);
249113275Smike	if (pr == NULL) {
250168401Spjd		sx_sunlock(&allprison_lock);
251113275Smike		return (EINVAL);
252113275Smike	}
253113275Smike	pr->pr_ref++;
254113275Smike	mtx_unlock(&pr->pr_mtx);
255168401Spjd	sx_sunlock(&allprison_lock);
256113275Smike
257150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
258175202Sattilio	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
259113275Smike	if ((error = change_dir(pr->pr_root, td)) != 0)
260113275Smike		goto e_unlock;
261113275Smike#ifdef MAC
262172930Srwatson	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
263113275Smike		goto e_unlock;
264113275Smike#endif
265175294Sattilio	VOP_UNLOCK(pr->pr_root, 0);
266113275Smike	change_root(pr->pr_root, td);
267150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
268113275Smike
26984828Sjhb	newcred = crget();
27084828Sjhb	PROC_LOCK(p);
27184828Sjhb	oldcred = p->p_ucred;
272113275Smike	setsugid(p);
27384828Sjhb	crcopy(newcred, oldcred);
274113630Sjhb	newcred->cr_prison = pr;
27584828Sjhb	p->p_ucred = newcred;
27684828Sjhb	PROC_UNLOCK(p);
27784828Sjhb	crfree(oldcred);
27846155Sphk	return (0);
279113275Smikee_unlock:
280175294Sattilio	VOP_UNLOCK(pr->pr_root, 0);
281150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
282113275Smike	mtx_lock(&pr->pr_mtx);
283113275Smike	pr->pr_ref--;
284113275Smike	mtx_unlock(&pr->pr_mtx);
28546155Sphk	return (error);
28646155Sphk}
28746155Sphk
288113275Smike/*
289113275Smike * Returns a locked prison instance, or NULL on failure.
290113275Smike */
291168399Spjdstruct prison *
292113275Smikeprison_find(int prid)
293113275Smike{
294113275Smike	struct prison *pr;
295113275Smike
296168401Spjd	sx_assert(&allprison_lock, SX_LOCKED);
297113275Smike	LIST_FOREACH(pr, &allprison, pr_list) {
298113275Smike		if (pr->pr_id == prid) {
299113275Smike			mtx_lock(&pr->pr_mtx);
300168489Spjd			if (pr->pr_ref == 0) {
301168489Spjd				mtx_unlock(&pr->pr_mtx);
302168489Spjd				break;
303168489Spjd			}
304113275Smike			return (pr);
305113275Smike		}
306113275Smike	}
307113275Smike	return (NULL);
308113275Smike}
309113275Smike
31072786Srwatsonvoid
31172786Srwatsonprison_free(struct prison *pr)
31272786Srwatson{
31372786Srwatson
31487275Srwatson	mtx_lock(&pr->pr_mtx);
31572786Srwatson	pr->pr_ref--;
31672786Srwatson	if (pr->pr_ref == 0) {
317168483Spjd		mtx_unlock(&pr->pr_mtx);
318124882Srwatson		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
319144660Sjeff		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
32087275Srwatson		return;
32172786Srwatson	}
32287275Srwatson	mtx_unlock(&pr->pr_mtx);
32372786Srwatson}
32472786Srwatson
325124882Srwatsonstatic void
326124882Srwatsonprison_complete(void *context, int pending)
327124882Srwatson{
328168489Spjd	struct prison_service *psrv;
329124882Srwatson	struct prison *pr;
330150652Scsjp	int vfslocked;
331124882Srwatson
332124882Srwatson	pr = (struct prison *)context;
333124882Srwatson
334168489Spjd	sx_xlock(&allprison_lock);
335168489Spjd	LIST_REMOVE(pr, pr_list);
336168489Spjd	prisoncount--;
337168489Spjd	sx_downgrade(&allprison_lock);
338168489Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
339168489Spjd		psrv->ps_destroy(psrv, pr);
340168489Spjd	}
341168489Spjd	sx_sunlock(&allprison_lock);
342168489Spjd
343150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
344124882Srwatson	vrele(pr->pr_root);
345150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
346124882Srwatson
347124882Srwatson	mtx_destroy(&pr->pr_mtx);
348124882Srwatson	if (pr->pr_linux != NULL)
349124882Srwatson		FREE(pr->pr_linux, M_PRISON);
350124882Srwatson	FREE(pr, M_PRISON);
351124882Srwatson}
352124882Srwatson
35372786Srwatsonvoid
35472786Srwatsonprison_hold(struct prison *pr)
35572786Srwatson{
35672786Srwatson
35787275Srwatson	mtx_lock(&pr->pr_mtx);
358168489Spjd	KASSERT(pr->pr_ref > 0,
359168489Spjd	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
36072786Srwatson	pr->pr_ref++;
36187275Srwatson	mtx_unlock(&pr->pr_mtx);
36272786Srwatson}
36372786Srwatson
36487275Srwatsonu_int32_t
36587275Srwatsonprison_getip(struct ucred *cred)
36687275Srwatson{
36787275Srwatson
36887275Srwatson	return (cred->cr_prison->pr_ip);
36987275Srwatson}
37087275Srwatson
37146155Sphkint
37272786Srwatsonprison_ip(struct ucred *cred, int flag, u_int32_t *ip)
37346155Sphk{
37446155Sphk	u_int32_t tmp;
37546155Sphk
37672786Srwatson	if (!jailed(cred))
37746155Sphk		return (0);
378167309Spjd	if (flag)
37946155Sphk		tmp = *ip;
38046155Sphk	else
38146155Sphk		tmp = ntohl(*ip);
38246155Sphk	if (tmp == INADDR_ANY) {
383167309Spjd		if (flag)
38472786Srwatson			*ip = cred->cr_prison->pr_ip;
38546155Sphk		else
38672786Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
38746155Sphk		return (0);
38846155Sphk	}
38981114Srwatson	if (tmp == INADDR_LOOPBACK) {
39081114Srwatson		if (flag)
39181114Srwatson			*ip = cred->cr_prison->pr_ip;
39281114Srwatson		else
39381114Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
39481114Srwatson		return (0);
39581114Srwatson	}
39672786Srwatson	if (cred->cr_prison->pr_ip != tmp)
39746155Sphk		return (1);
39846155Sphk	return (0);
39946155Sphk}
40046155Sphk
40146155Sphkvoid
40272786Srwatsonprison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
40346155Sphk{
40446155Sphk	u_int32_t tmp;
40546155Sphk
40672786Srwatson	if (!jailed(cred))
40746155Sphk		return;
40846155Sphk	if (flag)
40946155Sphk		tmp = *ip;
41046155Sphk	else
41146155Sphk		tmp = ntohl(*ip);
41281114Srwatson	if (tmp == INADDR_LOOPBACK) {
41346155Sphk		if (flag)
41472786Srwatson			*ip = cred->cr_prison->pr_ip;
41546155Sphk		else
41672786Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
41746155Sphk		return;
41846155Sphk	}
41946155Sphk	return;
42046155Sphk}
42146155Sphk
42246155Sphkint
42372786Srwatsonprison_if(struct ucred *cred, struct sockaddr *sa)
42446155Sphk{
425114168Smike	struct sockaddr_in *sai;
42646155Sphk	int ok;
42746155Sphk
428114168Smike	sai = (struct sockaddr_in *)sa;
42961235Srwatson	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
43061235Srwatson		ok = 1;
43161235Srwatson	else if (sai->sin_family != AF_INET)
43246155Sphk		ok = 0;
43372786Srwatson	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
43446155Sphk		ok = 1;
43546155Sphk	else
43646155Sphk		ok = 0;
43746155Sphk	return (ok);
43846155Sphk}
43972786Srwatson
44072786Srwatson/*
44172786Srwatson * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
44272786Srwatson */
44372786Srwatsonint
444114168Smikeprison_check(struct ucred *cred1, struct ucred *cred2)
44572786Srwatson{
44672786Srwatson
44772786Srwatson	if (jailed(cred1)) {
44872786Srwatson		if (!jailed(cred2))
44972786Srwatson			return (ESRCH);
45072786Srwatson		if (cred2->cr_prison != cred1->cr_prison)
45172786Srwatson			return (ESRCH);
45272786Srwatson	}
45372786Srwatson
45472786Srwatson	return (0);
45572786Srwatson}
45672786Srwatson
45772786Srwatson/*
45872786Srwatson * Return 1 if the passed credential is in a jail, otherwise 0.
45972786Srwatson */
46072786Srwatsonint
461114168Smikejailed(struct ucred *cred)
46272786Srwatson{
46372786Srwatson
46472786Srwatson	return (cred->cr_prison != NULL);
46572786Srwatson}
46691384Srobert
46791384Srobert/*
46891384Srobert * Return the correct hostname for the passed credential.
46991384Srobert */
47091391Srobertvoid
471114168Smikegetcredhostname(struct ucred *cred, char *buf, size_t size)
47291384Srobert{
47391384Srobert
47491391Srobert	if (jailed(cred)) {
47591391Srobert		mtx_lock(&cred->cr_prison->pr_mtx);
476105354Srobert		strlcpy(buf, cred->cr_prison->pr_host, size);
47791391Srobert		mtx_unlock(&cred->cr_prison->pr_mtx);
478114168Smike	} else
479105354Srobert		strlcpy(buf, hostname, size);
48091384Srobert}
481113275Smike
482125804Srwatson/*
483147185Spjd * Determine whether the subject represented by cred can "see"
484147185Spjd * status of a mount point.
485147185Spjd * Returns: 0 for permitted, ENOENT otherwise.
486147185Spjd * XXX: This function should be called cr_canseemount() and should be
487147185Spjd *      placed in kern_prot.c.
488125804Srwatson */
489125804Srwatsonint
490147185Spjdprison_canseemount(struct ucred *cred, struct mount *mp)
491125804Srwatson{
492147185Spjd	struct prison *pr;
493147185Spjd	struct statfs *sp;
494147185Spjd	size_t len;
495125804Srwatson
496147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
497147185Spjd		return (0);
498147185Spjd	pr = cred->cr_prison;
499147185Spjd	if (pr->pr_root->v_mount == mp)
500147185Spjd		return (0);
501147185Spjd	if (jail_enforce_statfs == 2)
502147185Spjd		return (ENOENT);
503147185Spjd	/*
504147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
505147185Spjd	 * all mount-points from inside a jail.
506147185Spjd	 * This is ugly check, but this is the only situation when jail's
507147185Spjd	 * directory ends with '/'.
508147185Spjd	 */
509147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
510147185Spjd		return (0);
511147185Spjd	len = strlen(pr->pr_path);
512147185Spjd	sp = &mp->mnt_stat;
513147185Spjd	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
514147185Spjd		return (ENOENT);
515147185Spjd	/*
516147185Spjd	 * Be sure that we don't have situation where jail's root directory
517147185Spjd	 * is "/some/path" and mount point is "/some/pathpath".
518147185Spjd	 */
519147185Spjd	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
520147185Spjd		return (ENOENT);
521147185Spjd	return (0);
522147185Spjd}
523147185Spjd
524147185Spjdvoid
525147185Spjdprison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
526147185Spjd{
527147185Spjd	char jpath[MAXPATHLEN];
528147185Spjd	struct prison *pr;
529147185Spjd	size_t len;
530147185Spjd
531147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
532147185Spjd		return;
533147185Spjd	pr = cred->cr_prison;
534147185Spjd	if (prison_canseemount(cred, mp) != 0) {
535147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
536147185Spjd		strlcpy(sp->f_mntonname, "[restricted]",
537147185Spjd		    sizeof(sp->f_mntonname));
538147185Spjd		return;
539125804Srwatson	}
540147185Spjd	if (pr->pr_root->v_mount == mp) {
541147185Spjd		/*
542147185Spjd		 * Clear current buffer data, so we are sure nothing from
543147185Spjd		 * the valid path left there.
544147185Spjd		 */
545147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
546147185Spjd		*sp->f_mntonname = '/';
547147185Spjd		return;
548147185Spjd	}
549147185Spjd	/*
550147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
551147185Spjd	 * all mount-points from inside a jail.
552147185Spjd	 */
553147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
554147185Spjd		return;
555147185Spjd	len = strlen(pr->pr_path);
556147185Spjd	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
557147185Spjd	/*
558147185Spjd	 * Clear current buffer data, so we are sure nothing from
559147185Spjd	 * the valid path left there.
560147185Spjd	 */
561147185Spjd	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
562147185Spjd	if (*jpath == '\0') {
563147185Spjd		/* Should never happen. */
564147185Spjd		*sp->f_mntonname = '/';
565147185Spjd	} else {
566147185Spjd		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
567147185Spjd	}
568125804Srwatson}
569125804Srwatson
570164032Srwatson/*
571164032Srwatson * Check with permission for a specific privilege is granted within jail.  We
572164032Srwatson * have a specific list of accepted privileges; the rest are denied.
573164032Srwatson */
574164032Srwatsonint
575164032Srwatsonprison_priv_check(struct ucred *cred, int priv)
576164032Srwatson{
577164032Srwatson
578164032Srwatson	if (!jailed(cred))
579164032Srwatson		return (0);
580164032Srwatson
581164032Srwatson	switch (priv) {
582164032Srwatson
583164032Srwatson		/*
584164032Srwatson		 * Allow ktrace privileges for root in jail.
585164032Srwatson		 */
586164032Srwatson	case PRIV_KTRACE:
587164032Srwatson
588166827Srwatson#if 0
589164032Srwatson		/*
590164032Srwatson		 * Allow jailed processes to configure audit identity and
591164032Srwatson		 * submit audit records (login, etc).  In the future we may
592164032Srwatson		 * want to further refine the relationship between audit and
593164032Srwatson		 * jail.
594164032Srwatson		 */
595164032Srwatson	case PRIV_AUDIT_GETAUDIT:
596164032Srwatson	case PRIV_AUDIT_SETAUDIT:
597164032Srwatson	case PRIV_AUDIT_SUBMIT:
598166827Srwatson#endif
599164032Srwatson
600164032Srwatson		/*
601164032Srwatson		 * Allow jailed processes to manipulate process UNIX
602164032Srwatson		 * credentials in any way they see fit.
603164032Srwatson		 */
604164032Srwatson	case PRIV_CRED_SETUID:
605164032Srwatson	case PRIV_CRED_SETEUID:
606164032Srwatson	case PRIV_CRED_SETGID:
607164032Srwatson	case PRIV_CRED_SETEGID:
608164032Srwatson	case PRIV_CRED_SETGROUPS:
609164032Srwatson	case PRIV_CRED_SETREUID:
610164032Srwatson	case PRIV_CRED_SETREGID:
611164032Srwatson	case PRIV_CRED_SETRESUID:
612164032Srwatson	case PRIV_CRED_SETRESGID:
613164032Srwatson
614164032Srwatson		/*
615164032Srwatson		 * Jail implements visibility constraints already, so allow
616164032Srwatson		 * jailed root to override uid/gid-based constraints.
617164032Srwatson		 */
618164032Srwatson	case PRIV_SEEOTHERGIDS:
619164032Srwatson	case PRIV_SEEOTHERUIDS:
620164032Srwatson
621164032Srwatson		/*
622164032Srwatson		 * Jail implements inter-process debugging limits already, so
623164032Srwatson		 * allow jailed root various debugging privileges.
624164032Srwatson		 */
625164032Srwatson	case PRIV_DEBUG_DIFFCRED:
626164032Srwatson	case PRIV_DEBUG_SUGID:
627164032Srwatson	case PRIV_DEBUG_UNPRIV:
628164032Srwatson
629164032Srwatson		/*
630164032Srwatson		 * Allow jail to set various resource limits and login
631164032Srwatson		 * properties, and for now, exceed process resource limits.
632164032Srwatson		 */
633164032Srwatson	case PRIV_PROC_LIMIT:
634164032Srwatson	case PRIV_PROC_SETLOGIN:
635164032Srwatson	case PRIV_PROC_SETRLIMIT:
636164032Srwatson
637164032Srwatson		/*
638164032Srwatson		 * System V and POSIX IPC privileges are granted in jail.
639164032Srwatson		 */
640164032Srwatson	case PRIV_IPC_READ:
641164032Srwatson	case PRIV_IPC_WRITE:
642164032Srwatson	case PRIV_IPC_ADMIN:
643164032Srwatson	case PRIV_IPC_MSGSIZE:
644164032Srwatson	case PRIV_MQ_ADMIN:
645164032Srwatson
646164032Srwatson		/*
647164032Srwatson		 * Jail implements its own inter-process limits, so allow
648164032Srwatson		 * root processes in jail to change scheduling on other
649164032Srwatson		 * processes in the same jail.  Likewise for signalling.
650164032Srwatson		 */
651164032Srwatson	case PRIV_SCHED_DIFFCRED:
652164032Srwatson	case PRIV_SIGNAL_DIFFCRED:
653164032Srwatson	case PRIV_SIGNAL_SUGID:
654164032Srwatson
655164032Srwatson		/*
656164032Srwatson		 * Allow jailed processes to write to sysctls marked as jail
657164032Srwatson		 * writable.
658164032Srwatson		 */
659164032Srwatson	case PRIV_SYSCTL_WRITEJAIL:
660164032Srwatson
661164032Srwatson		/*
662164032Srwatson		 * Allow root in jail to manage a variety of quota
663166831Srwatson		 * properties.  These should likely be conditional on a
664166831Srwatson		 * configuration option.
665164032Srwatson		 */
666166832Srwatson	case PRIV_VFS_GETQUOTA:
667166832Srwatson	case PRIV_VFS_SETQUOTA:
668164032Srwatson
669164032Srwatson		/*
670164032Srwatson		 * Since Jail relies on chroot() to implement file system
671164032Srwatson		 * protections, grant many VFS privileges to root in jail.
672164032Srwatson		 * Be careful to exclude mount-related and NFS-related
673164032Srwatson		 * privileges.
674164032Srwatson		 */
675164032Srwatson	case PRIV_VFS_READ:
676164032Srwatson	case PRIV_VFS_WRITE:
677164032Srwatson	case PRIV_VFS_ADMIN:
678164032Srwatson	case PRIV_VFS_EXEC:
679164032Srwatson	case PRIV_VFS_LOOKUP:
680164032Srwatson	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
681164032Srwatson	case PRIV_VFS_CHFLAGS_DEV:
682164032Srwatson	case PRIV_VFS_CHOWN:
683164032Srwatson	case PRIV_VFS_CHROOT:
684167152Spjd	case PRIV_VFS_RETAINSUGID:
685164032Srwatson	case PRIV_VFS_FCHROOT:
686164032Srwatson	case PRIV_VFS_LINK:
687164032Srwatson	case PRIV_VFS_SETGID:
688172860Srwatson	case PRIV_VFS_STAT:
689164032Srwatson	case PRIV_VFS_STICKYFILE:
690164032Srwatson		return (0);
691164032Srwatson
692164032Srwatson		/*
693164032Srwatson		 * Depending on the global setting, allow privilege of
694164032Srwatson		 * setting system flags.
695164032Srwatson		 */
696164032Srwatson	case PRIV_VFS_SYSFLAGS:
697164032Srwatson		if (jail_chflags_allowed)
698164032Srwatson			return (0);
699164032Srwatson		else
700164032Srwatson			return (EPERM);
701164032Srwatson
702164032Srwatson		/*
703168396Spjd		 * Depending on the global setting, allow privilege of
704168396Spjd		 * mounting/unmounting file systems.
705168396Spjd		 */
706168396Spjd	case PRIV_VFS_MOUNT:
707168396Spjd	case PRIV_VFS_UNMOUNT:
708168396Spjd	case PRIV_VFS_MOUNT_NONUSER:
709168699Spjd	case PRIV_VFS_MOUNT_OWNER:
710168396Spjd		if (jail_mount_allowed)
711168396Spjd			return (0);
712168396Spjd		else
713168396Spjd			return (EPERM);
714168396Spjd
715168396Spjd		/*
716168591Srwatson		 * Allow jailed root to bind reserved ports and reuse in-use
717168591Srwatson		 * ports.
718164032Srwatson		 */
719164032Srwatson	case PRIV_NETINET_RESERVEDPORT:
720168591Srwatson	case PRIV_NETINET_REUSEPORT:
721164032Srwatson		return (0);
722164032Srwatson
723164032Srwatson		/*
724175630Sbz		 * Allow jailed root to set certian IPv4/6 (option) headers.
725175630Sbz		 */
726175630Sbz	case PRIV_NETINET_SETHDROPTS:
727175630Sbz		return (0);
728175630Sbz
729175630Sbz		/*
730164032Srwatson		 * Conditionally allow creating raw sockets in jail.
731164032Srwatson		 */
732164032Srwatson	case PRIV_NETINET_RAW:
733164032Srwatson		if (jail_allow_raw_sockets)
734164032Srwatson			return (0);
735164032Srwatson		else
736164032Srwatson			return (EPERM);
737164032Srwatson
738164032Srwatson		/*
739164032Srwatson		 * Since jail implements its own visibility limits on netstat
740164032Srwatson		 * sysctls, allow getcred.  This allows identd to work in
741164032Srwatson		 * jail.
742164032Srwatson		 */
743164032Srwatson	case PRIV_NETINET_GETCRED:
744164032Srwatson		return (0);
745164032Srwatson
746164032Srwatson	default:
747164032Srwatson		/*
748164032Srwatson		 * In all remaining cases, deny the privilege request.  This
749164032Srwatson		 * includes almost all network privileges, many system
750164032Srwatson		 * configuration privileges.
751164032Srwatson		 */
752164032Srwatson		return (EPERM);
753164032Srwatson	}
754164032Srwatson}
755164032Srwatson
756168401Spjd/*
757168401Spjd * Register jail service. Provides 'create' and 'destroy' methods.
758168401Spjd * 'create' method will be called for every existing jail and all
759168401Spjd * jails in the future as they beeing created.
760168401Spjd * 'destroy' method will be called for every jail going away and
761168401Spjd * for all existing jails at the time of service deregistration.
762168401Spjd */
763168401Spjdstruct prison_service *
764168401Spjdprison_service_register(const char *name, prison_create_t create,
765168401Spjd    prison_destroy_t destroy)
766168401Spjd{
767168401Spjd	struct prison_service *psrv, *psrv2;
768168401Spjd	struct prison *pr;
769168401Spjd	int reallocate = 1, slotno = 0;
770168401Spjd	void **slots, **oldslots;
771168401Spjd
772168401Spjd	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
773168401Spjd	    M_WAITOK | M_ZERO);
774168401Spjd	psrv->ps_create = create;
775168401Spjd	psrv->ps_destroy = destroy;
776168401Spjd	strcpy(psrv->ps_name, name);
777168401Spjd	/*
778168401Spjd	 * Grab the allprison_lock here, so we won't miss any jail
779168401Spjd	 * creation/destruction.
780168401Spjd	 */
781168401Spjd	sx_xlock(&allprison_lock);
782168401Spjd#ifdef INVARIANTS
783168401Spjd	/*
784168401Spjd	 * Verify if service is not already registered.
785168401Spjd	 */
786168401Spjd	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
787168401Spjd		KASSERT(strcmp(psrv2->ps_name, name) != 0,
788168401Spjd		    ("jail service %s already registered", name));
789168401Spjd	}
790168401Spjd#endif
791168401Spjd	/*
792168401Spjd	 * Find free slot. When there is no existing free slot available,
793168401Spjd	 * allocate one at the end.
794168401Spjd	 */
795168401Spjd	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
796168401Spjd		if (psrv2->ps_slotno != slotno) {
797168401Spjd			KASSERT(slotno < psrv2->ps_slotno,
798168401Spjd			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
799168401Spjd			    slotno, psrv2->ps_slotno));
800168401Spjd			/* We found free slot. */
801168401Spjd			reallocate = 0;
802168401Spjd			break;
803168401Spjd		}
804168401Spjd		slotno++;
805168401Spjd	}
806168401Spjd	psrv->ps_slotno = slotno;
807168401Spjd	/*
808168401Spjd	 * Keep the list sorted by slot number.
809168401Spjd	 */
810168401Spjd	if (psrv2 != NULL) {
811168401Spjd		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
812168401Spjd		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
813168401Spjd	} else {
814168401Spjd		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
815168401Spjd		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
816168401Spjd	}
817168401Spjd	prison_service_slots++;
818168401Spjd	sx_downgrade(&allprison_lock);
819168401Spjd	/*
820168401Spjd	 * Allocate memory for new slot if we didn't found empty one.
821168401Spjd	 * Do not use realloc(9), because pr_slots is protected with a mutex,
822168401Spjd	 * so we can't sleep.
823168401Spjd	 */
824168401Spjd	LIST_FOREACH(pr, &allprison, pr_list) {
825168401Spjd		if (reallocate) {
826168401Spjd			/* First allocate memory with M_WAITOK. */
827168401Spjd			slots = malloc(sizeof(*slots) * prison_service_slots,
828168401Spjd			    M_PRISON, M_WAITOK);
829168401Spjd			/* Now grab the mutex and replace pr_slots. */
830168401Spjd			mtx_lock(&pr->pr_mtx);
831168401Spjd			oldslots = pr->pr_slots;
832168401Spjd			if (psrv->ps_slotno > 0) {
833168401Spjd				bcopy(oldslots, slots,
834168401Spjd				    sizeof(*slots) * (prison_service_slots - 1));
835168401Spjd			}
836168401Spjd			slots[psrv->ps_slotno] = NULL;
837168401Spjd			pr->pr_slots = slots;
838168401Spjd			mtx_unlock(&pr->pr_mtx);
839168401Spjd			if (oldslots != NULL)
840168401Spjd				free(oldslots, M_PRISON);
841168401Spjd		}
842168401Spjd		/*
843168401Spjd		 * Call 'create' method for each existing jail.
844168401Spjd		 */
845168401Spjd		psrv->ps_create(psrv, pr);
846168401Spjd	}
847168401Spjd	sx_sunlock(&allprison_lock);
848168401Spjd
849168401Spjd	return (psrv);
850168401Spjd}
851168401Spjd
852168401Spjdvoid
853168401Spjdprison_service_deregister(struct prison_service *psrv)
854168401Spjd{
855168401Spjd	struct prison *pr;
856168401Spjd	void **slots, **oldslots;
857168401Spjd	int last = 0;
858168401Spjd
859168401Spjd	sx_xlock(&allprison_lock);
860168401Spjd	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
861168401Spjd		last = 1;
862168401Spjd	TAILQ_REMOVE(&prison_services, psrv, ps_next);
863168401Spjd	prison_service_slots--;
864168401Spjd	sx_downgrade(&allprison_lock);
865168401Spjd	LIST_FOREACH(pr, &allprison, pr_list) {
866168401Spjd		/*
867168401Spjd		 * Call 'destroy' method for every currently existing jail.
868168401Spjd		 */
869168401Spjd		psrv->ps_destroy(psrv, pr);
870168401Spjd		/*
871168401Spjd		 * If this is the last slot, free the memory allocated for it.
872168401Spjd		 */
873168401Spjd		if (last) {
874168401Spjd			if (prison_service_slots == 0)
875168401Spjd				slots = NULL;
876168401Spjd			else {
877168401Spjd				slots = malloc(sizeof(*slots) * prison_service_slots,
878168401Spjd				    M_PRISON, M_WAITOK);
879168401Spjd			}
880168401Spjd			mtx_lock(&pr->pr_mtx);
881168401Spjd			oldslots = pr->pr_slots;
882168401Spjd			/*
883168401Spjd			 * We require setting slot to NULL after freeing it,
884168401Spjd			 * this way we can check for memory leaks here.
885168401Spjd			 */
886168401Spjd			KASSERT(oldslots[psrv->ps_slotno] == NULL,
887168401Spjd			    ("Slot %d (service %s, jailid=%d) still contains data?",
888168401Spjd			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
889168401Spjd			if (psrv->ps_slotno > 0) {
890168401Spjd				bcopy(oldslots, slots,
891168401Spjd				    sizeof(*slots) * prison_service_slots);
892168401Spjd			}
893168401Spjd			pr->pr_slots = slots;
894168401Spjd			mtx_unlock(&pr->pr_mtx);
895168401Spjd			KASSERT(oldslots != NULL, ("oldslots == NULL"));
896168401Spjd			free(oldslots, M_PRISON);
897168401Spjd		}
898168401Spjd	}
899168401Spjd	sx_sunlock(&allprison_lock);
900168401Spjd	free(psrv, M_PRISON);
901168401Spjd}
902168401Spjd
903168401Spjd/*
904168401Spjd * Function sets data for the given jail in slot assigned for the given
905168401Spjd * jail service.
906168401Spjd */
907168401Spjdvoid
908168401Spjdprison_service_data_set(struct prison_service *psrv, struct prison *pr,
909168401Spjd    void *data)
910168401Spjd{
911168401Spjd
912168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
913168401Spjd	pr->pr_slots[psrv->ps_slotno] = data;
914168401Spjd}
915168401Spjd
916168401Spjd/*
917168401Spjd * Function clears slots assigned for the given jail service in the given
918168401Spjd * prison structure and returns current slot data.
919168401Spjd */
920168401Spjdvoid *
921168401Spjdprison_service_data_del(struct prison_service *psrv, struct prison *pr)
922168401Spjd{
923168401Spjd	void *data;
924168401Spjd
925168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
926168401Spjd	data = pr->pr_slots[psrv->ps_slotno];
927168401Spjd	pr->pr_slots[psrv->ps_slotno] = NULL;
928168401Spjd	return (data);
929168401Spjd}
930168401Spjd
931168401Spjd/*
932168401Spjd * Function returns current data from the slot assigned to the given jail
933168401Spjd * service for the given jail.
934168401Spjd */
935168401Spjdvoid *
936168401Spjdprison_service_data_get(struct prison_service *psrv, struct prison *pr)
937168401Spjd{
938168401Spjd
939168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
940168401Spjd	return (pr->pr_slots[psrv->ps_slotno]);
941168401Spjd}
942168401Spjd
943113275Smikestatic int
944113275Smikesysctl_jail_list(SYSCTL_HANDLER_ARGS)
945113275Smike{
946113275Smike	struct xprison *xp, *sxp;
947113275Smike	struct prison *pr;
948113275Smike	int count, error;
949113275Smike
950127020Spjd	if (jailed(req->td->td_ucred))
951125806Srwatson		return (0);
952113275Smike
953168401Spjd	sx_slock(&allprison_lock);
954168401Spjd	if ((count = prisoncount) == 0) {
955168401Spjd		sx_sunlock(&allprison_lock);
956113275Smike		return (0);
957168401Spjd	}
958113275Smike
959113275Smike	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
960167309Spjd
961113275Smike	LIST_FOREACH(pr, &allprison, pr_list) {
962113275Smike		xp->pr_version = XPRISON_VERSION;
963113275Smike		xp->pr_id = pr->pr_id;
964168487Spjd		xp->pr_ip = pr->pr_ip;
965113275Smike		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
966168487Spjd		mtx_lock(&pr->pr_mtx);
967113275Smike		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
968113275Smike		mtx_unlock(&pr->pr_mtx);
969113275Smike		xp++;
970113275Smike	}
971168401Spjd	sx_sunlock(&allprison_lock);
972113275Smike
973113275Smike	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
974113275Smike	free(sxp, M_TEMP);
975167354Spjd	return (error);
976113275Smike}
977113275Smike
978113275SmikeSYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
979113275Smike    NULL, 0, sysctl_jail_list, "S", "List of active jails");
980126004Spjd
981126004Spjdstatic int
982126004Spjdsysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
983126004Spjd{
984126004Spjd	int error, injail;
985126004Spjd
986126004Spjd	injail = jailed(req->td->td_ucred);
987126004Spjd	error = SYSCTL_OUT(req, &injail, sizeof(injail));
988126004Spjd
989126004Spjd	return (error);
990126004Spjd}
991126004SpjdSYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
992126004Spjd    NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
993