kern_jail.c revision 179881
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 */
9
10#include <sys/cdefs.h>
11__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 179881 2008-06-19 21:41:57Z delphij $");
12
13#include "opt_mac.h"
14
15#include <sys/param.h>
16#include <sys/types.h>
17#include <sys/kernel.h>
18#include <sys/systm.h>
19#include <sys/errno.h>
20#include <sys/sysproto.h>
21#include <sys/malloc.h>
22#include <sys/priv.h>
23#include <sys/proc.h>
24#include <sys/taskqueue.h>
25#include <sys/fcntl.h>
26#include <sys/jail.h>
27#include <sys/lock.h>
28#include <sys/mutex.h>
29#include <sys/sx.h>
30#include <sys/namei.h>
31#include <sys/mount.h>
32#include <sys/queue.h>
33#include <sys/socket.h>
34#include <sys/syscallsubr.h>
35#include <sys/sysctl.h>
36#include <sys/vnode.h>
37#include <net/if.h>
38#include <netinet/in.h>
39
40#include <security/mac/mac_framework.h>
41
42MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
43
44SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
45    "Jail rules");
46
47int	jail_set_hostname_allowed = 1;
48SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
49    &jail_set_hostname_allowed, 0,
50    "Processes in jail can set their hostnames");
51
52int	jail_socket_unixiproute_only = 1;
53SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
54    &jail_socket_unixiproute_only, 0,
55    "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
56
57int	jail_sysvipc_allowed = 0;
58SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
59    &jail_sysvipc_allowed, 0,
60    "Processes in jail can use System V IPC primitives");
61
62static int jail_enforce_statfs = 2;
63SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
64    &jail_enforce_statfs, 0,
65    "Processes in jail cannot see all mounted file systems");
66
67int	jail_allow_raw_sockets = 0;
68SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
69    &jail_allow_raw_sockets, 0,
70    "Prison root can create raw sockets");
71
72int	jail_chflags_allowed = 0;
73SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
74    &jail_chflags_allowed, 0,
75    "Processes in jail can alter system file flags");
76
77int	jail_mount_allowed = 0;
78SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
79    &jail_mount_allowed, 0,
80    "Processes in jail can mount/unmount jail-friendly file systems");
81
82/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
83struct	prisonlist allprison;
84struct	sx allprison_lock;
85int	lastprid = 0;
86int	prisoncount = 0;
87
88/*
89 * List of jail services. Protected by allprison_lock.
90 */
91TAILQ_HEAD(prison_services_head, prison_service);
92static struct prison_services_head prison_services =
93    TAILQ_HEAD_INITIALIZER(prison_services);
94static int prison_service_slots = 0;
95
96struct prison_service {
97	prison_create_t ps_create;
98	prison_destroy_t ps_destroy;
99	int		ps_slotno;
100	TAILQ_ENTRY(prison_service) ps_next;
101	char	ps_name[0];
102};
103
104static void		 init_prison(void *);
105static void		 prison_complete(void *context, int pending);
106static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
107
108static void
109init_prison(void *data __unused)
110{
111
112	sx_init(&allprison_lock, "allprison");
113	LIST_INIT(&allprison);
114}
115
116SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
117
118/*
119 * struct jail_args {
120 *	struct jail *jail;
121 * };
122 */
123int
124jail(struct thread *td, struct jail_args *uap)
125{
126	struct nameidata nd;
127	struct prison *pr, *tpr;
128	struct prison_service *psrv;
129	struct jail j;
130	struct jail_attach_args jaa;
131	int vfslocked, error, tryprid;
132
133	error = copyin(uap->jail, &j, sizeof(j));
134	if (error)
135		return (error);
136	if (j.version != 0)
137		return (EINVAL);
138
139	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
140	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
141	pr->pr_ref = 1;
142	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
143	if (error)
144		goto e_killmtx;
145	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
146	    pr->pr_path, td);
147	error = namei(&nd);
148	if (error)
149		goto e_killmtx;
150	vfslocked = NDHASGIANT(&nd);
151	pr->pr_root = nd.ni_vp;
152	VOP_UNLOCK(nd.ni_vp, 0);
153	NDFREE(&nd, NDF_ONLY_PNBUF);
154	VFS_UNLOCK_GIANT(vfslocked);
155	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
156	if (error)
157		goto e_dropvnref;
158	pr->pr_ip = j.ip_number;
159	pr->pr_linux = NULL;
160	pr->pr_securelevel = securelevel;
161	if (prison_service_slots == 0)
162		pr->pr_slots = NULL;
163	else {
164		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
165		    M_PRISON, M_ZERO | M_WAITOK);
166	}
167
168	/* Determine next pr_id and add prison to allprison list. */
169	sx_xlock(&allprison_lock);
170	tryprid = lastprid + 1;
171	if (tryprid == JAIL_MAX)
172		tryprid = 1;
173next:
174	LIST_FOREACH(tpr, &allprison, pr_list) {
175		if (tpr->pr_id == tryprid) {
176			tryprid++;
177			if (tryprid == JAIL_MAX) {
178				sx_xunlock(&allprison_lock);
179				error = EAGAIN;
180				goto e_dropvnref;
181			}
182			goto next;
183		}
184	}
185	pr->pr_id = jaa.jid = lastprid = tryprid;
186	LIST_INSERT_HEAD(&allprison, pr, pr_list);
187	prisoncount++;
188	sx_downgrade(&allprison_lock);
189	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
190		psrv->ps_create(psrv, pr);
191	}
192	sx_sunlock(&allprison_lock);
193
194	error = jail_attach(td, &jaa);
195	if (error)
196		goto e_dropprref;
197	mtx_lock(&pr->pr_mtx);
198	pr->pr_ref--;
199	mtx_unlock(&pr->pr_mtx);
200	td->td_retval[0] = jaa.jid;
201	return (0);
202e_dropprref:
203	sx_xlock(&allprison_lock);
204	LIST_REMOVE(pr, pr_list);
205	prisoncount--;
206	sx_downgrade(&allprison_lock);
207	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
208		psrv->ps_destroy(psrv, pr);
209	}
210	sx_sunlock(&allprison_lock);
211e_dropvnref:
212	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
213	vrele(pr->pr_root);
214	VFS_UNLOCK_GIANT(vfslocked);
215e_killmtx:
216	mtx_destroy(&pr->pr_mtx);
217	FREE(pr, M_PRISON);
218	return (error);
219}
220
221/*
222 * struct jail_attach_args {
223 *	int jid;
224 * };
225 */
226int
227jail_attach(struct thread *td, struct jail_attach_args *uap)
228{
229	struct proc *p;
230	struct ucred *newcred, *oldcred;
231	struct prison *pr;
232	int vfslocked, error;
233
234	/*
235	 * XXX: Note that there is a slight race here if two threads
236	 * in the same privileged process attempt to attach to two
237	 * different jails at the same time.  It is important for
238	 * user processes not to do this, or they might end up with
239	 * a process root from one prison, but attached to the jail
240	 * of another.
241	 */
242	error = priv_check(td, PRIV_JAIL_ATTACH);
243	if (error)
244		return (error);
245
246	p = td->td_proc;
247	sx_slock(&allprison_lock);
248	pr = prison_find(uap->jid);
249	if (pr == NULL) {
250		sx_sunlock(&allprison_lock);
251		return (EINVAL);
252	}
253	pr->pr_ref++;
254	mtx_unlock(&pr->pr_mtx);
255	sx_sunlock(&allprison_lock);
256
257	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
258	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
259	if ((error = change_dir(pr->pr_root, td)) != 0)
260		goto e_unlock;
261#ifdef MAC
262	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
263		goto e_unlock;
264#endif
265	VOP_UNLOCK(pr->pr_root, 0);
266	change_root(pr->pr_root, td);
267	VFS_UNLOCK_GIANT(vfslocked);
268
269	newcred = crget();
270	PROC_LOCK(p);
271	oldcred = p->p_ucred;
272	setsugid(p);
273	crcopy(newcred, oldcred);
274	newcred->cr_prison = pr;
275	p->p_ucred = newcred;
276	PROC_UNLOCK(p);
277	crfree(oldcred);
278	return (0);
279e_unlock:
280	VOP_UNLOCK(pr->pr_root, 0);
281	VFS_UNLOCK_GIANT(vfslocked);
282	mtx_lock(&pr->pr_mtx);
283	pr->pr_ref--;
284	mtx_unlock(&pr->pr_mtx);
285	return (error);
286}
287
288/*
289 * Returns a locked prison instance, or NULL on failure.
290 */
291struct prison *
292prison_find(int prid)
293{
294	struct prison *pr;
295
296	sx_assert(&allprison_lock, SX_LOCKED);
297	LIST_FOREACH(pr, &allprison, pr_list) {
298		if (pr->pr_id == prid) {
299			mtx_lock(&pr->pr_mtx);
300			if (pr->pr_ref == 0) {
301				mtx_unlock(&pr->pr_mtx);
302				break;
303			}
304			return (pr);
305		}
306	}
307	return (NULL);
308}
309
310void
311prison_free(struct prison *pr)
312{
313
314	mtx_lock(&pr->pr_mtx);
315	pr->pr_ref--;
316	if (pr->pr_ref == 0) {
317		mtx_unlock(&pr->pr_mtx);
318		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
319		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
320		return;
321	}
322	mtx_unlock(&pr->pr_mtx);
323}
324
325static void
326prison_complete(void *context, int pending)
327{
328	struct prison_service *psrv;
329	struct prison *pr;
330	int vfslocked;
331
332	pr = (struct prison *)context;
333
334	sx_xlock(&allprison_lock);
335	LIST_REMOVE(pr, pr_list);
336	prisoncount--;
337	sx_downgrade(&allprison_lock);
338	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
339		psrv->ps_destroy(psrv, pr);
340	}
341	sx_sunlock(&allprison_lock);
342
343	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
344	vrele(pr->pr_root);
345	VFS_UNLOCK_GIANT(vfslocked);
346
347	mtx_destroy(&pr->pr_mtx);
348	if (pr->pr_linux != NULL)
349		FREE(pr->pr_linux, M_PRISON);
350	FREE(pr, M_PRISON);
351}
352
353void
354prison_hold(struct prison *pr)
355{
356
357	mtx_lock(&pr->pr_mtx);
358	KASSERT(pr->pr_ref > 0,
359	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
360	pr->pr_ref++;
361	mtx_unlock(&pr->pr_mtx);
362}
363
364u_int32_t
365prison_getip(struct ucred *cred)
366{
367
368	return (cred->cr_prison->pr_ip);
369}
370
371int
372prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
373{
374	u_int32_t tmp;
375
376	if (!jailed(cred))
377		return (0);
378	if (flag)
379		tmp = *ip;
380	else
381		tmp = ntohl(*ip);
382	if (tmp == INADDR_ANY) {
383		if (flag)
384			*ip = cred->cr_prison->pr_ip;
385		else
386			*ip = htonl(cred->cr_prison->pr_ip);
387		return (0);
388	}
389	if (tmp == INADDR_LOOPBACK) {
390		if (flag)
391			*ip = cred->cr_prison->pr_ip;
392		else
393			*ip = htonl(cred->cr_prison->pr_ip);
394		return (0);
395	}
396	if (cred->cr_prison->pr_ip != tmp)
397		return (1);
398	return (0);
399}
400
401void
402prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
403{
404	u_int32_t tmp;
405
406	if (!jailed(cred))
407		return;
408	if (flag)
409		tmp = *ip;
410	else
411		tmp = ntohl(*ip);
412	if (tmp == INADDR_LOOPBACK) {
413		if (flag)
414			*ip = cred->cr_prison->pr_ip;
415		else
416			*ip = htonl(cred->cr_prison->pr_ip);
417		return;
418	}
419	return;
420}
421
422int
423prison_if(struct ucred *cred, struct sockaddr *sa)
424{
425	struct sockaddr_in *sai;
426	int ok;
427
428	sai = (struct sockaddr_in *)sa;
429	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
430		ok = 1;
431	else if (sai->sin_family != AF_INET)
432		ok = 0;
433	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
434		ok = 1;
435	else
436		ok = 0;
437	return (ok);
438}
439
440/*
441 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
442 */
443int
444prison_check(struct ucred *cred1, struct ucred *cred2)
445{
446
447	if (jailed(cred1)) {
448		if (!jailed(cred2))
449			return (ESRCH);
450		if (cred2->cr_prison != cred1->cr_prison)
451			return (ESRCH);
452	}
453
454	return (0);
455}
456
457/*
458 * Return 1 if the passed credential is in a jail, otherwise 0.
459 */
460int
461jailed(struct ucred *cred)
462{
463
464	return (cred->cr_prison != NULL);
465}
466
467/*
468 * Return the correct hostname for the passed credential.
469 */
470void
471getcredhostname(struct ucred *cred, char *buf, size_t size)
472{
473
474	if (jailed(cred)) {
475		mtx_lock(&cred->cr_prison->pr_mtx);
476		strlcpy(buf, cred->cr_prison->pr_host, size);
477		mtx_unlock(&cred->cr_prison->pr_mtx);
478	} else
479		strlcpy(buf, hostname, size);
480}
481
482/*
483 * Determine whether the subject represented by cred can "see"
484 * status of a mount point.
485 * Returns: 0 for permitted, ENOENT otherwise.
486 * XXX: This function should be called cr_canseemount() and should be
487 *      placed in kern_prot.c.
488 */
489int
490prison_canseemount(struct ucred *cred, struct mount *mp)
491{
492	struct prison *pr;
493	struct statfs *sp;
494	size_t len;
495
496	if (!jailed(cred) || jail_enforce_statfs == 0)
497		return (0);
498	pr = cred->cr_prison;
499	if (pr->pr_root->v_mount == mp)
500		return (0);
501	if (jail_enforce_statfs == 2)
502		return (ENOENT);
503	/*
504	 * If jail's chroot directory is set to "/" we should be able to see
505	 * all mount-points from inside a jail.
506	 * This is ugly check, but this is the only situation when jail's
507	 * directory ends with '/'.
508	 */
509	if (strcmp(pr->pr_path, "/") == 0)
510		return (0);
511	len = strlen(pr->pr_path);
512	sp = &mp->mnt_stat;
513	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
514		return (ENOENT);
515	/*
516	 * Be sure that we don't have situation where jail's root directory
517	 * is "/some/path" and mount point is "/some/pathpath".
518	 */
519	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
520		return (ENOENT);
521	return (0);
522}
523
524void
525prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
526{
527	char jpath[MAXPATHLEN];
528	struct prison *pr;
529	size_t len;
530
531	if (!jailed(cred) || jail_enforce_statfs == 0)
532		return;
533	pr = cred->cr_prison;
534	if (prison_canseemount(cred, mp) != 0) {
535		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
536		strlcpy(sp->f_mntonname, "[restricted]",
537		    sizeof(sp->f_mntonname));
538		return;
539	}
540	if (pr->pr_root->v_mount == mp) {
541		/*
542		 * Clear current buffer data, so we are sure nothing from
543		 * the valid path left there.
544		 */
545		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
546		*sp->f_mntonname = '/';
547		return;
548	}
549	/*
550	 * If jail's chroot directory is set to "/" we should be able to see
551	 * all mount-points from inside a jail.
552	 */
553	if (strcmp(pr->pr_path, "/") == 0)
554		return;
555	len = strlen(pr->pr_path);
556	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
557	/*
558	 * Clear current buffer data, so we are sure nothing from
559	 * the valid path left there.
560	 */
561	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
562	if (*jpath == '\0') {
563		/* Should never happen. */
564		*sp->f_mntonname = '/';
565	} else {
566		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
567	}
568}
569
570/*
571 * Check with permission for a specific privilege is granted within jail.  We
572 * have a specific list of accepted privileges; the rest are denied.
573 */
574int
575prison_priv_check(struct ucred *cred, int priv)
576{
577
578	if (!jailed(cred))
579		return (0);
580
581	switch (priv) {
582
583		/*
584		 * Allow ktrace privileges for root in jail.
585		 */
586	case PRIV_KTRACE:
587
588#if 0
589		/*
590		 * Allow jailed processes to configure audit identity and
591		 * submit audit records (login, etc).  In the future we may
592		 * want to further refine the relationship between audit and
593		 * jail.
594		 */
595	case PRIV_AUDIT_GETAUDIT:
596	case PRIV_AUDIT_SETAUDIT:
597	case PRIV_AUDIT_SUBMIT:
598#endif
599
600		/*
601		 * Allow jailed processes to manipulate process UNIX
602		 * credentials in any way they see fit.
603		 */
604	case PRIV_CRED_SETUID:
605	case PRIV_CRED_SETEUID:
606	case PRIV_CRED_SETGID:
607	case PRIV_CRED_SETEGID:
608	case PRIV_CRED_SETGROUPS:
609	case PRIV_CRED_SETREUID:
610	case PRIV_CRED_SETREGID:
611	case PRIV_CRED_SETRESUID:
612	case PRIV_CRED_SETRESGID:
613
614		/*
615		 * Jail implements visibility constraints already, so allow
616		 * jailed root to override uid/gid-based constraints.
617		 */
618	case PRIV_SEEOTHERGIDS:
619	case PRIV_SEEOTHERUIDS:
620
621		/*
622		 * Jail implements inter-process debugging limits already, so
623		 * allow jailed root various debugging privileges.
624		 */
625	case PRIV_DEBUG_DIFFCRED:
626	case PRIV_DEBUG_SUGID:
627	case PRIV_DEBUG_UNPRIV:
628
629		/*
630		 * Allow jail to set various resource limits and login
631		 * properties, and for now, exceed process resource limits.
632		 */
633	case PRIV_PROC_LIMIT:
634	case PRIV_PROC_SETLOGIN:
635	case PRIV_PROC_SETRLIMIT:
636
637		/*
638		 * System V and POSIX IPC privileges are granted in jail.
639		 */
640	case PRIV_IPC_READ:
641	case PRIV_IPC_WRITE:
642	case PRIV_IPC_ADMIN:
643	case PRIV_IPC_MSGSIZE:
644	case PRIV_MQ_ADMIN:
645
646		/*
647		 * Jail implements its own inter-process limits, so allow
648		 * root processes in jail to change scheduling on other
649		 * processes in the same jail.  Likewise for signalling.
650		 */
651	case PRIV_SCHED_DIFFCRED:
652	case PRIV_SIGNAL_DIFFCRED:
653	case PRIV_SIGNAL_SUGID:
654
655		/*
656		 * Allow jailed processes to write to sysctls marked as jail
657		 * writable.
658		 */
659	case PRIV_SYSCTL_WRITEJAIL:
660
661		/*
662		 * Allow root in jail to manage a variety of quota
663		 * properties.  These should likely be conditional on a
664		 * configuration option.
665		 */
666	case PRIV_VFS_GETQUOTA:
667	case PRIV_VFS_SETQUOTA:
668
669		/*
670		 * Since Jail relies on chroot() to implement file system
671		 * protections, grant many VFS privileges to root in jail.
672		 * Be careful to exclude mount-related and NFS-related
673		 * privileges.
674		 */
675	case PRIV_VFS_READ:
676	case PRIV_VFS_WRITE:
677	case PRIV_VFS_ADMIN:
678	case PRIV_VFS_EXEC:
679	case PRIV_VFS_LOOKUP:
680	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
681	case PRIV_VFS_CHFLAGS_DEV:
682	case PRIV_VFS_CHOWN:
683	case PRIV_VFS_CHROOT:
684	case PRIV_VFS_RETAINSUGID:
685	case PRIV_VFS_FCHROOT:
686	case PRIV_VFS_LINK:
687	case PRIV_VFS_SETGID:
688	case PRIV_VFS_STAT:
689	case PRIV_VFS_STICKYFILE:
690		return (0);
691
692		/*
693		 * Depending on the global setting, allow privilege of
694		 * setting system flags.
695		 */
696	case PRIV_VFS_SYSFLAGS:
697		if (jail_chflags_allowed)
698			return (0);
699		else
700			return (EPERM);
701
702		/*
703		 * Depending on the global setting, allow privilege of
704		 * mounting/unmounting file systems.
705		 */
706	case PRIV_VFS_MOUNT:
707	case PRIV_VFS_UNMOUNT:
708	case PRIV_VFS_MOUNT_NONUSER:
709	case PRIV_VFS_MOUNT_OWNER:
710		if (jail_mount_allowed)
711			return (0);
712		else
713			return (EPERM);
714
715		/*
716		 * Allow jailed root to bind reserved ports and reuse in-use
717		 * ports.
718		 */
719	case PRIV_NETINET_RESERVEDPORT:
720	case PRIV_NETINET_REUSEPORT:
721		return (0);
722
723		/*
724		 * Allow jailed root to set certian IPv4/6 (option) headers.
725		 */
726	case PRIV_NETINET_SETHDROPTS:
727		return (0);
728
729		/*
730		 * Conditionally allow creating raw sockets in jail.
731		 */
732	case PRIV_NETINET_RAW:
733		if (jail_allow_raw_sockets)
734			return (0);
735		else
736			return (EPERM);
737
738		/*
739		 * Since jail implements its own visibility limits on netstat
740		 * sysctls, allow getcred.  This allows identd to work in
741		 * jail.
742		 */
743	case PRIV_NETINET_GETCRED:
744		return (0);
745
746	default:
747		/*
748		 * In all remaining cases, deny the privilege request.  This
749		 * includes almost all network privileges, many system
750		 * configuration privileges.
751		 */
752		return (EPERM);
753	}
754}
755
756/*
757 * Register jail service. Provides 'create' and 'destroy' methods.
758 * 'create' method will be called for every existing jail and all
759 * jails in the future as they beeing created.
760 * 'destroy' method will be called for every jail going away and
761 * for all existing jails at the time of service deregistration.
762 */
763struct prison_service *
764prison_service_register(const char *name, prison_create_t create,
765    prison_destroy_t destroy)
766{
767	struct prison_service *psrv, *psrv2;
768	struct prison *pr;
769	int reallocate = 1, slotno = 0;
770	void **slots, **oldslots;
771
772	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
773	    M_WAITOK | M_ZERO);
774	psrv->ps_create = create;
775	psrv->ps_destroy = destroy;
776	strcpy(psrv->ps_name, name);
777	/*
778	 * Grab the allprison_lock here, so we won't miss any jail
779	 * creation/destruction.
780	 */
781	sx_xlock(&allprison_lock);
782#ifdef INVARIANTS
783	/*
784	 * Verify if service is not already registered.
785	 */
786	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
787		KASSERT(strcmp(psrv2->ps_name, name) != 0,
788		    ("jail service %s already registered", name));
789	}
790#endif
791	/*
792	 * Find free slot. When there is no existing free slot available,
793	 * allocate one at the end.
794	 */
795	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
796		if (psrv2->ps_slotno != slotno) {
797			KASSERT(slotno < psrv2->ps_slotno,
798			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
799			    slotno, psrv2->ps_slotno));
800			/* We found free slot. */
801			reallocate = 0;
802			break;
803		}
804		slotno++;
805	}
806	psrv->ps_slotno = slotno;
807	/*
808	 * Keep the list sorted by slot number.
809	 */
810	if (psrv2 != NULL) {
811		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
812		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
813	} else {
814		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
815		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
816	}
817	prison_service_slots++;
818	sx_downgrade(&allprison_lock);
819	/*
820	 * Allocate memory for new slot if we didn't found empty one.
821	 * Do not use realloc(9), because pr_slots is protected with a mutex,
822	 * so we can't sleep.
823	 */
824	LIST_FOREACH(pr, &allprison, pr_list) {
825		if (reallocate) {
826			/* First allocate memory with M_WAITOK. */
827			slots = malloc(sizeof(*slots) * prison_service_slots,
828			    M_PRISON, M_WAITOK);
829			/* Now grab the mutex and replace pr_slots. */
830			mtx_lock(&pr->pr_mtx);
831			oldslots = pr->pr_slots;
832			if (psrv->ps_slotno > 0) {
833				bcopy(oldslots, slots,
834				    sizeof(*slots) * (prison_service_slots - 1));
835			}
836			slots[psrv->ps_slotno] = NULL;
837			pr->pr_slots = slots;
838			mtx_unlock(&pr->pr_mtx);
839			if (oldslots != NULL)
840				free(oldslots, M_PRISON);
841		}
842		/*
843		 * Call 'create' method for each existing jail.
844		 */
845		psrv->ps_create(psrv, pr);
846	}
847	sx_sunlock(&allprison_lock);
848
849	return (psrv);
850}
851
852void
853prison_service_deregister(struct prison_service *psrv)
854{
855	struct prison *pr;
856	void **slots, **oldslots;
857	int last = 0;
858
859	sx_xlock(&allprison_lock);
860	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
861		last = 1;
862	TAILQ_REMOVE(&prison_services, psrv, ps_next);
863	prison_service_slots--;
864	sx_downgrade(&allprison_lock);
865	LIST_FOREACH(pr, &allprison, pr_list) {
866		/*
867		 * Call 'destroy' method for every currently existing jail.
868		 */
869		psrv->ps_destroy(psrv, pr);
870		/*
871		 * If this is the last slot, free the memory allocated for it.
872		 */
873		if (last) {
874			if (prison_service_slots == 0)
875				slots = NULL;
876			else {
877				slots = malloc(sizeof(*slots) * prison_service_slots,
878				    M_PRISON, M_WAITOK);
879			}
880			mtx_lock(&pr->pr_mtx);
881			oldslots = pr->pr_slots;
882			/*
883			 * We require setting slot to NULL after freeing it,
884			 * this way we can check for memory leaks here.
885			 */
886			KASSERT(oldslots[psrv->ps_slotno] == NULL,
887			    ("Slot %d (service %s, jailid=%d) still contains data?",
888			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
889			if (psrv->ps_slotno > 0) {
890				bcopy(oldslots, slots,
891				    sizeof(*slots) * prison_service_slots);
892			}
893			pr->pr_slots = slots;
894			mtx_unlock(&pr->pr_mtx);
895			KASSERT(oldslots != NULL, ("oldslots == NULL"));
896			free(oldslots, M_PRISON);
897		}
898	}
899	sx_sunlock(&allprison_lock);
900	free(psrv, M_PRISON);
901}
902
903/*
904 * Function sets data for the given jail in slot assigned for the given
905 * jail service.
906 */
907void
908prison_service_data_set(struct prison_service *psrv, struct prison *pr,
909    void *data)
910{
911
912	mtx_assert(&pr->pr_mtx, MA_OWNED);
913	pr->pr_slots[psrv->ps_slotno] = data;
914}
915
916/*
917 * Function clears slots assigned for the given jail service in the given
918 * prison structure and returns current slot data.
919 */
920void *
921prison_service_data_del(struct prison_service *psrv, struct prison *pr)
922{
923	void *data;
924
925	mtx_assert(&pr->pr_mtx, MA_OWNED);
926	data = pr->pr_slots[psrv->ps_slotno];
927	pr->pr_slots[psrv->ps_slotno] = NULL;
928	return (data);
929}
930
931/*
932 * Function returns current data from the slot assigned to the given jail
933 * service for the given jail.
934 */
935void *
936prison_service_data_get(struct prison_service *psrv, struct prison *pr)
937{
938
939	mtx_assert(&pr->pr_mtx, MA_OWNED);
940	return (pr->pr_slots[psrv->ps_slotno]);
941}
942
943static int
944sysctl_jail_list(SYSCTL_HANDLER_ARGS)
945{
946	struct xprison *xp, *sxp;
947	struct prison *pr;
948	int count, error;
949
950	if (jailed(req->td->td_ucred))
951		return (0);
952
953	sx_slock(&allprison_lock);
954	if ((count = prisoncount) == 0) {
955		sx_sunlock(&allprison_lock);
956		return (0);
957	}
958
959	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
960
961	LIST_FOREACH(pr, &allprison, pr_list) {
962		xp->pr_version = XPRISON_VERSION;
963		xp->pr_id = pr->pr_id;
964		xp->pr_ip = pr->pr_ip;
965		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
966		mtx_lock(&pr->pr_mtx);
967		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
968		mtx_unlock(&pr->pr_mtx);
969		xp++;
970	}
971	sx_sunlock(&allprison_lock);
972
973	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
974	free(sxp, M_TEMP);
975	return (error);
976}
977
978SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
979    NULL, 0, sysctl_jail_list, "S", "List of active jails");
980
981static int
982sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
983{
984	int error, injail;
985
986	injail = jailed(req->td->td_ucred);
987	error = SYSCTL_OUT(req, &injail, sizeof(injail));
988
989	return (error);
990}
991SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
992    NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
993