zone.c revision 4846:f4f1c86d0e2d
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Zones
31 *
32 *   A zone is a named collection of processes, namespace constraints,
33 *   and other system resources which comprise a secure and manageable
34 *   application containment facility.
35 *
36 *   Zones (represented by the reference counted zone_t) are tracked in
37 *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38 *   (zoneid_t) are used to track zone association.  Zone IDs are
39 *   dynamically generated when the zone is created; if a persistent
40 *   identifier is needed (core files, accounting logs, audit trail,
41 *   etc.), the zone name should be used.
42 *
43 *
44 *   Global Zone:
45 *
46 *   The global zone (zoneid 0) is automatically associated with all
47 *   system resources that have not been bound to a user-created zone.
48 *   This means that even systems where zones are not in active use
49 *   have a global zone, and all processes, mounts, etc. are
50 *   associated with that zone.  The global zone is generally
51 *   unconstrained in terms of privileges and access, though the usual
52 *   credential and privilege based restrictions apply.
53 *
54 *
55 *   Zone States:
56 *
57 *   The states in which a zone may be in and the transitions are as
58 *   follows:
59 *
60 *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61 *   initialized zone is added to the list of active zones on the system but
62 *   isn't accessible.
63 *
64 *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
65 *   ready.  The zone is made visible after the ZSD constructor callbacks are
66 *   executed.  A zone remains in this state until it transitions into
67 *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
68 *
69 *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
70 *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
71 *   state.
72 *
73 *   ZONE_IS_RUNNING: The zone is open for business: zsched has
74 *   successfully started init.   A zone remains in this state until
75 *   zone_shutdown() is called.
76 *
77 *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
78 *   killing all processes running in the zone. The zone remains
79 *   in this state until there are no more user processes running in the zone.
80 *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
81 *   Since zone_shutdown() is restartable, it may be called successfully
82 *   multiple times for the same zone_t.  Setting of the zone's state to
83 *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
84 *   the zone's status without worrying about it being a moving target.
85 *
86 *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
87 *   are no more user processes in the zone.  The zone remains in this
88 *   state until there are no more kernel threads associated with the
89 *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
90 *   fail.
91 *
92 *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
93 *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
94 *   join the zone or create kernel threads therein.
95 *
96 *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
97 *   remains in this state until zsched exits.  Calls to zone_find_by_*()
98 *   return NULL from now on.
99 *
100 *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
101 *   processes or threads doing work on behalf of the zone.  The zone is
102 *   removed from the list of active zones.  zone_destroy() returns, and
103 *   the zone can be recreated.
104 *
105 *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
106 *   callbacks are executed, and all memory associated with the zone is
107 *   freed.
108 *
109 *   Threads can wait for the zone to enter a requested state by using
110 *   zone_status_wait() or zone_status_timedwait() with the desired
111 *   state passed in as an argument.  Zone state transitions are
112 *   uni-directional; it is not possible to move back to an earlier state.
113 *
114 *
115 *   Zone-Specific Data:
116 *
117 *   Subsystems needing to maintain zone-specific data can store that
118 *   data using the ZSD mechanism.  This provides a zone-specific data
119 *   store, similar to thread-specific data (see pthread_getspecific(3C)
120 *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
121 *   to register callbacks to be invoked when a zone is created, shut
122 *   down, or destroyed.  This can be used to initialize zone-specific
123 *   data for new zones and to clean up when zones go away.
124 *
125 *
126 *   Data Structures:
127 *
128 *   The per-zone structure (zone_t) is reference counted, and freed
129 *   when all references are released.  zone_hold and zone_rele can be
130 *   used to adjust the reference count.  In addition, reference counts
131 *   associated with the cred_t structure are tracked separately using
132 *   zone_cred_hold and zone_cred_rele.
133 *
134 *   Pointers to active zone_t's are stored in two hash tables; one
135 *   for searching by id, the other for searching by name.  Lookups
136 *   can be performed on either basis, using zone_find_by_id and
137 *   zone_find_by_name.  Both return zone_t pointers with the zone
138 *   held, so zone_rele should be called when the pointer is no longer
139 *   needed.  Zones can also be searched by path; zone_find_by_path
140 *   returns the zone with which a path name is associated (global
141 *   zone if the path is not within some other zone's file system
142 *   hierarchy).  This currently requires iterating through each zone,
143 *   so it is slower than an id or name search via a hash table.
144 *
145 *
146 *   Locking:
147 *
148 *   zonehash_lock: This is a top-level global lock used to protect the
149 *       zone hash tables and lists.  Zones cannot be created or destroyed
150 *       while this lock is held.
151 *   zone_status_lock: This is a global lock protecting zone state.
152 *       Zones cannot change state while this lock is held.  It also
153 *       protects the list of kernel threads associated with a zone.
154 *   zone_lock: This is a per-zone lock used to protect several fields of
155 *       the zone_t (see <sys/zone.h> for details).  In addition, holding
156 *       this lock means that the zone cannot go away.
157 *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
158 *	 related to the zone.max-lwps rctl.
159 *   zone_mem_lock: This is a per-zone lock used to protect the fields
160 *	 related to the zone.max-locked-memory and zone.max-swap rctls.
161 *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
162 *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
163 *       list (a list of zones in the ZONE_IS_DEAD state).
164 *
165 *   Ordering requirements:
166 *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
167 *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
168 *
169 *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
170 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
171 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
172 *
173 *   Blocking memory allocations are permitted while holding any of the
174 *   zone locks.
175 *
176 *
177 *   System Call Interface:
178 *
179 *   The zone subsystem can be managed and queried from user level with
180 *   the following system calls (all subcodes of the primary "zone"
181 *   system call):
182 *   - zone_create: creates a zone with selected attributes (name,
183 *     root path, privileges, resource controls, ZFS datasets)
184 *   - zone_enter: allows the current process to enter a zone
185 *   - zone_getattr: reports attributes of a zone
186 *   - zone_setattr: set attributes of a zone
187 *   - zone_boot: set 'init' running for the zone
188 *   - zone_list: lists all zones active in the system
189 *   - zone_lookup: looks up zone id based on name
190 *   - zone_shutdown: initiates shutdown process (see states above)
191 *   - zone_destroy: completes shutdown process (see states above)
192 *
193 */
194
195#include <sys/priv_impl.h>
196#include <sys/cred.h>
197#include <c2/audit.h>
198#include <sys/debug.h>
199#include <sys/file.h>
200#include <sys/kmem.h>
201#include <sys/kstat.h>
202#include <sys/mutex.h>
203#include <sys/note.h>
204#include <sys/pathname.h>
205#include <sys/proc.h>
206#include <sys/project.h>
207#include <sys/sysevent.h>
208#include <sys/task.h>
209#include <sys/systm.h>
210#include <sys/types.h>
211#include <sys/utsname.h>
212#include <sys/vnode.h>
213#include <sys/vfs.h>
214#include <sys/systeminfo.h>
215#include <sys/policy.h>
216#include <sys/cred_impl.h>
217#include <sys/contract_impl.h>
218#include <sys/contract/process_impl.h>
219#include <sys/class.h>
220#include <sys/pool.h>
221#include <sys/pool_pset.h>
222#include <sys/pset.h>
223#include <sys/sysmacros.h>
224#include <sys/callb.h>
225#include <sys/vmparam.h>
226#include <sys/corectl.h>
227#include <sys/ipc_impl.h>
228
229#include <sys/door.h>
230#include <sys/cpuvar.h>
231
232#include <sys/uadmin.h>
233#include <sys/session.h>
234#include <sys/cmn_err.h>
235#include <sys/modhash.h>
236#include <sys/sunddi.h>
237#include <sys/nvpair.h>
238#include <sys/rctl.h>
239#include <sys/fss.h>
240#include <sys/brand.h>
241#include <sys/zone.h>
242#include <net/if.h>
243#include <sys/cpucaps.h>
244#include <vm/seg.h>
245
246/*
247 * cv used to signal that all references to the zone have been released.  This
248 * needs to be global since there may be multiple waiters, and the first to
249 * wake up will free the zone_t, hence we cannot use zone->zone_cv.
250 */
251static kcondvar_t zone_destroy_cv;
252/*
253 * Lock used to serialize access to zone_cv.  This could have been per-zone,
254 * but then we'd need another lock for zone_destroy_cv, and why bother?
255 */
256static kmutex_t zone_status_lock;
257
258/*
259 * ZSD-related global variables.
260 */
261static kmutex_t zsd_key_lock;	/* protects the following two */
262/*
263 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
264 */
265static zone_key_t zsd_keyval = 0;
266/*
267 * Global list of registered keys.  We use this when a new zone is created.
268 */
269static list_t zsd_registered_keys;
270
271int zone_hash_size = 256;
272static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
273static kmutex_t zonehash_lock;
274static uint_t zonecount;
275static id_space_t *zoneid_space;
276
277/*
278 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
279 * kernel proper runs, and which manages all other zones.
280 *
281 * Although not declared as static, the variable "zone0" should not be used
282 * except for by code that needs to reference the global zone early on in boot,
283 * before it is fully initialized.  All other consumers should use
284 * 'global_zone'.
285 */
286zone_t zone0;
287zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
288
289/*
290 * List of active zones, protected by zonehash_lock.
291 */
292static list_t zone_active;
293
294/*
295 * List of destroyed zones that still have outstanding cred references.
296 * Used for debugging.  Uses a separate lock to avoid lock ordering
297 * problems in zone_free.
298 */
299static list_t zone_deathrow;
300static kmutex_t zone_deathrow_lock;
301
302/* number of zones is limited by virtual interface limit in IP */
303uint_t maxzones = 8192;
304
305/* Event channel to sent zone state change notifications */
306evchan_t *zone_event_chan;
307
308/*
309 * This table holds the mapping from kernel zone states to
310 * states visible in the state notification API.
311 * The idea is that we only expose "obvious" states and
312 * do not expose states which are just implementation details.
313 */
314const char  *zone_status_table[] = {
315	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
316	ZONE_EVENT_READY,		/* ready */
317	ZONE_EVENT_READY,		/* booting */
318	ZONE_EVENT_RUNNING,		/* running */
319	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
320	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
321	ZONE_EVENT_SHUTTING_DOWN,	/* down */
322	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
323	ZONE_EVENT_UNINITIALIZED,	/* dead */
324};
325
326/*
327 * This isn't static so lint doesn't complain.
328 */
329rctl_hndl_t rc_zone_cpu_shares;
330rctl_hndl_t rc_zone_locked_mem;
331rctl_hndl_t rc_zone_max_swap;
332rctl_hndl_t rc_zone_cpu_cap;
333rctl_hndl_t rc_zone_nlwps;
334rctl_hndl_t rc_zone_shmmax;
335rctl_hndl_t rc_zone_shmmni;
336rctl_hndl_t rc_zone_semmni;
337rctl_hndl_t rc_zone_msgmni;
338/*
339 * Synchronization primitives used to synchronize between mounts and zone
340 * creation/destruction.
341 */
342static int mounts_in_progress;
343static kcondvar_t mount_cv;
344static kmutex_t mount_lock;
345
346const char * const zone_default_initname = "/sbin/init";
347static char * const zone_prefix = "/zone/";
348static int zone_shutdown(zoneid_t zoneid);
349static int zone_add_datalink(zoneid_t, char *);
350static int zone_remove_datalink(zoneid_t, char *);
351static int zone_check_datalink(zoneid_t *, char *);
352static int zone_list_datalink(zoneid_t, int *, char *);
353
354/*
355 * Bump this number when you alter the zone syscall interfaces; this is
356 * because we need to have support for previous API versions in libc
357 * to support patching; libc calls into the kernel to determine this number.
358 *
359 * Version 1 of the API is the version originally shipped with Solaris 10
360 * Version 2 alters the zone_create system call in order to support more
361 *     arguments by moving the args into a structure; and to do better
362 *     error reporting when zone_create() fails.
363 * Version 3 alters the zone_create system call in order to support the
364 *     import of ZFS datasets to zones.
365 * Version 4 alters the zone_create system call in order to support
366 *     Trusted Extensions.
367 * Version 5 alters the zone_boot system call, and converts its old
368 *     bootargs parameter to be set by the zone_setattr API instead.
369 * Version 6 adds the flag argument to zone_create.
370 */
371static const int ZONE_SYSCALL_API_VERSION = 6;
372
373/*
374 * Certain filesystems (such as NFS and autofs) need to know which zone
375 * the mount is being placed in.  Because of this, we need to be able to
376 * ensure that a zone isn't in the process of being created such that
377 * nfs_mount() thinks it is in the global zone, while by the time it
378 * gets added the list of mounted zones, it ends up on zoneA's mount
379 * list.
380 *
381 * The following functions: block_mounts()/resume_mounts() and
382 * mount_in_progress()/mount_completed() are used by zones and the VFS
383 * layer (respectively) to synchronize zone creation and new mounts.
384 *
385 * The semantics are like a reader-reader lock such that there may
386 * either be multiple mounts (or zone creations, if that weren't
387 * serialized by zonehash_lock) in progress at the same time, but not
388 * both.
389 *
390 * We use cv's so the user can ctrl-C out of the operation if it's
391 * taking too long.
392 *
393 * The semantics are such that there is unfair bias towards the
394 * "current" operation.  This means that zone creations may starve if
395 * there is a rapid succession of new mounts coming in to the system, or
396 * there is a remote possibility that zones will be created at such a
397 * rate that new mounts will not be able to proceed.
398 */
399/*
400 * Prevent new mounts from progressing to the point of calling
401 * VFS_MOUNT().  If there are already mounts in this "region", wait for
402 * them to complete.
403 */
404static int
405block_mounts(void)
406{
407	int retval = 0;
408
409	/*
410	 * Since it may block for a long time, block_mounts() shouldn't be
411	 * called with zonehash_lock held.
412	 */
413	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
414	mutex_enter(&mount_lock);
415	while (mounts_in_progress > 0) {
416		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
417			goto signaled;
418	}
419	/*
420	 * A negative value of mounts_in_progress indicates that mounts
421	 * have been blocked by (-mounts_in_progress) different callers.
422	 */
423	mounts_in_progress--;
424	retval = 1;
425signaled:
426	mutex_exit(&mount_lock);
427	return (retval);
428}
429
430/*
431 * The VFS layer may progress with new mounts as far as we're concerned.
432 * Allow them to progress if we were the last obstacle.
433 */
434static void
435resume_mounts(void)
436{
437	mutex_enter(&mount_lock);
438	if (++mounts_in_progress == 0)
439		cv_broadcast(&mount_cv);
440	mutex_exit(&mount_lock);
441}
442
443/*
444 * The VFS layer is busy with a mount; zones should wait until all
445 * mounts are completed to progress.
446 */
447void
448mount_in_progress(void)
449{
450	mutex_enter(&mount_lock);
451	while (mounts_in_progress < 0)
452		cv_wait(&mount_cv, &mount_lock);
453	mounts_in_progress++;
454	mutex_exit(&mount_lock);
455}
456
457/*
458 * VFS is done with one mount; wake up any waiting block_mounts()
459 * callers if this is the last mount.
460 */
461void
462mount_completed(void)
463{
464	mutex_enter(&mount_lock);
465	if (--mounts_in_progress == 0)
466		cv_broadcast(&mount_cv);
467	mutex_exit(&mount_lock);
468}
469
470/*
471 * ZSD routines.
472 *
473 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
474 * defined by the pthread_key_create() and related interfaces.
475 *
476 * Kernel subsystems may register one or more data items and/or
477 * callbacks to be executed when a zone is created, shutdown, or
478 * destroyed.
479 *
480 * Unlike the thread counterpart, destructor callbacks will be executed
481 * even if the data pointer is NULL and/or there are no constructor
482 * callbacks, so it is the responsibility of such callbacks to check for
483 * NULL data values if necessary.
484 *
485 * The locking strategy and overall picture is as follows:
486 *
487 * When someone calls zone_key_create(), a template ZSD entry is added to the
488 * global list "zsd_registered_keys", protected by zsd_key_lock.  The
489 * constructor callback is called immediately on all existing zones, and a
490 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
491 * zone_lock).  As this operation requires the list of zones, the list of
492 * registered keys, and the per-zone list of ZSD entries to remain constant
493 * throughout the entire operation, it must grab zonehash_lock, zone_lock for
494 * all existing zones, and zsd_key_lock, in that order.  Similar locking is
495 * needed when zone_key_delete() is called.  It is thus sufficient to hold
496 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
497 * per-zone zone_zsd list.
498 *
499 * Note that this implementation does not make a copy of the ZSD entry if a
500 * constructor callback is not provided.  A zone_getspecific() on such an
501 * uninitialized ZSD entry will return NULL.
502 *
503 * When new zones are created constructor callbacks for all registered ZSD
504 * entries will be called.
505 *
506 * The framework does not provide any locking around zone_getspecific() and
507 * zone_setspecific() apart from that needed for internal consistency, so
508 * callers interested in atomic "test-and-set" semantics will need to provide
509 * their own locking.
510 */
511void
512zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
513    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
514{
515	struct zsd_entry *zsdp;
516	struct zsd_entry *t;
517	struct zone *zone;
518
519	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
520	zsdp->zsd_data = NULL;
521	zsdp->zsd_create = create;
522	zsdp->zsd_shutdown = shutdown;
523	zsdp->zsd_destroy = destroy;
524
525	mutex_enter(&zonehash_lock);	/* stop the world */
526	for (zone = list_head(&zone_active); zone != NULL;
527	    zone = list_next(&zone_active, zone))
528		mutex_enter(&zone->zone_lock);	/* lock all zones */
529
530	mutex_enter(&zsd_key_lock);
531	*keyp = zsdp->zsd_key = ++zsd_keyval;
532	ASSERT(zsd_keyval != 0);
533	list_insert_tail(&zsd_registered_keys, zsdp);
534	mutex_exit(&zsd_key_lock);
535
536	if (create != NULL) {
537		for (zone = list_head(&zone_active); zone != NULL;
538		    zone = list_next(&zone_active, zone)) {
539			t = kmem_alloc(sizeof (*t), KM_SLEEP);
540			t->zsd_key = *keyp;
541			t->zsd_data = (*create)(zone->zone_id);
542			t->zsd_create = create;
543			t->zsd_shutdown = shutdown;
544			t->zsd_destroy = destroy;
545			list_insert_tail(&zone->zone_zsd, t);
546		}
547	}
548	for (zone = list_head(&zone_active); zone != NULL;
549	    zone = list_next(&zone_active, zone))
550		mutex_exit(&zone->zone_lock);
551	mutex_exit(&zonehash_lock);
552}
553
554/*
555 * Helper function to find the zsd_entry associated with the key in the
556 * given list.
557 */
558static struct zsd_entry *
559zsd_find(list_t *l, zone_key_t key)
560{
561	struct zsd_entry *zsd;
562
563	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
564		if (zsd->zsd_key == key) {
565			/*
566			 * Move to head of list to keep list in MRU order.
567			 */
568			if (zsd != list_head(l)) {
569				list_remove(l, zsd);
570				list_insert_head(l, zsd);
571			}
572			return (zsd);
573		}
574	}
575	return (NULL);
576}
577
578/*
579 * Function called when a module is being unloaded, or otherwise wishes
580 * to unregister its ZSD key and callbacks.
581 */
582int
583zone_key_delete(zone_key_t key)
584{
585	struct zsd_entry *zsdp = NULL;
586	zone_t *zone;
587
588	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
589	for (zone = list_head(&zone_active); zone != NULL;
590	    zone = list_next(&zone_active, zone))
591		mutex_enter(&zone->zone_lock);	/* lock all zones */
592
593	mutex_enter(&zsd_key_lock);
594	zsdp = zsd_find(&zsd_registered_keys, key);
595	if (zsdp == NULL)
596		goto notfound;
597	list_remove(&zsd_registered_keys, zsdp);
598	mutex_exit(&zsd_key_lock);
599
600	for (zone = list_head(&zone_active); zone != NULL;
601	    zone = list_next(&zone_active, zone)) {
602		struct zsd_entry *del;
603		void *data;
604
605		if (!(zone->zone_flags & ZF_DESTROYED)) {
606			del = zsd_find(&zone->zone_zsd, key);
607			if (del != NULL) {
608				data = del->zsd_data;
609				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
610				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
611				list_remove(&zone->zone_zsd, del);
612				kmem_free(del, sizeof (*del));
613			} else {
614				data = NULL;
615			}
616			if (zsdp->zsd_shutdown)
617				zsdp->zsd_shutdown(zone->zone_id, data);
618			if (zsdp->zsd_destroy)
619				zsdp->zsd_destroy(zone->zone_id, data);
620		}
621		mutex_exit(&zone->zone_lock);
622	}
623	mutex_exit(&zonehash_lock);
624	kmem_free(zsdp, sizeof (*zsdp));
625	return (0);
626
627notfound:
628	mutex_exit(&zsd_key_lock);
629	for (zone = list_head(&zone_active); zone != NULL;
630	    zone = list_next(&zone_active, zone))
631		mutex_exit(&zone->zone_lock);
632	mutex_exit(&zonehash_lock);
633	return (-1);
634}
635
636/*
637 * ZSD counterpart of pthread_setspecific().
638 */
639int
640zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
641{
642	struct zsd_entry *t;
643	struct zsd_entry *zsdp = NULL;
644
645	mutex_enter(&zone->zone_lock);
646	t = zsd_find(&zone->zone_zsd, key);
647	if (t != NULL) {
648		/*
649		 * Replace old value with new
650		 */
651		t->zsd_data = (void *)data;
652		mutex_exit(&zone->zone_lock);
653		return (0);
654	}
655	/*
656	 * If there was no previous value, go through the list of registered
657	 * keys.
658	 *
659	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
660	 * necessary for shutdown callbacks to be able to execute without fear
661	 * of deadlock.
662	 */
663	mutex_enter(&zsd_key_lock);
664	zsdp = zsd_find(&zsd_registered_keys, key);
665	if (zsdp == NULL) { 	/* Key was not registered */
666		mutex_exit(&zsd_key_lock);
667		mutex_exit(&zone->zone_lock);
668		return (-1);
669	}
670
671	/*
672	 * Add a zsd_entry to this zone, using the template we just retrieved
673	 * to initialize the constructor and destructor(s).
674	 */
675	t = kmem_alloc(sizeof (*t), KM_SLEEP);
676	t->zsd_key = key;
677	t->zsd_data = (void *)data;
678	t->zsd_create = zsdp->zsd_create;
679	t->zsd_shutdown = zsdp->zsd_shutdown;
680	t->zsd_destroy = zsdp->zsd_destroy;
681	list_insert_tail(&zone->zone_zsd, t);
682	mutex_exit(&zsd_key_lock);
683	mutex_exit(&zone->zone_lock);
684	return (0);
685}
686
687/*
688 * ZSD counterpart of pthread_getspecific().
689 */
690void *
691zone_getspecific(zone_key_t key, zone_t *zone)
692{
693	struct zsd_entry *t;
694	void *data;
695
696	mutex_enter(&zone->zone_lock);
697	t = zsd_find(&zone->zone_zsd, key);
698	data = (t == NULL ? NULL : t->zsd_data);
699	mutex_exit(&zone->zone_lock);
700	return (data);
701}
702
703/*
704 * Function used to initialize a zone's list of ZSD callbacks and data
705 * when the zone is being created.  The callbacks are initialized from
706 * the template list (zsd_registered_keys), and the constructor
707 * callback executed (if one exists).
708 *
709 * This is called before the zone is made publicly available, hence no
710 * need to grab zone_lock.
711 *
712 * Although we grab and release zsd_key_lock, new entries cannot be
713 * added to or removed from the zsd_registered_keys list until we
714 * release zonehash_lock, so there isn't a window for a
715 * zone_key_create() to come in after we've dropped zsd_key_lock but
716 * before the zone is added to the zone list, such that the constructor
717 * callbacks aren't executed for the new zone.
718 */
719static void
720zone_zsd_configure(zone_t *zone)
721{
722	struct zsd_entry *zsdp;
723	struct zsd_entry *t;
724	zoneid_t zoneid = zone->zone_id;
725
726	ASSERT(MUTEX_HELD(&zonehash_lock));
727	ASSERT(list_head(&zone->zone_zsd) == NULL);
728	mutex_enter(&zsd_key_lock);
729	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
730	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
731		if (zsdp->zsd_create != NULL) {
732			t = kmem_alloc(sizeof (*t), KM_SLEEP);
733			t->zsd_key = zsdp->zsd_key;
734			t->zsd_create = zsdp->zsd_create;
735			t->zsd_data = (*t->zsd_create)(zoneid);
736			t->zsd_shutdown = zsdp->zsd_shutdown;
737			t->zsd_destroy = zsdp->zsd_destroy;
738			list_insert_tail(&zone->zone_zsd, t);
739		}
740	}
741	mutex_exit(&zsd_key_lock);
742}
743
744enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
745
746/*
747 * Helper function to execute shutdown or destructor callbacks.
748 */
749static void
750zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
751{
752	struct zsd_entry *zsdp;
753	struct zsd_entry *t;
754	zoneid_t zoneid = zone->zone_id;
755
756	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
757	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
758	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
759
760	mutex_enter(&zone->zone_lock);
761	if (ct == ZSD_DESTROY) {
762		if (zone->zone_flags & ZF_DESTROYED) {
763			/*
764			 * Make sure destructors are only called once.
765			 */
766			mutex_exit(&zone->zone_lock);
767			return;
768		}
769		zone->zone_flags |= ZF_DESTROYED;
770	}
771	mutex_exit(&zone->zone_lock);
772
773	/*
774	 * Both zsd_key_lock and zone_lock need to be held in order to add or
775	 * remove a ZSD key, (either globally as part of
776	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
777	 * possible through zone_setspecific()), so it's sufficient to hold
778	 * zsd_key_lock here.
779	 *
780	 * This is a good thing, since we don't want to recursively try to grab
781	 * zone_lock if a callback attempts to do something like a crfree() or
782	 * zone_rele().
783	 */
784	mutex_enter(&zsd_key_lock);
785	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
786	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
787		zone_key_t key = zsdp->zsd_key;
788
789		/* Skip if no callbacks registered */
790		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
791			continue;
792		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
793			continue;
794		/*
795		 * Call the callback with the zone-specific data if we can find
796		 * any, otherwise with NULL.
797		 */
798		t = zsd_find(&zone->zone_zsd, key);
799		if (t != NULL) {
800			if (ct == ZSD_SHUTDOWN) {
801				t->zsd_shutdown(zoneid, t->zsd_data);
802			} else {
803				ASSERT(ct == ZSD_DESTROY);
804				t->zsd_destroy(zoneid, t->zsd_data);
805			}
806		} else {
807			if (ct == ZSD_SHUTDOWN) {
808				zsdp->zsd_shutdown(zoneid, NULL);
809			} else {
810				ASSERT(ct == ZSD_DESTROY);
811				zsdp->zsd_destroy(zoneid, NULL);
812			}
813		}
814	}
815	mutex_exit(&zsd_key_lock);
816}
817
818/*
819 * Called when the zone is going away; free ZSD-related memory, and
820 * destroy the zone_zsd list.
821 */
822static void
823zone_free_zsd(zone_t *zone)
824{
825	struct zsd_entry *t, *next;
826
827	/*
828	 * Free all the zsd_entry's we had on this zone.
829	 */
830	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
831		next = list_next(&zone->zone_zsd, t);
832		list_remove(&zone->zone_zsd, t);
833		kmem_free(t, sizeof (*t));
834	}
835	list_destroy(&zone->zone_zsd);
836}
837
838/*
839 * Frees memory associated with the zone dataset list.
840 */
841static void
842zone_free_datasets(zone_t *zone)
843{
844	zone_dataset_t *t, *next;
845
846	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
847		next = list_next(&zone->zone_datasets, t);
848		list_remove(&zone->zone_datasets, t);
849		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
850		kmem_free(t, sizeof (*t));
851	}
852	list_destroy(&zone->zone_datasets);
853}
854
855/*
856 * zone.cpu-shares resource control support.
857 */
858/*ARGSUSED*/
859static rctl_qty_t
860zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
861{
862	ASSERT(MUTEX_HELD(&p->p_lock));
863	return (p->p_zone->zone_shares);
864}
865
866/*ARGSUSED*/
867static int
868zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
869    rctl_qty_t nv)
870{
871	ASSERT(MUTEX_HELD(&p->p_lock));
872	ASSERT(e->rcep_t == RCENTITY_ZONE);
873	if (e->rcep_p.zone == NULL)
874		return (0);
875
876	e->rcep_p.zone->zone_shares = nv;
877	return (0);
878}
879
880static rctl_ops_t zone_cpu_shares_ops = {
881	rcop_no_action,
882	zone_cpu_shares_usage,
883	zone_cpu_shares_set,
884	rcop_no_test
885};
886
887/*
888 * zone.cpu-cap resource control support.
889 */
890/*ARGSUSED*/
891static rctl_qty_t
892zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
893{
894	ASSERT(MUTEX_HELD(&p->p_lock));
895	return (cpucaps_zone_get(p->p_zone));
896}
897
898/*ARGSUSED*/
899static int
900zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
901    rctl_qty_t nv)
902{
903	zone_t *zone = e->rcep_p.zone;
904
905	ASSERT(MUTEX_HELD(&p->p_lock));
906	ASSERT(e->rcep_t == RCENTITY_ZONE);
907
908	if (zone == NULL)
909		return (0);
910
911	/*
912	 * set cap to the new value.
913	 */
914	return (cpucaps_zone_set(zone, nv));
915}
916
917static rctl_ops_t zone_cpu_cap_ops = {
918	rcop_no_action,
919	zone_cpu_cap_get,
920	zone_cpu_cap_set,
921	rcop_no_test
922};
923
924/*ARGSUSED*/
925static rctl_qty_t
926zone_lwps_usage(rctl_t *r, proc_t *p)
927{
928	rctl_qty_t nlwps;
929	zone_t *zone = p->p_zone;
930
931	ASSERT(MUTEX_HELD(&p->p_lock));
932
933	mutex_enter(&zone->zone_nlwps_lock);
934	nlwps = zone->zone_nlwps;
935	mutex_exit(&zone->zone_nlwps_lock);
936
937	return (nlwps);
938}
939
940/*ARGSUSED*/
941static int
942zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
943    rctl_qty_t incr, uint_t flags)
944{
945	rctl_qty_t nlwps;
946
947	ASSERT(MUTEX_HELD(&p->p_lock));
948	ASSERT(e->rcep_t == RCENTITY_ZONE);
949	if (e->rcep_p.zone == NULL)
950		return (0);
951	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
952	nlwps = e->rcep_p.zone->zone_nlwps;
953
954	if (nlwps + incr > rcntl->rcv_value)
955		return (1);
956
957	return (0);
958}
959
960/*ARGSUSED*/
961static int
962zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
963{
964	ASSERT(MUTEX_HELD(&p->p_lock));
965	ASSERT(e->rcep_t == RCENTITY_ZONE);
966	if (e->rcep_p.zone == NULL)
967		return (0);
968	e->rcep_p.zone->zone_nlwps_ctl = nv;
969	return (0);
970}
971
972static rctl_ops_t zone_lwps_ops = {
973	rcop_no_action,
974	zone_lwps_usage,
975	zone_lwps_set,
976	zone_lwps_test,
977};
978
979/*ARGSUSED*/
980static int
981zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
982    rctl_qty_t incr, uint_t flags)
983{
984	rctl_qty_t v;
985	ASSERT(MUTEX_HELD(&p->p_lock));
986	ASSERT(e->rcep_t == RCENTITY_ZONE);
987	v = e->rcep_p.zone->zone_shmmax + incr;
988	if (v > rval->rcv_value)
989		return (1);
990	return (0);
991}
992
993static rctl_ops_t zone_shmmax_ops = {
994	rcop_no_action,
995	rcop_no_usage,
996	rcop_no_set,
997	zone_shmmax_test
998};
999
1000/*ARGSUSED*/
1001static int
1002zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1003    rctl_qty_t incr, uint_t flags)
1004{
1005	rctl_qty_t v;
1006	ASSERT(MUTEX_HELD(&p->p_lock));
1007	ASSERT(e->rcep_t == RCENTITY_ZONE);
1008	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1009	if (v > rval->rcv_value)
1010		return (1);
1011	return (0);
1012}
1013
1014static rctl_ops_t zone_shmmni_ops = {
1015	rcop_no_action,
1016	rcop_no_usage,
1017	rcop_no_set,
1018	zone_shmmni_test
1019};
1020
1021/*ARGSUSED*/
1022static int
1023zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1024    rctl_qty_t incr, uint_t flags)
1025{
1026	rctl_qty_t v;
1027	ASSERT(MUTEX_HELD(&p->p_lock));
1028	ASSERT(e->rcep_t == RCENTITY_ZONE);
1029	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1030	if (v > rval->rcv_value)
1031		return (1);
1032	return (0);
1033}
1034
1035static rctl_ops_t zone_semmni_ops = {
1036	rcop_no_action,
1037	rcop_no_usage,
1038	rcop_no_set,
1039	zone_semmni_test
1040};
1041
1042/*ARGSUSED*/
1043static int
1044zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1045    rctl_qty_t incr, uint_t flags)
1046{
1047	rctl_qty_t v;
1048	ASSERT(MUTEX_HELD(&p->p_lock));
1049	ASSERT(e->rcep_t == RCENTITY_ZONE);
1050	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1051	if (v > rval->rcv_value)
1052		return (1);
1053	return (0);
1054}
1055
1056static rctl_ops_t zone_msgmni_ops = {
1057	rcop_no_action,
1058	rcop_no_usage,
1059	rcop_no_set,
1060	zone_msgmni_test
1061};
1062
1063/*ARGSUSED*/
1064static rctl_qty_t
1065zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1066{
1067	rctl_qty_t q;
1068	ASSERT(MUTEX_HELD(&p->p_lock));
1069	mutex_enter(&p->p_zone->zone_mem_lock);
1070	q = p->p_zone->zone_locked_mem;
1071	mutex_exit(&p->p_zone->zone_mem_lock);
1072	return (q);
1073}
1074
1075/*ARGSUSED*/
1076static int
1077zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1078    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1079{
1080	rctl_qty_t q;
1081	zone_t *z;
1082
1083	z = e->rcep_p.zone;
1084	ASSERT(MUTEX_HELD(&p->p_lock));
1085	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1086	q = z->zone_locked_mem;
1087	if (q + incr > rcntl->rcv_value)
1088		return (1);
1089	return (0);
1090}
1091
1092/*ARGSUSED*/
1093static int
1094zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1095    rctl_qty_t nv)
1096{
1097	ASSERT(MUTEX_HELD(&p->p_lock));
1098	ASSERT(e->rcep_t == RCENTITY_ZONE);
1099	if (e->rcep_p.zone == NULL)
1100		return (0);
1101	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1102	return (0);
1103}
1104
1105static rctl_ops_t zone_locked_mem_ops = {
1106	rcop_no_action,
1107	zone_locked_mem_usage,
1108	zone_locked_mem_set,
1109	zone_locked_mem_test
1110};
1111
1112/*ARGSUSED*/
1113static rctl_qty_t
1114zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1115{
1116	rctl_qty_t q;
1117	zone_t *z = p->p_zone;
1118
1119	ASSERT(MUTEX_HELD(&p->p_lock));
1120	mutex_enter(&z->zone_mem_lock);
1121	q = z->zone_max_swap;
1122	mutex_exit(&z->zone_mem_lock);
1123	return (q);
1124}
1125
1126/*ARGSUSED*/
1127static int
1128zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1129    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1130{
1131	rctl_qty_t q;
1132	zone_t *z;
1133
1134	z = e->rcep_p.zone;
1135	ASSERT(MUTEX_HELD(&p->p_lock));
1136	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1137	q = z->zone_max_swap;
1138	if (q + incr > rcntl->rcv_value)
1139		return (1);
1140	return (0);
1141}
1142
1143/*ARGSUSED*/
1144static int
1145zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1146    rctl_qty_t nv)
1147{
1148	ASSERT(MUTEX_HELD(&p->p_lock));
1149	ASSERT(e->rcep_t == RCENTITY_ZONE);
1150	if (e->rcep_p.zone == NULL)
1151		return (0);
1152	e->rcep_p.zone->zone_max_swap_ctl = nv;
1153	return (0);
1154}
1155
1156static rctl_ops_t zone_max_swap_ops = {
1157	rcop_no_action,
1158	zone_max_swap_usage,
1159	zone_max_swap_set,
1160	zone_max_swap_test
1161};
1162
1163/*
1164 * Helper function to brand the zone with a unique ID.
1165 */
1166static void
1167zone_uniqid(zone_t *zone)
1168{
1169	static uint64_t uniqid = 0;
1170
1171	ASSERT(MUTEX_HELD(&zonehash_lock));
1172	zone->zone_uniqid = uniqid++;
1173}
1174
1175/*
1176 * Returns a held pointer to the "kcred" for the specified zone.
1177 */
1178struct cred *
1179zone_get_kcred(zoneid_t zoneid)
1180{
1181	zone_t *zone;
1182	cred_t *cr;
1183
1184	if ((zone = zone_find_by_id(zoneid)) == NULL)
1185		return (NULL);
1186	cr = zone->zone_kcred;
1187	crhold(cr);
1188	zone_rele(zone);
1189	return (cr);
1190}
1191
1192static int
1193zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1194{
1195	zone_t *zone = ksp->ks_private;
1196	zone_kstat_t *zk = ksp->ks_data;
1197
1198	if (rw == KSTAT_WRITE)
1199		return (EACCES);
1200
1201	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1202	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1203	return (0);
1204}
1205
1206static int
1207zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1208{
1209	zone_t *zone = ksp->ks_private;
1210	zone_kstat_t *zk = ksp->ks_data;
1211
1212	if (rw == KSTAT_WRITE)
1213		return (EACCES);
1214
1215	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1216	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1217	return (0);
1218}
1219
1220static void
1221zone_kstat_create(zone_t *zone)
1222{
1223	kstat_t *ksp;
1224	zone_kstat_t *zk;
1225
1226	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
1227	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1228	    KSTAT_FLAG_VIRTUAL);
1229
1230	if (ksp == NULL)
1231		return;
1232
1233	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1234	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1235	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1236	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1237	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1238	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1239	ksp->ks_update = zone_lockedmem_kstat_update;
1240	ksp->ks_private = zone;
1241	kstat_install(ksp);
1242
1243	zone->zone_lockedmem_kstat = ksp;
1244
1245	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
1246	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1247	    KSTAT_FLAG_VIRTUAL);
1248
1249	if (ksp == NULL)
1250		return;
1251
1252	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1253	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1254	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1255	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1256	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1257	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1258	ksp->ks_update = zone_swapresv_kstat_update;
1259	ksp->ks_private = zone;
1260	kstat_install(ksp);
1261
1262	zone->zone_swapresv_kstat = ksp;
1263}
1264
1265static void
1266zone_kstat_delete(zone_t *zone)
1267{
1268	void *data;
1269
1270	if (zone->zone_lockedmem_kstat != NULL) {
1271		data = zone->zone_lockedmem_kstat->ks_data;
1272		kstat_delete(zone->zone_lockedmem_kstat);
1273		kmem_free(data, sizeof (zone_kstat_t));
1274	}
1275	if (zone->zone_swapresv_kstat != NULL) {
1276		data = zone->zone_swapresv_kstat->ks_data;
1277		kstat_delete(zone->zone_swapresv_kstat);
1278		kmem_free(data, sizeof (zone_kstat_t));
1279	}
1280}
1281
1282/*
1283 * Called very early on in boot to initialize the ZSD list so that
1284 * zone_key_create() can be called before zone_init().  It also initializes
1285 * portions of zone0 which may be used before zone_init() is called.  The
1286 * variable "global_zone" will be set when zone0 is fully initialized by
1287 * zone_init().
1288 */
1289void
1290zone_zsd_init(void)
1291{
1292	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1293	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1294	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1295	    offsetof(struct zsd_entry, zsd_linkage));
1296	list_create(&zone_active, sizeof (zone_t),
1297	    offsetof(zone_t, zone_linkage));
1298	list_create(&zone_deathrow, sizeof (zone_t),
1299	    offsetof(zone_t, zone_linkage));
1300
1301	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1302	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1303	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1304	zone0.zone_shares = 1;
1305	zone0.zone_nlwps = 0;
1306	zone0.zone_nlwps_ctl = INT_MAX;
1307	zone0.zone_locked_mem = 0;
1308	zone0.zone_locked_mem_ctl = UINT64_MAX;
1309	ASSERT(zone0.zone_max_swap == 0);
1310	zone0.zone_max_swap_ctl = UINT64_MAX;
1311	zone0.zone_shmmax = 0;
1312	zone0.zone_ipc.ipcq_shmmni = 0;
1313	zone0.zone_ipc.ipcq_semmni = 0;
1314	zone0.zone_ipc.ipcq_msgmni = 0;
1315	zone0.zone_name = GLOBAL_ZONENAME;
1316	zone0.zone_nodename = utsname.nodename;
1317	zone0.zone_domain = srpc_domain;
1318	zone0.zone_ref = 1;
1319	zone0.zone_id = GLOBAL_ZONEID;
1320	zone0.zone_status = ZONE_IS_RUNNING;
1321	zone0.zone_rootpath = "/";
1322	zone0.zone_rootpathlen = 2;
1323	zone0.zone_psetid = ZONE_PS_INVAL;
1324	zone0.zone_ncpus = 0;
1325	zone0.zone_ncpus_online = 0;
1326	zone0.zone_proc_initpid = 1;
1327	zone0.zone_initname = initname;
1328	zone0.zone_lockedmem_kstat = NULL;
1329	zone0.zone_swapresv_kstat = NULL;
1330	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1331	    offsetof(struct zsd_entry, zsd_linkage));
1332	list_insert_head(&zone_active, &zone0);
1333
1334	/*
1335	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1336	 * to anything meaningful.  It is assigned to be 'rootdir' in
1337	 * vfs_mountroot().
1338	 */
1339	zone0.zone_rootvp = NULL;
1340	zone0.zone_vfslist = NULL;
1341	zone0.zone_bootargs = initargs;
1342	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1343	/*
1344	 * The global zone has all privileges
1345	 */
1346	priv_fillset(zone0.zone_privset);
1347	/*
1348	 * Add p0 to the global zone
1349	 */
1350	zone0.zone_zsched = &p0;
1351	p0.p_zone = &zone0;
1352}
1353
1354/*
1355 * Compute a hash value based on the contents of the label and the DOI.  The
1356 * hash algorithm is somewhat arbitrary, but is based on the observation that
1357 * humans will likely pick labels that differ by amounts that work out to be
1358 * multiples of the number of hash chains, and thus stirring in some primes
1359 * should help.
1360 */
1361static uint_t
1362hash_bylabel(void *hdata, mod_hash_key_t key)
1363{
1364	const ts_label_t *lab = (ts_label_t *)key;
1365	const uint32_t *up, *ue;
1366	uint_t hash;
1367	int i;
1368
1369	_NOTE(ARGUNUSED(hdata));
1370
1371	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1372	/* we depend on alignment of label, but not representation */
1373	up = (const uint32_t *)&lab->tsl_label;
1374	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1375	i = 1;
1376	while (up < ue) {
1377		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1378		hash += *up + (*up << ((i % 16) + 1));
1379		up++;
1380		i++;
1381	}
1382	return (hash);
1383}
1384
1385/*
1386 * All that mod_hash cares about here is zero (equal) versus non-zero (not
1387 * equal).  This may need to be changed if less than / greater than is ever
1388 * needed.
1389 */
1390static int
1391hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1392{
1393	ts_label_t *lab1 = (ts_label_t *)key1;
1394	ts_label_t *lab2 = (ts_label_t *)key2;
1395
1396	return (label_equal(lab1, lab2) ? 0 : 1);
1397}
1398
1399/*
1400 * Called by main() to initialize the zones framework.
1401 */
1402void
1403zone_init(void)
1404{
1405	rctl_dict_entry_t *rde;
1406	rctl_val_t *dval;
1407	rctl_set_t *set;
1408	rctl_alloc_gp_t *gp;
1409	rctl_entity_p_t e;
1410	int res;
1411
1412	ASSERT(curproc == &p0);
1413
1414	/*
1415	 * Create ID space for zone IDs.  ID 0 is reserved for the
1416	 * global zone.
1417	 */
1418	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1419
1420	/*
1421	 * Initialize generic zone resource controls, if any.
1422	 */
1423	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1424	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1425	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1426	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
1427
1428	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
1429	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
1430	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
1431	    RCTL_GLOBAL_INFINITE,
1432	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
1433
1434	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1435	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1436	    INT_MAX, INT_MAX, &zone_lwps_ops);
1437	/*
1438	 * System V IPC resource controls
1439	 */
1440	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1441	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1442	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1443
1444	rc_zone_semmni = rctl_register("zone.max-sem-ids",
1445	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1446	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
1447
1448	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
1449	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1450	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
1451
1452	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
1453	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1454	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
1455
1456	/*
1457	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1458	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1459	 */
1460	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1461	bzero(dval, sizeof (rctl_val_t));
1462	dval->rcv_value = 1;
1463	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1464	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1465	dval->rcv_action_recip_pid = -1;
1466
1467	rde = rctl_dict_lookup("zone.cpu-shares");
1468	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1469
1470	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
1471	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1472	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1473	    &zone_locked_mem_ops);
1474
1475	rc_zone_max_swap = rctl_register("zone.max-swap",
1476	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1477	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1478	    &zone_max_swap_ops);
1479
1480	/*
1481	 * Initialize the ``global zone''.
1482	 */
1483	set = rctl_set_create();
1484	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1485	mutex_enter(&p0.p_lock);
1486	e.rcep_p.zone = &zone0;
1487	e.rcep_t = RCENTITY_ZONE;
1488	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1489	    gp);
1490
1491	zone0.zone_nlwps = p0.p_lwpcnt;
1492	zone0.zone_ntasks = 1;
1493	mutex_exit(&p0.p_lock);
1494	zone0.zone_restart_init = B_TRUE;
1495	zone0.zone_brand = &native_brand;
1496	rctl_prealloc_destroy(gp);
1497	/*
1498	 * pool_default hasn't been initialized yet, so we let pool_init()
1499	 * take care of making sure the global zone is in the default pool.
1500	 */
1501
1502	/*
1503	 * Initialize global zone kstats
1504	 */
1505	zone_kstat_create(&zone0);
1506
1507	/*
1508	 * Initialize zone label.
1509	 * mlp are initialized when tnzonecfg is loaded.
1510	 */
1511	zone0.zone_slabel = l_admin_low;
1512	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
1513	label_hold(l_admin_low);
1514
1515	mutex_enter(&zonehash_lock);
1516	zone_uniqid(&zone0);
1517	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1518
1519	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1520	    mod_hash_null_valdtor);
1521	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1522	    zone_hash_size, mod_hash_null_valdtor);
1523	/*
1524	 * maintain zonehashbylabel only for labeled systems
1525	 */
1526	if (is_system_labeled())
1527		zonehashbylabel = mod_hash_create_extended("zone_by_label",
1528		    zone_hash_size, mod_hash_null_keydtor,
1529		    mod_hash_null_valdtor, hash_bylabel, NULL,
1530		    hash_labelkey_cmp, KM_SLEEP);
1531	zonecount = 1;
1532
1533	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1534	    (mod_hash_val_t)&zone0);
1535	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1536	    (mod_hash_val_t)&zone0);
1537	if (is_system_labeled()) {
1538		zone0.zone_flags |= ZF_HASHED_LABEL;
1539		(void) mod_hash_insert(zonehashbylabel,
1540		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
1541	}
1542	mutex_exit(&zonehash_lock);
1543
1544	/*
1545	 * We avoid setting zone_kcred until now, since kcred is initialized
1546	 * sometime after zone_zsd_init() and before zone_init().
1547	 */
1548	zone0.zone_kcred = kcred;
1549	/*
1550	 * The global zone is fully initialized (except for zone_rootvp which
1551	 * will be set when the root filesystem is mounted).
1552	 */
1553	global_zone = &zone0;
1554
1555	/*
1556	 * Setup an event channel to send zone status change notifications on
1557	 */
1558	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
1559	    EVCH_CREAT);
1560
1561	if (res)
1562		panic("Sysevent_evc_bind failed during zone setup.\n");
1563
1564}
1565
1566static void
1567zone_free(zone_t *zone)
1568{
1569	ASSERT(zone != global_zone);
1570	ASSERT(zone->zone_ntasks == 0);
1571	ASSERT(zone->zone_nlwps == 0);
1572	ASSERT(zone->zone_cred_ref == 0);
1573	ASSERT(zone->zone_kcred == NULL);
1574	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1575	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1576
1577	/*
1578	 * Remove any zone caps.
1579	 */
1580	cpucaps_zone_remove(zone);
1581
1582	ASSERT(zone->zone_cpucap == NULL);
1583
1584	/* remove from deathrow list */
1585	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1586		ASSERT(zone->zone_ref == 0);
1587		mutex_enter(&zone_deathrow_lock);
1588		list_remove(&zone_deathrow, zone);
1589		mutex_exit(&zone_deathrow_lock);
1590	}
1591
1592	zone_free_zsd(zone);
1593	zone_free_datasets(zone);
1594
1595	if (zone->zone_rootvp != NULL)
1596		VN_RELE(zone->zone_rootvp);
1597	if (zone->zone_rootpath)
1598		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
1599	if (zone->zone_name != NULL)
1600		kmem_free(zone->zone_name, ZONENAME_MAX);
1601	if (zone->zone_slabel != NULL)
1602		label_rele(zone->zone_slabel);
1603	if (zone->zone_nodename != NULL)
1604		kmem_free(zone->zone_nodename, _SYS_NMLN);
1605	if (zone->zone_domain != NULL)
1606		kmem_free(zone->zone_domain, _SYS_NMLN);
1607	if (zone->zone_privset != NULL)
1608		kmem_free(zone->zone_privset, sizeof (priv_set_t));
1609	if (zone->zone_rctls != NULL)
1610		rctl_set_free(zone->zone_rctls);
1611	if (zone->zone_bootargs != NULL)
1612		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1613	if (zone->zone_initname != NULL)
1614		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1615	id_free(zoneid_space, zone->zone_id);
1616	mutex_destroy(&zone->zone_lock);
1617	cv_destroy(&zone->zone_cv);
1618	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
1619	kmem_free(zone, sizeof (zone_t));
1620}
1621
1622/*
1623 * See block comment at the top of this file for information about zone
1624 * status values.
1625 */
1626/*
1627 * Convenience function for setting zone status.
1628 */
1629static void
1630zone_status_set(zone_t *zone, zone_status_t status)
1631{
1632
1633	nvlist_t *nvl = NULL;
1634	ASSERT(MUTEX_HELD(&zone_status_lock));
1635	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
1636	    status >= zone_status_get(zone));
1637
1638	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
1639	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
1640	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
1641	    zone_status_table[status]) ||
1642	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
1643	    zone_status_table[zone->zone_status]) ||
1644	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
1645	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
1646	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
1647	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
1648#ifdef DEBUG
1649		(void) printf(
1650		    "Failed to allocate and send zone state change event.\n");
1651#endif
1652	}
1653	nvlist_free(nvl);
1654
1655	zone->zone_status = status;
1656
1657	cv_broadcast(&zone->zone_cv);
1658}
1659
1660/*
1661 * Public function to retrieve the zone status.  The zone status may
1662 * change after it is retrieved.
1663 */
1664zone_status_t
1665zone_status_get(zone_t *zone)
1666{
1667	return (zone->zone_status);
1668}
1669
1670static int
1671zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
1672{
1673	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
1674	int err = 0;
1675
1676	ASSERT(zone != global_zone);
1677	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
1678		goto done;	/* EFAULT or ENAMETOOLONG */
1679
1680	if (zone->zone_bootargs != NULL)
1681		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1682
1683	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
1684	(void) strcpy(zone->zone_bootargs, bootargs);
1685
1686done:
1687	kmem_free(bootargs, BOOTARGS_MAX);
1688	return (err);
1689}
1690
1691static int
1692zone_set_brand(zone_t *zone, const char *brand)
1693{
1694	struct brand_attr *attrp;
1695	brand_t *bp;
1696
1697	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
1698	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
1699		kmem_free(attrp, sizeof (struct brand_attr));
1700		return (EFAULT);
1701	}
1702
1703	bp = brand_register_zone(attrp);
1704	kmem_free(attrp, sizeof (struct brand_attr));
1705	if (bp == NULL)
1706		return (EINVAL);
1707
1708	/*
1709	 * This is the only place where a zone can change it's brand.
1710	 * We already need to hold zone_status_lock to check the zone
1711	 * status, so we'll just use that lock to serialize zone
1712	 * branding requests as well.
1713	 */
1714	mutex_enter(&zone_status_lock);
1715
1716	/* Re-Branding is not allowed and the zone can't be booted yet */
1717	if ((ZONE_IS_BRANDED(zone)) ||
1718	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
1719		mutex_exit(&zone_status_lock);
1720		brand_unregister_zone(bp);
1721		return (EINVAL);
1722	}
1723
1724	if (is_system_labeled() &&
1725	    strncmp(attrp->ba_brandname, NATIVE_BRAND_NAME, MAXNAMELEN) != 0) {
1726		mutex_exit(&zone_status_lock);
1727		brand_unregister_zone(bp);
1728		return (EPERM);
1729	}
1730
1731	zone->zone_brand = bp;
1732	mutex_exit(&zone_status_lock);
1733	return (0);
1734}
1735
1736static int
1737zone_set_initname(zone_t *zone, const char *zone_initname)
1738{
1739	char initname[INITNAME_SZ];
1740	size_t len;
1741	int err = 0;
1742
1743	ASSERT(zone != global_zone);
1744	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
1745		return (err);	/* EFAULT or ENAMETOOLONG */
1746
1747	if (zone->zone_initname != NULL)
1748		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1749
1750	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
1751	(void) strcpy(zone->zone_initname, initname);
1752	return (0);
1753}
1754
1755static int
1756zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
1757{
1758	uint64_t mcap;
1759	int err = 0;
1760
1761	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
1762		zone->zone_phys_mcap = mcap;
1763
1764	return (err);
1765}
1766
1767static int
1768zone_set_sched_class(zone_t *zone, const char *new_class)
1769{
1770	char sched_class[PC_CLNMSZ];
1771	id_t classid;
1772	int err;
1773
1774	ASSERT(zone != global_zone);
1775	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
1776		return (err);	/* EFAULT or ENAMETOOLONG */
1777
1778	if (getcid(sched_class, &classid) != 0 || classid == syscid)
1779		return (set_errno(EINVAL));
1780	zone->zone_defaultcid = classid;
1781	ASSERT(zone->zone_defaultcid > 0 &&
1782	    zone->zone_defaultcid < loaded_classes);
1783
1784	return (0);
1785}
1786
1787/*
1788 * Block indefinitely waiting for (zone_status >= status)
1789 */
1790void
1791zone_status_wait(zone_t *zone, zone_status_t status)
1792{
1793	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1794
1795	mutex_enter(&zone_status_lock);
1796	while (zone->zone_status < status) {
1797		cv_wait(&zone->zone_cv, &zone_status_lock);
1798	}
1799	mutex_exit(&zone_status_lock);
1800}
1801
1802/*
1803 * Private CPR-safe version of zone_status_wait().
1804 */
1805static void
1806zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
1807{
1808	callb_cpr_t cprinfo;
1809
1810	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1811
1812	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
1813	    str);
1814	mutex_enter(&zone_status_lock);
1815	while (zone->zone_status < status) {
1816		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1817		cv_wait(&zone->zone_cv, &zone_status_lock);
1818		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
1819	}
1820	/*
1821	 * zone_status_lock is implicitly released by the following.
1822	 */
1823	CALLB_CPR_EXIT(&cprinfo);
1824}
1825
1826/*
1827 * Block until zone enters requested state or signal is received.  Return (0)
1828 * if signaled, non-zero otherwise.
1829 */
1830int
1831zone_status_wait_sig(zone_t *zone, zone_status_t status)
1832{
1833	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1834
1835	mutex_enter(&zone_status_lock);
1836	while (zone->zone_status < status) {
1837		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
1838			mutex_exit(&zone_status_lock);
1839			return (0);
1840		}
1841	}
1842	mutex_exit(&zone_status_lock);
1843	return (1);
1844}
1845
1846/*
1847 * Block until the zone enters the requested state or the timeout expires,
1848 * whichever happens first.  Return (-1) if operation timed out, time remaining
1849 * otherwise.
1850 */
1851clock_t
1852zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
1853{
1854	clock_t timeleft = 0;
1855
1856	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1857
1858	mutex_enter(&zone_status_lock);
1859	while (zone->zone_status < status && timeleft != -1) {
1860		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
1861	}
1862	mutex_exit(&zone_status_lock);
1863	return (timeleft);
1864}
1865
1866/*
1867 * Block until the zone enters the requested state, the current process is
1868 * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
1869 * operation timed out, 0 if signaled, time remaining otherwise.
1870 */
1871clock_t
1872zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
1873{
1874	clock_t timeleft = tim - lbolt;
1875
1876	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1877
1878	mutex_enter(&zone_status_lock);
1879	while (zone->zone_status < status) {
1880		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
1881		    tim);
1882		if (timeleft <= 0)
1883			break;
1884	}
1885	mutex_exit(&zone_status_lock);
1886	return (timeleft);
1887}
1888
1889/*
1890 * Zones have two reference counts: one for references from credential
1891 * structures (zone_cred_ref), and one (zone_ref) for everything else.
1892 * This is so we can allow a zone to be rebooted while there are still
1893 * outstanding cred references, since certain drivers cache dblks (which
1894 * implicitly results in cached creds).  We wait for zone_ref to drop to
1895 * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
1896 * later freed when the zone_cred_ref drops to 0, though nothing other
1897 * than the zone id and privilege set should be accessed once the zone
1898 * is "dead".
1899 *
1900 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
1901 * to force halt/reboot to block waiting for the zone_cred_ref to drop
1902 * to 0.  This can be useful to flush out other sources of cached creds
1903 * that may be less innocuous than the driver case.
1904 */
1905
1906int zone_wait_for_cred = 0;
1907
1908static void
1909zone_hold_locked(zone_t *z)
1910{
1911	ASSERT(MUTEX_HELD(&z->zone_lock));
1912	z->zone_ref++;
1913	ASSERT(z->zone_ref != 0);
1914}
1915
1916void
1917zone_hold(zone_t *z)
1918{
1919	mutex_enter(&z->zone_lock);
1920	zone_hold_locked(z);
1921	mutex_exit(&z->zone_lock);
1922}
1923
1924/*
1925 * If the non-cred ref count drops to 1 and either the cred ref count
1926 * is 0 or we aren't waiting for cred references, the zone is ready to
1927 * be destroyed.
1928 */
1929#define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
1930	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
1931
1932void
1933zone_rele(zone_t *z)
1934{
1935	boolean_t wakeup;
1936
1937	mutex_enter(&z->zone_lock);
1938	ASSERT(z->zone_ref != 0);
1939	z->zone_ref--;
1940	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1941		/* no more refs, free the structure */
1942		mutex_exit(&z->zone_lock);
1943		zone_free(z);
1944		return;
1945	}
1946	/* signal zone_destroy so the zone can finish halting */
1947	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
1948	mutex_exit(&z->zone_lock);
1949
1950	if (wakeup) {
1951		/*
1952		 * Grabbing zonehash_lock here effectively synchronizes with
1953		 * zone_destroy() to avoid missed signals.
1954		 */
1955		mutex_enter(&zonehash_lock);
1956		cv_broadcast(&zone_destroy_cv);
1957		mutex_exit(&zonehash_lock);
1958	}
1959}
1960
1961void
1962zone_cred_hold(zone_t *z)
1963{
1964	mutex_enter(&z->zone_lock);
1965	z->zone_cred_ref++;
1966	ASSERT(z->zone_cred_ref != 0);
1967	mutex_exit(&z->zone_lock);
1968}
1969
1970void
1971zone_cred_rele(zone_t *z)
1972{
1973	boolean_t wakeup;
1974
1975	mutex_enter(&z->zone_lock);
1976	ASSERT(z->zone_cred_ref != 0);
1977	z->zone_cred_ref--;
1978	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1979		/* no more refs, free the structure */
1980		mutex_exit(&z->zone_lock);
1981		zone_free(z);
1982		return;
1983	}
1984	/*
1985	 * If zone_destroy is waiting for the cred references to drain
1986	 * out, and they have, signal it.
1987	 */
1988	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
1989	    zone_status_get(z) >= ZONE_IS_DEAD);
1990	mutex_exit(&z->zone_lock);
1991
1992	if (wakeup) {
1993		/*
1994		 * Grabbing zonehash_lock here effectively synchronizes with
1995		 * zone_destroy() to avoid missed signals.
1996		 */
1997		mutex_enter(&zonehash_lock);
1998		cv_broadcast(&zone_destroy_cv);
1999		mutex_exit(&zonehash_lock);
2000	}
2001}
2002
2003void
2004zone_task_hold(zone_t *z)
2005{
2006	mutex_enter(&z->zone_lock);
2007	z->zone_ntasks++;
2008	ASSERT(z->zone_ntasks != 0);
2009	mutex_exit(&z->zone_lock);
2010}
2011
2012void
2013zone_task_rele(zone_t *zone)
2014{
2015	uint_t refcnt;
2016
2017	mutex_enter(&zone->zone_lock);
2018	ASSERT(zone->zone_ntasks != 0);
2019	refcnt = --zone->zone_ntasks;
2020	if (refcnt > 1)	{	/* Common case */
2021		mutex_exit(&zone->zone_lock);
2022		return;
2023	}
2024	zone_hold_locked(zone);	/* so we can use the zone_t later */
2025	mutex_exit(&zone->zone_lock);
2026	if (refcnt == 1) {
2027		/*
2028		 * See if the zone is shutting down.
2029		 */
2030		mutex_enter(&zone_status_lock);
2031		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2032			goto out;
2033		}
2034
2035		/*
2036		 * Make sure the ntasks didn't change since we
2037		 * dropped zone_lock.
2038		 */
2039		mutex_enter(&zone->zone_lock);
2040		if (refcnt != zone->zone_ntasks) {
2041			mutex_exit(&zone->zone_lock);
2042			goto out;
2043		}
2044		mutex_exit(&zone->zone_lock);
2045
2046		/*
2047		 * No more user processes in the zone.  The zone is empty.
2048		 */
2049		zone_status_set(zone, ZONE_IS_EMPTY);
2050		goto out;
2051	}
2052
2053	ASSERT(refcnt == 0);
2054	/*
2055	 * zsched has exited; the zone is dead.
2056	 */
2057	zone->zone_zsched = NULL;		/* paranoia */
2058	mutex_enter(&zone_status_lock);
2059	zone_status_set(zone, ZONE_IS_DEAD);
2060out:
2061	mutex_exit(&zone_status_lock);
2062	zone_rele(zone);
2063}
2064
2065zoneid_t
2066getzoneid(void)
2067{
2068	return (curproc->p_zone->zone_id);
2069}
2070
2071/*
2072 * Internal versions of zone_find_by_*().  These don't zone_hold() or
2073 * check the validity of a zone's state.
2074 */
2075static zone_t *
2076zone_find_all_by_id(zoneid_t zoneid)
2077{
2078	mod_hash_val_t hv;
2079	zone_t *zone = NULL;
2080
2081	ASSERT(MUTEX_HELD(&zonehash_lock));
2082
2083	if (mod_hash_find(zonehashbyid,
2084	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2085		zone = (zone_t *)hv;
2086	return (zone);
2087}
2088
2089static zone_t *
2090zone_find_all_by_label(const ts_label_t *label)
2091{
2092	mod_hash_val_t hv;
2093	zone_t *zone = NULL;
2094
2095	ASSERT(MUTEX_HELD(&zonehash_lock));
2096
2097	/*
2098	 * zonehashbylabel is not maintained for unlabeled systems
2099	 */
2100	if (!is_system_labeled())
2101		return (NULL);
2102	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2103		zone = (zone_t *)hv;
2104	return (zone);
2105}
2106
2107static zone_t *
2108zone_find_all_by_name(char *name)
2109{
2110	mod_hash_val_t hv;
2111	zone_t *zone = NULL;
2112
2113	ASSERT(MUTEX_HELD(&zonehash_lock));
2114
2115	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2116		zone = (zone_t *)hv;
2117	return (zone);
2118}
2119
2120/*
2121 * Public interface for looking up a zone by zoneid.  Only returns the zone if
2122 * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2123 * Caller must call zone_rele() once it is done with the zone.
2124 *
2125 * The zone may begin the zone_destroy() sequence immediately after this
2126 * function returns, but may be safely used until zone_rele() is called.
2127 */
2128zone_t *
2129zone_find_by_id(zoneid_t zoneid)
2130{
2131	zone_t *zone;
2132	zone_status_t status;
2133
2134	mutex_enter(&zonehash_lock);
2135	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2136		mutex_exit(&zonehash_lock);
2137		return (NULL);
2138	}
2139	status = zone_status_get(zone);
2140	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2141		/*
2142		 * For all practical purposes the zone doesn't exist.
2143		 */
2144		mutex_exit(&zonehash_lock);
2145		return (NULL);
2146	}
2147	zone_hold(zone);
2148	mutex_exit(&zonehash_lock);
2149	return (zone);
2150}
2151
2152/*
2153 * Similar to zone_find_by_id, but using zone label as the key.
2154 */
2155zone_t *
2156zone_find_by_label(const ts_label_t *label)
2157{
2158	zone_t *zone;
2159	zone_status_t status;
2160
2161	mutex_enter(&zonehash_lock);
2162	if ((zone = zone_find_all_by_label(label)) == NULL) {
2163		mutex_exit(&zonehash_lock);
2164		return (NULL);
2165	}
2166
2167	status = zone_status_get(zone);
2168	if (status > ZONE_IS_DOWN) {
2169		/*
2170		 * For all practical purposes the zone doesn't exist.
2171		 */
2172		mutex_exit(&zonehash_lock);
2173		return (NULL);
2174	}
2175	zone_hold(zone);
2176	mutex_exit(&zonehash_lock);
2177	return (zone);
2178}
2179
2180/*
2181 * Similar to zone_find_by_id, but using zone name as the key.
2182 */
2183zone_t *
2184zone_find_by_name(char *name)
2185{
2186	zone_t *zone;
2187	zone_status_t status;
2188
2189	mutex_enter(&zonehash_lock);
2190	if ((zone = zone_find_all_by_name(name)) == NULL) {
2191		mutex_exit(&zonehash_lock);
2192		return (NULL);
2193	}
2194	status = zone_status_get(zone);
2195	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2196		/*
2197		 * For all practical purposes the zone doesn't exist.
2198		 */
2199		mutex_exit(&zonehash_lock);
2200		return (NULL);
2201	}
2202	zone_hold(zone);
2203	mutex_exit(&zonehash_lock);
2204	return (zone);
2205}
2206
2207/*
2208 * Similar to zone_find_by_id(), using the path as a key.  For instance,
2209 * if there is a zone "foo" rooted at /foo/root, and the path argument
2210 * is "/foo/root/proc", it will return the held zone_t corresponding to
2211 * zone "foo".
2212 *
2213 * zone_find_by_path() always returns a non-NULL value, since at the
2214 * very least every path will be contained in the global zone.
2215 *
2216 * As with the other zone_find_by_*() functions, the caller is
2217 * responsible for zone_rele()ing the return value of this function.
2218 */
2219zone_t *
2220zone_find_by_path(const char *path)
2221{
2222	zone_t *zone;
2223	zone_t *zret = NULL;
2224	zone_status_t status;
2225
2226	if (path == NULL) {
2227		/*
2228		 * Call from rootconf().
2229		 */
2230		zone_hold(global_zone);
2231		return (global_zone);
2232	}
2233	ASSERT(*path == '/');
2234	mutex_enter(&zonehash_lock);
2235	for (zone = list_head(&zone_active); zone != NULL;
2236	    zone = list_next(&zone_active, zone)) {
2237		if (ZONE_PATH_VISIBLE(path, zone))
2238			zret = zone;
2239	}
2240	ASSERT(zret != NULL);
2241	status = zone_status_get(zret);
2242	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2243		/*
2244		 * Zone practically doesn't exist.
2245		 */
2246		zret = global_zone;
2247	}
2248	zone_hold(zret);
2249	mutex_exit(&zonehash_lock);
2250	return (zret);
2251}
2252
2253/*
2254 * Get the number of cpus visible to this zone.  The system-wide global
2255 * 'ncpus' is returned if pools are disabled, the caller is in the
2256 * global zone, or a NULL zone argument is passed in.
2257 */
2258int
2259zone_ncpus_get(zone_t *zone)
2260{
2261	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2262
2263	return (myncpus != 0 ? myncpus : ncpus);
2264}
2265
2266/*
2267 * Get the number of online cpus visible to this zone.  The system-wide
2268 * global 'ncpus_online' is returned if pools are disabled, the caller
2269 * is in the global zone, or a NULL zone argument is passed in.
2270 */
2271int
2272zone_ncpus_online_get(zone_t *zone)
2273{
2274	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2275
2276	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2277}
2278
2279/*
2280 * Return the pool to which the zone is currently bound.
2281 */
2282pool_t *
2283zone_pool_get(zone_t *zone)
2284{
2285	ASSERT(pool_lock_held());
2286
2287	return (zone->zone_pool);
2288}
2289
2290/*
2291 * Set the zone's pool pointer and update the zone's visibility to match
2292 * the resources in the new pool.
2293 */
2294void
2295zone_pool_set(zone_t *zone, pool_t *pool)
2296{
2297	ASSERT(pool_lock_held());
2298	ASSERT(MUTEX_HELD(&cpu_lock));
2299
2300	zone->zone_pool = pool;
2301	zone_pset_set(zone, pool->pool_pset->pset_id);
2302}
2303
2304/*
2305 * Return the cached value of the id of the processor set to which the
2306 * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2307 * facility is disabled.
2308 */
2309psetid_t
2310zone_pset_get(zone_t *zone)
2311{
2312	ASSERT(MUTEX_HELD(&cpu_lock));
2313
2314	return (zone->zone_psetid);
2315}
2316
2317/*
2318 * Set the cached value of the id of the processor set to which the zone
2319 * is currently bound.  Also update the zone's visibility to match the
2320 * resources in the new processor set.
2321 */
2322void
2323zone_pset_set(zone_t *zone, psetid_t newpsetid)
2324{
2325	psetid_t oldpsetid;
2326
2327	ASSERT(MUTEX_HELD(&cpu_lock));
2328	oldpsetid = zone_pset_get(zone);
2329
2330	if (oldpsetid == newpsetid)
2331		return;
2332	/*
2333	 * Global zone sees all.
2334	 */
2335	if (zone != global_zone) {
2336		zone->zone_psetid = newpsetid;
2337		if (newpsetid != ZONE_PS_INVAL)
2338			pool_pset_visibility_add(newpsetid, zone);
2339		if (oldpsetid != ZONE_PS_INVAL)
2340			pool_pset_visibility_remove(oldpsetid, zone);
2341	}
2342	/*
2343	 * Disabling pools, so we should start using the global values
2344	 * for ncpus and ncpus_online.
2345	 */
2346	if (newpsetid == ZONE_PS_INVAL) {
2347		zone->zone_ncpus = 0;
2348		zone->zone_ncpus_online = 0;
2349	}
2350}
2351
2352/*
2353 * Walk the list of active zones and issue the provided callback for
2354 * each of them.
2355 *
2356 * Caller must not be holding any locks that may be acquired under
2357 * zonehash_lock.  See comment at the beginning of the file for a list of
2358 * common locks and their interactions with zones.
2359 */
2360int
2361zone_walk(int (*cb)(zone_t *, void *), void *data)
2362{
2363	zone_t *zone;
2364	int ret = 0;
2365	zone_status_t status;
2366
2367	mutex_enter(&zonehash_lock);
2368	for (zone = list_head(&zone_active); zone != NULL;
2369	    zone = list_next(&zone_active, zone)) {
2370		/*
2371		 * Skip zones that shouldn't be externally visible.
2372		 */
2373		status = zone_status_get(zone);
2374		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2375			continue;
2376		/*
2377		 * Bail immediately if any callback invocation returns a
2378		 * non-zero value.
2379		 */
2380		ret = (*cb)(zone, data);
2381		if (ret != 0)
2382			break;
2383	}
2384	mutex_exit(&zonehash_lock);
2385	return (ret);
2386}
2387
2388static int
2389zone_set_root(zone_t *zone, const char *upath)
2390{
2391	vnode_t *vp;
2392	int trycount;
2393	int error = 0;
2394	char *path;
2395	struct pathname upn, pn;
2396	size_t pathlen;
2397
2398	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2399		return (error);
2400
2401	pn_alloc(&pn);
2402
2403	/* prevent infinite loop */
2404	trycount = 10;
2405	for (;;) {
2406		if (--trycount <= 0) {
2407			error = ESTALE;
2408			goto out;
2409		}
2410
2411		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
2412			/*
2413			 * VOP_ACCESS() may cover 'vp' with a new
2414			 * filesystem, if 'vp' is an autoFS vnode.
2415			 * Get the new 'vp' if so.
2416			 */
2417			if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
2418			    (!vn_ismntpt(vp) ||
2419			    (error = traverse(&vp)) == 0)) {
2420				pathlen = pn.pn_pathlen + 2;
2421				path = kmem_alloc(pathlen, KM_SLEEP);
2422				(void) strncpy(path, pn.pn_path,
2423				    pn.pn_pathlen + 1);
2424				path[pathlen - 2] = '/';
2425				path[pathlen - 1] = '\0';
2426				pn_free(&pn);
2427				pn_free(&upn);
2428
2429				/* Success! */
2430				break;
2431			}
2432			VN_RELE(vp);
2433		}
2434		if (error != ESTALE)
2435			goto out;
2436	}
2437
2438	ASSERT(error == 0);
2439	zone->zone_rootvp = vp;		/* we hold a reference to vp */
2440	zone->zone_rootpath = path;
2441	zone->zone_rootpathlen = pathlen;
2442	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
2443		zone->zone_flags |= ZF_IS_SCRATCH;
2444	return (0);
2445
2446out:
2447	pn_free(&pn);
2448	pn_free(&upn);
2449	return (error);
2450}
2451
2452#define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
2453			((c) >= 'a' && (c) <= 'z') || \
2454			((c) >= 'A' && (c) <= 'Z'))
2455
2456static int
2457zone_set_name(zone_t *zone, const char *uname)
2458{
2459	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
2460	size_t len;
2461	int i, err;
2462
2463	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
2464		kmem_free(kname, ZONENAME_MAX);
2465		return (err);	/* EFAULT or ENAMETOOLONG */
2466	}
2467
2468	/* must be less than ZONENAME_MAX */
2469	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
2470		kmem_free(kname, ZONENAME_MAX);
2471		return (EINVAL);
2472	}
2473
2474	/*
2475	 * Name must start with an alphanumeric and must contain only
2476	 * alphanumerics, '-', '_' and '.'.
2477	 */
2478	if (!isalnum(kname[0])) {
2479		kmem_free(kname, ZONENAME_MAX);
2480		return (EINVAL);
2481	}
2482	for (i = 1; i < len - 1; i++) {
2483		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2484		    kname[i] != '.') {
2485			kmem_free(kname, ZONENAME_MAX);
2486			return (EINVAL);
2487		}
2488	}
2489
2490	zone->zone_name = kname;
2491	return (0);
2492}
2493
2494/*
2495 * Similar to thread_create(), but makes sure the thread is in the appropriate
2496 * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
2497 */
2498/*ARGSUSED*/
2499kthread_t *
2500zthread_create(
2501    caddr_t stk,
2502    size_t stksize,
2503    void (*proc)(),
2504    void *arg,
2505    size_t len,
2506    pri_t pri)
2507{
2508	kthread_t *t;
2509	zone_t *zone = curproc->p_zone;
2510	proc_t *pp = zone->zone_zsched;
2511
2512	zone_hold(zone);	/* Reference to be dropped when thread exits */
2513
2514	/*
2515	 * No-one should be trying to create threads if the zone is shutting
2516	 * down and there aren't any kernel threads around.  See comment
2517	 * in zthread_exit().
2518	 */
2519	ASSERT(!(zone->zone_kthreads == NULL &&
2520	    zone_status_get(zone) >= ZONE_IS_EMPTY));
2521	/*
2522	 * Create a thread, but don't let it run until we've finished setting
2523	 * things up.
2524	 */
2525	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
2526	ASSERT(t->t_forw == NULL);
2527	mutex_enter(&zone_status_lock);
2528	if (zone->zone_kthreads == NULL) {
2529		t->t_forw = t->t_back = t;
2530	} else {
2531		kthread_t *tx = zone->zone_kthreads;
2532
2533		t->t_forw = tx;
2534		t->t_back = tx->t_back;
2535		tx->t_back->t_forw = t;
2536		tx->t_back = t;
2537	}
2538	zone->zone_kthreads = t;
2539	mutex_exit(&zone_status_lock);
2540
2541	mutex_enter(&pp->p_lock);
2542	t->t_proc_flag |= TP_ZTHREAD;
2543	project_rele(t->t_proj);
2544	t->t_proj = project_hold(pp->p_task->tk_proj);
2545
2546	/*
2547	 * Setup complete, let it run.
2548	 */
2549	thread_lock(t);
2550	t->t_schedflag |= TS_ALLSTART;
2551	setrun_locked(t);
2552	thread_unlock(t);
2553
2554	mutex_exit(&pp->p_lock);
2555
2556	return (t);
2557}
2558
2559/*
2560 * Similar to thread_exit().  Must be called by threads created via
2561 * zthread_exit().
2562 */
2563void
2564zthread_exit(void)
2565{
2566	kthread_t *t = curthread;
2567	proc_t *pp = curproc;
2568	zone_t *zone = pp->p_zone;
2569
2570	mutex_enter(&zone_status_lock);
2571
2572	/*
2573	 * Reparent to p0
2574	 */
2575	kpreempt_disable();
2576	mutex_enter(&pp->p_lock);
2577	t->t_proc_flag &= ~TP_ZTHREAD;
2578	t->t_procp = &p0;
2579	hat_thread_exit(t);
2580	mutex_exit(&pp->p_lock);
2581	kpreempt_enable();
2582
2583	if (t->t_back == t) {
2584		ASSERT(t->t_forw == t);
2585		/*
2586		 * If the zone is empty, once the thread count
2587		 * goes to zero no further kernel threads can be
2588		 * created.  This is because if the creator is a process
2589		 * in the zone, then it must have exited before the zone
2590		 * state could be set to ZONE_IS_EMPTY.
2591		 * Otherwise, if the creator is a kernel thread in the
2592		 * zone, the thread count is non-zero.
2593		 *
2594		 * This really means that non-zone kernel threads should
2595		 * not create zone kernel threads.
2596		 */
2597		zone->zone_kthreads = NULL;
2598		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
2599			zone_status_set(zone, ZONE_IS_DOWN);
2600			/*
2601			 * Remove any CPU caps on this zone.
2602			 */
2603			cpucaps_zone_remove(zone);
2604		}
2605	} else {
2606		t->t_forw->t_back = t->t_back;
2607		t->t_back->t_forw = t->t_forw;
2608		if (zone->zone_kthreads == t)
2609			zone->zone_kthreads = t->t_forw;
2610	}
2611	mutex_exit(&zone_status_lock);
2612	zone_rele(zone);
2613	thread_exit();
2614	/* NOTREACHED */
2615}
2616
2617static void
2618zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
2619{
2620	vnode_t *oldvp;
2621
2622	/* we're going to hold a reference here to the directory */
2623	VN_HOLD(vp);
2624
2625#ifdef C2_AUDIT
2626	if (audit_active)	/* update abs cwd/root path see c2audit.c */
2627		audit_chdirec(vp, vpp);
2628#endif
2629
2630	mutex_enter(&pp->p_lock);
2631	oldvp = *vpp;
2632	*vpp = vp;
2633	mutex_exit(&pp->p_lock);
2634	if (oldvp != NULL)
2635		VN_RELE(oldvp);
2636}
2637
2638/*
2639 * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
2640 */
2641static int
2642nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
2643{
2644	nvpair_t *nvp = NULL;
2645	boolean_t priv_set = B_FALSE;
2646	boolean_t limit_set = B_FALSE;
2647	boolean_t action_set = B_FALSE;
2648
2649	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2650		const char *name;
2651		uint64_t ui64;
2652
2653		name = nvpair_name(nvp);
2654		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
2655			return (EINVAL);
2656		(void) nvpair_value_uint64(nvp, &ui64);
2657		if (strcmp(name, "privilege") == 0) {
2658			/*
2659			 * Currently only privileged values are allowed, but
2660			 * this may change in the future.
2661			 */
2662			if (ui64 != RCPRIV_PRIVILEGED)
2663				return (EINVAL);
2664			rv->rcv_privilege = ui64;
2665			priv_set = B_TRUE;
2666		} else if (strcmp(name, "limit") == 0) {
2667			rv->rcv_value = ui64;
2668			limit_set = B_TRUE;
2669		} else if (strcmp(name, "action") == 0) {
2670			if (ui64 != RCTL_LOCAL_NOACTION &&
2671			    ui64 != RCTL_LOCAL_DENY)
2672				return (EINVAL);
2673			rv->rcv_flagaction = ui64;
2674			action_set = B_TRUE;
2675		} else {
2676			return (EINVAL);
2677		}
2678	}
2679
2680	if (!(priv_set && limit_set && action_set))
2681		return (EINVAL);
2682	rv->rcv_action_signal = 0;
2683	rv->rcv_action_recipient = NULL;
2684	rv->rcv_action_recip_pid = -1;
2685	rv->rcv_firing_time = 0;
2686
2687	return (0);
2688}
2689
2690/*
2691 * Non-global zone version of start_init.
2692 */
2693void
2694zone_start_init(void)
2695{
2696	proc_t *p = ttoproc(curthread);
2697	zone_t *z = p->p_zone;
2698
2699	ASSERT(!INGLOBALZONE(curproc));
2700
2701	/*
2702	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
2703	 * storing just the pid of init is sufficient.
2704	 */
2705	z->zone_proc_initpid = p->p_pid;
2706
2707	/*
2708	 * We maintain zone_boot_err so that we can return the cause of the
2709	 * failure back to the caller of the zone_boot syscall.
2710	 */
2711	p->p_zone->zone_boot_err = start_init_common();
2712
2713	mutex_enter(&zone_status_lock);
2714	if (z->zone_boot_err != 0) {
2715		/*
2716		 * Make sure we are still in the booting state-- we could have
2717		 * raced and already be shutting down, or even further along.
2718		 */
2719		if (zone_status_get(z) == ZONE_IS_BOOTING) {
2720			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
2721		}
2722		mutex_exit(&zone_status_lock);
2723		/* It's gone bad, dispose of the process */
2724		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
2725			mutex_enter(&p->p_lock);
2726			ASSERT(p->p_flag & SEXITLWPS);
2727			lwp_exit();
2728		}
2729	} else {
2730		if (zone_status_get(z) == ZONE_IS_BOOTING)
2731			zone_status_set(z, ZONE_IS_RUNNING);
2732		mutex_exit(&zone_status_lock);
2733		/* cause the process to return to userland. */
2734		lwp_rtt();
2735	}
2736}
2737
2738struct zsched_arg {
2739	zone_t *zone;
2740	nvlist_t *nvlist;
2741};
2742
2743/*
2744 * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
2745 * anything to do with scheduling, but rather with the fact that
2746 * per-zone kernel threads are parented to zsched, just like regular
2747 * kernel threads are parented to sched (p0).
2748 *
2749 * zsched is also responsible for launching init for the zone.
2750 */
2751static void
2752zsched(void *arg)
2753{
2754	struct zsched_arg *za = arg;
2755	proc_t *pp = curproc;
2756	proc_t *initp = proc_init;
2757	zone_t *zone = za->zone;
2758	cred_t *cr, *oldcred;
2759	rctl_set_t *set;
2760	rctl_alloc_gp_t *gp;
2761	contract_t *ct = NULL;
2762	task_t *tk, *oldtk;
2763	rctl_entity_p_t e;
2764	kproject_t *pj;
2765
2766	nvlist_t *nvl = za->nvlist;
2767	nvpair_t *nvp = NULL;
2768
2769	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
2770	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
2771	PTOU(pp)->u_argc = 0;
2772	PTOU(pp)->u_argv = NULL;
2773	PTOU(pp)->u_envp = NULL;
2774	closeall(P_FINFO(pp));
2775
2776	/*
2777	 * We are this zone's "zsched" process.  As the zone isn't generally
2778	 * visible yet we don't need to grab any locks before initializing its
2779	 * zone_proc pointer.
2780	 */
2781	zone_hold(zone);  /* this hold is released by zone_destroy() */
2782	zone->zone_zsched = pp;
2783	mutex_enter(&pp->p_lock);
2784	pp->p_zone = zone;
2785	mutex_exit(&pp->p_lock);
2786
2787	/*
2788	 * Disassociate process from its 'parent'; parent ourselves to init
2789	 * (pid 1) and change other values as needed.
2790	 */
2791	sess_create();
2792
2793	mutex_enter(&pidlock);
2794	proc_detach(pp);
2795	pp->p_ppid = 1;
2796	pp->p_flag |= SZONETOP;
2797	pp->p_ancpid = 1;
2798	pp->p_parent = initp;
2799	pp->p_psibling = NULL;
2800	if (initp->p_child)
2801		initp->p_child->p_psibling = pp;
2802	pp->p_sibling = initp->p_child;
2803	initp->p_child = pp;
2804
2805	/* Decrement what newproc() incremented. */
2806	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
2807	/*
2808	 * Our credentials are about to become kcred-like, so we don't care
2809	 * about the caller's ruid.
2810	 */
2811	upcount_inc(crgetruid(kcred), zone->zone_id);
2812	mutex_exit(&pidlock);
2813
2814	/*
2815	 * getting out of global zone, so decrement lwp counts
2816	 */
2817	pj = pp->p_task->tk_proj;
2818	mutex_enter(&global_zone->zone_nlwps_lock);
2819	pj->kpj_nlwps -= pp->p_lwpcnt;
2820	global_zone->zone_nlwps -= pp->p_lwpcnt;
2821	mutex_exit(&global_zone->zone_nlwps_lock);
2822
2823	/*
2824	 * Decrement locked memory counts on old zone and project.
2825	 */
2826	mutex_enter(&global_zone->zone_mem_lock);
2827	global_zone->zone_locked_mem -= pp->p_locked_mem;
2828	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
2829	mutex_exit(&global_zone->zone_mem_lock);
2830
2831	/*
2832	 * Create and join a new task in project '0' of this zone.
2833	 *
2834	 * We don't need to call holdlwps() since we know we're the only lwp in
2835	 * this process.
2836	 *
2837	 * task_join() returns with p_lock held.
2838	 */
2839	tk = task_create(0, zone);
2840	mutex_enter(&cpu_lock);
2841	oldtk = task_join(tk, 0);
2842
2843	pj = pp->p_task->tk_proj;
2844
2845	mutex_enter(&zone->zone_mem_lock);
2846	zone->zone_locked_mem += pp->p_locked_mem;
2847	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
2848	mutex_exit(&zone->zone_mem_lock);
2849
2850	/*
2851	 * add lwp counts to zsched's zone, and increment project's task count
2852	 * due to the task created in the above tasksys_settaskid
2853	 */
2854
2855	mutex_enter(&zone->zone_nlwps_lock);
2856	pj->kpj_nlwps += pp->p_lwpcnt;
2857	pj->kpj_ntasks += 1;
2858	zone->zone_nlwps += pp->p_lwpcnt;
2859	mutex_exit(&zone->zone_nlwps_lock);
2860
2861	mutex_exit(&curproc->p_lock);
2862	mutex_exit(&cpu_lock);
2863	task_rele(oldtk);
2864
2865	/*
2866	 * The process was created by a process in the global zone, hence the
2867	 * credentials are wrong.  We might as well have kcred-ish credentials.
2868	 */
2869	cr = zone->zone_kcred;
2870	crhold(cr);
2871	mutex_enter(&pp->p_crlock);
2872	oldcred = pp->p_cred;
2873	pp->p_cred = cr;
2874	mutex_exit(&pp->p_crlock);
2875	crfree(oldcred);
2876
2877	/*
2878	 * Hold credentials again (for thread)
2879	 */
2880	crhold(cr);
2881
2882	/*
2883	 * p_lwpcnt can't change since this is a kernel process.
2884	 */
2885	crset(pp, cr);
2886
2887	/*
2888	 * Chroot
2889	 */
2890	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
2891	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
2892
2893	/*
2894	 * Initialize zone's rctl set.
2895	 */
2896	set = rctl_set_create();
2897	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2898	mutex_enter(&pp->p_lock);
2899	e.rcep_p.zone = zone;
2900	e.rcep_t = RCENTITY_ZONE;
2901	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
2902	mutex_exit(&pp->p_lock);
2903	rctl_prealloc_destroy(gp);
2904
2905	/*
2906	 * Apply the rctls passed in to zone_create().  This is basically a list
2907	 * assignment: all of the old values are removed and the new ones
2908	 * inserted.  That is, if an empty list is passed in, all values are
2909	 * removed.
2910	 */
2911	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2912		rctl_dict_entry_t *rde;
2913		rctl_hndl_t hndl;
2914		char *name;
2915		nvlist_t **nvlarray;
2916		uint_t i, nelem;
2917		int error;	/* For ASSERT()s */
2918
2919		name = nvpair_name(nvp);
2920		hndl = rctl_hndl_lookup(name);
2921		ASSERT(hndl != -1);
2922		rde = rctl_dict_lookup_hndl(hndl);
2923		ASSERT(rde != NULL);
2924
2925		for (; /* ever */; ) {
2926			rctl_val_t oval;
2927
2928			mutex_enter(&pp->p_lock);
2929			error = rctl_local_get(hndl, NULL, &oval, pp);
2930			mutex_exit(&pp->p_lock);
2931			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
2932			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
2933			if (oval.rcv_privilege == RCPRIV_SYSTEM)
2934				break;
2935			mutex_enter(&pp->p_lock);
2936			error = rctl_local_delete(hndl, &oval, pp);
2937			mutex_exit(&pp->p_lock);
2938			ASSERT(error == 0);
2939		}
2940		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2941		ASSERT(error == 0);
2942		for (i = 0; i < nelem; i++) {
2943			rctl_val_t *nvalp;
2944
2945			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2946			error = nvlist2rctlval(nvlarray[i], nvalp);
2947			ASSERT(error == 0);
2948			/*
2949			 * rctl_local_insert can fail if the value being
2950			 * inserted is a duplicate; this is OK.
2951			 */
2952			mutex_enter(&pp->p_lock);
2953			if (rctl_local_insert(hndl, nvalp, pp) != 0)
2954				kmem_cache_free(rctl_val_cache, nvalp);
2955			mutex_exit(&pp->p_lock);
2956		}
2957	}
2958	/*
2959	 * Tell the world that we're done setting up.
2960	 *
2961	 * At this point we want to set the zone status to ZONE_IS_READY
2962	 * and atomically set the zone's processor set visibility.  Once
2963	 * we drop pool_lock() this zone will automatically get updated
2964	 * to reflect any future changes to the pools configuration.
2965	 */
2966	pool_lock();
2967	mutex_enter(&cpu_lock);
2968	mutex_enter(&zonehash_lock);
2969	zone_uniqid(zone);
2970	zone_zsd_configure(zone);
2971	if (pool_state == POOL_ENABLED)
2972		zone_pset_set(zone, pool_default->pool_pset->pset_id);
2973	mutex_enter(&zone_status_lock);
2974	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2975	zone_status_set(zone, ZONE_IS_READY);
2976	mutex_exit(&zone_status_lock);
2977	mutex_exit(&zonehash_lock);
2978	mutex_exit(&cpu_lock);
2979	pool_unlock();
2980
2981	/*
2982	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
2983	 * we launch init, and set the state to running.
2984	 */
2985	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
2986
2987	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
2988		id_t cid;
2989
2990		/*
2991		 * Ok, this is a little complicated.  We need to grab the
2992		 * zone's pool's scheduling class ID; note that by now, we
2993		 * are already bound to a pool if we need to be (zoneadmd
2994		 * will have done that to us while we're in the READY
2995		 * state).  *But* the scheduling class for the zone's 'init'
2996		 * must be explicitly passed to newproc, which doesn't
2997		 * respect pool bindings.
2998		 *
2999		 * We hold the pool_lock across the call to newproc() to
3000		 * close the obvious race: the pool's scheduling class
3001		 * could change before we manage to create the LWP with
3002		 * classid 'cid'.
3003		 */
3004		pool_lock();
3005		if (zone->zone_defaultcid > 0)
3006			cid = zone->zone_defaultcid;
3007		else
3008			cid = pool_get_class(zone->zone_pool);
3009		if (cid == -1)
3010			cid = defaultcid;
3011
3012		/*
3013		 * If this fails, zone_boot will ultimately fail.  The
3014		 * state of the zone will be set to SHUTTING_DOWN-- userland
3015		 * will have to tear down the zone, and fail, or try again.
3016		 */
3017		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3018		    minclsyspri - 1, &ct)) != 0) {
3019			mutex_enter(&zone_status_lock);
3020			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3021			mutex_exit(&zone_status_lock);
3022		}
3023		pool_unlock();
3024	}
3025
3026	/*
3027	 * Wait for zone_destroy() to be called.  This is what we spend
3028	 * most of our life doing.
3029	 */
3030	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3031
3032	if (ct)
3033		/*
3034		 * At this point the process contract should be empty.
3035		 * (Though if it isn't, it's not the end of the world.)
3036		 */
3037		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3038
3039	/*
3040	 * Allow kcred to be freed when all referring processes
3041	 * (including this one) go away.  We can't just do this in
3042	 * zone_free because we need to wait for the zone_cred_ref to
3043	 * drop to 0 before calling zone_free, and the existence of
3044	 * zone_kcred will prevent that.  Thus, we call crfree here to
3045	 * balance the crdup in zone_create.  The crhold calls earlier
3046	 * in zsched will be dropped when the thread and process exit.
3047	 */
3048	crfree(zone->zone_kcred);
3049	zone->zone_kcred = NULL;
3050
3051	exit(CLD_EXITED, 0);
3052}
3053
3054/*
3055 * Helper function to determine if there are any submounts of the
3056 * provided path.  Used to make sure the zone doesn't "inherit" any
3057 * mounts from before it is created.
3058 */
3059static uint_t
3060zone_mount_count(const char *rootpath)
3061{
3062	vfs_t *vfsp;
3063	uint_t count = 0;
3064	size_t rootpathlen = strlen(rootpath);
3065
3066	/*
3067	 * Holding zonehash_lock prevents race conditions with
3068	 * vfs_list_add()/vfs_list_remove() since we serialize with
3069	 * zone_find_by_path().
3070	 */
3071	ASSERT(MUTEX_HELD(&zonehash_lock));
3072	/*
3073	 * The rootpath must end with a '/'
3074	 */
3075	ASSERT(rootpath[rootpathlen - 1] == '/');
3076
3077	/*
3078	 * This intentionally does not count the rootpath itself if that
3079	 * happens to be a mount point.
3080	 */
3081	vfs_list_read_lock();
3082	vfsp = rootvfs;
3083	do {
3084		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3085		    rootpathlen) == 0)
3086			count++;
3087		vfsp = vfsp->vfs_next;
3088	} while (vfsp != rootvfs);
3089	vfs_list_unlock();
3090	return (count);
3091}
3092
3093/*
3094 * Helper function to make sure that a zone created on 'rootpath'
3095 * wouldn't end up containing other zones' rootpaths.
3096 */
3097static boolean_t
3098zone_is_nested(const char *rootpath)
3099{
3100	zone_t *zone;
3101	size_t rootpathlen = strlen(rootpath);
3102	size_t len;
3103
3104	ASSERT(MUTEX_HELD(&zonehash_lock));
3105
3106	for (zone = list_head(&zone_active); zone != NULL;
3107	    zone = list_next(&zone_active, zone)) {
3108		if (zone == global_zone)
3109			continue;
3110		len = strlen(zone->zone_rootpath);
3111		if (strncmp(rootpath, zone->zone_rootpath,
3112		    MIN(rootpathlen, len)) == 0)
3113			return (B_TRUE);
3114	}
3115	return (B_FALSE);
3116}
3117
3118static int
3119zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3120    size_t zone_privssz)
3121{
3122	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3123
3124	if (zone_privssz < sizeof (priv_set_t))
3125		return (set_errno(ENOMEM));
3126
3127	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3128		kmem_free(privs, sizeof (priv_set_t));
3129		return (EFAULT);
3130	}
3131
3132	zone->zone_privset = privs;
3133	return (0);
3134}
3135
3136/*
3137 * We make creative use of nvlists to pass in rctls from userland.  The list is
3138 * a list of the following structures:
3139 *
3140 * (name = rctl_name, value = nvpair_list_array)
3141 *
3142 * Where each element of the nvpair_list_array is of the form:
3143 *
3144 * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3145 * 	(name = "limit", value = uint64_t),
3146 * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3147 */
3148static int
3149parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3150{
3151	nvpair_t *nvp = NULL;
3152	nvlist_t *nvl = NULL;
3153	char *kbuf;
3154	int error;
3155	rctl_val_t rv;
3156
3157	*nvlp = NULL;
3158
3159	if (buflen == 0)
3160		return (0);
3161
3162	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3163		return (ENOMEM);
3164	if (copyin(ubuf, kbuf, buflen)) {
3165		error = EFAULT;
3166		goto out;
3167	}
3168	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3169		/*
3170		 * nvl may have been allocated/free'd, but the value set to
3171		 * non-NULL, so we reset it here.
3172		 */
3173		nvl = NULL;
3174		error = EINVAL;
3175		goto out;
3176	}
3177	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3178		rctl_dict_entry_t *rde;
3179		rctl_hndl_t hndl;
3180		nvlist_t **nvlarray;
3181		uint_t i, nelem;
3182		char *name;
3183
3184		error = EINVAL;
3185		name = nvpair_name(nvp);
3186		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3187		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3188			goto out;
3189		}
3190		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3191			goto out;
3192		}
3193		rde = rctl_dict_lookup_hndl(hndl);
3194		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3195		ASSERT(error == 0);
3196		for (i = 0; i < nelem; i++) {
3197			if (error = nvlist2rctlval(nvlarray[i], &rv))
3198				goto out;
3199		}
3200		if (rctl_invalid_value(rde, &rv)) {
3201			error = EINVAL;
3202			goto out;
3203		}
3204	}
3205	error = 0;
3206	*nvlp = nvl;
3207out:
3208	kmem_free(kbuf, buflen);
3209	if (error && nvl != NULL)
3210		nvlist_free(nvl);
3211	return (error);
3212}
3213
3214int
3215zone_create_error(int er_error, int er_ext, int *er_out) {
3216	if (er_out != NULL) {
3217		if (copyout(&er_ext, er_out, sizeof (int))) {
3218			return (set_errno(EFAULT));
3219		}
3220	}
3221	return (set_errno(er_error));
3222}
3223
3224static int
3225zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3226{
3227	ts_label_t *tsl;
3228	bslabel_t blab;
3229
3230	/* Get label from user */
3231	if (copyin(lab, &blab, sizeof (blab)) != 0)
3232		return (EFAULT);
3233	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3234	if (tsl == NULL)
3235		return (ENOMEM);
3236
3237	zone->zone_slabel = tsl;
3238	return (0);
3239}
3240
3241/*
3242 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3243 */
3244static int
3245parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3246{
3247	char *kbuf;
3248	char *dataset, *next;
3249	zone_dataset_t *zd;
3250	size_t len;
3251
3252	if (ubuf == NULL || buflen == 0)
3253		return (0);
3254
3255	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3256		return (ENOMEM);
3257
3258	if (copyin(ubuf, kbuf, buflen) != 0) {
3259		kmem_free(kbuf, buflen);
3260		return (EFAULT);
3261	}
3262
3263	dataset = next = kbuf;
3264	for (;;) {
3265		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3266
3267		next = strchr(dataset, ',');
3268
3269		if (next == NULL)
3270			len = strlen(dataset);
3271		else
3272			len = next - dataset;
3273
3274		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3275		bcopy(dataset, zd->zd_dataset, len);
3276		zd->zd_dataset[len] = '\0';
3277
3278		list_insert_head(&zone->zone_datasets, zd);
3279
3280		if (next == NULL)
3281			break;
3282
3283		dataset = next + 1;
3284	}
3285
3286	kmem_free(kbuf, buflen);
3287	return (0);
3288}
3289
3290/*
3291 * System call to create/initialize a new zone named 'zone_name', rooted
3292 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3293 * and initialized with the zone-wide rctls described in 'rctlbuf', and
3294 * with labeling set by 'match', 'doi', and 'label'.
3295 *
3296 * If extended error is non-null, we may use it to return more detailed
3297 * error information.
3298 */
3299static zoneid_t
3300zone_create(const char *zone_name, const char *zone_root,
3301    const priv_set_t *zone_privs, size_t zone_privssz,
3302    caddr_t rctlbuf, size_t rctlbufsz,
3303    caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3304    int match, uint32_t doi, const bslabel_t *label,
3305    int flags)
3306{
3307	struct zsched_arg zarg;
3308	nvlist_t *rctls = NULL;
3309	proc_t *pp = curproc;
3310	zone_t *zone, *ztmp;
3311	zoneid_t zoneid;
3312	int error;
3313	int error2 = 0;
3314	char *str;
3315	cred_t *zkcr;
3316	boolean_t insert_label_hash;
3317
3318	if (secpolicy_zone_config(CRED()) != 0)
3319		return (set_errno(EPERM));
3320
3321	/* can't boot zone from within chroot environment */
3322	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3323		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3324		    extended_error));
3325
3326	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3327	zoneid = zone->zone_id = id_alloc(zoneid_space);
3328	zone->zone_status = ZONE_IS_UNINITIALIZED;
3329	zone->zone_pool = pool_default;
3330	zone->zone_pool_mod = gethrtime();
3331	zone->zone_psetid = ZONE_PS_INVAL;
3332	zone->zone_ncpus = 0;
3333	zone->zone_ncpus_online = 0;
3334	zone->zone_restart_init = B_TRUE;
3335	zone->zone_brand = &native_brand;
3336	zone->zone_initname = NULL;
3337	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3338	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3339	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3340	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3341	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3342	    offsetof(struct zsd_entry, zsd_linkage));
3343	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3344	    offsetof(zone_dataset_t, zd_linkage));
3345	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3346
3347	if (flags & ZCF_NET_EXCL) {
3348		zone->zone_flags |= ZF_NET_EXCL;
3349	}
3350
3351	if ((error = zone_set_name(zone, zone_name)) != 0) {
3352		zone_free(zone);
3353		return (zone_create_error(error, 0, extended_error));
3354	}
3355
3356	if ((error = zone_set_root(zone, zone_root)) != 0) {
3357		zone_free(zone);
3358		return (zone_create_error(error, 0, extended_error));
3359	}
3360	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
3361		zone_free(zone);
3362		return (zone_create_error(error, 0, extended_error));
3363	}
3364
3365	/* initialize node name to be the same as zone name */
3366	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3367	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
3368	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
3369
3370	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3371	zone->zone_domain[0] = '\0';
3372	zone->zone_shares = 1;
3373	zone->zone_shmmax = 0;
3374	zone->zone_ipc.ipcq_shmmni = 0;
3375	zone->zone_ipc.ipcq_semmni = 0;
3376	zone->zone_ipc.ipcq_msgmni = 0;
3377	zone->zone_bootargs = NULL;
3378	zone->zone_initname =
3379	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
3380	(void) strcpy(zone->zone_initname, zone_default_initname);
3381	zone->zone_nlwps = 0;
3382	zone->zone_nlwps_ctl = INT_MAX;
3383	zone->zone_locked_mem = 0;
3384	zone->zone_locked_mem_ctl = UINT64_MAX;
3385	zone->zone_max_swap = 0;
3386	zone->zone_max_swap_ctl = UINT64_MAX;
3387	zone0.zone_lockedmem_kstat = NULL;
3388	zone0.zone_swapresv_kstat = NULL;
3389
3390	/*
3391	 * Zsched initializes the rctls.
3392	 */
3393	zone->zone_rctls = NULL;
3394
3395	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
3396		zone_free(zone);
3397		return (zone_create_error(error, 0, extended_error));
3398	}
3399
3400	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
3401		zone_free(zone);
3402		return (set_errno(error));
3403	}
3404
3405	/*
3406	 * Read in the trusted system parameters:
3407	 * match flag and sensitivity label.
3408	 */
3409	zone->zone_match = match;
3410	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3411		/* Fail if requested to set doi to anything but system's doi */
3412		if (doi != 0 && doi != default_doi) {
3413			zone_free(zone);
3414			return (set_errno(EINVAL));
3415		}
3416		/* Always apply system's doi to the zone */
3417		error = zone_set_label(zone, label, default_doi);
3418		if (error != 0) {
3419			zone_free(zone);
3420			return (set_errno(error));
3421		}
3422		insert_label_hash = B_TRUE;
3423	} else {
3424		/* all zones get an admin_low label if system is not labeled */
3425		zone->zone_slabel = l_admin_low;
3426		label_hold(l_admin_low);
3427		insert_label_hash = B_FALSE;
3428	}
3429
3430	/*
3431	 * Stop all lwps since that's what normally happens as part of fork().
3432	 * This needs to happen before we grab any locks to avoid deadlock
3433	 * (another lwp in the process could be waiting for the held lock).
3434	 */
3435	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
3436		zone_free(zone);
3437		if (rctls)
3438			nvlist_free(rctls);
3439		return (zone_create_error(error, 0, extended_error));
3440	}
3441
3442	if (block_mounts() == 0) {
3443		mutex_enter(&pp->p_lock);
3444		if (curthread != pp->p_agenttp)
3445			continuelwps(pp);
3446		mutex_exit(&pp->p_lock);
3447		zone_free(zone);
3448		if (rctls)
3449			nvlist_free(rctls);
3450		return (zone_create_error(error, 0, extended_error));
3451	}
3452
3453	/*
3454	 * Set up credential for kernel access.  After this, any errors
3455	 * should go through the dance in errout rather than calling
3456	 * zone_free directly.
3457	 */
3458	zone->zone_kcred = crdup(kcred);
3459	crsetzone(zone->zone_kcred, zone);
3460	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
3461	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
3462	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
3463	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
3464
3465	mutex_enter(&zonehash_lock);
3466	/*
3467	 * Make sure zone doesn't already exist.
3468	 *
3469	 * If the system and zone are labeled,
3470	 * make sure no other zone exists that has the same label.
3471	 */
3472	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
3473	    (insert_label_hash &&
3474	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
3475		zone_status_t status;
3476
3477		status = zone_status_get(ztmp);
3478		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
3479			error = EEXIST;
3480		else
3481			error = EBUSY;
3482
3483		if (insert_label_hash)
3484			error2 = ZE_LABELINUSE;
3485
3486		goto errout;
3487	}
3488
3489	/*
3490	 * Don't allow zone creations which would cause one zone's rootpath to
3491	 * be accessible from that of another (non-global) zone.
3492	 */
3493	if (zone_is_nested(zone->zone_rootpath)) {
3494		error = EBUSY;
3495		goto errout;
3496	}
3497
3498	ASSERT(zonecount != 0);		/* check for leaks */
3499	if (zonecount + 1 > maxzones) {
3500		error = ENOMEM;
3501		goto errout;
3502	}
3503
3504	if (zone_mount_count(zone->zone_rootpath) != 0) {
3505		error = EBUSY;
3506		error2 = ZE_AREMOUNTS;
3507		goto errout;
3508	}
3509
3510	/*
3511	 * Zone is still incomplete, but we need to drop all locks while
3512	 * zsched() initializes this zone's kernel process.  We
3513	 * optimistically add the zone to the hashtable and associated
3514	 * lists so a parallel zone_create() doesn't try to create the
3515	 * same zone.
3516	 */
3517	zonecount++;
3518	(void) mod_hash_insert(zonehashbyid,
3519	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
3520	    (mod_hash_val_t)(uintptr_t)zone);
3521	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
3522	(void) strcpy(str, zone->zone_name);
3523	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
3524	    (mod_hash_val_t)(uintptr_t)zone);
3525	if (insert_label_hash) {
3526		(void) mod_hash_insert(zonehashbylabel,
3527		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
3528		zone->zone_flags |= ZF_HASHED_LABEL;
3529	}
3530
3531	/*
3532	 * Insert into active list.  At this point there are no 'hold's
3533	 * on the zone, but everyone else knows not to use it, so we can
3534	 * continue to use it.  zsched() will do a zone_hold() if the
3535	 * newproc() is successful.
3536	 */
3537	list_insert_tail(&zone_active, zone);
3538	mutex_exit(&zonehash_lock);
3539
3540	zarg.zone = zone;
3541	zarg.nvlist = rctls;
3542	/*
3543	 * The process, task, and project rctls are probably wrong;
3544	 * we need an interface to get the default values of all rctls,
3545	 * and initialize zsched appropriately.  I'm not sure that that
3546	 * makes much of a difference, though.
3547	 */
3548	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
3549		/*
3550		 * We need to undo all globally visible state.
3551		 */
3552		mutex_enter(&zonehash_lock);
3553		list_remove(&zone_active, zone);
3554		if (zone->zone_flags & ZF_HASHED_LABEL) {
3555			ASSERT(zone->zone_slabel != NULL);
3556			(void) mod_hash_destroy(zonehashbylabel,
3557			    (mod_hash_key_t)zone->zone_slabel);
3558		}
3559		(void) mod_hash_destroy(zonehashbyname,
3560		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
3561		(void) mod_hash_destroy(zonehashbyid,
3562		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3563		ASSERT(zonecount > 1);
3564		zonecount--;
3565		goto errout;
3566	}
3567
3568	/*
3569	 * Zone creation can't fail from now on.
3570	 */
3571
3572	/*
3573	 * Create zone kstats
3574	 */
3575	zone_kstat_create(zone);
3576
3577	/*
3578	 * Let the other lwps continue.
3579	 */
3580	mutex_enter(&pp->p_lock);
3581	if (curthread != pp->p_agenttp)
3582		continuelwps(pp);
3583	mutex_exit(&pp->p_lock);
3584
3585	/*
3586	 * Wait for zsched to finish initializing the zone.
3587	 */
3588	zone_status_wait(zone, ZONE_IS_READY);
3589	/*
3590	 * The zone is fully visible, so we can let mounts progress.
3591	 */
3592	resume_mounts();
3593	if (rctls)
3594		nvlist_free(rctls);
3595
3596	return (zoneid);
3597
3598errout:
3599	mutex_exit(&zonehash_lock);
3600	/*
3601	 * Let the other lwps continue.
3602	 */
3603	mutex_enter(&pp->p_lock);
3604	if (curthread != pp->p_agenttp)
3605		continuelwps(pp);
3606	mutex_exit(&pp->p_lock);
3607
3608	resume_mounts();
3609	if (rctls)
3610		nvlist_free(rctls);
3611	/*
3612	 * There is currently one reference to the zone, a cred_ref from
3613	 * zone_kcred.  To free the zone, we call crfree, which will call
3614	 * zone_cred_rele, which will call zone_free.
3615	 */
3616	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
3617	ASSERT(zone->zone_kcred->cr_ref == 1);
3618	ASSERT(zone->zone_ref == 0);
3619	zkcr = zone->zone_kcred;
3620	zone->zone_kcred = NULL;
3621	crfree(zkcr);				/* triggers call to zone_free */
3622	return (zone_create_error(error, error2, extended_error));
3623}
3624
3625/*
3626 * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
3627 * the heavy lifting.  initname is the path to the program to launch
3628 * at the "top" of the zone; if this is NULL, we use the system default,
3629 * which is stored at zone_default_initname.
3630 */
3631static int
3632zone_boot(zoneid_t zoneid)
3633{
3634	int err;
3635	zone_t *zone;
3636
3637	if (secpolicy_zone_config(CRED()) != 0)
3638		return (set_errno(EPERM));
3639	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3640		return (set_errno(EINVAL));
3641
3642	mutex_enter(&zonehash_lock);
3643	/*
3644	 * Look for zone under hash lock to prevent races with calls to
3645	 * zone_shutdown, zone_destroy, etc.
3646	 */
3647	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3648		mutex_exit(&zonehash_lock);
3649		return (set_errno(EINVAL));
3650	}
3651
3652	mutex_enter(&zone_status_lock);
3653	if (zone_status_get(zone) != ZONE_IS_READY) {
3654		mutex_exit(&zone_status_lock);
3655		mutex_exit(&zonehash_lock);
3656		return (set_errno(EINVAL));
3657	}
3658	zone_status_set(zone, ZONE_IS_BOOTING);
3659	mutex_exit(&zone_status_lock);
3660
3661	zone_hold(zone);	/* so we can use the zone_t later */
3662	mutex_exit(&zonehash_lock);
3663
3664	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
3665		zone_rele(zone);
3666		return (set_errno(EINTR));
3667	}
3668
3669	/*
3670	 * Boot (starting init) might have failed, in which case the zone
3671	 * will go to the SHUTTING_DOWN state; an appropriate errno will
3672	 * be placed in zone->zone_boot_err, and so we return that.
3673	 */
3674	err = zone->zone_boot_err;
3675	zone_rele(zone);
3676	return (err ? set_errno(err) : 0);
3677}
3678
3679/*
3680 * Kills all user processes in the zone, waiting for them all to exit
3681 * before returning.
3682 */
3683static int
3684zone_empty(zone_t *zone)
3685{
3686	int waitstatus;
3687
3688	/*
3689	 * We need to drop zonehash_lock before killing all
3690	 * processes, otherwise we'll deadlock with zone_find_*
3691	 * which can be called from the exit path.
3692	 */
3693	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
3694	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
3695	    ZONE_IS_EMPTY)) == -1) {
3696		killall(zone->zone_id);
3697	}
3698	/*
3699	 * return EINTR if we were signaled
3700	 */
3701	if (waitstatus == 0)
3702		return (EINTR);
3703	return (0);
3704}
3705
3706/*
3707 * This function implements the policy for zone visibility.
3708 *
3709 * In standard Solaris, a non-global zone can only see itself.
3710 *
3711 * In Trusted Extensions, a labeled zone can lookup any zone whose label
3712 * it dominates. For this test, the label of the global zone is treated as
3713 * admin_high so it is special-cased instead of being checked for dominance.
3714 *
3715 * Returns true if zone attributes are viewable, false otherwise.
3716 */
3717static boolean_t
3718zone_list_access(zone_t *zone)
3719{
3720
3721	if (curproc->p_zone == global_zone ||
3722	    curproc->p_zone == zone) {
3723		return (B_TRUE);
3724	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3725		bslabel_t *curproc_label;
3726		bslabel_t *zone_label;
3727
3728		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
3729		zone_label = label2bslabel(zone->zone_slabel);
3730
3731		if (zone->zone_id != GLOBAL_ZONEID &&
3732		    bldominates(curproc_label, zone_label)) {
3733			return (B_TRUE);
3734		} else {
3735			return (B_FALSE);
3736		}
3737	} else {
3738		return (B_FALSE);
3739	}
3740}
3741
3742/*
3743 * Systemcall to start the zone's halt sequence.  By the time this
3744 * function successfully returns, all user processes and kernel threads
3745 * executing in it will have exited, ZSD shutdown callbacks executed,
3746 * and the zone status set to ZONE_IS_DOWN.
3747 *
3748 * It is possible that the call will interrupt itself if the caller is the
3749 * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
3750 */
3751static int
3752zone_shutdown(zoneid_t zoneid)
3753{
3754	int error;
3755	zone_t *zone;
3756	zone_status_t status;
3757
3758	if (secpolicy_zone_config(CRED()) != 0)
3759		return (set_errno(EPERM));
3760	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3761		return (set_errno(EINVAL));
3762
3763	/*
3764	 * Block mounts so that VFS_MOUNT() can get an accurate view of
3765	 * the zone's status with regards to ZONE_IS_SHUTTING down.
3766	 *
3767	 * e.g. NFS can fail the mount if it determines that the zone
3768	 * has already begun the shutdown sequence.
3769	 */
3770	if (block_mounts() == 0)
3771		return (set_errno(EINTR));
3772	mutex_enter(&zonehash_lock);
3773	/*
3774	 * Look for zone under hash lock to prevent races with other
3775	 * calls to zone_shutdown and zone_destroy.
3776	 */
3777	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3778		mutex_exit(&zonehash_lock);
3779		resume_mounts();
3780		return (set_errno(EINVAL));
3781	}
3782	mutex_enter(&zone_status_lock);
3783	status = zone_status_get(zone);
3784	/*
3785	 * Fail if the zone isn't fully initialized yet.
3786	 */
3787	if (status < ZONE_IS_READY) {
3788		mutex_exit(&zone_status_lock);
3789		mutex_exit(&zonehash_lock);
3790		resume_mounts();
3791		return (set_errno(EINVAL));
3792	}
3793	/*
3794	 * If conditions required for zone_shutdown() to return have been met,
3795	 * return success.
3796	 */
3797	if (status >= ZONE_IS_DOWN) {
3798		mutex_exit(&zone_status_lock);
3799		mutex_exit(&zonehash_lock);
3800		resume_mounts();
3801		return (0);
3802	}
3803	/*
3804	 * If zone_shutdown() hasn't been called before, go through the motions.
3805	 * If it has, there's nothing to do but wait for the kernel threads to
3806	 * drain.
3807	 */
3808	if (status < ZONE_IS_EMPTY) {
3809		uint_t ntasks;
3810
3811		mutex_enter(&zone->zone_lock);
3812		if ((ntasks = zone->zone_ntasks) != 1) {
3813			/*
3814			 * There's still stuff running.
3815			 */
3816			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3817		}
3818		mutex_exit(&zone->zone_lock);
3819		if (ntasks == 1) {
3820			/*
3821			 * The only way to create another task is through
3822			 * zone_enter(), which will block until we drop
3823			 * zonehash_lock.  The zone is empty.
3824			 */
3825			if (zone->zone_kthreads == NULL) {
3826				/*
3827				 * Skip ahead to ZONE_IS_DOWN
3828				 */
3829				zone_status_set(zone, ZONE_IS_DOWN);
3830			} else {
3831				zone_status_set(zone, ZONE_IS_EMPTY);
3832			}
3833		}
3834	}
3835	zone_hold(zone);	/* so we can use the zone_t later */
3836	mutex_exit(&zone_status_lock);
3837	mutex_exit(&zonehash_lock);
3838	resume_mounts();
3839
3840	if (error = zone_empty(zone)) {
3841		zone_rele(zone);
3842		return (set_errno(error));
3843	}
3844	/*
3845	 * After the zone status goes to ZONE_IS_DOWN this zone will no
3846	 * longer be notified of changes to the pools configuration, so
3847	 * in order to not end up with a stale pool pointer, we point
3848	 * ourselves at the default pool and remove all resource
3849	 * visibility.  This is especially important as the zone_t may
3850	 * languish on the deathrow for a very long time waiting for
3851	 * cred's to drain out.
3852	 *
3853	 * This rebinding of the zone can happen multiple times
3854	 * (presumably due to interrupted or parallel systemcalls)
3855	 * without any adverse effects.
3856	 */
3857	if (pool_lock_intr() != 0) {
3858		zone_rele(zone);
3859		return (set_errno(EINTR));
3860	}
3861	if (pool_state == POOL_ENABLED) {
3862		mutex_enter(&cpu_lock);
3863		zone_pool_set(zone, pool_default);
3864		/*
3865		 * The zone no longer needs to be able to see any cpus.
3866		 */
3867		zone_pset_set(zone, ZONE_PS_INVAL);
3868		mutex_exit(&cpu_lock);
3869	}
3870	pool_unlock();
3871
3872	/*
3873	 * ZSD shutdown callbacks can be executed multiple times, hence
3874	 * it is safe to not be holding any locks across this call.
3875	 */
3876	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
3877
3878	mutex_enter(&zone_status_lock);
3879	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
3880		zone_status_set(zone, ZONE_IS_DOWN);
3881	mutex_exit(&zone_status_lock);
3882
3883	/*
3884	 * Wait for kernel threads to drain.
3885	 */
3886	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
3887		zone_rele(zone);
3888		return (set_errno(EINTR));
3889	}
3890
3891	/*
3892	 * Zone can be become down/destroyable even if the above wait
3893	 * returns EINTR, so any code added here may never execute.
3894	 * (i.e. don't add code here)
3895	 */
3896
3897	zone_rele(zone);
3898	return (0);
3899}
3900
3901/*
3902 * Systemcall entry point to finalize the zone halt process.  The caller
3903 * must have already successfully called zone_shutdown().
3904 *
3905 * Upon successful completion, the zone will have been fully destroyed:
3906 * zsched will have exited, destructor callbacks executed, and the zone
3907 * removed from the list of active zones.
3908 */
3909static int
3910zone_destroy(zoneid_t zoneid)
3911{
3912	uint64_t uniqid;
3913	zone_t *zone;
3914	zone_status_t status;
3915
3916	if (secpolicy_zone_config(CRED()) != 0)
3917		return (set_errno(EPERM));
3918	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3919		return (set_errno(EINVAL));
3920
3921	mutex_enter(&zonehash_lock);
3922	/*
3923	 * Look for zone under hash lock to prevent races with other
3924	 * calls to zone_destroy.
3925	 */
3926	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3927		mutex_exit(&zonehash_lock);
3928		return (set_errno(EINVAL));
3929	}
3930
3931	if (zone_mount_count(zone->zone_rootpath) != 0) {
3932		mutex_exit(&zonehash_lock);
3933		return (set_errno(EBUSY));
3934	}
3935	mutex_enter(&zone_status_lock);
3936	status = zone_status_get(zone);
3937	if (status < ZONE_IS_DOWN) {
3938		mutex_exit(&zone_status_lock);
3939		mutex_exit(&zonehash_lock);
3940		return (set_errno(EBUSY));
3941	} else if (status == ZONE_IS_DOWN) {
3942		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
3943	}
3944	mutex_exit(&zone_status_lock);
3945	zone_hold(zone);
3946	mutex_exit(&zonehash_lock);
3947
3948	/*
3949	 * wait for zsched to exit
3950	 */
3951	zone_status_wait(zone, ZONE_IS_DEAD);
3952	zone_zsd_callbacks(zone, ZSD_DESTROY);
3953	zone->zone_netstack = NULL;
3954	uniqid = zone->zone_uniqid;
3955	zone_rele(zone);
3956	zone = NULL;	/* potentially free'd */
3957
3958	mutex_enter(&zonehash_lock);
3959	for (; /* ever */; ) {
3960		boolean_t unref;
3961
3962		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
3963		    zone->zone_uniqid != uniqid) {
3964			/*
3965			 * The zone has gone away.  Necessary conditions
3966			 * are met, so we return success.
3967			 */
3968			mutex_exit(&zonehash_lock);
3969			return (0);
3970		}
3971		mutex_enter(&zone->zone_lock);
3972		unref = ZONE_IS_UNREF(zone);
3973		mutex_exit(&zone->zone_lock);
3974		if (unref) {
3975			/*
3976			 * There is only one reference to the zone -- that
3977			 * added when the zone was added to the hashtables --
3978			 * and things will remain this way until we drop
3979			 * zonehash_lock... we can go ahead and cleanup the
3980			 * zone.
3981			 */
3982			break;
3983		}
3984
3985		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
3986			/* Signaled */
3987			mutex_exit(&zonehash_lock);
3988			return (set_errno(EINTR));
3989		}
3990
3991	}
3992
3993	/*
3994	 * Remove CPU cap for this zone now since we're not going to
3995	 * fail below this point.
3996	 */
3997	cpucaps_zone_remove(zone);
3998
3999	/* Get rid of the zone's kstats */
4000	zone_kstat_delete(zone);
4001
4002	/* Say goodbye to brand framework. */
4003	brand_unregister_zone(zone->zone_brand);
4004
4005	/*
4006	 * It is now safe to let the zone be recreated; remove it from the
4007	 * lists.  The memory will not be freed until the last cred
4008	 * reference goes away.
4009	 */
4010	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
4011	zonecount--;
4012	/* remove from active list and hash tables */
4013	list_remove(&zone_active, zone);
4014	(void) mod_hash_destroy(zonehashbyname,
4015	    (mod_hash_key_t)zone->zone_name);
4016	(void) mod_hash_destroy(zonehashbyid,
4017	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4018	if (zone->zone_flags & ZF_HASHED_LABEL)
4019		(void) mod_hash_destroy(zonehashbylabel,
4020		    (mod_hash_key_t)zone->zone_slabel);
4021	mutex_exit(&zonehash_lock);
4022
4023	/*
4024	 * Release the root vnode; we're not using it anymore.  Nor should any
4025	 * other thread that might access it exist.
4026	 */
4027	if (zone->zone_rootvp != NULL) {
4028		VN_RELE(zone->zone_rootvp);
4029		zone->zone_rootvp = NULL;
4030	}
4031
4032	/* add to deathrow list */
4033	mutex_enter(&zone_deathrow_lock);
4034	list_insert_tail(&zone_deathrow, zone);
4035	mutex_exit(&zone_deathrow_lock);
4036
4037	/*
4038	 * Drop last reference (which was added by zsched()), this will
4039	 * free the zone unless there are outstanding cred references.
4040	 */
4041	zone_rele(zone);
4042	return (0);
4043}
4044
4045/*
4046 * Systemcall entry point for zone_getattr(2).
4047 */
4048static ssize_t
4049zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4050{
4051	size_t size;
4052	int error = 0, err;
4053	zone_t *zone;
4054	char *zonepath;
4055	char *outstr;
4056	zone_status_t zone_status;
4057	pid_t initpid;
4058	boolean_t global = (curzone == global_zone);
4059	boolean_t inzone = (curzone->zone_id == zoneid);
4060	ushort_t flags;
4061
4062	mutex_enter(&zonehash_lock);
4063	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4064		mutex_exit(&zonehash_lock);
4065		return (set_errno(EINVAL));
4066	}
4067	zone_status = zone_status_get(zone);
4068	if (zone_status < ZONE_IS_READY) {
4069		mutex_exit(&zonehash_lock);
4070		return (set_errno(EINVAL));
4071	}
4072	zone_hold(zone);
4073	mutex_exit(&zonehash_lock);
4074
4075	/*
4076	 * If not in the global zone, don't show information about other zones,
4077	 * unless the system is labeled and the local zone's label dominates
4078	 * the other zone.
4079	 */
4080	if (!zone_list_access(zone)) {
4081		zone_rele(zone);
4082		return (set_errno(EINVAL));
4083	}
4084
4085	switch (attr) {
4086	case ZONE_ATTR_ROOT:
4087		if (global) {
4088			/*
4089			 * Copy the path to trim the trailing "/" (except for
4090			 * the global zone).
4091			 */
4092			if (zone != global_zone)
4093				size = zone->zone_rootpathlen - 1;
4094			else
4095				size = zone->zone_rootpathlen;
4096			zonepath = kmem_alloc(size, KM_SLEEP);
4097			bcopy(zone->zone_rootpath, zonepath, size);
4098			zonepath[size - 1] = '\0';
4099		} else {
4100			if (inzone || !is_system_labeled()) {
4101				/*
4102				 * Caller is not in the global zone.
4103				 * if the query is on the current zone
4104				 * or the system is not labeled,
4105				 * just return faked-up path for current zone.
4106				 */
4107				zonepath = "/";
4108				size = 2;
4109			} else {
4110				/*
4111				 * Return related path for current zone.
4112				 */
4113				int prefix_len = strlen(zone_prefix);
4114				int zname_len = strlen(zone->zone_name);
4115
4116				size = prefix_len + zname_len + 1;
4117				zonepath = kmem_alloc(size, KM_SLEEP);
4118				bcopy(zone_prefix, zonepath, prefix_len);
4119				bcopy(zone->zone_name, zonepath +
4120				    prefix_len, zname_len);
4121				zonepath[size - 1] = '\0';
4122			}
4123		}
4124		if (bufsize > size)
4125			bufsize = size;
4126		if (buf != NULL) {
4127			err = copyoutstr(zonepath, buf, bufsize, NULL);
4128			if (err != 0 && err != ENAMETOOLONG)
4129				error = EFAULT;
4130		}
4131		if (global || (is_system_labeled() && !inzone))
4132			kmem_free(zonepath, size);
4133		break;
4134
4135	case ZONE_ATTR_NAME:
4136		size = strlen(zone->zone_name) + 1;
4137		if (bufsize > size)
4138			bufsize = size;
4139		if (buf != NULL) {
4140			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4141			if (err != 0 && err != ENAMETOOLONG)
4142				error = EFAULT;
4143		}
4144		break;
4145
4146	case ZONE_ATTR_STATUS:
4147		/*
4148		 * Since we're not holding zonehash_lock, the zone status
4149		 * may be anything; leave it up to userland to sort it out.
4150		 */
4151		size = sizeof (zone_status);
4152		if (bufsize > size)
4153			bufsize = size;
4154		zone_status = zone_status_get(zone);
4155		if (buf != NULL &&
4156		    copyout(&zone_status, buf, bufsize) != 0)
4157			error = EFAULT;
4158		break;
4159	case ZONE_ATTR_FLAGS:
4160		size = sizeof (zone->zone_flags);
4161		if (bufsize > size)
4162			bufsize = size;
4163		flags = zone->zone_flags;
4164		if (buf != NULL &&
4165		    copyout(&flags, buf, bufsize) != 0)
4166			error = EFAULT;
4167		break;
4168	case ZONE_ATTR_PRIVSET:
4169		size = sizeof (priv_set_t);
4170		if (bufsize > size)
4171			bufsize = size;
4172		if (buf != NULL &&
4173		    copyout(zone->zone_privset, buf, bufsize) != 0)
4174			error = EFAULT;
4175		break;
4176	case ZONE_ATTR_UNIQID:
4177		size = sizeof (zone->zone_uniqid);
4178		if (bufsize > size)
4179			bufsize = size;
4180		if (buf != NULL &&
4181		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4182			error = EFAULT;
4183		break;
4184	case ZONE_ATTR_POOLID:
4185		{
4186			pool_t *pool;
4187			poolid_t poolid;
4188
4189			if (pool_lock_intr() != 0) {
4190				error = EINTR;
4191				break;
4192			}
4193			pool = zone_pool_get(zone);
4194			poolid = pool->pool_id;
4195			pool_unlock();
4196			size = sizeof (poolid);
4197			if (bufsize > size)
4198				bufsize = size;
4199			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4200				error = EFAULT;
4201		}
4202		break;
4203	case ZONE_ATTR_SLBL:
4204		size = sizeof (bslabel_t);
4205		if (bufsize > size)
4206			bufsize = size;
4207		if (zone->zone_slabel == NULL)
4208			error = EINVAL;
4209		else if (buf != NULL &&
4210		    copyout(label2bslabel(zone->zone_slabel), buf,
4211		    bufsize) != 0)
4212			error = EFAULT;
4213		break;
4214	case ZONE_ATTR_INITPID:
4215		size = sizeof (initpid);
4216		if (bufsize > size)
4217			bufsize = size;
4218		initpid = zone->zone_proc_initpid;
4219		if (initpid == -1) {
4220			error = ESRCH;
4221			break;
4222		}
4223		if (buf != NULL &&
4224		    copyout(&initpid, buf, bufsize) != 0)
4225			error = EFAULT;
4226		break;
4227	case ZONE_ATTR_BRAND:
4228		size = strlen(zone->zone_brand->b_name) + 1;
4229
4230		if (bufsize > size)
4231			bufsize = size;
4232		if (buf != NULL) {
4233			err = copyoutstr(zone->zone_brand->b_name, buf,
4234			    bufsize, NULL);
4235			if (err != 0 && err != ENAMETOOLONG)
4236				error = EFAULT;
4237		}
4238		break;
4239	case ZONE_ATTR_INITNAME:
4240		size = strlen(zone->zone_initname) + 1;
4241		if (bufsize > size)
4242			bufsize = size;
4243		if (buf != NULL) {
4244			err = copyoutstr(zone->zone_initname, buf, bufsize,
4245			    NULL);
4246			if (err != 0 && err != ENAMETOOLONG)
4247				error = EFAULT;
4248		}
4249		break;
4250	case ZONE_ATTR_BOOTARGS:
4251		if (zone->zone_bootargs == NULL)
4252			outstr = "";
4253		else
4254			outstr = zone->zone_bootargs;
4255		size = strlen(outstr) + 1;
4256		if (bufsize > size)
4257			bufsize = size;
4258		if (buf != NULL) {
4259			err = copyoutstr(outstr, buf, bufsize, NULL);
4260			if (err != 0 && err != ENAMETOOLONG)
4261				error = EFAULT;
4262		}
4263		break;
4264	case ZONE_ATTR_PHYS_MCAP:
4265		size = sizeof (zone->zone_phys_mcap);
4266		if (bufsize > size)
4267			bufsize = size;
4268		if (buf != NULL &&
4269		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4270			error = EFAULT;
4271		break;
4272	case ZONE_ATTR_SCHED_CLASS:
4273		mutex_enter(&class_lock);
4274
4275		if (zone->zone_defaultcid >= loaded_classes)
4276			outstr = "";
4277		else
4278			outstr = sclass[zone->zone_defaultcid].cl_name;
4279		size = strlen(outstr) + 1;
4280		if (bufsize > size)
4281			bufsize = size;
4282		if (buf != NULL) {
4283			err = copyoutstr(outstr, buf, bufsize, NULL);
4284			if (err != 0 && err != ENAMETOOLONG)
4285				error = EFAULT;
4286		}
4287
4288		mutex_exit(&class_lock);
4289		break;
4290	default:
4291		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4292			size = bufsize;
4293			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4294		} else {
4295			error = EINVAL;
4296		}
4297	}
4298	zone_rele(zone);
4299
4300	if (error)
4301		return (set_errno(error));
4302	return ((ssize_t)size);
4303}
4304
4305/*
4306 * Systemcall entry point for zone_setattr(2).
4307 */
4308/*ARGSUSED*/
4309static int
4310zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4311{
4312	zone_t *zone;
4313	zone_status_t zone_status;
4314	int err;
4315
4316	if (secpolicy_zone_config(CRED()) != 0)
4317		return (set_errno(EPERM));
4318
4319	/*
4320	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
4321	 * global zone.
4322	 */
4323	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
4324		return (set_errno(EINVAL));
4325	}
4326
4327	mutex_enter(&zonehash_lock);
4328	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4329		mutex_exit(&zonehash_lock);
4330		return (set_errno(EINVAL));
4331	}
4332	zone_hold(zone);
4333	mutex_exit(&zonehash_lock);
4334
4335	/*
4336	 * At present most attributes can only be set on non-running,
4337	 * non-global zones.
4338	 */
4339	zone_status = zone_status_get(zone);
4340	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
4341		goto done;
4342
4343	switch (attr) {
4344	case ZONE_ATTR_INITNAME:
4345		err = zone_set_initname(zone, (const char *)buf);
4346		break;
4347	case ZONE_ATTR_BOOTARGS:
4348		err = zone_set_bootargs(zone, (const char *)buf);
4349		break;
4350	case ZONE_ATTR_BRAND:
4351		err = zone_set_brand(zone, (const char *)buf);
4352		break;
4353	case ZONE_ATTR_PHYS_MCAP:
4354		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
4355		break;
4356	case ZONE_ATTR_SCHED_CLASS:
4357		err = zone_set_sched_class(zone, (const char *)buf);
4358		break;
4359	default:
4360		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
4361			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
4362		else
4363			err = EINVAL;
4364	}
4365
4366done:
4367	zone_rele(zone);
4368	return (err != 0 ? set_errno(err) : 0);
4369}
4370
4371/*
4372 * Return zero if the process has at least one vnode mapped in to its
4373 * address space which shouldn't be allowed to change zones.
4374 *
4375 * Also return zero if the process has any shared mappings which reserve
4376 * swap.  This is because the counting for zone.max-swap does not allow swap
4377 * revervation to be shared between zones.  zone swap reservation is counted
4378 * on zone->zone_max_swap.
4379 */
4380static int
4381as_can_change_zones(void)
4382{
4383	proc_t *pp = curproc;
4384	struct seg *seg;
4385	struct as *as = pp->p_as;
4386	vnode_t *vp;
4387	int allow = 1;
4388
4389	ASSERT(pp->p_as != &kas);
4390	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
4391	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
4392
4393		/*
4394		 * Cannot enter zone with shared anon memory which
4395		 * reserves swap.  See comment above.
4396		 */
4397		if (seg_can_change_zones(seg) == B_FALSE) {
4398			allow = 0;
4399			break;
4400		}
4401		/*
4402		 * if we can't get a backing vnode for this segment then skip
4403		 * it.
4404		 */
4405		vp = NULL;
4406		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
4407			continue;
4408		if (!vn_can_change_zones(vp)) { /* bail on first match */
4409			allow = 0;
4410			break;
4411		}
4412	}
4413	AS_LOCK_EXIT(as, &as->a_lock);
4414	return (allow);
4415}
4416
4417/*
4418 * Count swap reserved by curproc's address space
4419 */
4420static size_t
4421as_swresv(void)
4422{
4423	proc_t *pp = curproc;
4424	struct seg *seg;
4425	struct as *as = pp->p_as;
4426	size_t swap = 0;
4427
4428	ASSERT(pp->p_as != &kas);
4429	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
4430	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
4431		swap += seg_swresv(seg);
4432
4433	return (swap);
4434}
4435
4436/*
4437 * Systemcall entry point for zone_enter().
4438 *
4439 * The current process is injected into said zone.  In the process
4440 * it will change its project membership, privileges, rootdir/cwd,
4441 * zone-wide rctls, and pool association to match those of the zone.
4442 *
4443 * The first zone_enter() called while the zone is in the ZONE_IS_READY
4444 * state will transition it to ZONE_IS_RUNNING.  Processes may only
4445 * enter a zone that is "ready" or "running".
4446 */
4447static int
4448zone_enter(zoneid_t zoneid)
4449{
4450	zone_t *zone;
4451	vnode_t *vp;
4452	proc_t *pp = curproc;
4453	contract_t *ct;
4454	cont_process_t *ctp;
4455	task_t *tk, *oldtk;
4456	kproject_t *zone_proj0;
4457	cred_t *cr, *newcr;
4458	pool_t *oldpool, *newpool;
4459	sess_t *sp;
4460	uid_t uid;
4461	zone_status_t status;
4462	int err = 0;
4463	rctl_entity_p_t e;
4464	size_t swap;
4465	kthread_id_t t;
4466
4467	if (secpolicy_zone_config(CRED()) != 0)
4468		return (set_errno(EPERM));
4469	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4470		return (set_errno(EINVAL));
4471
4472	/*
4473	 * Stop all lwps so we don't need to hold a lock to look at
4474	 * curproc->p_zone.  This needs to happen before we grab any
4475	 * locks to avoid deadlock (another lwp in the process could
4476	 * be waiting for the held lock).
4477	 */
4478	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
4479		return (set_errno(EINTR));
4480
4481	/*
4482	 * Make sure we're not changing zones with files open or mapped in
4483	 * to our address space which shouldn't be changing zones.
4484	 */
4485	if (!files_can_change_zones()) {
4486		err = EBADF;
4487		goto out;
4488	}
4489	if (!as_can_change_zones()) {
4490		err = EFAULT;
4491		goto out;
4492	}
4493
4494	mutex_enter(&zonehash_lock);
4495	if (pp->p_zone != global_zone) {
4496		mutex_exit(&zonehash_lock);
4497		err = EINVAL;
4498		goto out;
4499	}
4500
4501	zone = zone_find_all_by_id(zoneid);
4502	if (zone == NULL) {
4503		mutex_exit(&zonehash_lock);
4504		err = EINVAL;
4505		goto out;
4506	}
4507
4508	/*
4509	 * To prevent processes in a zone from holding contracts on
4510	 * extrazonal resources, and to avoid process contract
4511	 * memberships which span zones, contract holders and processes
4512	 * which aren't the sole members of their encapsulating process
4513	 * contracts are not allowed to zone_enter.
4514	 */
4515	ctp = pp->p_ct_process;
4516	ct = &ctp->conp_contract;
4517	mutex_enter(&ct->ct_lock);
4518	mutex_enter(&pp->p_lock);
4519	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
4520		mutex_exit(&pp->p_lock);
4521		mutex_exit(&ct->ct_lock);
4522		mutex_exit(&zonehash_lock);
4523		err = EINVAL;
4524		goto out;
4525	}
4526
4527	/*
4528	 * Moreover, we don't allow processes whose encapsulating
4529	 * process contracts have inherited extrazonal contracts.
4530	 * While it would be easier to eliminate all process contracts
4531	 * with inherited contracts, we need to be able to give a
4532	 * restarted init (or other zone-penetrating process) its
4533	 * predecessor's contracts.
4534	 */
4535	if (ctp->conp_ninherited != 0) {
4536		contract_t *next;
4537		for (next = list_head(&ctp->conp_inherited); next;
4538		    next = list_next(&ctp->conp_inherited, next)) {
4539			if (contract_getzuniqid(next) != zone->zone_uniqid) {
4540				mutex_exit(&pp->p_lock);
4541				mutex_exit(&ct->ct_lock);
4542				mutex_exit(&zonehash_lock);
4543				err = EINVAL;
4544				goto out;
4545			}
4546		}
4547	}
4548	mutex_exit(&pp->p_lock);
4549	mutex_exit(&ct->ct_lock);
4550
4551	status = zone_status_get(zone);
4552	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
4553		/*
4554		 * Can't join
4555		 */
4556		mutex_exit(&zonehash_lock);
4557		err = EINVAL;
4558		goto out;
4559	}
4560
4561	/*
4562	 * Make sure new priv set is within the permitted set for caller
4563	 */
4564	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
4565		mutex_exit(&zonehash_lock);
4566		err = EPERM;
4567		goto out;
4568	}
4569	/*
4570	 * We want to momentarily drop zonehash_lock while we optimistically
4571	 * bind curproc to the pool it should be running in.  This is safe
4572	 * since the zone can't disappear (we have a hold on it).
4573	 */
4574	zone_hold(zone);
4575	mutex_exit(&zonehash_lock);
4576
4577	/*
4578	 * Grab pool_lock to keep the pools configuration from changing
4579	 * and to stop ourselves from getting rebound to another pool
4580	 * until we join the zone.
4581	 */
4582	if (pool_lock_intr() != 0) {
4583		zone_rele(zone);
4584		err = EINTR;
4585		goto out;
4586	}
4587	ASSERT(secpolicy_pool(CRED()) == 0);
4588	/*
4589	 * Bind ourselves to the pool currently associated with the zone.
4590	 */
4591	oldpool = curproc->p_pool;
4592	newpool = zone_pool_get(zone);
4593	if (pool_state == POOL_ENABLED && newpool != oldpool &&
4594	    (err = pool_do_bind(newpool, P_PID, P_MYID,
4595	    POOL_BIND_ALL)) != 0) {
4596		pool_unlock();
4597		zone_rele(zone);
4598		goto out;
4599	}
4600
4601	/*
4602	 * Grab cpu_lock now; we'll need it later when we call
4603	 * task_join().
4604	 */
4605	mutex_enter(&cpu_lock);
4606	mutex_enter(&zonehash_lock);
4607	/*
4608	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
4609	 */
4610	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
4611		/*
4612		 * Can't join anymore.
4613		 */
4614		mutex_exit(&zonehash_lock);
4615		mutex_exit(&cpu_lock);
4616		if (pool_state == POOL_ENABLED &&
4617		    newpool != oldpool)
4618			(void) pool_do_bind(oldpool, P_PID, P_MYID,
4619			    POOL_BIND_ALL);
4620		pool_unlock();
4621		zone_rele(zone);
4622		err = EINVAL;
4623		goto out;
4624	}
4625
4626	/*
4627	 * a_lock must be held while transfering locked memory and swap
4628	 * reservation from the global zone to the non global zone because
4629	 * asynchronous faults on the processes' address space can lock
4630	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
4631	 * segments respectively.
4632	 */
4633	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
4634	swap = as_swresv();
4635	mutex_enter(&pp->p_lock);
4636	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
4637	/* verify that we do not exceed and task or lwp limits */
4638	mutex_enter(&zone->zone_nlwps_lock);
4639	/* add new lwps to zone and zone's proj0 */
4640	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
4641	zone->zone_nlwps += pp->p_lwpcnt;
4642	/* add 1 task to zone's proj0 */
4643	zone_proj0->kpj_ntasks += 1;
4644	mutex_exit(&zone->zone_nlwps_lock);
4645
4646	mutex_enter(&zone->zone_mem_lock);
4647	zone->zone_locked_mem += pp->p_locked_mem;
4648	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4649	zone->zone_max_swap += swap;
4650	mutex_exit(&zone->zone_mem_lock);
4651
4652	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
4653	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
4654	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
4655
4656	/* remove lwps from proc's old zone and old project */
4657	mutex_enter(&pp->p_zone->zone_nlwps_lock);
4658	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
4659	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
4660	mutex_exit(&pp->p_zone->zone_nlwps_lock);
4661
4662	mutex_enter(&pp->p_zone->zone_mem_lock);
4663	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
4664	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4665	pp->p_zone->zone_max_swap -= swap;
4666	mutex_exit(&pp->p_zone->zone_mem_lock);
4667
4668	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
4669	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
4670	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
4671
4672	mutex_exit(&pp->p_lock);
4673	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
4674
4675	/*
4676	 * Joining the zone cannot fail from now on.
4677	 *
4678	 * This means that a lot of the following code can be commonized and
4679	 * shared with zsched().
4680	 */
4681
4682	/*
4683	 * Reset the encapsulating process contract's zone.
4684	 */
4685	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
4686	contract_setzuniqid(ct, zone->zone_uniqid);
4687
4688	/*
4689	 * Create a new task and associate the process with the project keyed
4690	 * by (projid,zoneid).
4691	 *
4692	 * We might as well be in project 0; the global zone's projid doesn't
4693	 * make much sense in a zone anyhow.
4694	 *
4695	 * This also increments zone_ntasks, and returns with p_lock held.
4696	 */
4697	tk = task_create(0, zone);
4698	oldtk = task_join(tk, 0);
4699	mutex_exit(&cpu_lock);
4700
4701	pp->p_flag |= SZONETOP;
4702	pp->p_zone = zone;
4703
4704	/*
4705	 * call RCTLOP_SET functions on this proc
4706	 */
4707	e.rcep_p.zone = zone;
4708	e.rcep_t = RCENTITY_ZONE;
4709	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
4710	    RCD_CALLBACK);
4711	mutex_exit(&pp->p_lock);
4712
4713	/*
4714	 * We don't need to hold any of zsched's locks here; not only do we know
4715	 * the process and zone aren't going away, we know its session isn't
4716	 * changing either.
4717	 *
4718	 * By joining zsched's session here, we mimic the behavior in the
4719	 * global zone of init's sid being the pid of sched.  We extend this
4720	 * to all zlogin-like zone_enter()'ing processes as well.
4721	 */
4722	mutex_enter(&pidlock);
4723	sp = zone->zone_zsched->p_sessp;
4724	sess_hold(zone->zone_zsched);
4725	mutex_enter(&pp->p_lock);
4726	pgexit(pp);
4727	sess_rele(pp->p_sessp, B_TRUE);
4728	pp->p_sessp = sp;
4729	pgjoin(pp, zone->zone_zsched->p_pidp);
4730
4731	/*
4732	 * If any threads are scheduled to be placed on zone wait queue they
4733	 * should abandon the idea since the wait queue is changing.
4734	 * We need to be holding pidlock & p_lock to do this.
4735	 */
4736	if ((t = pp->p_tlist) != NULL) {
4737		do {
4738			thread_lock(t);
4739			/*
4740			 * Kick this thread so that he doesn't sit
4741			 * on a wrong wait queue.
4742			 */
4743			if (ISWAITING(t))
4744				setrun_locked(t);
4745
4746			if (t->t_schedflag & TS_ANYWAITQ)
4747				t->t_schedflag &= ~ TS_ANYWAITQ;
4748
4749			thread_unlock(t);
4750		} while ((t = t->t_forw) != pp->p_tlist);
4751	}
4752
4753	/*
4754	 * If there is a default scheduling class for the zone and it is not
4755	 * the class we are currently in, change all of the threads in the
4756	 * process to the new class.  We need to be holding pidlock & p_lock
4757	 * when we call parmsset so this is a good place to do it.
4758	 */
4759	if (zone->zone_defaultcid > 0 &&
4760	    zone->zone_defaultcid != curthread->t_cid) {
4761		pcparms_t pcparms;
4762
4763		pcparms.pc_cid = zone->zone_defaultcid;
4764		pcparms.pc_clparms[0] = 0;
4765
4766		/*
4767		 * If setting the class fails, we still want to enter the zone.
4768		 */
4769		if ((t = pp->p_tlist) != NULL) {
4770			do {
4771				(void) parmsset(&pcparms, t);
4772			} while ((t = t->t_forw) != pp->p_tlist);
4773		}
4774	}
4775
4776	mutex_exit(&pp->p_lock);
4777	mutex_exit(&pidlock);
4778
4779	mutex_exit(&zonehash_lock);
4780	/*
4781	 * We're firmly in the zone; let pools progress.
4782	 */
4783	pool_unlock();
4784	task_rele(oldtk);
4785	/*
4786	 * We don't need to retain a hold on the zone since we already
4787	 * incremented zone_ntasks, so the zone isn't going anywhere.
4788	 */
4789	zone_rele(zone);
4790
4791	/*
4792	 * Chroot
4793	 */
4794	vp = zone->zone_rootvp;
4795	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
4796	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
4797
4798	/*
4799	 * Change process credentials
4800	 */
4801	newcr = cralloc();
4802	mutex_enter(&pp->p_crlock);
4803	cr = pp->p_cred;
4804	crcopy_to(cr, newcr);
4805	crsetzone(newcr, zone);
4806	pp->p_cred = newcr;
4807
4808	/*
4809	 * Restrict all process privilege sets to zone limit
4810	 */
4811	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
4812	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
4813	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
4814	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
4815	mutex_exit(&pp->p_crlock);
4816	crset(pp, newcr);
4817
4818	/*
4819	 * Adjust upcount to reflect zone entry.
4820	 */
4821	uid = crgetruid(newcr);
4822	mutex_enter(&pidlock);
4823	upcount_dec(uid, GLOBAL_ZONEID);
4824	upcount_inc(uid, zoneid);
4825	mutex_exit(&pidlock);
4826
4827	/*
4828	 * Set up core file path and content.
4829	 */
4830	set_core_defaults();
4831
4832out:
4833	/*
4834	 * Let the other lwps continue.
4835	 */
4836	mutex_enter(&pp->p_lock);
4837	if (curthread != pp->p_agenttp)
4838		continuelwps(pp);
4839	mutex_exit(&pp->p_lock);
4840
4841	return (err != 0 ? set_errno(err) : 0);
4842}
4843
4844/*
4845 * Systemcall entry point for zone_list(2).
4846 *
4847 * Processes running in a (non-global) zone only see themselves.
4848 * On labeled systems, they see all zones whose label they dominate.
4849 */
4850static int
4851zone_list(zoneid_t *zoneidlist, uint_t *numzones)
4852{
4853	zoneid_t *zoneids;
4854	zone_t *zone, *myzone;
4855	uint_t user_nzones, real_nzones;
4856	uint_t domi_nzones;
4857	int error;
4858
4859	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
4860		return (set_errno(EFAULT));
4861
4862	myzone = curproc->p_zone;
4863	if (myzone != global_zone) {
4864		bslabel_t *mybslab;
4865
4866		if (!is_system_labeled()) {
4867			/* just return current zone */
4868			real_nzones = domi_nzones = 1;
4869			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
4870			zoneids[0] = myzone->zone_id;
4871		} else {
4872			/* return all zones that are dominated */
4873			mutex_enter(&zonehash_lock);
4874			real_nzones = zonecount;
4875			domi_nzones = 0;
4876			if (real_nzones > 0) {
4877				zoneids = kmem_alloc(real_nzones *
4878				    sizeof (zoneid_t), KM_SLEEP);
4879				mybslab = label2bslabel(myzone->zone_slabel);
4880				for (zone = list_head(&zone_active);
4881				    zone != NULL;
4882				    zone = list_next(&zone_active, zone)) {
4883					if (zone->zone_id == GLOBAL_ZONEID)
4884						continue;
4885					if (zone != myzone &&
4886					    (zone->zone_flags & ZF_IS_SCRATCH))
4887						continue;
4888					/*
4889					 * Note that a label always dominates
4890					 * itself, so myzone is always included
4891					 * in the list.
4892					 */
4893					if (bldominates(mybslab,
4894					    label2bslabel(zone->zone_slabel))) {
4895						zoneids[domi_nzones++] =
4896						    zone->zone_id;
4897					}
4898				}
4899			}
4900			mutex_exit(&zonehash_lock);
4901		}
4902	} else {
4903		mutex_enter(&zonehash_lock);
4904		real_nzones = zonecount;
4905		domi_nzones = 0;
4906		if (real_nzones > 0) {
4907			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
4908			    KM_SLEEP);
4909			for (zone = list_head(&zone_active); zone != NULL;
4910			    zone = list_next(&zone_active, zone))
4911				zoneids[domi_nzones++] = zone->zone_id;
4912			ASSERT(domi_nzones == real_nzones);
4913		}
4914		mutex_exit(&zonehash_lock);
4915	}
4916
4917	/*
4918	 * If user has allocated space for fewer entries than we found, then
4919	 * return only up to his limit.  Either way, tell him exactly how many
4920	 * we found.
4921	 */
4922	if (domi_nzones < user_nzones)
4923		user_nzones = domi_nzones;
4924	error = 0;
4925	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
4926		error = EFAULT;
4927	} else if (zoneidlist != NULL && user_nzones != 0) {
4928		if (copyout(zoneids, zoneidlist,
4929		    user_nzones * sizeof (zoneid_t)) != 0)
4930			error = EFAULT;
4931	}
4932
4933	if (real_nzones > 0)
4934		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
4935
4936	if (error != 0)
4937		return (set_errno(error));
4938	else
4939		return (0);
4940}
4941
4942/*
4943 * Systemcall entry point for zone_lookup(2).
4944 *
4945 * Non-global zones are only able to see themselves and (on labeled systems)
4946 * the zones they dominate.
4947 */
4948static zoneid_t
4949zone_lookup(const char *zone_name)
4950{
4951	char *kname;
4952	zone_t *zone;
4953	zoneid_t zoneid;
4954	int err;
4955
4956	if (zone_name == NULL) {
4957		/* return caller's zone id */
4958		return (getzoneid());
4959	}
4960
4961	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4962	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
4963		kmem_free(kname, ZONENAME_MAX);
4964		return (set_errno(err));
4965	}
4966
4967	mutex_enter(&zonehash_lock);
4968	zone = zone_find_all_by_name(kname);
4969	kmem_free(kname, ZONENAME_MAX);
4970	/*
4971	 * In a non-global zone, can only lookup global and own name.
4972	 * In Trusted Extensions zone label dominance rules apply.
4973	 */
4974	if (zone == NULL ||
4975	    zone_status_get(zone) < ZONE_IS_READY ||
4976	    !zone_list_access(zone)) {
4977		mutex_exit(&zonehash_lock);
4978		return (set_errno(EINVAL));
4979	} else {
4980		zoneid = zone->zone_id;
4981		mutex_exit(&zonehash_lock);
4982		return (zoneid);
4983	}
4984}
4985
4986static int
4987zone_version(int *version_arg)
4988{
4989	int version = ZONE_SYSCALL_API_VERSION;
4990
4991	if (copyout(&version, version_arg, sizeof (int)) != 0)
4992		return (set_errno(EFAULT));
4993	return (0);
4994}
4995
4996/* ARGSUSED */
4997long
4998zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
4999{
5000	zone_def zs;
5001
5002	switch (cmd) {
5003	case ZONE_CREATE:
5004		if (get_udatamodel() == DATAMODEL_NATIVE) {
5005			if (copyin(arg1, &zs, sizeof (zone_def))) {
5006				return (set_errno(EFAULT));
5007			}
5008		} else {
5009#ifdef _SYSCALL32_IMPL
5010			zone_def32 zs32;
5011
5012			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
5013				return (set_errno(EFAULT));
5014			}
5015			zs.zone_name =
5016			    (const char *)(unsigned long)zs32.zone_name;
5017			zs.zone_root =
5018			    (const char *)(unsigned long)zs32.zone_root;
5019			zs.zone_privs =
5020			    (const struct priv_set *)
5021			    (unsigned long)zs32.zone_privs;
5022			zs.zone_privssz = zs32.zone_privssz;
5023			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
5024			zs.rctlbufsz = zs32.rctlbufsz;
5025			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5026			zs.zfsbufsz = zs32.zfsbufsz;
5027			zs.extended_error =
5028			    (int *)(unsigned long)zs32.extended_error;
5029			zs.match = zs32.match;
5030			zs.doi = zs32.doi;
5031			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
5032			zs.flags = zs32.flags;
5033#else
5034			panic("get_udatamodel() returned bogus result\n");
5035#endif
5036		}
5037
5038		return (zone_create(zs.zone_name, zs.zone_root,
5039		    zs.zone_privs, zs.zone_privssz,
5040		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5041		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
5042		    zs.extended_error, zs.match, zs.doi,
5043		    zs.label, zs.flags));
5044	case ZONE_BOOT:
5045		return (zone_boot((zoneid_t)(uintptr_t)arg1));
5046	case ZONE_DESTROY:
5047		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
5048	case ZONE_GETATTR:
5049		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
5050		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5051	case ZONE_SETATTR:
5052		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
5053		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5054	case ZONE_ENTER:
5055		return (zone_enter((zoneid_t)(uintptr_t)arg1));
5056	case ZONE_LIST:
5057		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
5058	case ZONE_SHUTDOWN:
5059		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
5060	case ZONE_LOOKUP:
5061		return (zone_lookup((const char *)arg1));
5062	case ZONE_VERSION:
5063		return (zone_version((int *)arg1));
5064	case ZONE_ADD_DATALINK:
5065		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
5066		    (char *)arg2));
5067	case ZONE_DEL_DATALINK:
5068		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
5069		    (char *)arg2));
5070	case ZONE_CHECK_DATALINK:
5071		return (zone_check_datalink((zoneid_t *)arg1, (char *)arg2));
5072	case ZONE_LIST_DATALINK:
5073		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
5074		    (int *)arg2, (char *)arg3));
5075	default:
5076		return (set_errno(EINVAL));
5077	}
5078}
5079
5080struct zarg {
5081	zone_t *zone;
5082	zone_cmd_arg_t arg;
5083};
5084
5085static int
5086zone_lookup_door(const char *zone_name, door_handle_t *doorp)
5087{
5088	char *buf;
5089	size_t buflen;
5090	int error;
5091
5092	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
5093	buf = kmem_alloc(buflen, KM_SLEEP);
5094	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
5095	error = door_ki_open(buf, doorp);
5096	kmem_free(buf, buflen);
5097	return (error);
5098}
5099
5100static void
5101zone_release_door(door_handle_t *doorp)
5102{
5103	door_ki_rele(*doorp);
5104	*doorp = NULL;
5105}
5106
5107static void
5108zone_ki_call_zoneadmd(struct zarg *zargp)
5109{
5110	door_handle_t door = NULL;
5111	door_arg_t darg, save_arg;
5112	char *zone_name;
5113	size_t zone_namelen;
5114	zoneid_t zoneid;
5115	zone_t *zone;
5116	zone_cmd_arg_t arg;
5117	uint64_t uniqid;
5118	size_t size;
5119	int error;
5120	int retry;
5121
5122	zone = zargp->zone;
5123	arg = zargp->arg;
5124	kmem_free(zargp, sizeof (*zargp));
5125
5126	zone_namelen = strlen(zone->zone_name) + 1;
5127	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
5128	bcopy(zone->zone_name, zone_name, zone_namelen);
5129	zoneid = zone->zone_id;
5130	uniqid = zone->zone_uniqid;
5131	/*
5132	 * zoneadmd may be down, but at least we can empty out the zone.
5133	 * We can ignore the return value of zone_empty() since we're called
5134	 * from a kernel thread and know we won't be delivered any signals.
5135	 */
5136	ASSERT(curproc == &p0);
5137	(void) zone_empty(zone);
5138	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5139	zone_rele(zone);
5140
5141	size = sizeof (arg);
5142	darg.rbuf = (char *)&arg;
5143	darg.data_ptr = (char *)&arg;
5144	darg.rsize = size;
5145	darg.data_size = size;
5146	darg.desc_ptr = NULL;
5147	darg.desc_num = 0;
5148
5149	save_arg = darg;
5150	/*
5151	 * Since we're not holding a reference to the zone, any number of
5152	 * things can go wrong, including the zone disappearing before we get a
5153	 * chance to talk to zoneadmd.
5154	 */
5155	for (retry = 0; /* forever */; retry++) {
5156		if (door == NULL &&
5157		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5158			goto next;
5159		}
5160		ASSERT(door != NULL);
5161
5162		if ((error = door_ki_upcall(door, &darg)) == 0) {
5163			break;
5164		}
5165		switch (error) {
5166		case EINTR:
5167			/* FALLTHROUGH */
5168		case EAGAIN:	/* process may be forking */
5169			/*
5170			 * Back off for a bit
5171			 */
5172			break;
5173		case EBADF:
5174			zone_release_door(&door);
5175			if (zone_lookup_door(zone_name, &door) != 0) {
5176				/*
5177				 * zoneadmd may be dead, but it may come back to
5178				 * life later.
5179				 */
5180				break;
5181			}
5182			break;
5183		default:
5184			cmn_err(CE_WARN,
5185			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5186			    error);
5187			goto out;
5188		}
5189next:
5190		/*
5191		 * If this isn't the same zone_t that we originally had in mind,
5192		 * then this is the same as if two kadmin requests come in at
5193		 * the same time: the first one wins.  This means we lose, so we
5194		 * bail.
5195		 */
5196		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5197			/*
5198			 * Problem is solved.
5199			 */
5200			break;
5201		}
5202		if (zone->zone_uniqid != uniqid) {
5203			/*
5204			 * zoneid recycled
5205			 */
5206			zone_rele(zone);
5207			break;
5208		}
5209		/*
5210		 * We could zone_status_timedwait(), but there doesn't seem to
5211		 * be much point in doing that (plus, it would mean that
5212		 * zone_free() isn't called until this thread exits).
5213		 */
5214		zone_rele(zone);
5215		delay(hz);
5216		darg = save_arg;
5217	}
5218out:
5219	if (door != NULL) {
5220		zone_release_door(&door);
5221	}
5222	kmem_free(zone_name, zone_namelen);
5223	thread_exit();
5224}
5225
5226/*
5227 * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5228 * kadmin().  The caller is a process in the zone.
5229 *
5230 * In order to shutdown the zone, we will hand off control to zoneadmd
5231 * (running in the global zone) via a door.  We do a half-hearted job at
5232 * killing all processes in the zone, create a kernel thread to contact
5233 * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5234 * a form of generation number used to let zoneadmd (as well as
5235 * zone_destroy()) know exactly which zone they're re talking about.
5236 */
5237int
5238zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5239{
5240	struct zarg *zargp;
5241	zone_cmd_t zcmd;
5242	zone_t *zone;
5243
5244	zone = curproc->p_zone;
5245	ASSERT(getzoneid() != GLOBAL_ZONEID);
5246
5247	switch (cmd) {
5248	case A_SHUTDOWN:
5249		switch (fcn) {
5250		case AD_HALT:
5251		case AD_POWEROFF:
5252			zcmd = Z_HALT;
5253			break;
5254		case AD_BOOT:
5255			zcmd = Z_REBOOT;
5256			break;
5257		case AD_IBOOT:
5258		case AD_SBOOT:
5259		case AD_SIBOOT:
5260		case AD_NOSYNC:
5261			return (ENOTSUP);
5262		default:
5263			return (EINVAL);
5264		}
5265		break;
5266	case A_REBOOT:
5267		zcmd = Z_REBOOT;
5268		break;
5269	case A_FTRACE:
5270	case A_REMOUNT:
5271	case A_FREEZE:
5272	case A_DUMP:
5273		return (ENOTSUP);
5274	default:
5275		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
5276		return (EINVAL);
5277	}
5278
5279	if (secpolicy_zone_admin(credp, B_FALSE))
5280		return (EPERM);
5281	mutex_enter(&zone_status_lock);
5282
5283	/*
5284	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
5285	 * is in the zone.
5286	 */
5287	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
5288	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
5289		/*
5290		 * This zone is already on its way down.
5291		 */
5292		mutex_exit(&zone_status_lock);
5293		return (0);
5294	}
5295	/*
5296	 * Prevent future zone_enter()s
5297	 */
5298	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5299	mutex_exit(&zone_status_lock);
5300
5301	/*
5302	 * Kill everyone now and call zoneadmd later.
5303	 * zone_ki_call_zoneadmd() will do a more thorough job of this
5304	 * later.
5305	 */
5306	killall(zone->zone_id);
5307	/*
5308	 * Now, create the thread to contact zoneadmd and do the rest of the
5309	 * work.  This thread can't be created in our zone otherwise
5310	 * zone_destroy() would deadlock.
5311	 */
5312	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
5313	zargp->arg.cmd = zcmd;
5314	zargp->arg.uniqid = zone->zone_uniqid;
5315	zargp->zone = zone;
5316	(void) strcpy(zargp->arg.locale, "C");
5317	/* mdep was already copied in for us by uadmin */
5318	if (mdep != NULL)
5319		(void) strlcpy(zargp->arg.bootbuf, mdep,
5320		    sizeof (zargp->arg.bootbuf));
5321	zone_hold(zone);
5322
5323	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
5324	    TS_RUN, minclsyspri);
5325	exit(CLD_EXITED, 0);
5326
5327	return (EINVAL);
5328}
5329
5330/*
5331 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
5332 * status to ZONE_IS_SHUTTING_DOWN.
5333 */
5334void
5335zone_shutdown_global(void)
5336{
5337	ASSERT(curproc->p_zone == global_zone);
5338
5339	mutex_enter(&zone_status_lock);
5340	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
5341	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
5342	mutex_exit(&zone_status_lock);
5343}
5344
5345/*
5346 * Returns true if the named dataset is visible in the current zone.
5347 * The 'write' parameter is set to 1 if the dataset is also writable.
5348 */
5349int
5350zone_dataset_visible(const char *dataset, int *write)
5351{
5352	zone_dataset_t *zd;
5353	size_t len;
5354	zone_t *zone = curproc->p_zone;
5355
5356	if (dataset[0] == '\0')
5357		return (0);
5358
5359	/*
5360	 * Walk the list once, looking for datasets which match exactly, or
5361	 * specify a dataset underneath an exported dataset.  If found, return
5362	 * true and note that it is writable.
5363	 */
5364	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5365	    zd = list_next(&zone->zone_datasets, zd)) {
5366
5367		len = strlen(zd->zd_dataset);
5368		if (strlen(dataset) >= len &&
5369		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5370		    (dataset[len] == '\0' || dataset[len] == '/' ||
5371		    dataset[len] == '@')) {
5372			if (write)
5373				*write = 1;
5374			return (1);
5375		}
5376	}
5377
5378	/*
5379	 * Walk the list a second time, searching for datasets which are parents
5380	 * of exported datasets.  These should be visible, but read-only.
5381	 *
5382	 * Note that we also have to support forms such as 'pool/dataset/', with
5383	 * a trailing slash.
5384	 */
5385	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5386	    zd = list_next(&zone->zone_datasets, zd)) {
5387
5388		len = strlen(dataset);
5389		if (dataset[len - 1] == '/')
5390			len--;	/* Ignore trailing slash */
5391		if (len < strlen(zd->zd_dataset) &&
5392		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5393		    zd->zd_dataset[len] == '/') {
5394			if (write)
5395				*write = 0;
5396			return (1);
5397		}
5398	}
5399
5400	return (0);
5401}
5402
5403/*
5404 * zone_find_by_any_path() -
5405 *
5406 * kernel-private routine similar to zone_find_by_path(), but which
5407 * effectively compares against zone paths rather than zonerootpath
5408 * (i.e., the last component of zonerootpaths, which should be "root/",
5409 * are not compared.)  This is done in order to accurately identify all
5410 * paths, whether zone-visible or not, including those which are parallel
5411 * to /root/, such as /dev/, /home/, etc...
5412 *
5413 * If the specified path does not fall under any zone path then global
5414 * zone is returned.
5415 *
5416 * The treat_abs parameter indicates whether the path should be treated as
5417 * an absolute path although it does not begin with "/".  (This supports
5418 * nfs mount syntax such as host:any/path.)
5419 *
5420 * The caller is responsible for zone_rele of the returned zone.
5421 */
5422zone_t *
5423zone_find_by_any_path(const char *path, boolean_t treat_abs)
5424{
5425	zone_t *zone;
5426	int path_offset = 0;
5427
5428	if (path == NULL) {
5429		zone_hold(global_zone);
5430		return (global_zone);
5431	}
5432
5433	if (*path != '/') {
5434		ASSERT(treat_abs);
5435		path_offset = 1;
5436	}
5437
5438	mutex_enter(&zonehash_lock);
5439	for (zone = list_head(&zone_active); zone != NULL;
5440	    zone = list_next(&zone_active, zone)) {
5441		char	*c;
5442		size_t	pathlen;
5443		char *rootpath_start;
5444
5445		if (zone == global_zone)	/* skip global zone */
5446			continue;
5447
5448		/* scan backwards to find start of last component */
5449		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
5450		do {
5451			c--;
5452		} while (*c != '/');
5453
5454		pathlen = c - zone->zone_rootpath + 1 - path_offset;
5455		rootpath_start = (zone->zone_rootpath + path_offset);
5456		if (strncmp(path, rootpath_start, pathlen) == 0)
5457			break;
5458	}
5459	if (zone == NULL)
5460		zone = global_zone;
5461	zone_hold(zone);
5462	mutex_exit(&zonehash_lock);
5463	return (zone);
5464}
5465
5466/* List of data link names which are accessible from the zone */
5467struct dlnamelist {
5468	char			dlnl_name[LIFNAMSIZ];
5469	struct dlnamelist	*dlnl_next;
5470};
5471
5472
5473/*
5474 * Check whether the datalink name (dlname) itself is present.
5475 * Return true if found.
5476 */
5477static boolean_t
5478zone_dlname(zone_t *zone, char *dlname)
5479{
5480	struct dlnamelist *dlnl;
5481	boolean_t found = B_FALSE;
5482
5483	mutex_enter(&zone->zone_lock);
5484	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5485		if (strncmp(dlnl->dlnl_name, dlname, LIFNAMSIZ) == 0) {
5486			found = B_TRUE;
5487			break;
5488		}
5489	}
5490	mutex_exit(&zone->zone_lock);
5491	return (found);
5492}
5493
5494/*
5495 * Add an data link name for the zone. Does not check for duplicates.
5496 */
5497static int
5498zone_add_datalink(zoneid_t zoneid, char *dlname)
5499{
5500	struct dlnamelist *dlnl;
5501	zone_t *zone;
5502	zone_t *thiszone;
5503	int err;
5504
5505	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5506	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5507		kmem_free(dlnl, sizeof (struct dlnamelist));
5508		return (set_errno(err));
5509	}
5510
5511	thiszone = zone_find_by_id(zoneid);
5512	if (thiszone == NULL) {
5513		kmem_free(dlnl, sizeof (struct dlnamelist));
5514		return (set_errno(ENXIO));
5515	}
5516
5517	/*
5518	 * Verify that the datalink name isn't already used by a different
5519	 * zone while allowing duplicate entries for the same zone (e.g. due
5520	 * to both using IPv4 and IPv6 on an interface)
5521	 */
5522	mutex_enter(&zonehash_lock);
5523	for (zone = list_head(&zone_active); zone != NULL;
5524	    zone = list_next(&zone_active, zone)) {
5525		if (zone->zone_id == zoneid)
5526			continue;
5527
5528		if (zone_dlname(zone, dlnl->dlnl_name)) {
5529			mutex_exit(&zonehash_lock);
5530			zone_rele(thiszone);
5531			kmem_free(dlnl, sizeof (struct dlnamelist));
5532			return (set_errno(EPERM));
5533		}
5534	}
5535	mutex_enter(&thiszone->zone_lock);
5536	dlnl->dlnl_next = thiszone->zone_dl_list;
5537	thiszone->zone_dl_list = dlnl;
5538	mutex_exit(&thiszone->zone_lock);
5539	mutex_exit(&zonehash_lock);
5540	zone_rele(thiszone);
5541	return (0);
5542}
5543
5544static int
5545zone_remove_datalink(zoneid_t zoneid, char *dlname)
5546{
5547	struct dlnamelist *dlnl, *odlnl, **dlnlp;
5548	zone_t *zone;
5549	int err;
5550
5551	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5552	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5553		kmem_free(dlnl, sizeof (struct dlnamelist));
5554		return (set_errno(err));
5555	}
5556	zone = zone_find_by_id(zoneid);
5557	if (zone == NULL) {
5558		kmem_free(dlnl, sizeof (struct dlnamelist));
5559		return (set_errno(EINVAL));
5560	}
5561
5562	mutex_enter(&zone->zone_lock);
5563	/* Look for match */
5564	dlnlp = &zone->zone_dl_list;
5565	while (*dlnlp != NULL) {
5566		if (strncmp(dlnl->dlnl_name, (*dlnlp)->dlnl_name,
5567		    LIFNAMSIZ) == 0)
5568			goto found;
5569		dlnlp = &((*dlnlp)->dlnl_next);
5570	}
5571	mutex_exit(&zone->zone_lock);
5572	zone_rele(zone);
5573	kmem_free(dlnl, sizeof (struct dlnamelist));
5574	return (set_errno(ENXIO));
5575
5576found:
5577	odlnl = *dlnlp;
5578	*dlnlp = (*dlnlp)->dlnl_next;
5579	kmem_free(odlnl, sizeof (struct dlnamelist));
5580
5581	mutex_exit(&zone->zone_lock);
5582	zone_rele(zone);
5583	kmem_free(dlnl, sizeof (struct dlnamelist));
5584	return (0);
5585}
5586
5587/*
5588 * Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink
5589 * name (dlname); otherwise we just check if the specified zoneidp has access
5590 * to the datalink name.
5591 */
5592static int
5593zone_check_datalink(zoneid_t *zoneidp, char *dlname)
5594{
5595	zoneid_t id;
5596	char *dln;
5597	zone_t *zone;
5598	int err = 0;
5599	boolean_t allzones = B_FALSE;
5600
5601	if (copyin(zoneidp, &id, sizeof (id)) != 0) {
5602		return (set_errno(EFAULT));
5603	}
5604	dln = kmem_zalloc(LIFNAMSIZ, KM_SLEEP);
5605	if ((err = copyinstr(dlname, dln, LIFNAMSIZ, NULL)) != 0) {
5606		kmem_free(dln, LIFNAMSIZ);
5607		return (set_errno(err));
5608	}
5609
5610	if (id == ALL_ZONES)
5611		allzones = B_TRUE;
5612
5613	/*
5614	 * Check whether datalink name is already used.
5615	 */
5616	mutex_enter(&zonehash_lock);
5617	for (zone = list_head(&zone_active); zone != NULL;
5618	    zone = list_next(&zone_active, zone)) {
5619		if (allzones || (id == zone->zone_id)) {
5620			if (!zone_dlname(zone, dln))
5621				continue;
5622			if (allzones)
5623				err = copyout(&zone->zone_id, zoneidp,
5624				    sizeof (*zoneidp));
5625
5626			mutex_exit(&zonehash_lock);
5627			kmem_free(dln, LIFNAMSIZ);
5628			return (err ? set_errno(EFAULT) : 0);
5629		}
5630	}
5631
5632	/* datalink name is not found in any active zone. */
5633	mutex_exit(&zonehash_lock);
5634	kmem_free(dln, LIFNAMSIZ);
5635	return (set_errno(ENXIO));
5636}
5637
5638/*
5639 * Get the names of the datalinks assigned to a zone.
5640 * Here *nump is the number of datalinks, and the assumption
5641 * is that the caller will gurantee that the the supplied buffer is
5642 * big enough to hold at least #*nump datalink names, that is,
5643 * LIFNAMSIZ X *nump
5644 * On return, *nump will be the "new" number of datalinks, if it
5645 * ever changed.
5646 */
5647static int
5648zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
5649{
5650	int num, dlcount;
5651	zone_t *zone;
5652	struct dlnamelist *dlnl;
5653	char *ptr;
5654
5655	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
5656		return (set_errno(EFAULT));
5657
5658	zone = zone_find_by_id(zoneid);
5659	if (zone == NULL) {
5660		return (set_errno(ENXIO));
5661	}
5662
5663	num = 0;
5664	mutex_enter(&zone->zone_lock);
5665	ptr = buf;
5666	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5667		/*
5668		 * If the list changed and the new number is bigger
5669		 * than what the caller supplied, just count, don't
5670		 * do copyout
5671		 */
5672		if (++num > dlcount)
5673			continue;
5674		if (copyout(dlnl->dlnl_name, ptr, LIFNAMSIZ) != 0) {
5675			mutex_exit(&zone->zone_lock);
5676			zone_rele(zone);
5677			return (set_errno(EFAULT));
5678		}
5679		ptr += LIFNAMSIZ;
5680	}
5681	mutex_exit(&zone->zone_lock);
5682	zone_rele(zone);
5683
5684	/* Increased or decreased, caller should be notified. */
5685	if (num != dlcount) {
5686		if (copyout(&num, nump, sizeof (num)) != 0) {
5687			return (set_errno(EFAULT));
5688		}
5689	}
5690	return (0);
5691}
5692
5693/*
5694 * Public interface for looking up a zone by zoneid. It's a customized version
5695 * for netstack_zone_create(), it:
5696 * 1. Doesn't acquire the zonehash_lock, since it is called from
5697 *    zone_key_create() or zone_zsd_configure(), lock already held.
5698 * 2. Doesn't check the status of the zone.
5699 * 3. It will be called even before zone_init is called, in that case the
5700 *    address of zone0 is returned directly, and netstack_zone_create()
5701 *    will only assign a value to zone0.zone_netstack, won't break anything.
5702 */
5703zone_t *
5704zone_find_by_id_nolock(zoneid_t zoneid)
5705{
5706	ASSERT(MUTEX_HELD(&zonehash_lock));
5707
5708	if (zonehashbyid == NULL)
5709		return (&zone0);
5710	else
5711		return (zone_find_all_by_id(zoneid));
5712}
5713