zone.c revision 4842:002ad7ab90df
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Zones
31 *
32 *   A zone is a named collection of processes, namespace constraints,
33 *   and other system resources which comprise a secure and manageable
34 *   application containment facility.
35 *
36 *   Zones (represented by the reference counted zone_t) are tracked in
37 *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38 *   (zoneid_t) are used to track zone association.  Zone IDs are
39 *   dynamically generated when the zone is created; if a persistent
40 *   identifier is needed (core files, accounting logs, audit trail,
41 *   etc.), the zone name should be used.
42 *
43 *
44 *   Global Zone:
45 *
46 *   The global zone (zoneid 0) is automatically associated with all
47 *   system resources that have not been bound to a user-created zone.
48 *   This means that even systems where zones are not in active use
49 *   have a global zone, and all processes, mounts, etc. are
50 *   associated with that zone.  The global zone is generally
51 *   unconstrained in terms of privileges and access, though the usual
52 *   credential and privilege based restrictions apply.
53 *
54 *
55 *   Zone States:
56 *
57 *   The states in which a zone may be in and the transitions are as
58 *   follows:
59 *
60 *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61 *   initialized zone is added to the list of active zones on the system but
62 *   isn't accessible.
63 *
64 *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
65 *   ready.  The zone is made visible after the ZSD constructor callbacks are
66 *   executed.  A zone remains in this state until it transitions into
67 *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
68 *
69 *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
70 *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
71 *   state.
72 *
73 *   ZONE_IS_RUNNING: The zone is open for business: zsched has
74 *   successfully started init.   A zone remains in this state until
75 *   zone_shutdown() is called.
76 *
77 *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
78 *   killing all processes running in the zone. The zone remains
79 *   in this state until there are no more user processes running in the zone.
80 *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
81 *   Since zone_shutdown() is restartable, it may be called successfully
82 *   multiple times for the same zone_t.  Setting of the zone's state to
83 *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
84 *   the zone's status without worrying about it being a moving target.
85 *
86 *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
87 *   are no more user processes in the zone.  The zone remains in this
88 *   state until there are no more kernel threads associated with the
89 *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
90 *   fail.
91 *
92 *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
93 *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
94 *   join the zone or create kernel threads therein.
95 *
96 *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
97 *   remains in this state until zsched exits.  Calls to zone_find_by_*()
98 *   return NULL from now on.
99 *
100 *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
101 *   processes or threads doing work on behalf of the zone.  The zone is
102 *   removed from the list of active zones.  zone_destroy() returns, and
103 *   the zone can be recreated.
104 *
105 *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
106 *   callbacks are executed, and all memory associated with the zone is
107 *   freed.
108 *
109 *   Threads can wait for the zone to enter a requested state by using
110 *   zone_status_wait() or zone_status_timedwait() with the desired
111 *   state passed in as an argument.  Zone state transitions are
112 *   uni-directional; it is not possible to move back to an earlier state.
113 *
114 *
115 *   Zone-Specific Data:
116 *
117 *   Subsystems needing to maintain zone-specific data can store that
118 *   data using the ZSD mechanism.  This provides a zone-specific data
119 *   store, similar to thread-specific data (see pthread_getspecific(3C)
120 *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
121 *   to register callbacks to be invoked when a zone is created, shut
122 *   down, or destroyed.  This can be used to initialize zone-specific
123 *   data for new zones and to clean up when zones go away.
124 *
125 *
126 *   Data Structures:
127 *
128 *   The per-zone structure (zone_t) is reference counted, and freed
129 *   when all references are released.  zone_hold and zone_rele can be
130 *   used to adjust the reference count.  In addition, reference counts
131 *   associated with the cred_t structure are tracked separately using
132 *   zone_cred_hold and zone_cred_rele.
133 *
134 *   Pointers to active zone_t's are stored in two hash tables; one
135 *   for searching by id, the other for searching by name.  Lookups
136 *   can be performed on either basis, using zone_find_by_id and
137 *   zone_find_by_name.  Both return zone_t pointers with the zone
138 *   held, so zone_rele should be called when the pointer is no longer
139 *   needed.  Zones can also be searched by path; zone_find_by_path
140 *   returns the zone with which a path name is associated (global
141 *   zone if the path is not within some other zone's file system
142 *   hierarchy).  This currently requires iterating through each zone,
143 *   so it is slower than an id or name search via a hash table.
144 *
145 *
146 *   Locking:
147 *
148 *   zonehash_lock: This is a top-level global lock used to protect the
149 *       zone hash tables and lists.  Zones cannot be created or destroyed
150 *       while this lock is held.
151 *   zone_status_lock: This is a global lock protecting zone state.
152 *       Zones cannot change state while this lock is held.  It also
153 *       protects the list of kernel threads associated with a zone.
154 *   zone_lock: This is a per-zone lock used to protect several fields of
155 *       the zone_t (see <sys/zone.h> for details).  In addition, holding
156 *       this lock means that the zone cannot go away.
157 *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
158 *	 related to the zone.max-lwps rctl.
159 *   zone_mem_lock: This is a per-zone lock used to protect the fields
160 *	 related to the zone.max-locked-memory and zone.max-swap rctls.
161 *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
162 *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
163 *       list (a list of zones in the ZONE_IS_DEAD state).
164 *
165 *   Ordering requirements:
166 *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
167 *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
168 *
169 *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
170 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
171 *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
172 *
173 *   Blocking memory allocations are permitted while holding any of the
174 *   zone locks.
175 *
176 *
177 *   System Call Interface:
178 *
179 *   The zone subsystem can be managed and queried from user level with
180 *   the following system calls (all subcodes of the primary "zone"
181 *   system call):
182 *   - zone_create: creates a zone with selected attributes (name,
183 *     root path, privileges, resource controls, ZFS datasets)
184 *   - zone_enter: allows the current process to enter a zone
185 *   - zone_getattr: reports attributes of a zone
186 *   - zone_setattr: set attributes of a zone
187 *   - zone_boot: set 'init' running for the zone
188 *   - zone_list: lists all zones active in the system
189 *   - zone_lookup: looks up zone id based on name
190 *   - zone_shutdown: initiates shutdown process (see states above)
191 *   - zone_destroy: completes shutdown process (see states above)
192 *
193 */
194
195#include <sys/priv_impl.h>
196#include <sys/cred.h>
197#include <c2/audit.h>
198#include <sys/debug.h>
199#include <sys/file.h>
200#include <sys/kmem.h>
201#include <sys/kstat.h>
202#include <sys/mutex.h>
203#include <sys/note.h>
204#include <sys/pathname.h>
205#include <sys/proc.h>
206#include <sys/project.h>
207#include <sys/sysevent.h>
208#include <sys/task.h>
209#include <sys/systm.h>
210#include <sys/types.h>
211#include <sys/utsname.h>
212#include <sys/vnode.h>
213#include <sys/vfs.h>
214#include <sys/systeminfo.h>
215#include <sys/policy.h>
216#include <sys/cred_impl.h>
217#include <sys/contract_impl.h>
218#include <sys/contract/process_impl.h>
219#include <sys/class.h>
220#include <sys/pool.h>
221#include <sys/pool_pset.h>
222#include <sys/pset.h>
223#include <sys/sysmacros.h>
224#include <sys/callb.h>
225#include <sys/vmparam.h>
226#include <sys/corectl.h>
227#include <sys/ipc_impl.h>
228
229#include <sys/door.h>
230#include <sys/cpuvar.h>
231
232#include <sys/uadmin.h>
233#include <sys/session.h>
234#include <sys/cmn_err.h>
235#include <sys/modhash.h>
236#include <sys/sunddi.h>
237#include <sys/nvpair.h>
238#include <sys/rctl.h>
239#include <sys/fss.h>
240#include <sys/brand.h>
241#include <sys/zone.h>
242#include <net/if.h>
243#include <sys/cpucaps.h>
244#include <vm/seg.h>
245
246/*
247 * cv used to signal that all references to the zone have been released.  This
248 * needs to be global since there may be multiple waiters, and the first to
249 * wake up will free the zone_t, hence we cannot use zone->zone_cv.
250 */
251static kcondvar_t zone_destroy_cv;
252/*
253 * Lock used to serialize access to zone_cv.  This could have been per-zone,
254 * but then we'd need another lock for zone_destroy_cv, and why bother?
255 */
256static kmutex_t zone_status_lock;
257
258/*
259 * ZSD-related global variables.
260 */
261static kmutex_t zsd_key_lock;	/* protects the following two */
262/*
263 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
264 */
265static zone_key_t zsd_keyval = 0;
266/*
267 * Global list of registered keys.  We use this when a new zone is created.
268 */
269static list_t zsd_registered_keys;
270
271int zone_hash_size = 256;
272static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
273static kmutex_t zonehash_lock;
274static uint_t zonecount;
275static id_space_t *zoneid_space;
276
277/*
278 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
279 * kernel proper runs, and which manages all other zones.
280 *
281 * Although not declared as static, the variable "zone0" should not be used
282 * except for by code that needs to reference the global zone early on in boot,
283 * before it is fully initialized.  All other consumers should use
284 * 'global_zone'.
285 */
286zone_t zone0;
287zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
288
289/*
290 * List of active zones, protected by zonehash_lock.
291 */
292static list_t zone_active;
293
294/*
295 * List of destroyed zones that still have outstanding cred references.
296 * Used for debugging.  Uses a separate lock to avoid lock ordering
297 * problems in zone_free.
298 */
299static list_t zone_deathrow;
300static kmutex_t zone_deathrow_lock;
301
302/* number of zones is limited by virtual interface limit in IP */
303uint_t maxzones = 8192;
304
305/* Event channel to sent zone state change notifications */
306evchan_t *zone_event_chan;
307
308/*
309 * This table holds the mapping from kernel zone states to
310 * states visible in the state notification API.
311 * The idea is that we only expose "obvious" states and
312 * do not expose states which are just implementation details.
313 */
314const char  *zone_status_table[] = {
315	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
316	ZONE_EVENT_READY,		/* ready */
317	ZONE_EVENT_READY,		/* booting */
318	ZONE_EVENT_RUNNING,		/* running */
319	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
320	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
321	ZONE_EVENT_SHUTTING_DOWN,	/* down */
322	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
323	ZONE_EVENT_UNINITIALIZED,	/* dead */
324};
325
326/*
327 * This isn't static so lint doesn't complain.
328 */
329rctl_hndl_t rc_zone_cpu_shares;
330rctl_hndl_t rc_zone_locked_mem;
331rctl_hndl_t rc_zone_max_swap;
332rctl_hndl_t rc_zone_cpu_cap;
333rctl_hndl_t rc_zone_nlwps;
334rctl_hndl_t rc_zone_shmmax;
335rctl_hndl_t rc_zone_shmmni;
336rctl_hndl_t rc_zone_semmni;
337rctl_hndl_t rc_zone_msgmni;
338/*
339 * Synchronization primitives used to synchronize between mounts and zone
340 * creation/destruction.
341 */
342static int mounts_in_progress;
343static kcondvar_t mount_cv;
344static kmutex_t mount_lock;
345
346const char * const zone_default_initname = "/sbin/init";
347static char * const zone_prefix = "/zone/";
348static int zone_shutdown(zoneid_t zoneid);
349static int zone_add_datalink(zoneid_t, char *);
350static int zone_remove_datalink(zoneid_t, char *);
351static int zone_check_datalink(zoneid_t *, char *);
352static int zone_list_datalink(zoneid_t, int *, char *);
353
354/*
355 * Bump this number when you alter the zone syscall interfaces; this is
356 * because we need to have support for previous API versions in libc
357 * to support patching; libc calls into the kernel to determine this number.
358 *
359 * Version 1 of the API is the version originally shipped with Solaris 10
360 * Version 2 alters the zone_create system call in order to support more
361 *     arguments by moving the args into a structure; and to do better
362 *     error reporting when zone_create() fails.
363 * Version 3 alters the zone_create system call in order to support the
364 *     import of ZFS datasets to zones.
365 * Version 4 alters the zone_create system call in order to support
366 *     Trusted Extensions.
367 * Version 5 alters the zone_boot system call, and converts its old
368 *     bootargs parameter to be set by the zone_setattr API instead.
369 * Version 6 adds the flag argument to zone_create.
370 */
371static const int ZONE_SYSCALL_API_VERSION = 6;
372
373/*
374 * Certain filesystems (such as NFS and autofs) need to know which zone
375 * the mount is being placed in.  Because of this, we need to be able to
376 * ensure that a zone isn't in the process of being created such that
377 * nfs_mount() thinks it is in the global zone, while by the time it
378 * gets added the list of mounted zones, it ends up on zoneA's mount
379 * list.
380 *
381 * The following functions: block_mounts()/resume_mounts() and
382 * mount_in_progress()/mount_completed() are used by zones and the VFS
383 * layer (respectively) to synchronize zone creation and new mounts.
384 *
385 * The semantics are like a reader-reader lock such that there may
386 * either be multiple mounts (or zone creations, if that weren't
387 * serialized by zonehash_lock) in progress at the same time, but not
388 * both.
389 *
390 * We use cv's so the user can ctrl-C out of the operation if it's
391 * taking too long.
392 *
393 * The semantics are such that there is unfair bias towards the
394 * "current" operation.  This means that zone creations may starve if
395 * there is a rapid succession of new mounts coming in to the system, or
396 * there is a remote possibility that zones will be created at such a
397 * rate that new mounts will not be able to proceed.
398 */
399/*
400 * Prevent new mounts from progressing to the point of calling
401 * VFS_MOUNT().  If there are already mounts in this "region", wait for
402 * them to complete.
403 */
404static int
405block_mounts(void)
406{
407	int retval = 0;
408
409	/*
410	 * Since it may block for a long time, block_mounts() shouldn't be
411	 * called with zonehash_lock held.
412	 */
413	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
414	mutex_enter(&mount_lock);
415	while (mounts_in_progress > 0) {
416		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
417			goto signaled;
418	}
419	/*
420	 * A negative value of mounts_in_progress indicates that mounts
421	 * have been blocked by (-mounts_in_progress) different callers.
422	 */
423	mounts_in_progress--;
424	retval = 1;
425signaled:
426	mutex_exit(&mount_lock);
427	return (retval);
428}
429
430/*
431 * The VFS layer may progress with new mounts as far as we're concerned.
432 * Allow them to progress if we were the last obstacle.
433 */
434static void
435resume_mounts(void)
436{
437	mutex_enter(&mount_lock);
438	if (++mounts_in_progress == 0)
439		cv_broadcast(&mount_cv);
440	mutex_exit(&mount_lock);
441}
442
443/*
444 * The VFS layer is busy with a mount; zones should wait until all
445 * mounts are completed to progress.
446 */
447void
448mount_in_progress(void)
449{
450	mutex_enter(&mount_lock);
451	while (mounts_in_progress < 0)
452		cv_wait(&mount_cv, &mount_lock);
453	mounts_in_progress++;
454	mutex_exit(&mount_lock);
455}
456
457/*
458 * VFS is done with one mount; wake up any waiting block_mounts()
459 * callers if this is the last mount.
460 */
461void
462mount_completed(void)
463{
464	mutex_enter(&mount_lock);
465	if (--mounts_in_progress == 0)
466		cv_broadcast(&mount_cv);
467	mutex_exit(&mount_lock);
468}
469
470/*
471 * ZSD routines.
472 *
473 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
474 * defined by the pthread_key_create() and related interfaces.
475 *
476 * Kernel subsystems may register one or more data items and/or
477 * callbacks to be executed when a zone is created, shutdown, or
478 * destroyed.
479 *
480 * Unlike the thread counterpart, destructor callbacks will be executed
481 * even if the data pointer is NULL and/or there are no constructor
482 * callbacks, so it is the responsibility of such callbacks to check for
483 * NULL data values if necessary.
484 *
485 * The locking strategy and overall picture is as follows:
486 *
487 * When someone calls zone_key_create(), a template ZSD entry is added to the
488 * global list "zsd_registered_keys", protected by zsd_key_lock.  The
489 * constructor callback is called immediately on all existing zones, and a
490 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
491 * zone_lock).  As this operation requires the list of zones, the list of
492 * registered keys, and the per-zone list of ZSD entries to remain constant
493 * throughout the entire operation, it must grab zonehash_lock, zone_lock for
494 * all existing zones, and zsd_key_lock, in that order.  Similar locking is
495 * needed when zone_key_delete() is called.  It is thus sufficient to hold
496 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
497 * per-zone zone_zsd list.
498 *
499 * Note that this implementation does not make a copy of the ZSD entry if a
500 * constructor callback is not provided.  A zone_getspecific() on such an
501 * uninitialized ZSD entry will return NULL.
502 *
503 * When new zones are created constructor callbacks for all registered ZSD
504 * entries will be called.
505 *
506 * The framework does not provide any locking around zone_getspecific() and
507 * zone_setspecific() apart from that needed for internal consistency, so
508 * callers interested in atomic "test-and-set" semantics will need to provide
509 * their own locking.
510 */
511void
512zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
513    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
514{
515	struct zsd_entry *zsdp;
516	struct zsd_entry *t;
517	struct zone *zone;
518
519	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
520	zsdp->zsd_data = NULL;
521	zsdp->zsd_create = create;
522	zsdp->zsd_shutdown = shutdown;
523	zsdp->zsd_destroy = destroy;
524
525	mutex_enter(&zonehash_lock);	/* stop the world */
526	for (zone = list_head(&zone_active); zone != NULL;
527	    zone = list_next(&zone_active, zone))
528		mutex_enter(&zone->zone_lock);	/* lock all zones */
529
530	mutex_enter(&zsd_key_lock);
531	*keyp = zsdp->zsd_key = ++zsd_keyval;
532	ASSERT(zsd_keyval != 0);
533	list_insert_tail(&zsd_registered_keys, zsdp);
534	mutex_exit(&zsd_key_lock);
535
536	if (create != NULL) {
537		for (zone = list_head(&zone_active); zone != NULL;
538		    zone = list_next(&zone_active, zone)) {
539			t = kmem_alloc(sizeof (*t), KM_SLEEP);
540			t->zsd_key = *keyp;
541			t->zsd_data = (*create)(zone->zone_id);
542			t->zsd_create = create;
543			t->zsd_shutdown = shutdown;
544			t->zsd_destroy = destroy;
545			list_insert_tail(&zone->zone_zsd, t);
546		}
547	}
548	for (zone = list_head(&zone_active); zone != NULL;
549	    zone = list_next(&zone_active, zone))
550		mutex_exit(&zone->zone_lock);
551	mutex_exit(&zonehash_lock);
552}
553
554/*
555 * Helper function to find the zsd_entry associated with the key in the
556 * given list.
557 */
558static struct zsd_entry *
559zsd_find(list_t *l, zone_key_t key)
560{
561	struct zsd_entry *zsd;
562
563	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
564		if (zsd->zsd_key == key) {
565			/*
566			 * Move to head of list to keep list in MRU order.
567			 */
568			if (zsd != list_head(l)) {
569				list_remove(l, zsd);
570				list_insert_head(l, zsd);
571			}
572			return (zsd);
573		}
574	}
575	return (NULL);
576}
577
578/*
579 * Function called when a module is being unloaded, or otherwise wishes
580 * to unregister its ZSD key and callbacks.
581 */
582int
583zone_key_delete(zone_key_t key)
584{
585	struct zsd_entry *zsdp = NULL;
586	zone_t *zone;
587
588	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
589	for (zone = list_head(&zone_active); zone != NULL;
590	    zone = list_next(&zone_active, zone))
591		mutex_enter(&zone->zone_lock);	/* lock all zones */
592
593	mutex_enter(&zsd_key_lock);
594	zsdp = zsd_find(&zsd_registered_keys, key);
595	if (zsdp == NULL)
596		goto notfound;
597	list_remove(&zsd_registered_keys, zsdp);
598	mutex_exit(&zsd_key_lock);
599
600	for (zone = list_head(&zone_active); zone != NULL;
601	    zone = list_next(&zone_active, zone)) {
602		struct zsd_entry *del;
603		void *data;
604
605		if (!(zone->zone_flags & ZF_DESTROYED)) {
606			del = zsd_find(&zone->zone_zsd, key);
607			if (del != NULL) {
608				data = del->zsd_data;
609				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
610				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
611				list_remove(&zone->zone_zsd, del);
612				kmem_free(del, sizeof (*del));
613			} else {
614				data = NULL;
615			}
616			if (zsdp->zsd_shutdown)
617				zsdp->zsd_shutdown(zone->zone_id, data);
618			if (zsdp->zsd_destroy)
619				zsdp->zsd_destroy(zone->zone_id, data);
620		}
621		mutex_exit(&zone->zone_lock);
622	}
623	mutex_exit(&zonehash_lock);
624	kmem_free(zsdp, sizeof (*zsdp));
625	return (0);
626
627notfound:
628	mutex_exit(&zsd_key_lock);
629	for (zone = list_head(&zone_active); zone != NULL;
630	    zone = list_next(&zone_active, zone))
631		mutex_exit(&zone->zone_lock);
632	mutex_exit(&zonehash_lock);
633	return (-1);
634}
635
636/*
637 * ZSD counterpart of pthread_setspecific().
638 */
639int
640zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
641{
642	struct zsd_entry *t;
643	struct zsd_entry *zsdp = NULL;
644
645	mutex_enter(&zone->zone_lock);
646	t = zsd_find(&zone->zone_zsd, key);
647	if (t != NULL) {
648		/*
649		 * Replace old value with new
650		 */
651		t->zsd_data = (void *)data;
652		mutex_exit(&zone->zone_lock);
653		return (0);
654	}
655	/*
656	 * If there was no previous value, go through the list of registered
657	 * keys.
658	 *
659	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
660	 * necessary for shutdown callbacks to be able to execute without fear
661	 * of deadlock.
662	 */
663	mutex_enter(&zsd_key_lock);
664	zsdp = zsd_find(&zsd_registered_keys, key);
665	if (zsdp == NULL) { 	/* Key was not registered */
666		mutex_exit(&zsd_key_lock);
667		mutex_exit(&zone->zone_lock);
668		return (-1);
669	}
670
671	/*
672	 * Add a zsd_entry to this zone, using the template we just retrieved
673	 * to initialize the constructor and destructor(s).
674	 */
675	t = kmem_alloc(sizeof (*t), KM_SLEEP);
676	t->zsd_key = key;
677	t->zsd_data = (void *)data;
678	t->zsd_create = zsdp->zsd_create;
679	t->zsd_shutdown = zsdp->zsd_shutdown;
680	t->zsd_destroy = zsdp->zsd_destroy;
681	list_insert_tail(&zone->zone_zsd, t);
682	mutex_exit(&zsd_key_lock);
683	mutex_exit(&zone->zone_lock);
684	return (0);
685}
686
687/*
688 * ZSD counterpart of pthread_getspecific().
689 */
690void *
691zone_getspecific(zone_key_t key, zone_t *zone)
692{
693	struct zsd_entry *t;
694	void *data;
695
696	mutex_enter(&zone->zone_lock);
697	t = zsd_find(&zone->zone_zsd, key);
698	data = (t == NULL ? NULL : t->zsd_data);
699	mutex_exit(&zone->zone_lock);
700	return (data);
701}
702
703/*
704 * Function used to initialize a zone's list of ZSD callbacks and data
705 * when the zone is being created.  The callbacks are initialized from
706 * the template list (zsd_registered_keys), and the constructor
707 * callback executed (if one exists).
708 *
709 * This is called before the zone is made publicly available, hence no
710 * need to grab zone_lock.
711 *
712 * Although we grab and release zsd_key_lock, new entries cannot be
713 * added to or removed from the zsd_registered_keys list until we
714 * release zonehash_lock, so there isn't a window for a
715 * zone_key_create() to come in after we've dropped zsd_key_lock but
716 * before the zone is added to the zone list, such that the constructor
717 * callbacks aren't executed for the new zone.
718 */
719static void
720zone_zsd_configure(zone_t *zone)
721{
722	struct zsd_entry *zsdp;
723	struct zsd_entry *t;
724	zoneid_t zoneid = zone->zone_id;
725
726	ASSERT(MUTEX_HELD(&zonehash_lock));
727	ASSERT(list_head(&zone->zone_zsd) == NULL);
728	mutex_enter(&zsd_key_lock);
729	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
730	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
731		if (zsdp->zsd_create != NULL) {
732			t = kmem_alloc(sizeof (*t), KM_SLEEP);
733			t->zsd_key = zsdp->zsd_key;
734			t->zsd_create = zsdp->zsd_create;
735			t->zsd_data = (*t->zsd_create)(zoneid);
736			t->zsd_shutdown = zsdp->zsd_shutdown;
737			t->zsd_destroy = zsdp->zsd_destroy;
738			list_insert_tail(&zone->zone_zsd, t);
739		}
740	}
741	mutex_exit(&zsd_key_lock);
742}
743
744enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
745
746/*
747 * Helper function to execute shutdown or destructor callbacks.
748 */
749static void
750zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
751{
752	struct zsd_entry *zsdp;
753	struct zsd_entry *t;
754	zoneid_t zoneid = zone->zone_id;
755
756	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
757	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
758	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
759
760	mutex_enter(&zone->zone_lock);
761	if (ct == ZSD_DESTROY) {
762		if (zone->zone_flags & ZF_DESTROYED) {
763			/*
764			 * Make sure destructors are only called once.
765			 */
766			mutex_exit(&zone->zone_lock);
767			return;
768		}
769		zone->zone_flags |= ZF_DESTROYED;
770	}
771	mutex_exit(&zone->zone_lock);
772
773	/*
774	 * Both zsd_key_lock and zone_lock need to be held in order to add or
775	 * remove a ZSD key, (either globally as part of
776	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
777	 * possible through zone_setspecific()), so it's sufficient to hold
778	 * zsd_key_lock here.
779	 *
780	 * This is a good thing, since we don't want to recursively try to grab
781	 * zone_lock if a callback attempts to do something like a crfree() or
782	 * zone_rele().
783	 */
784	mutex_enter(&zsd_key_lock);
785	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
786	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
787		zone_key_t key = zsdp->zsd_key;
788
789		/* Skip if no callbacks registered */
790		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
791			continue;
792		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
793			continue;
794		/*
795		 * Call the callback with the zone-specific data if we can find
796		 * any, otherwise with NULL.
797		 */
798		t = zsd_find(&zone->zone_zsd, key);
799		if (t != NULL) {
800			if (ct == ZSD_SHUTDOWN) {
801				t->zsd_shutdown(zoneid, t->zsd_data);
802			} else {
803				ASSERT(ct == ZSD_DESTROY);
804				t->zsd_destroy(zoneid, t->zsd_data);
805			}
806		} else {
807			if (ct == ZSD_SHUTDOWN) {
808				zsdp->zsd_shutdown(zoneid, NULL);
809			} else {
810				ASSERT(ct == ZSD_DESTROY);
811				zsdp->zsd_destroy(zoneid, NULL);
812			}
813		}
814	}
815	mutex_exit(&zsd_key_lock);
816}
817
818/*
819 * Called when the zone is going away; free ZSD-related memory, and
820 * destroy the zone_zsd list.
821 */
822static void
823zone_free_zsd(zone_t *zone)
824{
825	struct zsd_entry *t, *next;
826
827	/*
828	 * Free all the zsd_entry's we had on this zone.
829	 */
830	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
831		next = list_next(&zone->zone_zsd, t);
832		list_remove(&zone->zone_zsd, t);
833		kmem_free(t, sizeof (*t));
834	}
835	list_destroy(&zone->zone_zsd);
836}
837
838/*
839 * Frees memory associated with the zone dataset list.
840 */
841static void
842zone_free_datasets(zone_t *zone)
843{
844	zone_dataset_t *t, *next;
845
846	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
847		next = list_next(&zone->zone_datasets, t);
848		list_remove(&zone->zone_datasets, t);
849		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
850		kmem_free(t, sizeof (*t));
851	}
852	list_destroy(&zone->zone_datasets);
853}
854
855/*
856 * zone.cpu-shares resource control support.
857 */
858/*ARGSUSED*/
859static rctl_qty_t
860zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
861{
862	ASSERT(MUTEX_HELD(&p->p_lock));
863	return (p->p_zone->zone_shares);
864}
865
866/*ARGSUSED*/
867static int
868zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
869    rctl_qty_t nv)
870{
871	ASSERT(MUTEX_HELD(&p->p_lock));
872	ASSERT(e->rcep_t == RCENTITY_ZONE);
873	if (e->rcep_p.zone == NULL)
874		return (0);
875
876	e->rcep_p.zone->zone_shares = nv;
877	return (0);
878}
879
880static rctl_ops_t zone_cpu_shares_ops = {
881	rcop_no_action,
882	zone_cpu_shares_usage,
883	zone_cpu_shares_set,
884	rcop_no_test
885};
886
887/*
888 * zone.cpu-cap resource control support.
889 */
890/*ARGSUSED*/
891static rctl_qty_t
892zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
893{
894	ASSERT(MUTEX_HELD(&p->p_lock));
895	return (cpucaps_zone_get(p->p_zone));
896}
897
898/*ARGSUSED*/
899static int
900zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
901    rctl_qty_t nv)
902{
903	zone_t *zone = e->rcep_p.zone;
904
905	ASSERT(MUTEX_HELD(&p->p_lock));
906	ASSERT(e->rcep_t == RCENTITY_ZONE);
907
908	if (zone == NULL)
909		return (0);
910
911	/*
912	 * set cap to the new value.
913	 */
914	return (cpucaps_zone_set(zone, nv));
915}
916
917static rctl_ops_t zone_cpu_cap_ops = {
918	rcop_no_action,
919	zone_cpu_cap_get,
920	zone_cpu_cap_set,
921	rcop_no_test
922};
923
924/*ARGSUSED*/
925static rctl_qty_t
926zone_lwps_usage(rctl_t *r, proc_t *p)
927{
928	rctl_qty_t nlwps;
929	zone_t *zone = p->p_zone;
930
931	ASSERT(MUTEX_HELD(&p->p_lock));
932
933	mutex_enter(&zone->zone_nlwps_lock);
934	nlwps = zone->zone_nlwps;
935	mutex_exit(&zone->zone_nlwps_lock);
936
937	return (nlwps);
938}
939
940/*ARGSUSED*/
941static int
942zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
943    rctl_qty_t incr, uint_t flags)
944{
945	rctl_qty_t nlwps;
946
947	ASSERT(MUTEX_HELD(&p->p_lock));
948	ASSERT(e->rcep_t == RCENTITY_ZONE);
949	if (e->rcep_p.zone == NULL)
950		return (0);
951	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
952	nlwps = e->rcep_p.zone->zone_nlwps;
953
954	if (nlwps + incr > rcntl->rcv_value)
955		return (1);
956
957	return (0);
958}
959
960/*ARGSUSED*/
961static int
962zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
963{
964	ASSERT(MUTEX_HELD(&p->p_lock));
965	ASSERT(e->rcep_t == RCENTITY_ZONE);
966	if (e->rcep_p.zone == NULL)
967		return (0);
968	e->rcep_p.zone->zone_nlwps_ctl = nv;
969	return (0);
970}
971
972static rctl_ops_t zone_lwps_ops = {
973	rcop_no_action,
974	zone_lwps_usage,
975	zone_lwps_set,
976	zone_lwps_test,
977};
978
979/*ARGSUSED*/
980static int
981zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
982    rctl_qty_t incr, uint_t flags)
983{
984	rctl_qty_t v;
985	ASSERT(MUTEX_HELD(&p->p_lock));
986	ASSERT(e->rcep_t == RCENTITY_ZONE);
987	v = e->rcep_p.zone->zone_shmmax + incr;
988	if (v > rval->rcv_value)
989		return (1);
990	return (0);
991}
992
993static rctl_ops_t zone_shmmax_ops = {
994	rcop_no_action,
995	rcop_no_usage,
996	rcop_no_set,
997	zone_shmmax_test
998};
999
1000/*ARGSUSED*/
1001static int
1002zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1003    rctl_qty_t incr, uint_t flags)
1004{
1005	rctl_qty_t v;
1006	ASSERT(MUTEX_HELD(&p->p_lock));
1007	ASSERT(e->rcep_t == RCENTITY_ZONE);
1008	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1009	if (v > rval->rcv_value)
1010		return (1);
1011	return (0);
1012}
1013
1014static rctl_ops_t zone_shmmni_ops = {
1015	rcop_no_action,
1016	rcop_no_usage,
1017	rcop_no_set,
1018	zone_shmmni_test
1019};
1020
1021/*ARGSUSED*/
1022static int
1023zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1024    rctl_qty_t incr, uint_t flags)
1025{
1026	rctl_qty_t v;
1027	ASSERT(MUTEX_HELD(&p->p_lock));
1028	ASSERT(e->rcep_t == RCENTITY_ZONE);
1029	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1030	if (v > rval->rcv_value)
1031		return (1);
1032	return (0);
1033}
1034
1035static rctl_ops_t zone_semmni_ops = {
1036	rcop_no_action,
1037	rcop_no_usage,
1038	rcop_no_set,
1039	zone_semmni_test
1040};
1041
1042/*ARGSUSED*/
1043static int
1044zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1045    rctl_qty_t incr, uint_t flags)
1046{
1047	rctl_qty_t v;
1048	ASSERT(MUTEX_HELD(&p->p_lock));
1049	ASSERT(e->rcep_t == RCENTITY_ZONE);
1050	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1051	if (v > rval->rcv_value)
1052		return (1);
1053	return (0);
1054}
1055
1056static rctl_ops_t zone_msgmni_ops = {
1057	rcop_no_action,
1058	rcop_no_usage,
1059	rcop_no_set,
1060	zone_msgmni_test
1061};
1062
1063/*ARGSUSED*/
1064static rctl_qty_t
1065zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1066{
1067	rctl_qty_t q;
1068	ASSERT(MUTEX_HELD(&p->p_lock));
1069	mutex_enter(&p->p_zone->zone_mem_lock);
1070	q = p->p_zone->zone_locked_mem;
1071	mutex_exit(&p->p_zone->zone_mem_lock);
1072	return (q);
1073}
1074
1075/*ARGSUSED*/
1076static int
1077zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1078    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1079{
1080	rctl_qty_t q;
1081	zone_t *z;
1082
1083	z = e->rcep_p.zone;
1084	ASSERT(MUTEX_HELD(&p->p_lock));
1085	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1086	q = z->zone_locked_mem;
1087	if (q + incr > rcntl->rcv_value)
1088		return (1);
1089	return (0);
1090}
1091
1092/*ARGSUSED*/
1093static int
1094zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1095    rctl_qty_t nv)
1096{
1097	ASSERT(MUTEX_HELD(&p->p_lock));
1098	ASSERT(e->rcep_t == RCENTITY_ZONE);
1099	if (e->rcep_p.zone == NULL)
1100		return (0);
1101	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1102	return (0);
1103}
1104
1105static rctl_ops_t zone_locked_mem_ops = {
1106	rcop_no_action,
1107	zone_locked_mem_usage,
1108	zone_locked_mem_set,
1109	zone_locked_mem_test
1110};
1111
1112/*ARGSUSED*/
1113static rctl_qty_t
1114zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1115{
1116	rctl_qty_t q;
1117	zone_t *z = p->p_zone;
1118
1119	ASSERT(MUTEX_HELD(&p->p_lock));
1120	mutex_enter(&z->zone_mem_lock);
1121	q = z->zone_max_swap;
1122	mutex_exit(&z->zone_mem_lock);
1123	return (q);
1124}
1125
1126/*ARGSUSED*/
1127static int
1128zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1129    rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1130{
1131	rctl_qty_t q;
1132	zone_t *z;
1133
1134	z = e->rcep_p.zone;
1135	ASSERT(MUTEX_HELD(&p->p_lock));
1136	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1137	q = z->zone_max_swap;
1138	if (q + incr > rcntl->rcv_value)
1139		return (1);
1140	return (0);
1141}
1142
1143/*ARGSUSED*/
1144static int
1145zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1146    rctl_qty_t nv)
1147{
1148	ASSERT(MUTEX_HELD(&p->p_lock));
1149	ASSERT(e->rcep_t == RCENTITY_ZONE);
1150	if (e->rcep_p.zone == NULL)
1151		return (0);
1152	e->rcep_p.zone->zone_max_swap_ctl = nv;
1153	return (0);
1154}
1155
1156static rctl_ops_t zone_max_swap_ops = {
1157	rcop_no_action,
1158	zone_max_swap_usage,
1159	zone_max_swap_set,
1160	zone_max_swap_test
1161};
1162
1163/*
1164 * Helper function to brand the zone with a unique ID.
1165 */
1166static void
1167zone_uniqid(zone_t *zone)
1168{
1169	static uint64_t uniqid = 0;
1170
1171	ASSERT(MUTEX_HELD(&zonehash_lock));
1172	zone->zone_uniqid = uniqid++;
1173}
1174
1175/*
1176 * Returns a held pointer to the "kcred" for the specified zone.
1177 */
1178struct cred *
1179zone_get_kcred(zoneid_t zoneid)
1180{
1181	zone_t *zone;
1182	cred_t *cr;
1183
1184	if ((zone = zone_find_by_id(zoneid)) == NULL)
1185		return (NULL);
1186	cr = zone->zone_kcred;
1187	crhold(cr);
1188	zone_rele(zone);
1189	return (cr);
1190}
1191
1192static int
1193zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1194{
1195	zone_t *zone = ksp->ks_private;
1196	zone_kstat_t *zk = ksp->ks_data;
1197
1198	if (rw == KSTAT_WRITE)
1199		return (EACCES);
1200
1201	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1202	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1203	return (0);
1204}
1205
1206static int
1207zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1208{
1209	zone_t *zone = ksp->ks_private;
1210	zone_kstat_t *zk = ksp->ks_data;
1211
1212	if (rw == KSTAT_WRITE)
1213		return (EACCES);
1214
1215	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1216	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1217	return (0);
1218}
1219
1220static void
1221zone_kstat_create(zone_t *zone)
1222{
1223	kstat_t *ksp;
1224	zone_kstat_t *zk;
1225
1226	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
1227	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1228	    KSTAT_FLAG_VIRTUAL);
1229
1230	if (ksp == NULL)
1231		return;
1232
1233	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1234	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1235	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1236	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1237	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1238	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1239	ksp->ks_update = zone_lockedmem_kstat_update;
1240	ksp->ks_private = zone;
1241	kstat_install(ksp);
1242
1243	zone->zone_lockedmem_kstat = ksp;
1244
1245	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
1246	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1247	    KSTAT_FLAG_VIRTUAL);
1248
1249	if (ksp == NULL)
1250		return;
1251
1252	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1253	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1254	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1255	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1256	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1257	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1258	ksp->ks_update = zone_swapresv_kstat_update;
1259	ksp->ks_private = zone;
1260	kstat_install(ksp);
1261
1262	zone->zone_swapresv_kstat = ksp;
1263}
1264
1265static void
1266zone_kstat_delete(zone_t *zone)
1267{
1268	void *data;
1269
1270	if (zone->zone_lockedmem_kstat != NULL) {
1271		data = zone->zone_lockedmem_kstat->ks_data;
1272		kstat_delete(zone->zone_lockedmem_kstat);
1273		kmem_free(data, sizeof (zone_kstat_t));
1274	}
1275	if (zone->zone_swapresv_kstat != NULL) {
1276		data = zone->zone_swapresv_kstat->ks_data;
1277		kstat_delete(zone->zone_swapresv_kstat);
1278		kmem_free(data, sizeof (zone_kstat_t));
1279	}
1280}
1281
1282/*
1283 * Called very early on in boot to initialize the ZSD list so that
1284 * zone_key_create() can be called before zone_init().  It also initializes
1285 * portions of zone0 which may be used before zone_init() is called.  The
1286 * variable "global_zone" will be set when zone0 is fully initialized by
1287 * zone_init().
1288 */
1289void
1290zone_zsd_init(void)
1291{
1292	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1293	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1294	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1295	    offsetof(struct zsd_entry, zsd_linkage));
1296	list_create(&zone_active, sizeof (zone_t),
1297	    offsetof(zone_t, zone_linkage));
1298	list_create(&zone_deathrow, sizeof (zone_t),
1299	    offsetof(zone_t, zone_linkage));
1300
1301	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1302	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1303	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1304	zone0.zone_shares = 1;
1305	zone0.zone_nlwps = 0;
1306	zone0.zone_nlwps_ctl = INT_MAX;
1307	zone0.zone_locked_mem = 0;
1308	zone0.zone_locked_mem_ctl = UINT64_MAX;
1309	ASSERT(zone0.zone_max_swap == 0);
1310	zone0.zone_max_swap_ctl = UINT64_MAX;
1311	zone0.zone_shmmax = 0;
1312	zone0.zone_ipc.ipcq_shmmni = 0;
1313	zone0.zone_ipc.ipcq_semmni = 0;
1314	zone0.zone_ipc.ipcq_msgmni = 0;
1315	zone0.zone_name = GLOBAL_ZONENAME;
1316	zone0.zone_nodename = utsname.nodename;
1317	zone0.zone_domain = srpc_domain;
1318	zone0.zone_ref = 1;
1319	zone0.zone_id = GLOBAL_ZONEID;
1320	zone0.zone_status = ZONE_IS_RUNNING;
1321	zone0.zone_rootpath = "/";
1322	zone0.zone_rootpathlen = 2;
1323	zone0.zone_psetid = ZONE_PS_INVAL;
1324	zone0.zone_ncpus = 0;
1325	zone0.zone_ncpus_online = 0;
1326	zone0.zone_proc_initpid = 1;
1327	zone0.zone_initname = initname;
1328	zone0.zone_lockedmem_kstat = NULL;
1329	zone0.zone_swapresv_kstat = NULL;
1330	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1331	    offsetof(struct zsd_entry, zsd_linkage));
1332	list_insert_head(&zone_active, &zone0);
1333
1334	/*
1335	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1336	 * to anything meaningful.  It is assigned to be 'rootdir' in
1337	 * vfs_mountroot().
1338	 */
1339	zone0.zone_rootvp = NULL;
1340	zone0.zone_vfslist = NULL;
1341	zone0.zone_bootargs = initargs;
1342	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1343	/*
1344	 * The global zone has all privileges
1345	 */
1346	priv_fillset(zone0.zone_privset);
1347	/*
1348	 * Add p0 to the global zone
1349	 */
1350	zone0.zone_zsched = &p0;
1351	p0.p_zone = &zone0;
1352}
1353
1354/*
1355 * Compute a hash value based on the contents of the label and the DOI.  The
1356 * hash algorithm is somewhat arbitrary, but is based on the observation that
1357 * humans will likely pick labels that differ by amounts that work out to be
1358 * multiples of the number of hash chains, and thus stirring in some primes
1359 * should help.
1360 */
1361static uint_t
1362hash_bylabel(void *hdata, mod_hash_key_t key)
1363{
1364	const ts_label_t *lab = (ts_label_t *)key;
1365	const uint32_t *up, *ue;
1366	uint_t hash;
1367	int i;
1368
1369	_NOTE(ARGUNUSED(hdata));
1370
1371	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1372	/* we depend on alignment of label, but not representation */
1373	up = (const uint32_t *)&lab->tsl_label;
1374	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1375	i = 1;
1376	while (up < ue) {
1377		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1378		hash += *up + (*up << ((i % 16) + 1));
1379		up++;
1380		i++;
1381	}
1382	return (hash);
1383}
1384
1385/*
1386 * All that mod_hash cares about here is zero (equal) versus non-zero (not
1387 * equal).  This may need to be changed if less than / greater than is ever
1388 * needed.
1389 */
1390static int
1391hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1392{
1393	ts_label_t *lab1 = (ts_label_t *)key1;
1394	ts_label_t *lab2 = (ts_label_t *)key2;
1395
1396	return (label_equal(lab1, lab2) ? 0 : 1);
1397}
1398
1399/*
1400 * Called by main() to initialize the zones framework.
1401 */
1402void
1403zone_init(void)
1404{
1405	rctl_dict_entry_t *rde;
1406	rctl_val_t *dval;
1407	rctl_set_t *set;
1408	rctl_alloc_gp_t *gp;
1409	rctl_entity_p_t e;
1410	int res;
1411
1412	ASSERT(curproc == &p0);
1413
1414	/*
1415	 * Create ID space for zone IDs.  ID 0 is reserved for the
1416	 * global zone.
1417	 */
1418	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1419
1420	/*
1421	 * Initialize generic zone resource controls, if any.
1422	 */
1423	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1424	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1425	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1426	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
1427
1428	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
1429	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
1430	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
1431	    RCTL_GLOBAL_INFINITE,
1432	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
1433
1434	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1435	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1436	    INT_MAX, INT_MAX, &zone_lwps_ops);
1437	/*
1438	 * System V IPC resource controls
1439	 */
1440	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1441	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1442	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1443
1444	rc_zone_semmni = rctl_register("zone.max-sem-ids",
1445	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1446	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
1447
1448	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
1449	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1450	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
1451
1452	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
1453	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1454	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
1455
1456	/*
1457	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1458	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1459	 */
1460	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1461	bzero(dval, sizeof (rctl_val_t));
1462	dval->rcv_value = 1;
1463	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1464	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1465	dval->rcv_action_recip_pid = -1;
1466
1467	rde = rctl_dict_lookup("zone.cpu-shares");
1468	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1469
1470	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
1471	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1472	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1473	    &zone_locked_mem_ops);
1474
1475	rc_zone_max_swap = rctl_register("zone.max-swap",
1476	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1477	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1478	    &zone_max_swap_ops);
1479
1480	/*
1481	 * Initialize the ``global zone''.
1482	 */
1483	set = rctl_set_create();
1484	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1485	mutex_enter(&p0.p_lock);
1486	e.rcep_p.zone = &zone0;
1487	e.rcep_t = RCENTITY_ZONE;
1488	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1489	    gp);
1490
1491	zone0.zone_nlwps = p0.p_lwpcnt;
1492	zone0.zone_ntasks = 1;
1493	mutex_exit(&p0.p_lock);
1494	zone0.zone_restart_init = B_TRUE;
1495	zone0.zone_brand = &native_brand;
1496	rctl_prealloc_destroy(gp);
1497	/*
1498	 * pool_default hasn't been initialized yet, so we let pool_init()
1499	 * take care of making sure the global zone is in the default pool.
1500	 */
1501
1502	/*
1503	 * Initialize global zone kstats
1504	 */
1505	zone_kstat_create(&zone0);
1506
1507	/*
1508	 * Initialize zone label.
1509	 * mlp are initialized when tnzonecfg is loaded.
1510	 */
1511	zone0.zone_slabel = l_admin_low;
1512	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
1513	label_hold(l_admin_low);
1514
1515	mutex_enter(&zonehash_lock);
1516	zone_uniqid(&zone0);
1517	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1518
1519	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1520	    mod_hash_null_valdtor);
1521	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1522	    zone_hash_size, mod_hash_null_valdtor);
1523	/*
1524	 * maintain zonehashbylabel only for labeled systems
1525	 */
1526	if (is_system_labeled())
1527		zonehashbylabel = mod_hash_create_extended("zone_by_label",
1528		    zone_hash_size, mod_hash_null_keydtor,
1529		    mod_hash_null_valdtor, hash_bylabel, NULL,
1530		    hash_labelkey_cmp, KM_SLEEP);
1531	zonecount = 1;
1532
1533	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1534	    (mod_hash_val_t)&zone0);
1535	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1536	    (mod_hash_val_t)&zone0);
1537	if (is_system_labeled()) {
1538		zone0.zone_flags |= ZF_HASHED_LABEL;
1539		(void) mod_hash_insert(zonehashbylabel,
1540		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
1541	}
1542	mutex_exit(&zonehash_lock);
1543
1544	/*
1545	 * We avoid setting zone_kcred until now, since kcred is initialized
1546	 * sometime after zone_zsd_init() and before zone_init().
1547	 */
1548	zone0.zone_kcred = kcred;
1549	/*
1550	 * The global zone is fully initialized (except for zone_rootvp which
1551	 * will be set when the root filesystem is mounted).
1552	 */
1553	global_zone = &zone0;
1554
1555	/*
1556	 * Setup an event channel to send zone status change notifications on
1557	 */
1558	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
1559	    EVCH_CREAT);
1560
1561	if (res)
1562		panic("Sysevent_evc_bind failed during zone setup.\n");
1563
1564}
1565
1566static void
1567zone_free(zone_t *zone)
1568{
1569	ASSERT(zone != global_zone);
1570	ASSERT(zone->zone_ntasks == 0);
1571	ASSERT(zone->zone_nlwps == 0);
1572	ASSERT(zone->zone_cred_ref == 0);
1573	ASSERT(zone->zone_kcred == NULL);
1574	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1575	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1576
1577	/*
1578	 * Remove any zone caps.
1579	 */
1580	cpucaps_zone_remove(zone);
1581
1582	ASSERT(zone->zone_cpucap == NULL);
1583
1584	/* remove from deathrow list */
1585	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1586		ASSERT(zone->zone_ref == 0);
1587		mutex_enter(&zone_deathrow_lock);
1588		list_remove(&zone_deathrow, zone);
1589		mutex_exit(&zone_deathrow_lock);
1590	}
1591
1592	zone_free_zsd(zone);
1593	zone_free_datasets(zone);
1594
1595	if (zone->zone_rootvp != NULL)
1596		VN_RELE(zone->zone_rootvp);
1597	if (zone->zone_rootpath)
1598		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
1599	if (zone->zone_name != NULL)
1600		kmem_free(zone->zone_name, ZONENAME_MAX);
1601	if (zone->zone_slabel != NULL)
1602		label_rele(zone->zone_slabel);
1603	if (zone->zone_nodename != NULL)
1604		kmem_free(zone->zone_nodename, _SYS_NMLN);
1605	if (zone->zone_domain != NULL)
1606		kmem_free(zone->zone_domain, _SYS_NMLN);
1607	if (zone->zone_privset != NULL)
1608		kmem_free(zone->zone_privset, sizeof (priv_set_t));
1609	if (zone->zone_rctls != NULL)
1610		rctl_set_free(zone->zone_rctls);
1611	if (zone->zone_bootargs != NULL)
1612		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1613	if (zone->zone_initname != NULL)
1614		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1615	id_free(zoneid_space, zone->zone_id);
1616	mutex_destroy(&zone->zone_lock);
1617	cv_destroy(&zone->zone_cv);
1618	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
1619	kmem_free(zone, sizeof (zone_t));
1620}
1621
1622/*
1623 * See block comment at the top of this file for information about zone
1624 * status values.
1625 */
1626/*
1627 * Convenience function for setting zone status.
1628 */
1629static void
1630zone_status_set(zone_t *zone, zone_status_t status)
1631{
1632
1633	nvlist_t *nvl = NULL;
1634	ASSERT(MUTEX_HELD(&zone_status_lock));
1635	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
1636	    status >= zone_status_get(zone));
1637
1638	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
1639	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
1640	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
1641	    zone_status_table[status]) ||
1642	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
1643	    zone_status_table[zone->zone_status]) ||
1644	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
1645	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
1646	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
1647	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
1648#ifdef DEBUG
1649		(void) printf(
1650		    "Failed to allocate and send zone state change event.\n");
1651#endif
1652	}
1653	nvlist_free(nvl);
1654
1655	zone->zone_status = status;
1656
1657	cv_broadcast(&zone->zone_cv);
1658}
1659
1660/*
1661 * Public function to retrieve the zone status.  The zone status may
1662 * change after it is retrieved.
1663 */
1664zone_status_t
1665zone_status_get(zone_t *zone)
1666{
1667	return (zone->zone_status);
1668}
1669
1670static int
1671zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
1672{
1673	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
1674	int err = 0;
1675
1676	ASSERT(zone != global_zone);
1677	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
1678		goto done;	/* EFAULT or ENAMETOOLONG */
1679
1680	if (zone->zone_bootargs != NULL)
1681		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1682
1683	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
1684	(void) strcpy(zone->zone_bootargs, bootargs);
1685
1686done:
1687	kmem_free(bootargs, BOOTARGS_MAX);
1688	return (err);
1689}
1690
1691static int
1692zone_set_brand(zone_t *zone, const char *brand)
1693{
1694	struct brand_attr *attrp;
1695	brand_t *bp;
1696
1697	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
1698	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
1699		kmem_free(attrp, sizeof (struct brand_attr));
1700		return (EFAULT);
1701	}
1702
1703	bp = brand_register_zone(attrp);
1704	kmem_free(attrp, sizeof (struct brand_attr));
1705	if (bp == NULL)
1706		return (EINVAL);
1707
1708	/*
1709	 * This is the only place where a zone can change it's brand.
1710	 * We already need to hold zone_status_lock to check the zone
1711	 * status, so we'll just use that lock to serialize zone
1712	 * branding requests as well.
1713	 */
1714	mutex_enter(&zone_status_lock);
1715
1716	/* Re-Branding is not allowed and the zone can't be booted yet */
1717	if ((ZONE_IS_BRANDED(zone)) ||
1718	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
1719		mutex_exit(&zone_status_lock);
1720		brand_unregister_zone(bp);
1721		return (EINVAL);
1722	}
1723
1724	if (is_system_labeled() &&
1725	    strncmp(attrp->ba_brandname, NATIVE_BRAND_NAME, MAXNAMELEN) != 0) {
1726		mutex_exit(&zone_status_lock);
1727		brand_unregister_zone(bp);
1728		return (EPERM);
1729	}
1730
1731	/* set up the brand specific data */
1732	zone->zone_brand = bp;
1733	ZBROP(zone)->b_init_brand_data(zone);
1734
1735	mutex_exit(&zone_status_lock);
1736	return (0);
1737}
1738
1739static int
1740zone_set_initname(zone_t *zone, const char *zone_initname)
1741{
1742	char initname[INITNAME_SZ];
1743	size_t len;
1744	int err = 0;
1745
1746	ASSERT(zone != global_zone);
1747	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
1748		return (err);	/* EFAULT or ENAMETOOLONG */
1749
1750	if (zone->zone_initname != NULL)
1751		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1752
1753	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
1754	(void) strcpy(zone->zone_initname, initname);
1755	return (0);
1756}
1757
1758static int
1759zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
1760{
1761	uint64_t mcap;
1762	int err = 0;
1763
1764	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
1765		zone->zone_phys_mcap = mcap;
1766
1767	return (err);
1768}
1769
1770static int
1771zone_set_sched_class(zone_t *zone, const char *new_class)
1772{
1773	char sched_class[PC_CLNMSZ];
1774	id_t classid;
1775	int err;
1776
1777	ASSERT(zone != global_zone);
1778	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
1779		return (err);	/* EFAULT or ENAMETOOLONG */
1780
1781	if (getcid(sched_class, &classid) != 0 || classid == syscid)
1782		return (set_errno(EINVAL));
1783	zone->zone_defaultcid = classid;
1784	ASSERT(zone->zone_defaultcid > 0 &&
1785	    zone->zone_defaultcid < loaded_classes);
1786
1787	return (0);
1788}
1789
1790/*
1791 * Block indefinitely waiting for (zone_status >= status)
1792 */
1793void
1794zone_status_wait(zone_t *zone, zone_status_t status)
1795{
1796	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1797
1798	mutex_enter(&zone_status_lock);
1799	while (zone->zone_status < status) {
1800		cv_wait(&zone->zone_cv, &zone_status_lock);
1801	}
1802	mutex_exit(&zone_status_lock);
1803}
1804
1805/*
1806 * Private CPR-safe version of zone_status_wait().
1807 */
1808static void
1809zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
1810{
1811	callb_cpr_t cprinfo;
1812
1813	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1814
1815	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
1816	    str);
1817	mutex_enter(&zone_status_lock);
1818	while (zone->zone_status < status) {
1819		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1820		cv_wait(&zone->zone_cv, &zone_status_lock);
1821		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
1822	}
1823	/*
1824	 * zone_status_lock is implicitly released by the following.
1825	 */
1826	CALLB_CPR_EXIT(&cprinfo);
1827}
1828
1829/*
1830 * Block until zone enters requested state or signal is received.  Return (0)
1831 * if signaled, non-zero otherwise.
1832 */
1833int
1834zone_status_wait_sig(zone_t *zone, zone_status_t status)
1835{
1836	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1837
1838	mutex_enter(&zone_status_lock);
1839	while (zone->zone_status < status) {
1840		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
1841			mutex_exit(&zone_status_lock);
1842			return (0);
1843		}
1844	}
1845	mutex_exit(&zone_status_lock);
1846	return (1);
1847}
1848
1849/*
1850 * Block until the zone enters the requested state or the timeout expires,
1851 * whichever happens first.  Return (-1) if operation timed out, time remaining
1852 * otherwise.
1853 */
1854clock_t
1855zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
1856{
1857	clock_t timeleft = 0;
1858
1859	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1860
1861	mutex_enter(&zone_status_lock);
1862	while (zone->zone_status < status && timeleft != -1) {
1863		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
1864	}
1865	mutex_exit(&zone_status_lock);
1866	return (timeleft);
1867}
1868
1869/*
1870 * Block until the zone enters the requested state, the current process is
1871 * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
1872 * operation timed out, 0 if signaled, time remaining otherwise.
1873 */
1874clock_t
1875zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
1876{
1877	clock_t timeleft = tim - lbolt;
1878
1879	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1880
1881	mutex_enter(&zone_status_lock);
1882	while (zone->zone_status < status) {
1883		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
1884		    tim);
1885		if (timeleft <= 0)
1886			break;
1887	}
1888	mutex_exit(&zone_status_lock);
1889	return (timeleft);
1890}
1891
1892/*
1893 * Zones have two reference counts: one for references from credential
1894 * structures (zone_cred_ref), and one (zone_ref) for everything else.
1895 * This is so we can allow a zone to be rebooted while there are still
1896 * outstanding cred references, since certain drivers cache dblks (which
1897 * implicitly results in cached creds).  We wait for zone_ref to drop to
1898 * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
1899 * later freed when the zone_cred_ref drops to 0, though nothing other
1900 * than the zone id and privilege set should be accessed once the zone
1901 * is "dead".
1902 *
1903 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
1904 * to force halt/reboot to block waiting for the zone_cred_ref to drop
1905 * to 0.  This can be useful to flush out other sources of cached creds
1906 * that may be less innocuous than the driver case.
1907 */
1908
1909int zone_wait_for_cred = 0;
1910
1911static void
1912zone_hold_locked(zone_t *z)
1913{
1914	ASSERT(MUTEX_HELD(&z->zone_lock));
1915	z->zone_ref++;
1916	ASSERT(z->zone_ref != 0);
1917}
1918
1919void
1920zone_hold(zone_t *z)
1921{
1922	mutex_enter(&z->zone_lock);
1923	zone_hold_locked(z);
1924	mutex_exit(&z->zone_lock);
1925}
1926
1927/*
1928 * If the non-cred ref count drops to 1 and either the cred ref count
1929 * is 0 or we aren't waiting for cred references, the zone is ready to
1930 * be destroyed.
1931 */
1932#define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
1933	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
1934
1935void
1936zone_rele(zone_t *z)
1937{
1938	boolean_t wakeup;
1939
1940	mutex_enter(&z->zone_lock);
1941	ASSERT(z->zone_ref != 0);
1942	z->zone_ref--;
1943	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1944		/* no more refs, free the structure */
1945		mutex_exit(&z->zone_lock);
1946		zone_free(z);
1947		return;
1948	}
1949	/* signal zone_destroy so the zone can finish halting */
1950	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
1951	mutex_exit(&z->zone_lock);
1952
1953	if (wakeup) {
1954		/*
1955		 * Grabbing zonehash_lock here effectively synchronizes with
1956		 * zone_destroy() to avoid missed signals.
1957		 */
1958		mutex_enter(&zonehash_lock);
1959		cv_broadcast(&zone_destroy_cv);
1960		mutex_exit(&zonehash_lock);
1961	}
1962}
1963
1964void
1965zone_cred_hold(zone_t *z)
1966{
1967	mutex_enter(&z->zone_lock);
1968	z->zone_cred_ref++;
1969	ASSERT(z->zone_cred_ref != 0);
1970	mutex_exit(&z->zone_lock);
1971}
1972
1973void
1974zone_cred_rele(zone_t *z)
1975{
1976	boolean_t wakeup;
1977
1978	mutex_enter(&z->zone_lock);
1979	ASSERT(z->zone_cred_ref != 0);
1980	z->zone_cred_ref--;
1981	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1982		/* no more refs, free the structure */
1983		mutex_exit(&z->zone_lock);
1984		zone_free(z);
1985		return;
1986	}
1987	/*
1988	 * If zone_destroy is waiting for the cred references to drain
1989	 * out, and they have, signal it.
1990	 */
1991	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
1992	    zone_status_get(z) >= ZONE_IS_DEAD);
1993	mutex_exit(&z->zone_lock);
1994
1995	if (wakeup) {
1996		/*
1997		 * Grabbing zonehash_lock here effectively synchronizes with
1998		 * zone_destroy() to avoid missed signals.
1999		 */
2000		mutex_enter(&zonehash_lock);
2001		cv_broadcast(&zone_destroy_cv);
2002		mutex_exit(&zonehash_lock);
2003	}
2004}
2005
2006void
2007zone_task_hold(zone_t *z)
2008{
2009	mutex_enter(&z->zone_lock);
2010	z->zone_ntasks++;
2011	ASSERT(z->zone_ntasks != 0);
2012	mutex_exit(&z->zone_lock);
2013}
2014
2015void
2016zone_task_rele(zone_t *zone)
2017{
2018	uint_t refcnt;
2019
2020	mutex_enter(&zone->zone_lock);
2021	ASSERT(zone->zone_ntasks != 0);
2022	refcnt = --zone->zone_ntasks;
2023	if (refcnt > 1)	{	/* Common case */
2024		mutex_exit(&zone->zone_lock);
2025		return;
2026	}
2027	zone_hold_locked(zone);	/* so we can use the zone_t later */
2028	mutex_exit(&zone->zone_lock);
2029	if (refcnt == 1) {
2030		/*
2031		 * See if the zone is shutting down.
2032		 */
2033		mutex_enter(&zone_status_lock);
2034		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2035			goto out;
2036		}
2037
2038		/*
2039		 * Make sure the ntasks didn't change since we
2040		 * dropped zone_lock.
2041		 */
2042		mutex_enter(&zone->zone_lock);
2043		if (refcnt != zone->zone_ntasks) {
2044			mutex_exit(&zone->zone_lock);
2045			goto out;
2046		}
2047		mutex_exit(&zone->zone_lock);
2048
2049		/*
2050		 * No more user processes in the zone.  The zone is empty.
2051		 */
2052		zone_status_set(zone, ZONE_IS_EMPTY);
2053		goto out;
2054	}
2055
2056	ASSERT(refcnt == 0);
2057	/*
2058	 * zsched has exited; the zone is dead.
2059	 */
2060	zone->zone_zsched = NULL;		/* paranoia */
2061	mutex_enter(&zone_status_lock);
2062	zone_status_set(zone, ZONE_IS_DEAD);
2063out:
2064	mutex_exit(&zone_status_lock);
2065	zone_rele(zone);
2066}
2067
2068zoneid_t
2069getzoneid(void)
2070{
2071	return (curproc->p_zone->zone_id);
2072}
2073
2074/*
2075 * Internal versions of zone_find_by_*().  These don't zone_hold() or
2076 * check the validity of a zone's state.
2077 */
2078static zone_t *
2079zone_find_all_by_id(zoneid_t zoneid)
2080{
2081	mod_hash_val_t hv;
2082	zone_t *zone = NULL;
2083
2084	ASSERT(MUTEX_HELD(&zonehash_lock));
2085
2086	if (mod_hash_find(zonehashbyid,
2087	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2088		zone = (zone_t *)hv;
2089	return (zone);
2090}
2091
2092static zone_t *
2093zone_find_all_by_label(const ts_label_t *label)
2094{
2095	mod_hash_val_t hv;
2096	zone_t *zone = NULL;
2097
2098	ASSERT(MUTEX_HELD(&zonehash_lock));
2099
2100	/*
2101	 * zonehashbylabel is not maintained for unlabeled systems
2102	 */
2103	if (!is_system_labeled())
2104		return (NULL);
2105	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2106		zone = (zone_t *)hv;
2107	return (zone);
2108}
2109
2110static zone_t *
2111zone_find_all_by_name(char *name)
2112{
2113	mod_hash_val_t hv;
2114	zone_t *zone = NULL;
2115
2116	ASSERT(MUTEX_HELD(&zonehash_lock));
2117
2118	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2119		zone = (zone_t *)hv;
2120	return (zone);
2121}
2122
2123/*
2124 * Public interface for looking up a zone by zoneid.  Only returns the zone if
2125 * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2126 * Caller must call zone_rele() once it is done with the zone.
2127 *
2128 * The zone may begin the zone_destroy() sequence immediately after this
2129 * function returns, but may be safely used until zone_rele() is called.
2130 */
2131zone_t *
2132zone_find_by_id(zoneid_t zoneid)
2133{
2134	zone_t *zone;
2135	zone_status_t status;
2136
2137	mutex_enter(&zonehash_lock);
2138	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2139		mutex_exit(&zonehash_lock);
2140		return (NULL);
2141	}
2142	status = zone_status_get(zone);
2143	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2144		/*
2145		 * For all practical purposes the zone doesn't exist.
2146		 */
2147		mutex_exit(&zonehash_lock);
2148		return (NULL);
2149	}
2150	zone_hold(zone);
2151	mutex_exit(&zonehash_lock);
2152	return (zone);
2153}
2154
2155/*
2156 * Similar to zone_find_by_id, but using zone label as the key.
2157 */
2158zone_t *
2159zone_find_by_label(const ts_label_t *label)
2160{
2161	zone_t *zone;
2162	zone_status_t status;
2163
2164	mutex_enter(&zonehash_lock);
2165	if ((zone = zone_find_all_by_label(label)) == NULL) {
2166		mutex_exit(&zonehash_lock);
2167		return (NULL);
2168	}
2169
2170	status = zone_status_get(zone);
2171	if (status > ZONE_IS_DOWN) {
2172		/*
2173		 * For all practical purposes the zone doesn't exist.
2174		 */
2175		mutex_exit(&zonehash_lock);
2176		return (NULL);
2177	}
2178	zone_hold(zone);
2179	mutex_exit(&zonehash_lock);
2180	return (zone);
2181}
2182
2183/*
2184 * Similar to zone_find_by_id, but using zone name as the key.
2185 */
2186zone_t *
2187zone_find_by_name(char *name)
2188{
2189	zone_t *zone;
2190	zone_status_t status;
2191
2192	mutex_enter(&zonehash_lock);
2193	if ((zone = zone_find_all_by_name(name)) == NULL) {
2194		mutex_exit(&zonehash_lock);
2195		return (NULL);
2196	}
2197	status = zone_status_get(zone);
2198	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2199		/*
2200		 * For all practical purposes the zone doesn't exist.
2201		 */
2202		mutex_exit(&zonehash_lock);
2203		return (NULL);
2204	}
2205	zone_hold(zone);
2206	mutex_exit(&zonehash_lock);
2207	return (zone);
2208}
2209
2210/*
2211 * Similar to zone_find_by_id(), using the path as a key.  For instance,
2212 * if there is a zone "foo" rooted at /foo/root, and the path argument
2213 * is "/foo/root/proc", it will return the held zone_t corresponding to
2214 * zone "foo".
2215 *
2216 * zone_find_by_path() always returns a non-NULL value, since at the
2217 * very least every path will be contained in the global zone.
2218 *
2219 * As with the other zone_find_by_*() functions, the caller is
2220 * responsible for zone_rele()ing the return value of this function.
2221 */
2222zone_t *
2223zone_find_by_path(const char *path)
2224{
2225	zone_t *zone;
2226	zone_t *zret = NULL;
2227	zone_status_t status;
2228
2229	if (path == NULL) {
2230		/*
2231		 * Call from rootconf().
2232		 */
2233		zone_hold(global_zone);
2234		return (global_zone);
2235	}
2236	ASSERT(*path == '/');
2237	mutex_enter(&zonehash_lock);
2238	for (zone = list_head(&zone_active); zone != NULL;
2239	    zone = list_next(&zone_active, zone)) {
2240		if (ZONE_PATH_VISIBLE(path, zone))
2241			zret = zone;
2242	}
2243	ASSERT(zret != NULL);
2244	status = zone_status_get(zret);
2245	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2246		/*
2247		 * Zone practically doesn't exist.
2248		 */
2249		zret = global_zone;
2250	}
2251	zone_hold(zret);
2252	mutex_exit(&zonehash_lock);
2253	return (zret);
2254}
2255
2256/*
2257 * Get the number of cpus visible to this zone.  The system-wide global
2258 * 'ncpus' is returned if pools are disabled, the caller is in the
2259 * global zone, or a NULL zone argument is passed in.
2260 */
2261int
2262zone_ncpus_get(zone_t *zone)
2263{
2264	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2265
2266	return (myncpus != 0 ? myncpus : ncpus);
2267}
2268
2269/*
2270 * Get the number of online cpus visible to this zone.  The system-wide
2271 * global 'ncpus_online' is returned if pools are disabled, the caller
2272 * is in the global zone, or a NULL zone argument is passed in.
2273 */
2274int
2275zone_ncpus_online_get(zone_t *zone)
2276{
2277	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2278
2279	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2280}
2281
2282/*
2283 * Return the pool to which the zone is currently bound.
2284 */
2285pool_t *
2286zone_pool_get(zone_t *zone)
2287{
2288	ASSERT(pool_lock_held());
2289
2290	return (zone->zone_pool);
2291}
2292
2293/*
2294 * Set the zone's pool pointer and update the zone's visibility to match
2295 * the resources in the new pool.
2296 */
2297void
2298zone_pool_set(zone_t *zone, pool_t *pool)
2299{
2300	ASSERT(pool_lock_held());
2301	ASSERT(MUTEX_HELD(&cpu_lock));
2302
2303	zone->zone_pool = pool;
2304	zone_pset_set(zone, pool->pool_pset->pset_id);
2305}
2306
2307/*
2308 * Return the cached value of the id of the processor set to which the
2309 * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2310 * facility is disabled.
2311 */
2312psetid_t
2313zone_pset_get(zone_t *zone)
2314{
2315	ASSERT(MUTEX_HELD(&cpu_lock));
2316
2317	return (zone->zone_psetid);
2318}
2319
2320/*
2321 * Set the cached value of the id of the processor set to which the zone
2322 * is currently bound.  Also update the zone's visibility to match the
2323 * resources in the new processor set.
2324 */
2325void
2326zone_pset_set(zone_t *zone, psetid_t newpsetid)
2327{
2328	psetid_t oldpsetid;
2329
2330	ASSERT(MUTEX_HELD(&cpu_lock));
2331	oldpsetid = zone_pset_get(zone);
2332
2333	if (oldpsetid == newpsetid)
2334		return;
2335	/*
2336	 * Global zone sees all.
2337	 */
2338	if (zone != global_zone) {
2339		zone->zone_psetid = newpsetid;
2340		if (newpsetid != ZONE_PS_INVAL)
2341			pool_pset_visibility_add(newpsetid, zone);
2342		if (oldpsetid != ZONE_PS_INVAL)
2343			pool_pset_visibility_remove(oldpsetid, zone);
2344	}
2345	/*
2346	 * Disabling pools, so we should start using the global values
2347	 * for ncpus and ncpus_online.
2348	 */
2349	if (newpsetid == ZONE_PS_INVAL) {
2350		zone->zone_ncpus = 0;
2351		zone->zone_ncpus_online = 0;
2352	}
2353}
2354
2355/*
2356 * Walk the list of active zones and issue the provided callback for
2357 * each of them.
2358 *
2359 * Caller must not be holding any locks that may be acquired under
2360 * zonehash_lock.  See comment at the beginning of the file for a list of
2361 * common locks and their interactions with zones.
2362 */
2363int
2364zone_walk(int (*cb)(zone_t *, void *), void *data)
2365{
2366	zone_t *zone;
2367	int ret = 0;
2368	zone_status_t status;
2369
2370	mutex_enter(&zonehash_lock);
2371	for (zone = list_head(&zone_active); zone != NULL;
2372	    zone = list_next(&zone_active, zone)) {
2373		/*
2374		 * Skip zones that shouldn't be externally visible.
2375		 */
2376		status = zone_status_get(zone);
2377		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2378			continue;
2379		/*
2380		 * Bail immediately if any callback invocation returns a
2381		 * non-zero value.
2382		 */
2383		ret = (*cb)(zone, data);
2384		if (ret != 0)
2385			break;
2386	}
2387	mutex_exit(&zonehash_lock);
2388	return (ret);
2389}
2390
2391static int
2392zone_set_root(zone_t *zone, const char *upath)
2393{
2394	vnode_t *vp;
2395	int trycount;
2396	int error = 0;
2397	char *path;
2398	struct pathname upn, pn;
2399	size_t pathlen;
2400
2401	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2402		return (error);
2403
2404	pn_alloc(&pn);
2405
2406	/* prevent infinite loop */
2407	trycount = 10;
2408	for (;;) {
2409		if (--trycount <= 0) {
2410			error = ESTALE;
2411			goto out;
2412		}
2413
2414		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
2415			/*
2416			 * VOP_ACCESS() may cover 'vp' with a new
2417			 * filesystem, if 'vp' is an autoFS vnode.
2418			 * Get the new 'vp' if so.
2419			 */
2420			if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
2421			    (!vn_ismntpt(vp) ||
2422			    (error = traverse(&vp)) == 0)) {
2423				pathlen = pn.pn_pathlen + 2;
2424				path = kmem_alloc(pathlen, KM_SLEEP);
2425				(void) strncpy(path, pn.pn_path,
2426				    pn.pn_pathlen + 1);
2427				path[pathlen - 2] = '/';
2428				path[pathlen - 1] = '\0';
2429				pn_free(&pn);
2430				pn_free(&upn);
2431
2432				/* Success! */
2433				break;
2434			}
2435			VN_RELE(vp);
2436		}
2437		if (error != ESTALE)
2438			goto out;
2439	}
2440
2441	ASSERT(error == 0);
2442	zone->zone_rootvp = vp;		/* we hold a reference to vp */
2443	zone->zone_rootpath = path;
2444	zone->zone_rootpathlen = pathlen;
2445	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
2446		zone->zone_flags |= ZF_IS_SCRATCH;
2447	return (0);
2448
2449out:
2450	pn_free(&pn);
2451	pn_free(&upn);
2452	return (error);
2453}
2454
2455#define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
2456			((c) >= 'a' && (c) <= 'z') || \
2457			((c) >= 'A' && (c) <= 'Z'))
2458
2459static int
2460zone_set_name(zone_t *zone, const char *uname)
2461{
2462	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
2463	size_t len;
2464	int i, err;
2465
2466	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
2467		kmem_free(kname, ZONENAME_MAX);
2468		return (err);	/* EFAULT or ENAMETOOLONG */
2469	}
2470
2471	/* must be less than ZONENAME_MAX */
2472	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
2473		kmem_free(kname, ZONENAME_MAX);
2474		return (EINVAL);
2475	}
2476
2477	/*
2478	 * Name must start with an alphanumeric and must contain only
2479	 * alphanumerics, '-', '_' and '.'.
2480	 */
2481	if (!isalnum(kname[0])) {
2482		kmem_free(kname, ZONENAME_MAX);
2483		return (EINVAL);
2484	}
2485	for (i = 1; i < len - 1; i++) {
2486		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2487		    kname[i] != '.') {
2488			kmem_free(kname, ZONENAME_MAX);
2489			return (EINVAL);
2490		}
2491	}
2492
2493	zone->zone_name = kname;
2494	return (0);
2495}
2496
2497/*
2498 * Similar to thread_create(), but makes sure the thread is in the appropriate
2499 * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
2500 */
2501/*ARGSUSED*/
2502kthread_t *
2503zthread_create(
2504    caddr_t stk,
2505    size_t stksize,
2506    void (*proc)(),
2507    void *arg,
2508    size_t len,
2509    pri_t pri)
2510{
2511	kthread_t *t;
2512	zone_t *zone = curproc->p_zone;
2513	proc_t *pp = zone->zone_zsched;
2514
2515	zone_hold(zone);	/* Reference to be dropped when thread exits */
2516
2517	/*
2518	 * No-one should be trying to create threads if the zone is shutting
2519	 * down and there aren't any kernel threads around.  See comment
2520	 * in zthread_exit().
2521	 */
2522	ASSERT(!(zone->zone_kthreads == NULL &&
2523	    zone_status_get(zone) >= ZONE_IS_EMPTY));
2524	/*
2525	 * Create a thread, but don't let it run until we've finished setting
2526	 * things up.
2527	 */
2528	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
2529	ASSERT(t->t_forw == NULL);
2530	mutex_enter(&zone_status_lock);
2531	if (zone->zone_kthreads == NULL) {
2532		t->t_forw = t->t_back = t;
2533	} else {
2534		kthread_t *tx = zone->zone_kthreads;
2535
2536		t->t_forw = tx;
2537		t->t_back = tx->t_back;
2538		tx->t_back->t_forw = t;
2539		tx->t_back = t;
2540	}
2541	zone->zone_kthreads = t;
2542	mutex_exit(&zone_status_lock);
2543
2544	mutex_enter(&pp->p_lock);
2545	t->t_proc_flag |= TP_ZTHREAD;
2546	project_rele(t->t_proj);
2547	t->t_proj = project_hold(pp->p_task->tk_proj);
2548
2549	/*
2550	 * Setup complete, let it run.
2551	 */
2552	thread_lock(t);
2553	t->t_schedflag |= TS_ALLSTART;
2554	setrun_locked(t);
2555	thread_unlock(t);
2556
2557	mutex_exit(&pp->p_lock);
2558
2559	return (t);
2560}
2561
2562/*
2563 * Similar to thread_exit().  Must be called by threads created via
2564 * zthread_exit().
2565 */
2566void
2567zthread_exit(void)
2568{
2569	kthread_t *t = curthread;
2570	proc_t *pp = curproc;
2571	zone_t *zone = pp->p_zone;
2572
2573	mutex_enter(&zone_status_lock);
2574
2575	/*
2576	 * Reparent to p0
2577	 */
2578	kpreempt_disable();
2579	mutex_enter(&pp->p_lock);
2580	t->t_proc_flag &= ~TP_ZTHREAD;
2581	t->t_procp = &p0;
2582	hat_thread_exit(t);
2583	mutex_exit(&pp->p_lock);
2584	kpreempt_enable();
2585
2586	if (t->t_back == t) {
2587		ASSERT(t->t_forw == t);
2588		/*
2589		 * If the zone is empty, once the thread count
2590		 * goes to zero no further kernel threads can be
2591		 * created.  This is because if the creator is a process
2592		 * in the zone, then it must have exited before the zone
2593		 * state could be set to ZONE_IS_EMPTY.
2594		 * Otherwise, if the creator is a kernel thread in the
2595		 * zone, the thread count is non-zero.
2596		 *
2597		 * This really means that non-zone kernel threads should
2598		 * not create zone kernel threads.
2599		 */
2600		zone->zone_kthreads = NULL;
2601		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
2602			zone_status_set(zone, ZONE_IS_DOWN);
2603			/*
2604			 * Remove any CPU caps on this zone.
2605			 */
2606			cpucaps_zone_remove(zone);
2607		}
2608	} else {
2609		t->t_forw->t_back = t->t_back;
2610		t->t_back->t_forw = t->t_forw;
2611		if (zone->zone_kthreads == t)
2612			zone->zone_kthreads = t->t_forw;
2613	}
2614	mutex_exit(&zone_status_lock);
2615	zone_rele(zone);
2616	thread_exit();
2617	/* NOTREACHED */
2618}
2619
2620static void
2621zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
2622{
2623	vnode_t *oldvp;
2624
2625	/* we're going to hold a reference here to the directory */
2626	VN_HOLD(vp);
2627
2628#ifdef C2_AUDIT
2629	if (audit_active)	/* update abs cwd/root path see c2audit.c */
2630		audit_chdirec(vp, vpp);
2631#endif
2632
2633	mutex_enter(&pp->p_lock);
2634	oldvp = *vpp;
2635	*vpp = vp;
2636	mutex_exit(&pp->p_lock);
2637	if (oldvp != NULL)
2638		VN_RELE(oldvp);
2639}
2640
2641/*
2642 * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
2643 */
2644static int
2645nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
2646{
2647	nvpair_t *nvp = NULL;
2648	boolean_t priv_set = B_FALSE;
2649	boolean_t limit_set = B_FALSE;
2650	boolean_t action_set = B_FALSE;
2651
2652	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2653		const char *name;
2654		uint64_t ui64;
2655
2656		name = nvpair_name(nvp);
2657		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
2658			return (EINVAL);
2659		(void) nvpair_value_uint64(nvp, &ui64);
2660		if (strcmp(name, "privilege") == 0) {
2661			/*
2662			 * Currently only privileged values are allowed, but
2663			 * this may change in the future.
2664			 */
2665			if (ui64 != RCPRIV_PRIVILEGED)
2666				return (EINVAL);
2667			rv->rcv_privilege = ui64;
2668			priv_set = B_TRUE;
2669		} else if (strcmp(name, "limit") == 0) {
2670			rv->rcv_value = ui64;
2671			limit_set = B_TRUE;
2672		} else if (strcmp(name, "action") == 0) {
2673			if (ui64 != RCTL_LOCAL_NOACTION &&
2674			    ui64 != RCTL_LOCAL_DENY)
2675				return (EINVAL);
2676			rv->rcv_flagaction = ui64;
2677			action_set = B_TRUE;
2678		} else {
2679			return (EINVAL);
2680		}
2681	}
2682
2683	if (!(priv_set && limit_set && action_set))
2684		return (EINVAL);
2685	rv->rcv_action_signal = 0;
2686	rv->rcv_action_recipient = NULL;
2687	rv->rcv_action_recip_pid = -1;
2688	rv->rcv_firing_time = 0;
2689
2690	return (0);
2691}
2692
2693/*
2694 * Non-global zone version of start_init.
2695 */
2696void
2697zone_start_init(void)
2698{
2699	proc_t *p = ttoproc(curthread);
2700	zone_t *z = p->p_zone;
2701
2702	ASSERT(!INGLOBALZONE(curproc));
2703
2704	/*
2705	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
2706	 * storing just the pid of init is sufficient.
2707	 */
2708	z->zone_proc_initpid = p->p_pid;
2709
2710	/*
2711	 * We maintain zone_boot_err so that we can return the cause of the
2712	 * failure back to the caller of the zone_boot syscall.
2713	 */
2714	p->p_zone->zone_boot_err = start_init_common();
2715
2716	mutex_enter(&zone_status_lock);
2717	if (z->zone_boot_err != 0) {
2718		/*
2719		 * Make sure we are still in the booting state-- we could have
2720		 * raced and already be shutting down, or even further along.
2721		 */
2722		if (zone_status_get(z) == ZONE_IS_BOOTING) {
2723			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
2724		}
2725		mutex_exit(&zone_status_lock);
2726		/* It's gone bad, dispose of the process */
2727		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
2728			mutex_enter(&p->p_lock);
2729			ASSERT(p->p_flag & SEXITLWPS);
2730			lwp_exit();
2731		}
2732	} else {
2733		if (zone_status_get(z) == ZONE_IS_BOOTING)
2734			zone_status_set(z, ZONE_IS_RUNNING);
2735		mutex_exit(&zone_status_lock);
2736		/* cause the process to return to userland. */
2737		lwp_rtt();
2738	}
2739}
2740
2741struct zsched_arg {
2742	zone_t *zone;
2743	nvlist_t *nvlist;
2744};
2745
2746/*
2747 * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
2748 * anything to do with scheduling, but rather with the fact that
2749 * per-zone kernel threads are parented to zsched, just like regular
2750 * kernel threads are parented to sched (p0).
2751 *
2752 * zsched is also responsible for launching init for the zone.
2753 */
2754static void
2755zsched(void *arg)
2756{
2757	struct zsched_arg *za = arg;
2758	proc_t *pp = curproc;
2759	proc_t *initp = proc_init;
2760	zone_t *zone = za->zone;
2761	cred_t *cr, *oldcred;
2762	rctl_set_t *set;
2763	rctl_alloc_gp_t *gp;
2764	contract_t *ct = NULL;
2765	task_t *tk, *oldtk;
2766	rctl_entity_p_t e;
2767	kproject_t *pj;
2768
2769	nvlist_t *nvl = za->nvlist;
2770	nvpair_t *nvp = NULL;
2771
2772	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
2773	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
2774	PTOU(pp)->u_argc = 0;
2775	PTOU(pp)->u_argv = NULL;
2776	PTOU(pp)->u_envp = NULL;
2777	closeall(P_FINFO(pp));
2778
2779	/*
2780	 * We are this zone's "zsched" process.  As the zone isn't generally
2781	 * visible yet we don't need to grab any locks before initializing its
2782	 * zone_proc pointer.
2783	 */
2784	zone_hold(zone);  /* this hold is released by zone_destroy() */
2785	zone->zone_zsched = pp;
2786	mutex_enter(&pp->p_lock);
2787	pp->p_zone = zone;
2788	mutex_exit(&pp->p_lock);
2789
2790	/*
2791	 * Disassociate process from its 'parent'; parent ourselves to init
2792	 * (pid 1) and change other values as needed.
2793	 */
2794	sess_create();
2795
2796	mutex_enter(&pidlock);
2797	proc_detach(pp);
2798	pp->p_ppid = 1;
2799	pp->p_flag |= SZONETOP;
2800	pp->p_ancpid = 1;
2801	pp->p_parent = initp;
2802	pp->p_psibling = NULL;
2803	if (initp->p_child)
2804		initp->p_child->p_psibling = pp;
2805	pp->p_sibling = initp->p_child;
2806	initp->p_child = pp;
2807
2808	/* Decrement what newproc() incremented. */
2809	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
2810	/*
2811	 * Our credentials are about to become kcred-like, so we don't care
2812	 * about the caller's ruid.
2813	 */
2814	upcount_inc(crgetruid(kcred), zone->zone_id);
2815	mutex_exit(&pidlock);
2816
2817	/*
2818	 * getting out of global zone, so decrement lwp counts
2819	 */
2820	pj = pp->p_task->tk_proj;
2821	mutex_enter(&global_zone->zone_nlwps_lock);
2822	pj->kpj_nlwps -= pp->p_lwpcnt;
2823	global_zone->zone_nlwps -= pp->p_lwpcnt;
2824	mutex_exit(&global_zone->zone_nlwps_lock);
2825
2826	/*
2827	 * Decrement locked memory counts on old zone and project.
2828	 */
2829	mutex_enter(&global_zone->zone_mem_lock);
2830	global_zone->zone_locked_mem -= pp->p_locked_mem;
2831	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
2832	mutex_exit(&global_zone->zone_mem_lock);
2833
2834	/*
2835	 * Create and join a new task in project '0' of this zone.
2836	 *
2837	 * We don't need to call holdlwps() since we know we're the only lwp in
2838	 * this process.
2839	 *
2840	 * task_join() returns with p_lock held.
2841	 */
2842	tk = task_create(0, zone);
2843	mutex_enter(&cpu_lock);
2844	oldtk = task_join(tk, 0);
2845
2846	pj = pp->p_task->tk_proj;
2847
2848	mutex_enter(&zone->zone_mem_lock);
2849	zone->zone_locked_mem += pp->p_locked_mem;
2850	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
2851	mutex_exit(&zone->zone_mem_lock);
2852
2853	/*
2854	 * add lwp counts to zsched's zone, and increment project's task count
2855	 * due to the task created in the above tasksys_settaskid
2856	 */
2857
2858	mutex_enter(&zone->zone_nlwps_lock);
2859	pj->kpj_nlwps += pp->p_lwpcnt;
2860	pj->kpj_ntasks += 1;
2861	zone->zone_nlwps += pp->p_lwpcnt;
2862	mutex_exit(&zone->zone_nlwps_lock);
2863
2864	mutex_exit(&curproc->p_lock);
2865	mutex_exit(&cpu_lock);
2866	task_rele(oldtk);
2867
2868	/*
2869	 * The process was created by a process in the global zone, hence the
2870	 * credentials are wrong.  We might as well have kcred-ish credentials.
2871	 */
2872	cr = zone->zone_kcred;
2873	crhold(cr);
2874	mutex_enter(&pp->p_crlock);
2875	oldcred = pp->p_cred;
2876	pp->p_cred = cr;
2877	mutex_exit(&pp->p_crlock);
2878	crfree(oldcred);
2879
2880	/*
2881	 * Hold credentials again (for thread)
2882	 */
2883	crhold(cr);
2884
2885	/*
2886	 * p_lwpcnt can't change since this is a kernel process.
2887	 */
2888	crset(pp, cr);
2889
2890	/*
2891	 * Chroot
2892	 */
2893	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
2894	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
2895
2896	/*
2897	 * Initialize zone's rctl set.
2898	 */
2899	set = rctl_set_create();
2900	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2901	mutex_enter(&pp->p_lock);
2902	e.rcep_p.zone = zone;
2903	e.rcep_t = RCENTITY_ZONE;
2904	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
2905	mutex_exit(&pp->p_lock);
2906	rctl_prealloc_destroy(gp);
2907
2908	/*
2909	 * Apply the rctls passed in to zone_create().  This is basically a list
2910	 * assignment: all of the old values are removed and the new ones
2911	 * inserted.  That is, if an empty list is passed in, all values are
2912	 * removed.
2913	 */
2914	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2915		rctl_dict_entry_t *rde;
2916		rctl_hndl_t hndl;
2917		char *name;
2918		nvlist_t **nvlarray;
2919		uint_t i, nelem;
2920		int error;	/* For ASSERT()s */
2921
2922		name = nvpair_name(nvp);
2923		hndl = rctl_hndl_lookup(name);
2924		ASSERT(hndl != -1);
2925		rde = rctl_dict_lookup_hndl(hndl);
2926		ASSERT(rde != NULL);
2927
2928		for (; /* ever */; ) {
2929			rctl_val_t oval;
2930
2931			mutex_enter(&pp->p_lock);
2932			error = rctl_local_get(hndl, NULL, &oval, pp);
2933			mutex_exit(&pp->p_lock);
2934			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
2935			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
2936			if (oval.rcv_privilege == RCPRIV_SYSTEM)
2937				break;
2938			mutex_enter(&pp->p_lock);
2939			error = rctl_local_delete(hndl, &oval, pp);
2940			mutex_exit(&pp->p_lock);
2941			ASSERT(error == 0);
2942		}
2943		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2944		ASSERT(error == 0);
2945		for (i = 0; i < nelem; i++) {
2946			rctl_val_t *nvalp;
2947
2948			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2949			error = nvlist2rctlval(nvlarray[i], nvalp);
2950			ASSERT(error == 0);
2951			/*
2952			 * rctl_local_insert can fail if the value being
2953			 * inserted is a duplicate; this is OK.
2954			 */
2955			mutex_enter(&pp->p_lock);
2956			if (rctl_local_insert(hndl, nvalp, pp) != 0)
2957				kmem_cache_free(rctl_val_cache, nvalp);
2958			mutex_exit(&pp->p_lock);
2959		}
2960	}
2961	/*
2962	 * Tell the world that we're done setting up.
2963	 *
2964	 * At this point we want to set the zone status to ZONE_IS_READY
2965	 * and atomically set the zone's processor set visibility.  Once
2966	 * we drop pool_lock() this zone will automatically get updated
2967	 * to reflect any future changes to the pools configuration.
2968	 */
2969	pool_lock();
2970	mutex_enter(&cpu_lock);
2971	mutex_enter(&zonehash_lock);
2972	zone_uniqid(zone);
2973	zone_zsd_configure(zone);
2974	if (pool_state == POOL_ENABLED)
2975		zone_pset_set(zone, pool_default->pool_pset->pset_id);
2976	mutex_enter(&zone_status_lock);
2977	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2978	zone_status_set(zone, ZONE_IS_READY);
2979	mutex_exit(&zone_status_lock);
2980	mutex_exit(&zonehash_lock);
2981	mutex_exit(&cpu_lock);
2982	pool_unlock();
2983
2984	/*
2985	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
2986	 * we launch init, and set the state to running.
2987	 */
2988	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
2989
2990	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
2991		id_t cid;
2992
2993		/*
2994		 * Ok, this is a little complicated.  We need to grab the
2995		 * zone's pool's scheduling class ID; note that by now, we
2996		 * are already bound to a pool if we need to be (zoneadmd
2997		 * will have done that to us while we're in the READY
2998		 * state).  *But* the scheduling class for the zone's 'init'
2999		 * must be explicitly passed to newproc, which doesn't
3000		 * respect pool bindings.
3001		 *
3002		 * We hold the pool_lock across the call to newproc() to
3003		 * close the obvious race: the pool's scheduling class
3004		 * could change before we manage to create the LWP with
3005		 * classid 'cid'.
3006		 */
3007		pool_lock();
3008		if (zone->zone_defaultcid > 0)
3009			cid = zone->zone_defaultcid;
3010		else
3011			cid = pool_get_class(zone->zone_pool);
3012		if (cid == -1)
3013			cid = defaultcid;
3014
3015		/*
3016		 * If this fails, zone_boot will ultimately fail.  The
3017		 * state of the zone will be set to SHUTTING_DOWN-- userland
3018		 * will have to tear down the zone, and fail, or try again.
3019		 */
3020		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3021		    minclsyspri - 1, &ct)) != 0) {
3022			mutex_enter(&zone_status_lock);
3023			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3024			mutex_exit(&zone_status_lock);
3025		}
3026		pool_unlock();
3027	}
3028
3029	/*
3030	 * Wait for zone_destroy() to be called.  This is what we spend
3031	 * most of our life doing.
3032	 */
3033	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3034
3035	if (ct)
3036		/*
3037		 * At this point the process contract should be empty.
3038		 * (Though if it isn't, it's not the end of the world.)
3039		 */
3040		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3041
3042	/*
3043	 * Allow kcred to be freed when all referring processes
3044	 * (including this one) go away.  We can't just do this in
3045	 * zone_free because we need to wait for the zone_cred_ref to
3046	 * drop to 0 before calling zone_free, and the existence of
3047	 * zone_kcred will prevent that.  Thus, we call crfree here to
3048	 * balance the crdup in zone_create.  The crhold calls earlier
3049	 * in zsched will be dropped when the thread and process exit.
3050	 */
3051	crfree(zone->zone_kcred);
3052	zone->zone_kcred = NULL;
3053
3054	exit(CLD_EXITED, 0);
3055}
3056
3057/*
3058 * Helper function to determine if there are any submounts of the
3059 * provided path.  Used to make sure the zone doesn't "inherit" any
3060 * mounts from before it is created.
3061 */
3062static uint_t
3063zone_mount_count(const char *rootpath)
3064{
3065	vfs_t *vfsp;
3066	uint_t count = 0;
3067	size_t rootpathlen = strlen(rootpath);
3068
3069	/*
3070	 * Holding zonehash_lock prevents race conditions with
3071	 * vfs_list_add()/vfs_list_remove() since we serialize with
3072	 * zone_find_by_path().
3073	 */
3074	ASSERT(MUTEX_HELD(&zonehash_lock));
3075	/*
3076	 * The rootpath must end with a '/'
3077	 */
3078	ASSERT(rootpath[rootpathlen - 1] == '/');
3079
3080	/*
3081	 * This intentionally does not count the rootpath itself if that
3082	 * happens to be a mount point.
3083	 */
3084	vfs_list_read_lock();
3085	vfsp = rootvfs;
3086	do {
3087		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3088		    rootpathlen) == 0)
3089			count++;
3090		vfsp = vfsp->vfs_next;
3091	} while (vfsp != rootvfs);
3092	vfs_list_unlock();
3093	return (count);
3094}
3095
3096/*
3097 * Helper function to make sure that a zone created on 'rootpath'
3098 * wouldn't end up containing other zones' rootpaths.
3099 */
3100static boolean_t
3101zone_is_nested(const char *rootpath)
3102{
3103	zone_t *zone;
3104	size_t rootpathlen = strlen(rootpath);
3105	size_t len;
3106
3107	ASSERT(MUTEX_HELD(&zonehash_lock));
3108
3109	for (zone = list_head(&zone_active); zone != NULL;
3110	    zone = list_next(&zone_active, zone)) {
3111		if (zone == global_zone)
3112			continue;
3113		len = strlen(zone->zone_rootpath);
3114		if (strncmp(rootpath, zone->zone_rootpath,
3115		    MIN(rootpathlen, len)) == 0)
3116			return (B_TRUE);
3117	}
3118	return (B_FALSE);
3119}
3120
3121static int
3122zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3123    size_t zone_privssz)
3124{
3125	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3126
3127	if (zone_privssz < sizeof (priv_set_t))
3128		return (set_errno(ENOMEM));
3129
3130	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3131		kmem_free(privs, sizeof (priv_set_t));
3132		return (EFAULT);
3133	}
3134
3135	zone->zone_privset = privs;
3136	return (0);
3137}
3138
3139/*
3140 * We make creative use of nvlists to pass in rctls from userland.  The list is
3141 * a list of the following structures:
3142 *
3143 * (name = rctl_name, value = nvpair_list_array)
3144 *
3145 * Where each element of the nvpair_list_array is of the form:
3146 *
3147 * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3148 * 	(name = "limit", value = uint64_t),
3149 * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3150 */
3151static int
3152parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3153{
3154	nvpair_t *nvp = NULL;
3155	nvlist_t *nvl = NULL;
3156	char *kbuf;
3157	int error;
3158	rctl_val_t rv;
3159
3160	*nvlp = NULL;
3161
3162	if (buflen == 0)
3163		return (0);
3164
3165	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3166		return (ENOMEM);
3167	if (copyin(ubuf, kbuf, buflen)) {
3168		error = EFAULT;
3169		goto out;
3170	}
3171	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3172		/*
3173		 * nvl may have been allocated/free'd, but the value set to
3174		 * non-NULL, so we reset it here.
3175		 */
3176		nvl = NULL;
3177		error = EINVAL;
3178		goto out;
3179	}
3180	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3181		rctl_dict_entry_t *rde;
3182		rctl_hndl_t hndl;
3183		nvlist_t **nvlarray;
3184		uint_t i, nelem;
3185		char *name;
3186
3187		error = EINVAL;
3188		name = nvpair_name(nvp);
3189		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3190		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3191			goto out;
3192		}
3193		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3194			goto out;
3195		}
3196		rde = rctl_dict_lookup_hndl(hndl);
3197		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3198		ASSERT(error == 0);
3199		for (i = 0; i < nelem; i++) {
3200			if (error = nvlist2rctlval(nvlarray[i], &rv))
3201				goto out;
3202		}
3203		if (rctl_invalid_value(rde, &rv)) {
3204			error = EINVAL;
3205			goto out;
3206		}
3207	}
3208	error = 0;
3209	*nvlp = nvl;
3210out:
3211	kmem_free(kbuf, buflen);
3212	if (error && nvl != NULL)
3213		nvlist_free(nvl);
3214	return (error);
3215}
3216
3217int
3218zone_create_error(int er_error, int er_ext, int *er_out) {
3219	if (er_out != NULL) {
3220		if (copyout(&er_ext, er_out, sizeof (int))) {
3221			return (set_errno(EFAULT));
3222		}
3223	}
3224	return (set_errno(er_error));
3225}
3226
3227static int
3228zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3229{
3230	ts_label_t *tsl;
3231	bslabel_t blab;
3232
3233	/* Get label from user */
3234	if (copyin(lab, &blab, sizeof (blab)) != 0)
3235		return (EFAULT);
3236	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3237	if (tsl == NULL)
3238		return (ENOMEM);
3239
3240	zone->zone_slabel = tsl;
3241	return (0);
3242}
3243
3244/*
3245 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3246 */
3247static int
3248parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3249{
3250	char *kbuf;
3251	char *dataset, *next;
3252	zone_dataset_t *zd;
3253	size_t len;
3254
3255	if (ubuf == NULL || buflen == 0)
3256		return (0);
3257
3258	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3259		return (ENOMEM);
3260
3261	if (copyin(ubuf, kbuf, buflen) != 0) {
3262		kmem_free(kbuf, buflen);
3263		return (EFAULT);
3264	}
3265
3266	dataset = next = kbuf;
3267	for (;;) {
3268		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3269
3270		next = strchr(dataset, ',');
3271
3272		if (next == NULL)
3273			len = strlen(dataset);
3274		else
3275			len = next - dataset;
3276
3277		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3278		bcopy(dataset, zd->zd_dataset, len);
3279		zd->zd_dataset[len] = '\0';
3280
3281		list_insert_head(&zone->zone_datasets, zd);
3282
3283		if (next == NULL)
3284			break;
3285
3286		dataset = next + 1;
3287	}
3288
3289	kmem_free(kbuf, buflen);
3290	return (0);
3291}
3292
3293/*
3294 * System call to create/initialize a new zone named 'zone_name', rooted
3295 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3296 * and initialized with the zone-wide rctls described in 'rctlbuf', and
3297 * with labeling set by 'match', 'doi', and 'label'.
3298 *
3299 * If extended error is non-null, we may use it to return more detailed
3300 * error information.
3301 */
3302static zoneid_t
3303zone_create(const char *zone_name, const char *zone_root,
3304    const priv_set_t *zone_privs, size_t zone_privssz,
3305    caddr_t rctlbuf, size_t rctlbufsz,
3306    caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3307    int match, uint32_t doi, const bslabel_t *label,
3308    int flags)
3309{
3310	struct zsched_arg zarg;
3311	nvlist_t *rctls = NULL;
3312	proc_t *pp = curproc;
3313	zone_t *zone, *ztmp;
3314	zoneid_t zoneid;
3315	int error;
3316	int error2 = 0;
3317	char *str;
3318	cred_t *zkcr;
3319	boolean_t insert_label_hash;
3320
3321	if (secpolicy_zone_config(CRED()) != 0)
3322		return (set_errno(EPERM));
3323
3324	/* can't boot zone from within chroot environment */
3325	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3326		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3327		    extended_error));
3328
3329	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3330	zoneid = zone->zone_id = id_alloc(zoneid_space);
3331	zone->zone_status = ZONE_IS_UNINITIALIZED;
3332	zone->zone_pool = pool_default;
3333	zone->zone_pool_mod = gethrtime();
3334	zone->zone_psetid = ZONE_PS_INVAL;
3335	zone->zone_ncpus = 0;
3336	zone->zone_ncpus_online = 0;
3337	zone->zone_restart_init = B_TRUE;
3338	zone->zone_brand = &native_brand;
3339	zone->zone_initname = NULL;
3340	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3341	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3342	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3343	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3344	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3345	    offsetof(struct zsd_entry, zsd_linkage));
3346	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3347	    offsetof(zone_dataset_t, zd_linkage));
3348	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3349
3350	if (flags & ZCF_NET_EXCL) {
3351		zone->zone_flags |= ZF_NET_EXCL;
3352	}
3353
3354	if ((error = zone_set_name(zone, zone_name)) != 0) {
3355		zone_free(zone);
3356		return (zone_create_error(error, 0, extended_error));
3357	}
3358
3359	if ((error = zone_set_root(zone, zone_root)) != 0) {
3360		zone_free(zone);
3361		return (zone_create_error(error, 0, extended_error));
3362	}
3363	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
3364		zone_free(zone);
3365		return (zone_create_error(error, 0, extended_error));
3366	}
3367
3368	/* initialize node name to be the same as zone name */
3369	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3370	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
3371	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
3372
3373	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3374	zone->zone_domain[0] = '\0';
3375	zone->zone_shares = 1;
3376	zone->zone_shmmax = 0;
3377	zone->zone_ipc.ipcq_shmmni = 0;
3378	zone->zone_ipc.ipcq_semmni = 0;
3379	zone->zone_ipc.ipcq_msgmni = 0;
3380	zone->zone_bootargs = NULL;
3381	zone->zone_initname =
3382	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
3383	(void) strcpy(zone->zone_initname, zone_default_initname);
3384	zone->zone_nlwps = 0;
3385	zone->zone_nlwps_ctl = INT_MAX;
3386	zone->zone_locked_mem = 0;
3387	zone->zone_locked_mem_ctl = UINT64_MAX;
3388	zone->zone_max_swap = 0;
3389	zone->zone_max_swap_ctl = UINT64_MAX;
3390	zone0.zone_lockedmem_kstat = NULL;
3391	zone0.zone_swapresv_kstat = NULL;
3392
3393	/*
3394	 * Zsched initializes the rctls.
3395	 */
3396	zone->zone_rctls = NULL;
3397
3398	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
3399		zone_free(zone);
3400		return (zone_create_error(error, 0, extended_error));
3401	}
3402
3403	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
3404		zone_free(zone);
3405		return (set_errno(error));
3406	}
3407
3408	/*
3409	 * Read in the trusted system parameters:
3410	 * match flag and sensitivity label.
3411	 */
3412	zone->zone_match = match;
3413	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3414		/* Fail if requested to set doi to anything but system's doi */
3415		if (doi != 0 && doi != default_doi) {
3416			zone_free(zone);
3417			return (set_errno(EINVAL));
3418		}
3419		/* Always apply system's doi to the zone */
3420		error = zone_set_label(zone, label, default_doi);
3421		if (error != 0) {
3422			zone_free(zone);
3423			return (set_errno(error));
3424		}
3425		insert_label_hash = B_TRUE;
3426	} else {
3427		/* all zones get an admin_low label if system is not labeled */
3428		zone->zone_slabel = l_admin_low;
3429		label_hold(l_admin_low);
3430		insert_label_hash = B_FALSE;
3431	}
3432
3433	/*
3434	 * Stop all lwps since that's what normally happens as part of fork().
3435	 * This needs to happen before we grab any locks to avoid deadlock
3436	 * (another lwp in the process could be waiting for the held lock).
3437	 */
3438	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
3439		zone_free(zone);
3440		if (rctls)
3441			nvlist_free(rctls);
3442		return (zone_create_error(error, 0, extended_error));
3443	}
3444
3445	if (block_mounts() == 0) {
3446		mutex_enter(&pp->p_lock);
3447		if (curthread != pp->p_agenttp)
3448			continuelwps(pp);
3449		mutex_exit(&pp->p_lock);
3450		zone_free(zone);
3451		if (rctls)
3452			nvlist_free(rctls);
3453		return (zone_create_error(error, 0, extended_error));
3454	}
3455
3456	/*
3457	 * Set up credential for kernel access.  After this, any errors
3458	 * should go through the dance in errout rather than calling
3459	 * zone_free directly.
3460	 */
3461	zone->zone_kcred = crdup(kcred);
3462	crsetzone(zone->zone_kcred, zone);
3463	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
3464	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
3465	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
3466	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
3467
3468	mutex_enter(&zonehash_lock);
3469	/*
3470	 * Make sure zone doesn't already exist.
3471	 *
3472	 * If the system and zone are labeled,
3473	 * make sure no other zone exists that has the same label.
3474	 */
3475	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
3476	    (insert_label_hash &&
3477	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
3478		zone_status_t status;
3479
3480		status = zone_status_get(ztmp);
3481		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
3482			error = EEXIST;
3483		else
3484			error = EBUSY;
3485
3486		if (insert_label_hash)
3487			error2 = ZE_LABELINUSE;
3488
3489		goto errout;
3490	}
3491
3492	/*
3493	 * Don't allow zone creations which would cause one zone's rootpath to
3494	 * be accessible from that of another (non-global) zone.
3495	 */
3496	if (zone_is_nested(zone->zone_rootpath)) {
3497		error = EBUSY;
3498		goto errout;
3499	}
3500
3501	ASSERT(zonecount != 0);		/* check for leaks */
3502	if (zonecount + 1 > maxzones) {
3503		error = ENOMEM;
3504		goto errout;
3505	}
3506
3507	if (zone_mount_count(zone->zone_rootpath) != 0) {
3508		error = EBUSY;
3509		error2 = ZE_AREMOUNTS;
3510		goto errout;
3511	}
3512
3513	/*
3514	 * Zone is still incomplete, but we need to drop all locks while
3515	 * zsched() initializes this zone's kernel process.  We
3516	 * optimistically add the zone to the hashtable and associated
3517	 * lists so a parallel zone_create() doesn't try to create the
3518	 * same zone.
3519	 */
3520	zonecount++;
3521	(void) mod_hash_insert(zonehashbyid,
3522	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
3523	    (mod_hash_val_t)(uintptr_t)zone);
3524	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
3525	(void) strcpy(str, zone->zone_name);
3526	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
3527	    (mod_hash_val_t)(uintptr_t)zone);
3528	if (insert_label_hash) {
3529		(void) mod_hash_insert(zonehashbylabel,
3530		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
3531		zone->zone_flags |= ZF_HASHED_LABEL;
3532	}
3533
3534	/*
3535	 * Insert into active list.  At this point there are no 'hold's
3536	 * on the zone, but everyone else knows not to use it, so we can
3537	 * continue to use it.  zsched() will do a zone_hold() if the
3538	 * newproc() is successful.
3539	 */
3540	list_insert_tail(&zone_active, zone);
3541	mutex_exit(&zonehash_lock);
3542
3543	zarg.zone = zone;
3544	zarg.nvlist = rctls;
3545	/*
3546	 * The process, task, and project rctls are probably wrong;
3547	 * we need an interface to get the default values of all rctls,
3548	 * and initialize zsched appropriately.  I'm not sure that that
3549	 * makes much of a difference, though.
3550	 */
3551	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
3552		/*
3553		 * We need to undo all globally visible state.
3554		 */
3555		mutex_enter(&zonehash_lock);
3556		list_remove(&zone_active, zone);
3557		if (zone->zone_flags & ZF_HASHED_LABEL) {
3558			ASSERT(zone->zone_slabel != NULL);
3559			(void) mod_hash_destroy(zonehashbylabel,
3560			    (mod_hash_key_t)zone->zone_slabel);
3561		}
3562		(void) mod_hash_destroy(zonehashbyname,
3563		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
3564		(void) mod_hash_destroy(zonehashbyid,
3565		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3566		ASSERT(zonecount > 1);
3567		zonecount--;
3568		goto errout;
3569	}
3570
3571	/*
3572	 * Zone creation can't fail from now on.
3573	 */
3574
3575	/*
3576	 * Create zone kstats
3577	 */
3578	zone_kstat_create(zone);
3579
3580	/*
3581	 * Let the other lwps continue.
3582	 */
3583	mutex_enter(&pp->p_lock);
3584	if (curthread != pp->p_agenttp)
3585		continuelwps(pp);
3586	mutex_exit(&pp->p_lock);
3587
3588	/*
3589	 * Wait for zsched to finish initializing the zone.
3590	 */
3591	zone_status_wait(zone, ZONE_IS_READY);
3592	/*
3593	 * The zone is fully visible, so we can let mounts progress.
3594	 */
3595	resume_mounts();
3596	if (rctls)
3597		nvlist_free(rctls);
3598
3599	return (zoneid);
3600
3601errout:
3602	mutex_exit(&zonehash_lock);
3603	/*
3604	 * Let the other lwps continue.
3605	 */
3606	mutex_enter(&pp->p_lock);
3607	if (curthread != pp->p_agenttp)
3608		continuelwps(pp);
3609	mutex_exit(&pp->p_lock);
3610
3611	resume_mounts();
3612	if (rctls)
3613		nvlist_free(rctls);
3614	/*
3615	 * There is currently one reference to the zone, a cred_ref from
3616	 * zone_kcred.  To free the zone, we call crfree, which will call
3617	 * zone_cred_rele, which will call zone_free.
3618	 */
3619	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
3620	ASSERT(zone->zone_kcred->cr_ref == 1);
3621	ASSERT(zone->zone_ref == 0);
3622	zkcr = zone->zone_kcred;
3623	zone->zone_kcred = NULL;
3624	crfree(zkcr);				/* triggers call to zone_free */
3625	return (zone_create_error(error, error2, extended_error));
3626}
3627
3628/*
3629 * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
3630 * the heavy lifting.  initname is the path to the program to launch
3631 * at the "top" of the zone; if this is NULL, we use the system default,
3632 * which is stored at zone_default_initname.
3633 */
3634static int
3635zone_boot(zoneid_t zoneid)
3636{
3637	int err;
3638	zone_t *zone;
3639
3640	if (secpolicy_zone_config(CRED()) != 0)
3641		return (set_errno(EPERM));
3642	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3643		return (set_errno(EINVAL));
3644
3645	mutex_enter(&zonehash_lock);
3646	/*
3647	 * Look for zone under hash lock to prevent races with calls to
3648	 * zone_shutdown, zone_destroy, etc.
3649	 */
3650	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3651		mutex_exit(&zonehash_lock);
3652		return (set_errno(EINVAL));
3653	}
3654
3655	mutex_enter(&zone_status_lock);
3656	if (zone_status_get(zone) != ZONE_IS_READY) {
3657		mutex_exit(&zone_status_lock);
3658		mutex_exit(&zonehash_lock);
3659		return (set_errno(EINVAL));
3660	}
3661	zone_status_set(zone, ZONE_IS_BOOTING);
3662	mutex_exit(&zone_status_lock);
3663
3664	zone_hold(zone);	/* so we can use the zone_t later */
3665	mutex_exit(&zonehash_lock);
3666
3667	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
3668		zone_rele(zone);
3669		return (set_errno(EINTR));
3670	}
3671
3672	/*
3673	 * Boot (starting init) might have failed, in which case the zone
3674	 * will go to the SHUTTING_DOWN state; an appropriate errno will
3675	 * be placed in zone->zone_boot_err, and so we return that.
3676	 */
3677	err = zone->zone_boot_err;
3678	zone_rele(zone);
3679	return (err ? set_errno(err) : 0);
3680}
3681
3682/*
3683 * Kills all user processes in the zone, waiting for them all to exit
3684 * before returning.
3685 */
3686static int
3687zone_empty(zone_t *zone)
3688{
3689	int waitstatus;
3690
3691	/*
3692	 * We need to drop zonehash_lock before killing all
3693	 * processes, otherwise we'll deadlock with zone_find_*
3694	 * which can be called from the exit path.
3695	 */
3696	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
3697	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
3698	    ZONE_IS_EMPTY)) == -1) {
3699		killall(zone->zone_id);
3700	}
3701	/*
3702	 * return EINTR if we were signaled
3703	 */
3704	if (waitstatus == 0)
3705		return (EINTR);
3706	return (0);
3707}
3708
3709/*
3710 * This function implements the policy for zone visibility.
3711 *
3712 * In standard Solaris, a non-global zone can only see itself.
3713 *
3714 * In Trusted Extensions, a labeled zone can lookup any zone whose label
3715 * it dominates. For this test, the label of the global zone is treated as
3716 * admin_high so it is special-cased instead of being checked for dominance.
3717 *
3718 * Returns true if zone attributes are viewable, false otherwise.
3719 */
3720static boolean_t
3721zone_list_access(zone_t *zone)
3722{
3723
3724	if (curproc->p_zone == global_zone ||
3725	    curproc->p_zone == zone) {
3726		return (B_TRUE);
3727	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3728		bslabel_t *curproc_label;
3729		bslabel_t *zone_label;
3730
3731		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
3732		zone_label = label2bslabel(zone->zone_slabel);
3733
3734		if (zone->zone_id != GLOBAL_ZONEID &&
3735		    bldominates(curproc_label, zone_label)) {
3736			return (B_TRUE);
3737		} else {
3738			return (B_FALSE);
3739		}
3740	} else {
3741		return (B_FALSE);
3742	}
3743}
3744
3745/*
3746 * Systemcall to start the zone's halt sequence.  By the time this
3747 * function successfully returns, all user processes and kernel threads
3748 * executing in it will have exited, ZSD shutdown callbacks executed,
3749 * and the zone status set to ZONE_IS_DOWN.
3750 *
3751 * It is possible that the call will interrupt itself if the caller is the
3752 * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
3753 */
3754static int
3755zone_shutdown(zoneid_t zoneid)
3756{
3757	int error;
3758	zone_t *zone;
3759	zone_status_t status;
3760
3761	if (secpolicy_zone_config(CRED()) != 0)
3762		return (set_errno(EPERM));
3763	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3764		return (set_errno(EINVAL));
3765
3766	/*
3767	 * Block mounts so that VFS_MOUNT() can get an accurate view of
3768	 * the zone's status with regards to ZONE_IS_SHUTTING down.
3769	 *
3770	 * e.g. NFS can fail the mount if it determines that the zone
3771	 * has already begun the shutdown sequence.
3772	 */
3773	if (block_mounts() == 0)
3774		return (set_errno(EINTR));
3775	mutex_enter(&zonehash_lock);
3776	/*
3777	 * Look for zone under hash lock to prevent races with other
3778	 * calls to zone_shutdown and zone_destroy.
3779	 */
3780	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3781		mutex_exit(&zonehash_lock);
3782		resume_mounts();
3783		return (set_errno(EINVAL));
3784	}
3785	mutex_enter(&zone_status_lock);
3786	status = zone_status_get(zone);
3787	/*
3788	 * Fail if the zone isn't fully initialized yet.
3789	 */
3790	if (status < ZONE_IS_READY) {
3791		mutex_exit(&zone_status_lock);
3792		mutex_exit(&zonehash_lock);
3793		resume_mounts();
3794		return (set_errno(EINVAL));
3795	}
3796	/*
3797	 * If conditions required for zone_shutdown() to return have been met,
3798	 * return success.
3799	 */
3800	if (status >= ZONE_IS_DOWN) {
3801		mutex_exit(&zone_status_lock);
3802		mutex_exit(&zonehash_lock);
3803		resume_mounts();
3804		return (0);
3805	}
3806	/*
3807	 * If zone_shutdown() hasn't been called before, go through the motions.
3808	 * If it has, there's nothing to do but wait for the kernel threads to
3809	 * drain.
3810	 */
3811	if (status < ZONE_IS_EMPTY) {
3812		uint_t ntasks;
3813
3814		mutex_enter(&zone->zone_lock);
3815		if ((ntasks = zone->zone_ntasks) != 1) {
3816			/*
3817			 * There's still stuff running.
3818			 */
3819			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3820		}
3821		mutex_exit(&zone->zone_lock);
3822		if (ntasks == 1) {
3823			/*
3824			 * The only way to create another task is through
3825			 * zone_enter(), which will block until we drop
3826			 * zonehash_lock.  The zone is empty.
3827			 */
3828			if (zone->zone_kthreads == NULL) {
3829				/*
3830				 * Skip ahead to ZONE_IS_DOWN
3831				 */
3832				zone_status_set(zone, ZONE_IS_DOWN);
3833			} else {
3834				zone_status_set(zone, ZONE_IS_EMPTY);
3835			}
3836		}
3837	}
3838	zone_hold(zone);	/* so we can use the zone_t later */
3839	mutex_exit(&zone_status_lock);
3840	mutex_exit(&zonehash_lock);
3841	resume_mounts();
3842
3843	if (error = zone_empty(zone)) {
3844		zone_rele(zone);
3845		return (set_errno(error));
3846	}
3847	/*
3848	 * After the zone status goes to ZONE_IS_DOWN this zone will no
3849	 * longer be notified of changes to the pools configuration, so
3850	 * in order to not end up with a stale pool pointer, we point
3851	 * ourselves at the default pool and remove all resource
3852	 * visibility.  This is especially important as the zone_t may
3853	 * languish on the deathrow for a very long time waiting for
3854	 * cred's to drain out.
3855	 *
3856	 * This rebinding of the zone can happen multiple times
3857	 * (presumably due to interrupted or parallel systemcalls)
3858	 * without any adverse effects.
3859	 */
3860	if (pool_lock_intr() != 0) {
3861		zone_rele(zone);
3862		return (set_errno(EINTR));
3863	}
3864	if (pool_state == POOL_ENABLED) {
3865		mutex_enter(&cpu_lock);
3866		zone_pool_set(zone, pool_default);
3867		/*
3868		 * The zone no longer needs to be able to see any cpus.
3869		 */
3870		zone_pset_set(zone, ZONE_PS_INVAL);
3871		mutex_exit(&cpu_lock);
3872	}
3873	pool_unlock();
3874
3875	/*
3876	 * ZSD shutdown callbacks can be executed multiple times, hence
3877	 * it is safe to not be holding any locks across this call.
3878	 */
3879	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
3880
3881	mutex_enter(&zone_status_lock);
3882	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
3883		zone_status_set(zone, ZONE_IS_DOWN);
3884	mutex_exit(&zone_status_lock);
3885
3886	/*
3887	 * Wait for kernel threads to drain.
3888	 */
3889	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
3890		zone_rele(zone);
3891		return (set_errno(EINTR));
3892	}
3893
3894	/*
3895	 * Zone can be become down/destroyable even if the above wait
3896	 * returns EINTR, so any code added here may never execute.
3897	 * (i.e. don't add code here)
3898	 */
3899
3900	zone_rele(zone);
3901	return (0);
3902}
3903
3904/*
3905 * Systemcall entry point to finalize the zone halt process.  The caller
3906 * must have already successfully called zone_shutdown().
3907 *
3908 * Upon successful completion, the zone will have been fully destroyed:
3909 * zsched will have exited, destructor callbacks executed, and the zone
3910 * removed from the list of active zones.
3911 */
3912static int
3913zone_destroy(zoneid_t zoneid)
3914{
3915	uint64_t uniqid;
3916	zone_t *zone;
3917	zone_status_t status;
3918
3919	if (secpolicy_zone_config(CRED()) != 0)
3920		return (set_errno(EPERM));
3921	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3922		return (set_errno(EINVAL));
3923
3924	mutex_enter(&zonehash_lock);
3925	/*
3926	 * Look for zone under hash lock to prevent races with other
3927	 * calls to zone_destroy.
3928	 */
3929	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3930		mutex_exit(&zonehash_lock);
3931		return (set_errno(EINVAL));
3932	}
3933
3934	if (zone_mount_count(zone->zone_rootpath) != 0) {
3935		mutex_exit(&zonehash_lock);
3936		return (set_errno(EBUSY));
3937	}
3938	mutex_enter(&zone_status_lock);
3939	status = zone_status_get(zone);
3940	if (status < ZONE_IS_DOWN) {
3941		mutex_exit(&zone_status_lock);
3942		mutex_exit(&zonehash_lock);
3943		return (set_errno(EBUSY));
3944	} else if (status == ZONE_IS_DOWN) {
3945		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
3946	}
3947	mutex_exit(&zone_status_lock);
3948	zone_hold(zone);
3949	mutex_exit(&zonehash_lock);
3950
3951	/*
3952	 * wait for zsched to exit
3953	 */
3954	zone_status_wait(zone, ZONE_IS_DEAD);
3955	zone_zsd_callbacks(zone, ZSD_DESTROY);
3956	zone->zone_netstack = NULL;
3957	uniqid = zone->zone_uniqid;
3958	zone_rele(zone);
3959	zone = NULL;	/* potentially free'd */
3960
3961	mutex_enter(&zonehash_lock);
3962	for (; /* ever */; ) {
3963		boolean_t unref;
3964
3965		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
3966		    zone->zone_uniqid != uniqid) {
3967			/*
3968			 * The zone has gone away.  Necessary conditions
3969			 * are met, so we return success.
3970			 */
3971			mutex_exit(&zonehash_lock);
3972			return (0);
3973		}
3974		mutex_enter(&zone->zone_lock);
3975		unref = ZONE_IS_UNREF(zone);
3976		mutex_exit(&zone->zone_lock);
3977		if (unref) {
3978			/*
3979			 * There is only one reference to the zone -- that
3980			 * added when the zone was added to the hashtables --
3981			 * and things will remain this way until we drop
3982			 * zonehash_lock... we can go ahead and cleanup the
3983			 * zone.
3984			 */
3985			break;
3986		}
3987
3988		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
3989			/* Signaled */
3990			mutex_exit(&zonehash_lock);
3991			return (set_errno(EINTR));
3992		}
3993
3994	}
3995
3996	/*
3997	 * Remove CPU cap for this zone now since we're not going to
3998	 * fail below this point.
3999	 */
4000	cpucaps_zone_remove(zone);
4001
4002	/* Get rid of the zone's kstats */
4003	zone_kstat_delete(zone);
4004
4005	/* free brand specific data */
4006	if (ZONE_IS_BRANDED(zone))
4007		ZBROP(zone)->b_free_brand_data(zone);
4008
4009	/* Say goodbye to brand framework. */
4010	brand_unregister_zone(zone->zone_brand);
4011
4012	/*
4013	 * It is now safe to let the zone be recreated; remove it from the
4014	 * lists.  The memory will not be freed until the last cred
4015	 * reference goes away.
4016	 */
4017	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
4018	zonecount--;
4019	/* remove from active list and hash tables */
4020	list_remove(&zone_active, zone);
4021	(void) mod_hash_destroy(zonehashbyname,
4022	    (mod_hash_key_t)zone->zone_name);
4023	(void) mod_hash_destroy(zonehashbyid,
4024	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4025	if (zone->zone_flags & ZF_HASHED_LABEL)
4026		(void) mod_hash_destroy(zonehashbylabel,
4027		    (mod_hash_key_t)zone->zone_slabel);
4028	mutex_exit(&zonehash_lock);
4029
4030	/*
4031	 * Release the root vnode; we're not using it anymore.  Nor should any
4032	 * other thread that might access it exist.
4033	 */
4034	if (zone->zone_rootvp != NULL) {
4035		VN_RELE(zone->zone_rootvp);
4036		zone->zone_rootvp = NULL;
4037	}
4038
4039	/* add to deathrow list */
4040	mutex_enter(&zone_deathrow_lock);
4041	list_insert_tail(&zone_deathrow, zone);
4042	mutex_exit(&zone_deathrow_lock);
4043
4044	/*
4045	 * Drop last reference (which was added by zsched()), this will
4046	 * free the zone unless there are outstanding cred references.
4047	 */
4048	zone_rele(zone);
4049	return (0);
4050}
4051
4052/*
4053 * Systemcall entry point for zone_getattr(2).
4054 */
4055static ssize_t
4056zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4057{
4058	size_t size;
4059	int error = 0, err;
4060	zone_t *zone;
4061	char *zonepath;
4062	char *outstr;
4063	zone_status_t zone_status;
4064	pid_t initpid;
4065	boolean_t global = (curzone == global_zone);
4066	boolean_t inzone = (curzone->zone_id == zoneid);
4067	ushort_t flags;
4068
4069	mutex_enter(&zonehash_lock);
4070	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4071		mutex_exit(&zonehash_lock);
4072		return (set_errno(EINVAL));
4073	}
4074	zone_status = zone_status_get(zone);
4075	if (zone_status < ZONE_IS_READY) {
4076		mutex_exit(&zonehash_lock);
4077		return (set_errno(EINVAL));
4078	}
4079	zone_hold(zone);
4080	mutex_exit(&zonehash_lock);
4081
4082	/*
4083	 * If not in the global zone, don't show information about other zones,
4084	 * unless the system is labeled and the local zone's label dominates
4085	 * the other zone.
4086	 */
4087	if (!zone_list_access(zone)) {
4088		zone_rele(zone);
4089		return (set_errno(EINVAL));
4090	}
4091
4092	switch (attr) {
4093	case ZONE_ATTR_ROOT:
4094		if (global) {
4095			/*
4096			 * Copy the path to trim the trailing "/" (except for
4097			 * the global zone).
4098			 */
4099			if (zone != global_zone)
4100				size = zone->zone_rootpathlen - 1;
4101			else
4102				size = zone->zone_rootpathlen;
4103			zonepath = kmem_alloc(size, KM_SLEEP);
4104			bcopy(zone->zone_rootpath, zonepath, size);
4105			zonepath[size - 1] = '\0';
4106		} else {
4107			if (inzone || !is_system_labeled()) {
4108				/*
4109				 * Caller is not in the global zone.
4110				 * if the query is on the current zone
4111				 * or the system is not labeled,
4112				 * just return faked-up path for current zone.
4113				 */
4114				zonepath = "/";
4115				size = 2;
4116			} else {
4117				/*
4118				 * Return related path for current zone.
4119				 */
4120				int prefix_len = strlen(zone_prefix);
4121				int zname_len = strlen(zone->zone_name);
4122
4123				size = prefix_len + zname_len + 1;
4124				zonepath = kmem_alloc(size, KM_SLEEP);
4125				bcopy(zone_prefix, zonepath, prefix_len);
4126				bcopy(zone->zone_name, zonepath +
4127				    prefix_len, zname_len);
4128				zonepath[size - 1] = '\0';
4129			}
4130		}
4131		if (bufsize > size)
4132			bufsize = size;
4133		if (buf != NULL) {
4134			err = copyoutstr(zonepath, buf, bufsize, NULL);
4135			if (err != 0 && err != ENAMETOOLONG)
4136				error = EFAULT;
4137		}
4138		if (global || (is_system_labeled() && !inzone))
4139			kmem_free(zonepath, size);
4140		break;
4141
4142	case ZONE_ATTR_NAME:
4143		size = strlen(zone->zone_name) + 1;
4144		if (bufsize > size)
4145			bufsize = size;
4146		if (buf != NULL) {
4147			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4148			if (err != 0 && err != ENAMETOOLONG)
4149				error = EFAULT;
4150		}
4151		break;
4152
4153	case ZONE_ATTR_STATUS:
4154		/*
4155		 * Since we're not holding zonehash_lock, the zone status
4156		 * may be anything; leave it up to userland to sort it out.
4157		 */
4158		size = sizeof (zone_status);
4159		if (bufsize > size)
4160			bufsize = size;
4161		zone_status = zone_status_get(zone);
4162		if (buf != NULL &&
4163		    copyout(&zone_status, buf, bufsize) != 0)
4164			error = EFAULT;
4165		break;
4166	case ZONE_ATTR_FLAGS:
4167		size = sizeof (zone->zone_flags);
4168		if (bufsize > size)
4169			bufsize = size;
4170		flags = zone->zone_flags;
4171		if (buf != NULL &&
4172		    copyout(&flags, buf, bufsize) != 0)
4173			error = EFAULT;
4174		break;
4175	case ZONE_ATTR_PRIVSET:
4176		size = sizeof (priv_set_t);
4177		if (bufsize > size)
4178			bufsize = size;
4179		if (buf != NULL &&
4180		    copyout(zone->zone_privset, buf, bufsize) != 0)
4181			error = EFAULT;
4182		break;
4183	case ZONE_ATTR_UNIQID:
4184		size = sizeof (zone->zone_uniqid);
4185		if (bufsize > size)
4186			bufsize = size;
4187		if (buf != NULL &&
4188		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4189			error = EFAULT;
4190		break;
4191	case ZONE_ATTR_POOLID:
4192		{
4193			pool_t *pool;
4194			poolid_t poolid;
4195
4196			if (pool_lock_intr() != 0) {
4197				error = EINTR;
4198				break;
4199			}
4200			pool = zone_pool_get(zone);
4201			poolid = pool->pool_id;
4202			pool_unlock();
4203			size = sizeof (poolid);
4204			if (bufsize > size)
4205				bufsize = size;
4206			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4207				error = EFAULT;
4208		}
4209		break;
4210	case ZONE_ATTR_SLBL:
4211		size = sizeof (bslabel_t);
4212		if (bufsize > size)
4213			bufsize = size;
4214		if (zone->zone_slabel == NULL)
4215			error = EINVAL;
4216		else if (buf != NULL &&
4217		    copyout(label2bslabel(zone->zone_slabel), buf,
4218		    bufsize) != 0)
4219			error = EFAULT;
4220		break;
4221	case ZONE_ATTR_INITPID:
4222		size = sizeof (initpid);
4223		if (bufsize > size)
4224			bufsize = size;
4225		initpid = zone->zone_proc_initpid;
4226		if (initpid == -1) {
4227			error = ESRCH;
4228			break;
4229		}
4230		if (buf != NULL &&
4231		    copyout(&initpid, buf, bufsize) != 0)
4232			error = EFAULT;
4233		break;
4234	case ZONE_ATTR_BRAND:
4235		size = strlen(zone->zone_brand->b_name) + 1;
4236
4237		if (bufsize > size)
4238			bufsize = size;
4239		if (buf != NULL) {
4240			err = copyoutstr(zone->zone_brand->b_name, buf,
4241			    bufsize, NULL);
4242			if (err != 0 && err != ENAMETOOLONG)
4243				error = EFAULT;
4244		}
4245		break;
4246	case ZONE_ATTR_INITNAME:
4247		size = strlen(zone->zone_initname) + 1;
4248		if (bufsize > size)
4249			bufsize = size;
4250		if (buf != NULL) {
4251			err = copyoutstr(zone->zone_initname, buf, bufsize,
4252			    NULL);
4253			if (err != 0 && err != ENAMETOOLONG)
4254				error = EFAULT;
4255		}
4256		break;
4257	case ZONE_ATTR_BOOTARGS:
4258		if (zone->zone_bootargs == NULL)
4259			outstr = "";
4260		else
4261			outstr = zone->zone_bootargs;
4262		size = strlen(outstr) + 1;
4263		if (bufsize > size)
4264			bufsize = size;
4265		if (buf != NULL) {
4266			err = copyoutstr(outstr, buf, bufsize, NULL);
4267			if (err != 0 && err != ENAMETOOLONG)
4268				error = EFAULT;
4269		}
4270		break;
4271	case ZONE_ATTR_PHYS_MCAP:
4272		size = sizeof (zone->zone_phys_mcap);
4273		if (bufsize > size)
4274			bufsize = size;
4275		if (buf != NULL &&
4276		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4277			error = EFAULT;
4278		break;
4279	case ZONE_ATTR_SCHED_CLASS:
4280		mutex_enter(&class_lock);
4281
4282		if (zone->zone_defaultcid >= loaded_classes)
4283			outstr = "";
4284		else
4285			outstr = sclass[zone->zone_defaultcid].cl_name;
4286		size = strlen(outstr) + 1;
4287		if (bufsize > size)
4288			bufsize = size;
4289		if (buf != NULL) {
4290			err = copyoutstr(outstr, buf, bufsize, NULL);
4291			if (err != 0 && err != ENAMETOOLONG)
4292				error = EFAULT;
4293		}
4294
4295		mutex_exit(&class_lock);
4296		break;
4297	default:
4298		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4299			size = bufsize;
4300			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4301		} else {
4302			error = EINVAL;
4303		}
4304	}
4305	zone_rele(zone);
4306
4307	if (error)
4308		return (set_errno(error));
4309	return ((ssize_t)size);
4310}
4311
4312/*
4313 * Systemcall entry point for zone_setattr(2).
4314 */
4315/*ARGSUSED*/
4316static int
4317zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4318{
4319	zone_t *zone;
4320	zone_status_t zone_status;
4321	int err;
4322
4323	if (secpolicy_zone_config(CRED()) != 0)
4324		return (set_errno(EPERM));
4325
4326	/*
4327	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
4328	 * global zone.
4329	 */
4330	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
4331		return (set_errno(EINVAL));
4332	}
4333
4334	mutex_enter(&zonehash_lock);
4335	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4336		mutex_exit(&zonehash_lock);
4337		return (set_errno(EINVAL));
4338	}
4339	zone_hold(zone);
4340	mutex_exit(&zonehash_lock);
4341
4342	/*
4343	 * At present most attributes can only be set on non-running,
4344	 * non-global zones.
4345	 */
4346	zone_status = zone_status_get(zone);
4347	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
4348		goto done;
4349
4350	switch (attr) {
4351	case ZONE_ATTR_INITNAME:
4352		err = zone_set_initname(zone, (const char *)buf);
4353		break;
4354	case ZONE_ATTR_BOOTARGS:
4355		err = zone_set_bootargs(zone, (const char *)buf);
4356		break;
4357	case ZONE_ATTR_BRAND:
4358		err = zone_set_brand(zone, (const char *)buf);
4359		break;
4360	case ZONE_ATTR_PHYS_MCAP:
4361		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
4362		break;
4363	case ZONE_ATTR_SCHED_CLASS:
4364		err = zone_set_sched_class(zone, (const char *)buf);
4365		break;
4366	default:
4367		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
4368			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
4369		else
4370			err = EINVAL;
4371	}
4372
4373done:
4374	zone_rele(zone);
4375	return (err != 0 ? set_errno(err) : 0);
4376}
4377
4378/*
4379 * Return zero if the process has at least one vnode mapped in to its
4380 * address space which shouldn't be allowed to change zones.
4381 *
4382 * Also return zero if the process has any shared mappings which reserve
4383 * swap.  This is because the counting for zone.max-swap does not allow swap
4384 * revervation to be shared between zones.  zone swap reservation is counted
4385 * on zone->zone_max_swap.
4386 */
4387static int
4388as_can_change_zones(void)
4389{
4390	proc_t *pp = curproc;
4391	struct seg *seg;
4392	struct as *as = pp->p_as;
4393	vnode_t *vp;
4394	int allow = 1;
4395
4396	ASSERT(pp->p_as != &kas);
4397	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
4398	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
4399
4400		/*
4401		 * Cannot enter zone with shared anon memory which
4402		 * reserves swap.  See comment above.
4403		 */
4404		if (seg_can_change_zones(seg) == B_FALSE) {
4405			allow = 0;
4406			break;
4407		}
4408		/*
4409		 * if we can't get a backing vnode for this segment then skip
4410		 * it.
4411		 */
4412		vp = NULL;
4413		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
4414			continue;
4415		if (!vn_can_change_zones(vp)) { /* bail on first match */
4416			allow = 0;
4417			break;
4418		}
4419	}
4420	AS_LOCK_EXIT(as, &as->a_lock);
4421	return (allow);
4422}
4423
4424/*
4425 * Count swap reserved by curproc's address space
4426 */
4427static size_t
4428as_swresv(void)
4429{
4430	proc_t *pp = curproc;
4431	struct seg *seg;
4432	struct as *as = pp->p_as;
4433	size_t swap = 0;
4434
4435	ASSERT(pp->p_as != &kas);
4436	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
4437	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
4438		swap += seg_swresv(seg);
4439
4440	return (swap);
4441}
4442
4443/*
4444 * Systemcall entry point for zone_enter().
4445 *
4446 * The current process is injected into said zone.  In the process
4447 * it will change its project membership, privileges, rootdir/cwd,
4448 * zone-wide rctls, and pool association to match those of the zone.
4449 *
4450 * The first zone_enter() called while the zone is in the ZONE_IS_READY
4451 * state will transition it to ZONE_IS_RUNNING.  Processes may only
4452 * enter a zone that is "ready" or "running".
4453 */
4454static int
4455zone_enter(zoneid_t zoneid)
4456{
4457	zone_t *zone;
4458	vnode_t *vp;
4459	proc_t *pp = curproc;
4460	contract_t *ct;
4461	cont_process_t *ctp;
4462	task_t *tk, *oldtk;
4463	kproject_t *zone_proj0;
4464	cred_t *cr, *newcr;
4465	pool_t *oldpool, *newpool;
4466	sess_t *sp;
4467	uid_t uid;
4468	zone_status_t status;
4469	int err = 0;
4470	rctl_entity_p_t e;
4471	size_t swap;
4472	kthread_id_t t;
4473
4474	if (secpolicy_zone_config(CRED()) != 0)
4475		return (set_errno(EPERM));
4476	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4477		return (set_errno(EINVAL));
4478
4479	/*
4480	 * Stop all lwps so we don't need to hold a lock to look at
4481	 * curproc->p_zone.  This needs to happen before we grab any
4482	 * locks to avoid deadlock (another lwp in the process could
4483	 * be waiting for the held lock).
4484	 */
4485	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
4486		return (set_errno(EINTR));
4487
4488	/*
4489	 * Make sure we're not changing zones with files open or mapped in
4490	 * to our address space which shouldn't be changing zones.
4491	 */
4492	if (!files_can_change_zones()) {
4493		err = EBADF;
4494		goto out;
4495	}
4496	if (!as_can_change_zones()) {
4497		err = EFAULT;
4498		goto out;
4499	}
4500
4501	mutex_enter(&zonehash_lock);
4502	if (pp->p_zone != global_zone) {
4503		mutex_exit(&zonehash_lock);
4504		err = EINVAL;
4505		goto out;
4506	}
4507
4508	zone = zone_find_all_by_id(zoneid);
4509	if (zone == NULL) {
4510		mutex_exit(&zonehash_lock);
4511		err = EINVAL;
4512		goto out;
4513	}
4514
4515	/*
4516	 * To prevent processes in a zone from holding contracts on
4517	 * extrazonal resources, and to avoid process contract
4518	 * memberships which span zones, contract holders and processes
4519	 * which aren't the sole members of their encapsulating process
4520	 * contracts are not allowed to zone_enter.
4521	 */
4522	ctp = pp->p_ct_process;
4523	ct = &ctp->conp_contract;
4524	mutex_enter(&ct->ct_lock);
4525	mutex_enter(&pp->p_lock);
4526	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
4527		mutex_exit(&pp->p_lock);
4528		mutex_exit(&ct->ct_lock);
4529		mutex_exit(&zonehash_lock);
4530		err = EINVAL;
4531		goto out;
4532	}
4533
4534	/*
4535	 * Moreover, we don't allow processes whose encapsulating
4536	 * process contracts have inherited extrazonal contracts.
4537	 * While it would be easier to eliminate all process contracts
4538	 * with inherited contracts, we need to be able to give a
4539	 * restarted init (or other zone-penetrating process) its
4540	 * predecessor's contracts.
4541	 */
4542	if (ctp->conp_ninherited != 0) {
4543		contract_t *next;
4544		for (next = list_head(&ctp->conp_inherited); next;
4545		    next = list_next(&ctp->conp_inherited, next)) {
4546			if (contract_getzuniqid(next) != zone->zone_uniqid) {
4547				mutex_exit(&pp->p_lock);
4548				mutex_exit(&ct->ct_lock);
4549				mutex_exit(&zonehash_lock);
4550				err = EINVAL;
4551				goto out;
4552			}
4553		}
4554	}
4555	mutex_exit(&pp->p_lock);
4556	mutex_exit(&ct->ct_lock);
4557
4558	status = zone_status_get(zone);
4559	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
4560		/*
4561		 * Can't join
4562		 */
4563		mutex_exit(&zonehash_lock);
4564		err = EINVAL;
4565		goto out;
4566	}
4567
4568	/*
4569	 * Make sure new priv set is within the permitted set for caller
4570	 */
4571	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
4572		mutex_exit(&zonehash_lock);
4573		err = EPERM;
4574		goto out;
4575	}
4576	/*
4577	 * We want to momentarily drop zonehash_lock while we optimistically
4578	 * bind curproc to the pool it should be running in.  This is safe
4579	 * since the zone can't disappear (we have a hold on it).
4580	 */
4581	zone_hold(zone);
4582	mutex_exit(&zonehash_lock);
4583
4584	/*
4585	 * Grab pool_lock to keep the pools configuration from changing
4586	 * and to stop ourselves from getting rebound to another pool
4587	 * until we join the zone.
4588	 */
4589	if (pool_lock_intr() != 0) {
4590		zone_rele(zone);
4591		err = EINTR;
4592		goto out;
4593	}
4594	ASSERT(secpolicy_pool(CRED()) == 0);
4595	/*
4596	 * Bind ourselves to the pool currently associated with the zone.
4597	 */
4598	oldpool = curproc->p_pool;
4599	newpool = zone_pool_get(zone);
4600	if (pool_state == POOL_ENABLED && newpool != oldpool &&
4601	    (err = pool_do_bind(newpool, P_PID, P_MYID,
4602	    POOL_BIND_ALL)) != 0) {
4603		pool_unlock();
4604		zone_rele(zone);
4605		goto out;
4606	}
4607
4608	/*
4609	 * Grab cpu_lock now; we'll need it later when we call
4610	 * task_join().
4611	 */
4612	mutex_enter(&cpu_lock);
4613	mutex_enter(&zonehash_lock);
4614	/*
4615	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
4616	 */
4617	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
4618		/*
4619		 * Can't join anymore.
4620		 */
4621		mutex_exit(&zonehash_lock);
4622		mutex_exit(&cpu_lock);
4623		if (pool_state == POOL_ENABLED &&
4624		    newpool != oldpool)
4625			(void) pool_do_bind(oldpool, P_PID, P_MYID,
4626			    POOL_BIND_ALL);
4627		pool_unlock();
4628		zone_rele(zone);
4629		err = EINVAL;
4630		goto out;
4631	}
4632
4633	/*
4634	 * a_lock must be held while transfering locked memory and swap
4635	 * reservation from the global zone to the non global zone because
4636	 * asynchronous faults on the processes' address space can lock
4637	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
4638	 * segments respectively.
4639	 */
4640	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
4641	swap = as_swresv();
4642	mutex_enter(&pp->p_lock);
4643	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
4644	/* verify that we do not exceed and task or lwp limits */
4645	mutex_enter(&zone->zone_nlwps_lock);
4646	/* add new lwps to zone and zone's proj0 */
4647	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
4648	zone->zone_nlwps += pp->p_lwpcnt;
4649	/* add 1 task to zone's proj0 */
4650	zone_proj0->kpj_ntasks += 1;
4651	mutex_exit(&zone->zone_nlwps_lock);
4652
4653	mutex_enter(&zone->zone_mem_lock);
4654	zone->zone_locked_mem += pp->p_locked_mem;
4655	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4656	zone->zone_max_swap += swap;
4657	mutex_exit(&zone->zone_mem_lock);
4658
4659	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
4660	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
4661	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
4662
4663	/* remove lwps from proc's old zone and old project */
4664	mutex_enter(&pp->p_zone->zone_nlwps_lock);
4665	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
4666	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
4667	mutex_exit(&pp->p_zone->zone_nlwps_lock);
4668
4669	mutex_enter(&pp->p_zone->zone_mem_lock);
4670	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
4671	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4672	pp->p_zone->zone_max_swap -= swap;
4673	mutex_exit(&pp->p_zone->zone_mem_lock);
4674
4675	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
4676	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
4677	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
4678
4679	mutex_exit(&pp->p_lock);
4680	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
4681
4682	/*
4683	 * Joining the zone cannot fail from now on.
4684	 *
4685	 * This means that a lot of the following code can be commonized and
4686	 * shared with zsched().
4687	 */
4688
4689	/*
4690	 * Reset the encapsulating process contract's zone.
4691	 */
4692	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
4693	contract_setzuniqid(ct, zone->zone_uniqid);
4694
4695	/*
4696	 * Create a new task and associate the process with the project keyed
4697	 * by (projid,zoneid).
4698	 *
4699	 * We might as well be in project 0; the global zone's projid doesn't
4700	 * make much sense in a zone anyhow.
4701	 *
4702	 * This also increments zone_ntasks, and returns with p_lock held.
4703	 */
4704	tk = task_create(0, zone);
4705	oldtk = task_join(tk, 0);
4706	mutex_exit(&cpu_lock);
4707
4708	pp->p_flag |= SZONETOP;
4709	pp->p_zone = zone;
4710
4711	/*
4712	 * call RCTLOP_SET functions on this proc
4713	 */
4714	e.rcep_p.zone = zone;
4715	e.rcep_t = RCENTITY_ZONE;
4716	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
4717	    RCD_CALLBACK);
4718	mutex_exit(&pp->p_lock);
4719
4720	/*
4721	 * We don't need to hold any of zsched's locks here; not only do we know
4722	 * the process and zone aren't going away, we know its session isn't
4723	 * changing either.
4724	 *
4725	 * By joining zsched's session here, we mimic the behavior in the
4726	 * global zone of init's sid being the pid of sched.  We extend this
4727	 * to all zlogin-like zone_enter()'ing processes as well.
4728	 */
4729	mutex_enter(&pidlock);
4730	sp = zone->zone_zsched->p_sessp;
4731	sess_hold(zone->zone_zsched);
4732	mutex_enter(&pp->p_lock);
4733	pgexit(pp);
4734	sess_rele(pp->p_sessp, B_TRUE);
4735	pp->p_sessp = sp;
4736	pgjoin(pp, zone->zone_zsched->p_pidp);
4737
4738	/*
4739	 * If any threads are scheduled to be placed on zone wait queue they
4740	 * should abandon the idea since the wait queue is changing.
4741	 * We need to be holding pidlock & p_lock to do this.
4742	 */
4743	if ((t = pp->p_tlist) != NULL) {
4744		do {
4745			thread_lock(t);
4746			/*
4747			 * Kick this thread so that he doesn't sit
4748			 * on a wrong wait queue.
4749			 */
4750			if (ISWAITING(t))
4751				setrun_locked(t);
4752
4753			if (t->t_schedflag & TS_ANYWAITQ)
4754				t->t_schedflag &= ~ TS_ANYWAITQ;
4755
4756			thread_unlock(t);
4757		} while ((t = t->t_forw) != pp->p_tlist);
4758	}
4759
4760	/*
4761	 * If there is a default scheduling class for the zone and it is not
4762	 * the class we are currently in, change all of the threads in the
4763	 * process to the new class.  We need to be holding pidlock & p_lock
4764	 * when we call parmsset so this is a good place to do it.
4765	 */
4766	if (zone->zone_defaultcid > 0 &&
4767	    zone->zone_defaultcid != curthread->t_cid) {
4768		pcparms_t pcparms;
4769
4770		pcparms.pc_cid = zone->zone_defaultcid;
4771		pcparms.pc_clparms[0] = 0;
4772
4773		/*
4774		 * If setting the class fails, we still want to enter the zone.
4775		 */
4776		if ((t = pp->p_tlist) != NULL) {
4777			do {
4778				(void) parmsset(&pcparms, t);
4779			} while ((t = t->t_forw) != pp->p_tlist);
4780		}
4781	}
4782
4783	mutex_exit(&pp->p_lock);
4784	mutex_exit(&pidlock);
4785
4786	mutex_exit(&zonehash_lock);
4787	/*
4788	 * We're firmly in the zone; let pools progress.
4789	 */
4790	pool_unlock();
4791	task_rele(oldtk);
4792	/*
4793	 * We don't need to retain a hold on the zone since we already
4794	 * incremented zone_ntasks, so the zone isn't going anywhere.
4795	 */
4796	zone_rele(zone);
4797
4798	/*
4799	 * Chroot
4800	 */
4801	vp = zone->zone_rootvp;
4802	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
4803	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
4804
4805	/*
4806	 * Change process credentials
4807	 */
4808	newcr = cralloc();
4809	mutex_enter(&pp->p_crlock);
4810	cr = pp->p_cred;
4811	crcopy_to(cr, newcr);
4812	crsetzone(newcr, zone);
4813	pp->p_cred = newcr;
4814
4815	/*
4816	 * Restrict all process privilege sets to zone limit
4817	 */
4818	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
4819	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
4820	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
4821	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
4822	mutex_exit(&pp->p_crlock);
4823	crset(pp, newcr);
4824
4825	/*
4826	 * Adjust upcount to reflect zone entry.
4827	 */
4828	uid = crgetruid(newcr);
4829	mutex_enter(&pidlock);
4830	upcount_dec(uid, GLOBAL_ZONEID);
4831	upcount_inc(uid, zoneid);
4832	mutex_exit(&pidlock);
4833
4834	/*
4835	 * Set up core file path and content.
4836	 */
4837	set_core_defaults();
4838
4839out:
4840	/*
4841	 * Let the other lwps continue.
4842	 */
4843	mutex_enter(&pp->p_lock);
4844	if (curthread != pp->p_agenttp)
4845		continuelwps(pp);
4846	mutex_exit(&pp->p_lock);
4847
4848	return (err != 0 ? set_errno(err) : 0);
4849}
4850
4851/*
4852 * Systemcall entry point for zone_list(2).
4853 *
4854 * Processes running in a (non-global) zone only see themselves.
4855 * On labeled systems, they see all zones whose label they dominate.
4856 */
4857static int
4858zone_list(zoneid_t *zoneidlist, uint_t *numzones)
4859{
4860	zoneid_t *zoneids;
4861	zone_t *zone, *myzone;
4862	uint_t user_nzones, real_nzones;
4863	uint_t domi_nzones;
4864	int error;
4865
4866	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
4867		return (set_errno(EFAULT));
4868
4869	myzone = curproc->p_zone;
4870	if (myzone != global_zone) {
4871		bslabel_t *mybslab;
4872
4873		if (!is_system_labeled()) {
4874			/* just return current zone */
4875			real_nzones = domi_nzones = 1;
4876			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
4877			zoneids[0] = myzone->zone_id;
4878		} else {
4879			/* return all zones that are dominated */
4880			mutex_enter(&zonehash_lock);
4881			real_nzones = zonecount;
4882			domi_nzones = 0;
4883			if (real_nzones > 0) {
4884				zoneids = kmem_alloc(real_nzones *
4885				    sizeof (zoneid_t), KM_SLEEP);
4886				mybslab = label2bslabel(myzone->zone_slabel);
4887				for (zone = list_head(&zone_active);
4888				    zone != NULL;
4889				    zone = list_next(&zone_active, zone)) {
4890					if (zone->zone_id == GLOBAL_ZONEID)
4891						continue;
4892					if (zone != myzone &&
4893					    (zone->zone_flags & ZF_IS_SCRATCH))
4894						continue;
4895					/*
4896					 * Note that a label always dominates
4897					 * itself, so myzone is always included
4898					 * in the list.
4899					 */
4900					if (bldominates(mybslab,
4901					    label2bslabel(zone->zone_slabel))) {
4902						zoneids[domi_nzones++] =
4903						    zone->zone_id;
4904					}
4905				}
4906			}
4907			mutex_exit(&zonehash_lock);
4908		}
4909	} else {
4910		mutex_enter(&zonehash_lock);
4911		real_nzones = zonecount;
4912		domi_nzones = 0;
4913		if (real_nzones > 0) {
4914			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
4915			    KM_SLEEP);
4916			for (zone = list_head(&zone_active); zone != NULL;
4917			    zone = list_next(&zone_active, zone))
4918				zoneids[domi_nzones++] = zone->zone_id;
4919			ASSERT(domi_nzones == real_nzones);
4920		}
4921		mutex_exit(&zonehash_lock);
4922	}
4923
4924	/*
4925	 * If user has allocated space for fewer entries than we found, then
4926	 * return only up to his limit.  Either way, tell him exactly how many
4927	 * we found.
4928	 */
4929	if (domi_nzones < user_nzones)
4930		user_nzones = domi_nzones;
4931	error = 0;
4932	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
4933		error = EFAULT;
4934	} else if (zoneidlist != NULL && user_nzones != 0) {
4935		if (copyout(zoneids, zoneidlist,
4936		    user_nzones * sizeof (zoneid_t)) != 0)
4937			error = EFAULT;
4938	}
4939
4940	if (real_nzones > 0)
4941		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
4942
4943	if (error != 0)
4944		return (set_errno(error));
4945	else
4946		return (0);
4947}
4948
4949/*
4950 * Systemcall entry point for zone_lookup(2).
4951 *
4952 * Non-global zones are only able to see themselves and (on labeled systems)
4953 * the zones they dominate.
4954 */
4955static zoneid_t
4956zone_lookup(const char *zone_name)
4957{
4958	char *kname;
4959	zone_t *zone;
4960	zoneid_t zoneid;
4961	int err;
4962
4963	if (zone_name == NULL) {
4964		/* return caller's zone id */
4965		return (getzoneid());
4966	}
4967
4968	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4969	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
4970		kmem_free(kname, ZONENAME_MAX);
4971		return (set_errno(err));
4972	}
4973
4974	mutex_enter(&zonehash_lock);
4975	zone = zone_find_all_by_name(kname);
4976	kmem_free(kname, ZONENAME_MAX);
4977	/*
4978	 * In a non-global zone, can only lookup global and own name.
4979	 * In Trusted Extensions zone label dominance rules apply.
4980	 */
4981	if (zone == NULL ||
4982	    zone_status_get(zone) < ZONE_IS_READY ||
4983	    !zone_list_access(zone)) {
4984		mutex_exit(&zonehash_lock);
4985		return (set_errno(EINVAL));
4986	} else {
4987		zoneid = zone->zone_id;
4988		mutex_exit(&zonehash_lock);
4989		return (zoneid);
4990	}
4991}
4992
4993static int
4994zone_version(int *version_arg)
4995{
4996	int version = ZONE_SYSCALL_API_VERSION;
4997
4998	if (copyout(&version, version_arg, sizeof (int)) != 0)
4999		return (set_errno(EFAULT));
5000	return (0);
5001}
5002
5003/* ARGSUSED */
5004long
5005zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
5006{
5007	zone_def zs;
5008
5009	switch (cmd) {
5010	case ZONE_CREATE:
5011		if (get_udatamodel() == DATAMODEL_NATIVE) {
5012			if (copyin(arg1, &zs, sizeof (zone_def))) {
5013				return (set_errno(EFAULT));
5014			}
5015		} else {
5016#ifdef _SYSCALL32_IMPL
5017			zone_def32 zs32;
5018
5019			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
5020				return (set_errno(EFAULT));
5021			}
5022			zs.zone_name =
5023			    (const char *)(unsigned long)zs32.zone_name;
5024			zs.zone_root =
5025			    (const char *)(unsigned long)zs32.zone_root;
5026			zs.zone_privs =
5027			    (const struct priv_set *)
5028			    (unsigned long)zs32.zone_privs;
5029			zs.zone_privssz = zs32.zone_privssz;
5030			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
5031			zs.rctlbufsz = zs32.rctlbufsz;
5032			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5033			zs.zfsbufsz = zs32.zfsbufsz;
5034			zs.extended_error =
5035			    (int *)(unsigned long)zs32.extended_error;
5036			zs.match = zs32.match;
5037			zs.doi = zs32.doi;
5038			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
5039			zs.flags = zs32.flags;
5040#else
5041			panic("get_udatamodel() returned bogus result\n");
5042#endif
5043		}
5044
5045		return (zone_create(zs.zone_name, zs.zone_root,
5046		    zs.zone_privs, zs.zone_privssz,
5047		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5048		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
5049		    zs.extended_error, zs.match, zs.doi,
5050		    zs.label, zs.flags));
5051	case ZONE_BOOT:
5052		return (zone_boot((zoneid_t)(uintptr_t)arg1));
5053	case ZONE_DESTROY:
5054		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
5055	case ZONE_GETATTR:
5056		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
5057		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5058	case ZONE_SETATTR:
5059		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
5060		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5061	case ZONE_ENTER:
5062		return (zone_enter((zoneid_t)(uintptr_t)arg1));
5063	case ZONE_LIST:
5064		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
5065	case ZONE_SHUTDOWN:
5066		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
5067	case ZONE_LOOKUP:
5068		return (zone_lookup((const char *)arg1));
5069	case ZONE_VERSION:
5070		return (zone_version((int *)arg1));
5071	case ZONE_ADD_DATALINK:
5072		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
5073		    (char *)arg2));
5074	case ZONE_DEL_DATALINK:
5075		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
5076		    (char *)arg2));
5077	case ZONE_CHECK_DATALINK:
5078		return (zone_check_datalink((zoneid_t *)arg1, (char *)arg2));
5079	case ZONE_LIST_DATALINK:
5080		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
5081		    (int *)arg2, (char *)arg3));
5082	default:
5083		return (set_errno(EINVAL));
5084	}
5085}
5086
5087struct zarg {
5088	zone_t *zone;
5089	zone_cmd_arg_t arg;
5090};
5091
5092static int
5093zone_lookup_door(const char *zone_name, door_handle_t *doorp)
5094{
5095	char *buf;
5096	size_t buflen;
5097	int error;
5098
5099	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
5100	buf = kmem_alloc(buflen, KM_SLEEP);
5101	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
5102	error = door_ki_open(buf, doorp);
5103	kmem_free(buf, buflen);
5104	return (error);
5105}
5106
5107static void
5108zone_release_door(door_handle_t *doorp)
5109{
5110	door_ki_rele(*doorp);
5111	*doorp = NULL;
5112}
5113
5114static void
5115zone_ki_call_zoneadmd(struct zarg *zargp)
5116{
5117	door_handle_t door = NULL;
5118	door_arg_t darg, save_arg;
5119	char *zone_name;
5120	size_t zone_namelen;
5121	zoneid_t zoneid;
5122	zone_t *zone;
5123	zone_cmd_arg_t arg;
5124	uint64_t uniqid;
5125	size_t size;
5126	int error;
5127	int retry;
5128
5129	zone = zargp->zone;
5130	arg = zargp->arg;
5131	kmem_free(zargp, sizeof (*zargp));
5132
5133	zone_namelen = strlen(zone->zone_name) + 1;
5134	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
5135	bcopy(zone->zone_name, zone_name, zone_namelen);
5136	zoneid = zone->zone_id;
5137	uniqid = zone->zone_uniqid;
5138	/*
5139	 * zoneadmd may be down, but at least we can empty out the zone.
5140	 * We can ignore the return value of zone_empty() since we're called
5141	 * from a kernel thread and know we won't be delivered any signals.
5142	 */
5143	ASSERT(curproc == &p0);
5144	(void) zone_empty(zone);
5145	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5146	zone_rele(zone);
5147
5148	size = sizeof (arg);
5149	darg.rbuf = (char *)&arg;
5150	darg.data_ptr = (char *)&arg;
5151	darg.rsize = size;
5152	darg.data_size = size;
5153	darg.desc_ptr = NULL;
5154	darg.desc_num = 0;
5155
5156	save_arg = darg;
5157	/*
5158	 * Since we're not holding a reference to the zone, any number of
5159	 * things can go wrong, including the zone disappearing before we get a
5160	 * chance to talk to zoneadmd.
5161	 */
5162	for (retry = 0; /* forever */; retry++) {
5163		if (door == NULL &&
5164		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5165			goto next;
5166		}
5167		ASSERT(door != NULL);
5168
5169		if ((error = door_ki_upcall(door, &darg)) == 0) {
5170			break;
5171		}
5172		switch (error) {
5173		case EINTR:
5174			/* FALLTHROUGH */
5175		case EAGAIN:	/* process may be forking */
5176			/*
5177			 * Back off for a bit
5178			 */
5179			break;
5180		case EBADF:
5181			zone_release_door(&door);
5182			if (zone_lookup_door(zone_name, &door) != 0) {
5183				/*
5184				 * zoneadmd may be dead, but it may come back to
5185				 * life later.
5186				 */
5187				break;
5188			}
5189			break;
5190		default:
5191			cmn_err(CE_WARN,
5192			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5193			    error);
5194			goto out;
5195		}
5196next:
5197		/*
5198		 * If this isn't the same zone_t that we originally had in mind,
5199		 * then this is the same as if two kadmin requests come in at
5200		 * the same time: the first one wins.  This means we lose, so we
5201		 * bail.
5202		 */
5203		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5204			/*
5205			 * Problem is solved.
5206			 */
5207			break;
5208		}
5209		if (zone->zone_uniqid != uniqid) {
5210			/*
5211			 * zoneid recycled
5212			 */
5213			zone_rele(zone);
5214			break;
5215		}
5216		/*
5217		 * We could zone_status_timedwait(), but there doesn't seem to
5218		 * be much point in doing that (plus, it would mean that
5219		 * zone_free() isn't called until this thread exits).
5220		 */
5221		zone_rele(zone);
5222		delay(hz);
5223		darg = save_arg;
5224	}
5225out:
5226	if (door != NULL) {
5227		zone_release_door(&door);
5228	}
5229	kmem_free(zone_name, zone_namelen);
5230	thread_exit();
5231}
5232
5233/*
5234 * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5235 * kadmin().  The caller is a process in the zone.
5236 *
5237 * In order to shutdown the zone, we will hand off control to zoneadmd
5238 * (running in the global zone) via a door.  We do a half-hearted job at
5239 * killing all processes in the zone, create a kernel thread to contact
5240 * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5241 * a form of generation number used to let zoneadmd (as well as
5242 * zone_destroy()) know exactly which zone they're re talking about.
5243 */
5244int
5245zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5246{
5247	struct zarg *zargp;
5248	zone_cmd_t zcmd;
5249	zone_t *zone;
5250
5251	zone = curproc->p_zone;
5252	ASSERT(getzoneid() != GLOBAL_ZONEID);
5253
5254	switch (cmd) {
5255	case A_SHUTDOWN:
5256		switch (fcn) {
5257		case AD_HALT:
5258		case AD_POWEROFF:
5259			zcmd = Z_HALT;
5260			break;
5261		case AD_BOOT:
5262			zcmd = Z_REBOOT;
5263			break;
5264		case AD_IBOOT:
5265		case AD_SBOOT:
5266		case AD_SIBOOT:
5267		case AD_NOSYNC:
5268			return (ENOTSUP);
5269		default:
5270			return (EINVAL);
5271		}
5272		break;
5273	case A_REBOOT:
5274		zcmd = Z_REBOOT;
5275		break;
5276	case A_FTRACE:
5277	case A_REMOUNT:
5278	case A_FREEZE:
5279	case A_DUMP:
5280		return (ENOTSUP);
5281	default:
5282		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
5283		return (EINVAL);
5284	}
5285
5286	if (secpolicy_zone_admin(credp, B_FALSE))
5287		return (EPERM);
5288	mutex_enter(&zone_status_lock);
5289
5290	/*
5291	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
5292	 * is in the zone.
5293	 */
5294	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
5295	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
5296		/*
5297		 * This zone is already on its way down.
5298		 */
5299		mutex_exit(&zone_status_lock);
5300		return (0);
5301	}
5302	/*
5303	 * Prevent future zone_enter()s
5304	 */
5305	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5306	mutex_exit(&zone_status_lock);
5307
5308	/*
5309	 * Kill everyone now and call zoneadmd later.
5310	 * zone_ki_call_zoneadmd() will do a more thorough job of this
5311	 * later.
5312	 */
5313	killall(zone->zone_id);
5314	/*
5315	 * Now, create the thread to contact zoneadmd and do the rest of the
5316	 * work.  This thread can't be created in our zone otherwise
5317	 * zone_destroy() would deadlock.
5318	 */
5319	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
5320	zargp->arg.cmd = zcmd;
5321	zargp->arg.uniqid = zone->zone_uniqid;
5322	zargp->zone = zone;
5323	(void) strcpy(zargp->arg.locale, "C");
5324	/* mdep was already copied in for us by uadmin */
5325	if (mdep != NULL)
5326		(void) strlcpy(zargp->arg.bootbuf, mdep,
5327		    sizeof (zargp->arg.bootbuf));
5328	zone_hold(zone);
5329
5330	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
5331	    TS_RUN, minclsyspri);
5332	exit(CLD_EXITED, 0);
5333
5334	return (EINVAL);
5335}
5336
5337/*
5338 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
5339 * status to ZONE_IS_SHUTTING_DOWN.
5340 */
5341void
5342zone_shutdown_global(void)
5343{
5344	ASSERT(curproc->p_zone == global_zone);
5345
5346	mutex_enter(&zone_status_lock);
5347	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
5348	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
5349	mutex_exit(&zone_status_lock);
5350}
5351
5352/*
5353 * Returns true if the named dataset is visible in the current zone.
5354 * The 'write' parameter is set to 1 if the dataset is also writable.
5355 */
5356int
5357zone_dataset_visible(const char *dataset, int *write)
5358{
5359	zone_dataset_t *zd;
5360	size_t len;
5361	zone_t *zone = curproc->p_zone;
5362
5363	if (dataset[0] == '\0')
5364		return (0);
5365
5366	/*
5367	 * Walk the list once, looking for datasets which match exactly, or
5368	 * specify a dataset underneath an exported dataset.  If found, return
5369	 * true and note that it is writable.
5370	 */
5371	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5372	    zd = list_next(&zone->zone_datasets, zd)) {
5373
5374		len = strlen(zd->zd_dataset);
5375		if (strlen(dataset) >= len &&
5376		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5377		    (dataset[len] == '\0' || dataset[len] == '/' ||
5378		    dataset[len] == '@')) {
5379			if (write)
5380				*write = 1;
5381			return (1);
5382		}
5383	}
5384
5385	/*
5386	 * Walk the list a second time, searching for datasets which are parents
5387	 * of exported datasets.  These should be visible, but read-only.
5388	 *
5389	 * Note that we also have to support forms such as 'pool/dataset/', with
5390	 * a trailing slash.
5391	 */
5392	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5393	    zd = list_next(&zone->zone_datasets, zd)) {
5394
5395		len = strlen(dataset);
5396		if (dataset[len - 1] == '/')
5397			len--;	/* Ignore trailing slash */
5398		if (len < strlen(zd->zd_dataset) &&
5399		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5400		    zd->zd_dataset[len] == '/') {
5401			if (write)
5402				*write = 0;
5403			return (1);
5404		}
5405	}
5406
5407	return (0);
5408}
5409
5410/*
5411 * zone_find_by_any_path() -
5412 *
5413 * kernel-private routine similar to zone_find_by_path(), but which
5414 * effectively compares against zone paths rather than zonerootpath
5415 * (i.e., the last component of zonerootpaths, which should be "root/",
5416 * are not compared.)  This is done in order to accurately identify all
5417 * paths, whether zone-visible or not, including those which are parallel
5418 * to /root/, such as /dev/, /home/, etc...
5419 *
5420 * If the specified path does not fall under any zone path then global
5421 * zone is returned.
5422 *
5423 * The treat_abs parameter indicates whether the path should be treated as
5424 * an absolute path although it does not begin with "/".  (This supports
5425 * nfs mount syntax such as host:any/path.)
5426 *
5427 * The caller is responsible for zone_rele of the returned zone.
5428 */
5429zone_t *
5430zone_find_by_any_path(const char *path, boolean_t treat_abs)
5431{
5432	zone_t *zone;
5433	int path_offset = 0;
5434
5435	if (path == NULL) {
5436		zone_hold(global_zone);
5437		return (global_zone);
5438	}
5439
5440	if (*path != '/') {
5441		ASSERT(treat_abs);
5442		path_offset = 1;
5443	}
5444
5445	mutex_enter(&zonehash_lock);
5446	for (zone = list_head(&zone_active); zone != NULL;
5447	    zone = list_next(&zone_active, zone)) {
5448		char	*c;
5449		size_t	pathlen;
5450		char *rootpath_start;
5451
5452		if (zone == global_zone)	/* skip global zone */
5453			continue;
5454
5455		/* scan backwards to find start of last component */
5456		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
5457		do {
5458			c--;
5459		} while (*c != '/');
5460
5461		pathlen = c - zone->zone_rootpath + 1 - path_offset;
5462		rootpath_start = (zone->zone_rootpath + path_offset);
5463		if (strncmp(path, rootpath_start, pathlen) == 0)
5464			break;
5465	}
5466	if (zone == NULL)
5467		zone = global_zone;
5468	zone_hold(zone);
5469	mutex_exit(&zonehash_lock);
5470	return (zone);
5471}
5472
5473/* List of data link names which are accessible from the zone */
5474struct dlnamelist {
5475	char			dlnl_name[LIFNAMSIZ];
5476	struct dlnamelist	*dlnl_next;
5477};
5478
5479
5480/*
5481 * Check whether the datalink name (dlname) itself is present.
5482 * Return true if found.
5483 */
5484static boolean_t
5485zone_dlname(zone_t *zone, char *dlname)
5486{
5487	struct dlnamelist *dlnl;
5488	boolean_t found = B_FALSE;
5489
5490	mutex_enter(&zone->zone_lock);
5491	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5492		if (strncmp(dlnl->dlnl_name, dlname, LIFNAMSIZ) == 0) {
5493			found = B_TRUE;
5494			break;
5495		}
5496	}
5497	mutex_exit(&zone->zone_lock);
5498	return (found);
5499}
5500
5501/*
5502 * Add an data link name for the zone. Does not check for duplicates.
5503 */
5504static int
5505zone_add_datalink(zoneid_t zoneid, char *dlname)
5506{
5507	struct dlnamelist *dlnl;
5508	zone_t *zone;
5509	zone_t *thiszone;
5510	int err;
5511
5512	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5513	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5514		kmem_free(dlnl, sizeof (struct dlnamelist));
5515		return (set_errno(err));
5516	}
5517
5518	thiszone = zone_find_by_id(zoneid);
5519	if (thiszone == NULL) {
5520		kmem_free(dlnl, sizeof (struct dlnamelist));
5521		return (set_errno(ENXIO));
5522	}
5523
5524	/*
5525	 * Verify that the datalink name isn't already used by a different
5526	 * zone while allowing duplicate entries for the same zone (e.g. due
5527	 * to both using IPv4 and IPv6 on an interface)
5528	 */
5529	mutex_enter(&zonehash_lock);
5530	for (zone = list_head(&zone_active); zone != NULL;
5531	    zone = list_next(&zone_active, zone)) {
5532		if (zone->zone_id == zoneid)
5533			continue;
5534
5535		if (zone_dlname(zone, dlnl->dlnl_name)) {
5536			mutex_exit(&zonehash_lock);
5537			zone_rele(thiszone);
5538			kmem_free(dlnl, sizeof (struct dlnamelist));
5539			return (set_errno(EPERM));
5540		}
5541	}
5542	mutex_enter(&thiszone->zone_lock);
5543	dlnl->dlnl_next = thiszone->zone_dl_list;
5544	thiszone->zone_dl_list = dlnl;
5545	mutex_exit(&thiszone->zone_lock);
5546	mutex_exit(&zonehash_lock);
5547	zone_rele(thiszone);
5548	return (0);
5549}
5550
5551static int
5552zone_remove_datalink(zoneid_t zoneid, char *dlname)
5553{
5554	struct dlnamelist *dlnl, *odlnl, **dlnlp;
5555	zone_t *zone;
5556	int err;
5557
5558	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5559	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5560		kmem_free(dlnl, sizeof (struct dlnamelist));
5561		return (set_errno(err));
5562	}
5563	zone = zone_find_by_id(zoneid);
5564	if (zone == NULL) {
5565		kmem_free(dlnl, sizeof (struct dlnamelist));
5566		return (set_errno(EINVAL));
5567	}
5568
5569	mutex_enter(&zone->zone_lock);
5570	/* Look for match */
5571	dlnlp = &zone->zone_dl_list;
5572	while (*dlnlp != NULL) {
5573		if (strncmp(dlnl->dlnl_name, (*dlnlp)->dlnl_name,
5574		    LIFNAMSIZ) == 0)
5575			goto found;
5576		dlnlp = &((*dlnlp)->dlnl_next);
5577	}
5578	mutex_exit(&zone->zone_lock);
5579	zone_rele(zone);
5580	kmem_free(dlnl, sizeof (struct dlnamelist));
5581	return (set_errno(ENXIO));
5582
5583found:
5584	odlnl = *dlnlp;
5585	*dlnlp = (*dlnlp)->dlnl_next;
5586	kmem_free(odlnl, sizeof (struct dlnamelist));
5587
5588	mutex_exit(&zone->zone_lock);
5589	zone_rele(zone);
5590	kmem_free(dlnl, sizeof (struct dlnamelist));
5591	return (0);
5592}
5593
5594/*
5595 * Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink
5596 * name (dlname); otherwise we just check if the specified zoneidp has access
5597 * to the datalink name.
5598 */
5599static int
5600zone_check_datalink(zoneid_t *zoneidp, char *dlname)
5601{
5602	zoneid_t id;
5603	char *dln;
5604	zone_t *zone;
5605	int err = 0;
5606	boolean_t allzones = B_FALSE;
5607
5608	if (copyin(zoneidp, &id, sizeof (id)) != 0) {
5609		return (set_errno(EFAULT));
5610	}
5611	dln = kmem_zalloc(LIFNAMSIZ, KM_SLEEP);
5612	if ((err = copyinstr(dlname, dln, LIFNAMSIZ, NULL)) != 0) {
5613		kmem_free(dln, LIFNAMSIZ);
5614		return (set_errno(err));
5615	}
5616
5617	if (id == ALL_ZONES)
5618		allzones = B_TRUE;
5619
5620	/*
5621	 * Check whether datalink name is already used.
5622	 */
5623	mutex_enter(&zonehash_lock);
5624	for (zone = list_head(&zone_active); zone != NULL;
5625	    zone = list_next(&zone_active, zone)) {
5626		if (allzones || (id == zone->zone_id)) {
5627			if (!zone_dlname(zone, dln))
5628				continue;
5629			if (allzones)
5630				err = copyout(&zone->zone_id, zoneidp,
5631				    sizeof (*zoneidp));
5632
5633			mutex_exit(&zonehash_lock);
5634			kmem_free(dln, LIFNAMSIZ);
5635			return (err ? set_errno(EFAULT) : 0);
5636		}
5637	}
5638
5639	/* datalink name is not found in any active zone. */
5640	mutex_exit(&zonehash_lock);
5641	kmem_free(dln, LIFNAMSIZ);
5642	return (set_errno(ENXIO));
5643}
5644
5645/*
5646 * Get the names of the datalinks assigned to a zone.
5647 * Here *nump is the number of datalinks, and the assumption
5648 * is that the caller will gurantee that the the supplied buffer is
5649 * big enough to hold at least #*nump datalink names, that is,
5650 * LIFNAMSIZ X *nump
5651 * On return, *nump will be the "new" number of datalinks, if it
5652 * ever changed.
5653 */
5654static int
5655zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
5656{
5657	int num, dlcount;
5658	zone_t *zone;
5659	struct dlnamelist *dlnl;
5660	char *ptr;
5661
5662	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
5663		return (set_errno(EFAULT));
5664
5665	zone = zone_find_by_id(zoneid);
5666	if (zone == NULL) {
5667		return (set_errno(ENXIO));
5668	}
5669
5670	num = 0;
5671	mutex_enter(&zone->zone_lock);
5672	ptr = buf;
5673	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5674		/*
5675		 * If the list changed and the new number is bigger
5676		 * than what the caller supplied, just count, don't
5677		 * do copyout
5678		 */
5679		if (++num > dlcount)
5680			continue;
5681		if (copyout(dlnl->dlnl_name, ptr, LIFNAMSIZ) != 0) {
5682			mutex_exit(&zone->zone_lock);
5683			zone_rele(zone);
5684			return (set_errno(EFAULT));
5685		}
5686		ptr += LIFNAMSIZ;
5687	}
5688	mutex_exit(&zone->zone_lock);
5689	zone_rele(zone);
5690
5691	/* Increased or decreased, caller should be notified. */
5692	if (num != dlcount) {
5693		if (copyout(&num, nump, sizeof (num)) != 0) {
5694			return (set_errno(EFAULT));
5695		}
5696	}
5697	return (0);
5698}
5699
5700/*
5701 * Public interface for looking up a zone by zoneid. It's a customized version
5702 * for netstack_zone_create(), it:
5703 * 1. Doesn't acquire the zonehash_lock, since it is called from
5704 *    zone_key_create() or zone_zsd_configure(), lock already held.
5705 * 2. Doesn't check the status of the zone.
5706 * 3. It will be called even before zone_init is called, in that case the
5707 *    address of zone0 is returned directly, and netstack_zone_create()
5708 *    will only assign a value to zone0.zone_netstack, won't break anything.
5709 */
5710zone_t *
5711zone_find_by_id_nolock(zoneid_t zoneid)
5712{
5713	ASSERT(MUTEX_HELD(&zonehash_lock));
5714
5715	if (zonehashbyid == NULL)
5716		return (&zone0);
5717	else
5718		return (zone_find_all_by_id(zoneid));
5719}
5720