vplat.c revision 3813:c7c433a53b1a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * This module contains functions used to bring up and tear down the
31 * Virtual Platform: [un]mounting file-systems, [un]plumbing network
32 * interfaces, [un]configuring devices, establishing resource controls,
33 * and creating/destroying the zone in the kernel.  These actions, on
34 * the way up, ready the zone; on the way down, they halt the zone.
35 * See the much longer block comment at the beginning of zoneadmd.c
36 * for a bigger picture of how the whole program functions.
37 *
38 * This module also has primary responsibility for the layout of "scratch
39 * zones."  These are mounted, but inactive, zones that are used during
40 * operating system upgrade and potentially other administrative action.  The
41 * scratch zone environment is similar to the miniroot environment.  The zone's
42 * actual root is mounted read-write on /a, and the standard paths (/usr,
43 * /sbin, /lib) all lead to read-only copies of the running system's binaries.
44 * This allows the administrative tools to manipulate the zone using "-R /a"
45 * without relying on any binaries in the zone itself.
46 *
47 * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
48 * environment), then we must resolve the lofs mounts used there to uncover
49 * writable (unshared) resources.  Shared resources, though, are always
50 * read-only.  In addition, if the "same" zone with a different root path is
51 * currently running, then "/b" inside the zone points to the running zone's
52 * root.  This allows LU to synchronize configuration files during the upgrade
53 * process.
54 *
55 * To construct this environment, this module creates a tmpfs mount on
56 * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
57 * described above is constructed on the fly.  The zone is then created using
58 * $ZONEPATH/lu as the root.
59 *
60 * Note that scratch zones are inactive.  The zone's bits are not running and
61 * likely cannot be run correctly until upgrade is done.  Init is not running
62 * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
63 * is not a part of the usual halt/ready/boot state machine.
64 */
65
66#include <sys/param.h>
67#include <sys/mount.h>
68#include <sys/mntent.h>
69#include <sys/socket.h>
70#include <sys/utsname.h>
71#include <sys/types.h>
72#include <sys/stat.h>
73#include <sys/sockio.h>
74#include <sys/stropts.h>
75#include <sys/conf.h>
76
77#include <sys/dlpi.h>
78#include <libdlpi.h>
79#include <libdladm.h>
80
81#include <inet/tcp.h>
82#include <arpa/inet.h>
83#include <netinet/in.h>
84#include <net/route.h>
85
86#include <stdio.h>
87#include <errno.h>
88#include <fcntl.h>
89#include <unistd.h>
90#include <rctl.h>
91#include <stdlib.h>
92#include <string.h>
93#include <strings.h>
94#include <wait.h>
95#include <limits.h>
96#include <libgen.h>
97#include <libzfs.h>
98#include <libdevinfo.h>
99#include <zone.h>
100#include <assert.h>
101#include <libcontract.h>
102#include <libcontract_priv.h>
103#include <uuid/uuid.h>
104
105#include <sys/mntio.h>
106#include <sys/mnttab.h>
107#include <sys/fs/autofs.h>	/* for _autofssys() */
108#include <sys/fs/lofs_info.h>
109#include <sys/fs/zfs.h>
110
111#include <pool.h>
112#include <sys/pool.h>
113#include <sys/priocntl.h>
114
115#include <libbrand.h>
116#include <sys/brand.h>
117#include <libzonecfg.h>
118#include <synch.h>
119
120#include "zoneadmd.h"
121#include <tsol/label.h>
122#include <libtsnet.h>
123#include <sys/priv.h>
124
125#define	V4_ADDR_LEN	32
126#define	V6_ADDR_LEN	128
127
128/* 0755 is the default directory mode. */
129#define	DEFAULT_DIR_MODE \
130	(S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
131#define	DEFAULT_DIR_USER -1	/* user ID for chown: -1 means don't change */
132#define	DEFAULT_DIR_GROUP -1	/* grp ID for chown: -1 means don't change */
133
134#define	IPD_DEFAULT_OPTS \
135	MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
136
137#define	DFSTYPES	"/etc/dfs/fstypes"
138#define	MAXTNZLEN	2048
139
140/* for routing socket */
141static int rts_seqno = 0;
142
143/* mangled zone name when mounting in an alternate root environment */
144static char kernzone[ZONENAME_MAX];
145
146/* array of cached mount entries for resolve_lofs */
147static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
148
149/* for Trusted Extensions */
150static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
151static int tsol_mounts(zlog_t *, char *, char *);
152static void tsol_unmounts(zlog_t *, char *);
153static m_label_t *zlabel = NULL;
154static m_label_t *zid_label = NULL;
155static priv_set_t *zprivs = NULL;
156
157/* from libsocket, not in any header file */
158extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
159
160/*
161 * An optimization for build_mnttable: reallocate (and potentially copy the
162 * data) only once every N times through the loop.
163 */
164#define	MNTTAB_HUNK	32
165
166/*
167 * Private autofs system call
168 */
169extern int _autofssys(int, void *);
170
171static int
172autofs_cleanup(zoneid_t zoneid)
173{
174	/*
175	 * Ask autofs to unmount all trigger nodes in the given zone.
176	 */
177	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
178}
179
180static void
181free_mnttable(struct mnttab *mnt_array, uint_t nelem)
182{
183	uint_t i;
184
185	if (mnt_array == NULL)
186		return;
187	for (i = 0; i < nelem; i++) {
188		free(mnt_array[i].mnt_mountp);
189		free(mnt_array[i].mnt_fstype);
190		free(mnt_array[i].mnt_special);
191		free(mnt_array[i].mnt_mntopts);
192		assert(mnt_array[i].mnt_time == NULL);
193	}
194	free(mnt_array);
195}
196
197/*
198 * Build the mount table for the zone rooted at "zroot", storing the resulting
199 * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
200 * array in "nelemp".
201 */
202static int
203build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
204    struct mnttab **mnt_arrayp, uint_t *nelemp)
205{
206	struct mnttab mnt;
207	struct mnttab *mnts;
208	struct mnttab *mnp;
209	uint_t nmnt;
210
211	rewind(mnttab);
212	resetmnttab(mnttab);
213	nmnt = 0;
214	mnts = NULL;
215	while (getmntent(mnttab, &mnt) == 0) {
216		struct mnttab *tmp_array;
217
218		if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
219			continue;
220		if (nmnt % MNTTAB_HUNK == 0) {
221			tmp_array = realloc(mnts,
222			    (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
223			if (tmp_array == NULL) {
224				free_mnttable(mnts, nmnt);
225				return (-1);
226			}
227			mnts = tmp_array;
228		}
229		mnp = &mnts[nmnt++];
230
231		/*
232		 * Zero out any fields we're not using.
233		 */
234		(void) memset(mnp, 0, sizeof (*mnp));
235
236		if (mnt.mnt_special != NULL)
237			mnp->mnt_special = strdup(mnt.mnt_special);
238		if (mnt.mnt_mntopts != NULL)
239			mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
240		mnp->mnt_mountp = strdup(mnt.mnt_mountp);
241		mnp->mnt_fstype = strdup(mnt.mnt_fstype);
242		if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
243		    (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
244		    mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
245			zerror(zlogp, B_TRUE, "memory allocation failed");
246			free_mnttable(mnts, nmnt);
247			return (-1);
248		}
249	}
250	*mnt_arrayp = mnts;
251	*nelemp = nmnt;
252	return (0);
253}
254
255/*
256 * This is an optimization.  The resolve_lofs function is used quite frequently
257 * to manipulate file paths, and on a machine with a large number of zones,
258 * there will be a huge number of mounted file systems.  Thus, we trigger a
259 * reread of the list of mount points
260 */
261static void
262lofs_discard_mnttab(void)
263{
264	free_mnttable(resolve_lofs_mnts,
265	    resolve_lofs_mnt_max - resolve_lofs_mnts);
266	resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
267}
268
269static int
270lofs_read_mnttab(zlog_t *zlogp)
271{
272	FILE *mnttab;
273	uint_t nmnts;
274
275	if ((mnttab = fopen(MNTTAB, "r")) == NULL)
276		return (-1);
277	if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
278	    &nmnts) == -1) {
279		(void) fclose(mnttab);
280		return (-1);
281	}
282	(void) fclose(mnttab);
283	resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
284	return (0);
285}
286
287/*
288 * This function loops over potential loopback mounts and symlinks in a given
289 * path and resolves them all down to an absolute path.
290 */
291static void
292resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
293{
294	int len, arlen;
295	const char *altroot;
296	char tmppath[MAXPATHLEN];
297	boolean_t outside_altroot;
298
299	if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
300		return;
301	tmppath[len] = '\0';
302	(void) strlcpy(path, tmppath, sizeof (tmppath));
303
304	/* This happens once per zoneadmd operation. */
305	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
306		return;
307
308	altroot = zonecfg_get_root();
309	arlen = strlen(altroot);
310	outside_altroot = B_FALSE;
311	for (;;) {
312		struct mnttab *mnp;
313
314		/* Search in reverse order to find longest match */
315		for (mnp = resolve_lofs_mnt_max - 1; mnp >= resolve_lofs_mnts;
316		    mnp--) {
317			if (mnp->mnt_fstype == NULL ||
318			    mnp->mnt_mountp == NULL ||
319			    mnp->mnt_special == NULL)
320				continue;
321			len = strlen(mnp->mnt_mountp);
322			if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
323			    (path[len] == '/' || path[len] == '\0'))
324				break;
325		}
326		if (mnp < resolve_lofs_mnts)
327			break;
328		/* If it's not a lofs then we're done */
329		if (strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
330			break;
331		if (outside_altroot) {
332			char *cp;
333			int olen = sizeof (MNTOPT_RO) - 1;
334
335			/*
336			 * If we run into a read-only mount outside of the
337			 * alternate root environment, then the user doesn't
338			 * want this path to be made read-write.
339			 */
340			if (mnp->mnt_mntopts != NULL &&
341			    (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
342			    NULL &&
343			    (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
344			    (cp[olen] == '\0' || cp[olen] == ',')) {
345				break;
346			}
347		} else if (arlen > 0 &&
348		    (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
349		    (mnp->mnt_special[arlen] != '\0' &&
350		    mnp->mnt_special[arlen] != '/'))) {
351			outside_altroot = B_TRUE;
352		}
353		/* use temporary buffer because new path might be longer */
354		(void) snprintf(tmppath, sizeof (tmppath), "%s%s",
355		    mnp->mnt_special, path + len);
356		if ((len = resolvepath(tmppath, path, pathlen)) == -1)
357			break;
358		path[len] = '\0';
359	}
360}
361
362/*
363 * For a regular mount, check if a replacement lofs mount is needed because the
364 * referenced device is already mounted somewhere.
365 */
366static int
367check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
368{
369	struct mnttab *mnp;
370	zone_fsopt_t *optptr, *onext;
371
372	/* This happens once per zoneadmd operation. */
373	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
374		return (-1);
375
376	/*
377	 * If this special node isn't already in use, then it's ours alone;
378	 * no need to worry about conflicting mounts.
379	 */
380	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
381	    mnp++) {
382		if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
383			break;
384	}
385	if (mnp >= resolve_lofs_mnt_max)
386		return (0);
387
388	/*
389	 * Convert this duplicate mount into a lofs mount.
390	 */
391	(void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
392	    sizeof (fsptr->zone_fs_special));
393	(void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
394	    sizeof (fsptr->zone_fs_type));
395	fsptr->zone_fs_raw[0] = '\0';
396
397	/*
398	 * Discard all but one of the original options and set that to be the
399	 * same set of options used for inherit package directory resources.
400	 */
401	optptr = fsptr->zone_fs_options;
402	if (optptr == NULL) {
403		optptr = malloc(sizeof (*optptr));
404		if (optptr == NULL) {
405			zerror(zlogp, B_TRUE, "cannot mount %s",
406			    fsptr->zone_fs_dir);
407			return (-1);
408		}
409	} else {
410		while ((onext = optptr->zone_fsopt_next) != NULL) {
411			optptr->zone_fsopt_next = onext->zone_fsopt_next;
412			free(onext);
413		}
414	}
415	(void) strcpy(optptr->zone_fsopt_opt, IPD_DEFAULT_OPTS);
416	optptr->zone_fsopt_next = NULL;
417	fsptr->zone_fs_options = optptr;
418	return (0);
419}
420
421static int
422make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode,
423    uid_t userid, gid_t groupid)
424{
425	char path[MAXPATHLEN];
426	struct stat st;
427
428	if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
429	    sizeof (path)) {
430		zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
431		    subdir);
432		return (-1);
433	}
434
435	if (lstat(path, &st) == 0) {
436		/*
437		 * We don't check the file mode since presumably the zone
438		 * administrator may have had good reason to change the mode,
439		 * and we don't need to second guess him.
440		 */
441		if (!S_ISDIR(st.st_mode)) {
442			if (is_system_labeled() &&
443			    S_ISREG(st.st_mode)) {
444				/*
445				 * The need to mount readonly copies of
446				 * global zone /etc/ files is unique to
447				 * Trusted Extensions.
448				 */
449				if (strncmp(subdir, "/etc/",
450				    strlen("/etc/")) != 0) {
451					zerror(zlogp, B_FALSE,
452					    "%s is not in /etc", path);
453					return (-1);
454				}
455			} else {
456				zerror(zlogp, B_FALSE,
457				    "%s is not a directory", path);
458				return (-1);
459			}
460		}
461		return (0);
462	}
463
464	if (mkdirp(path, mode) != 0) {
465		if (errno == EROFS)
466			zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
467			    "a read-only file system in this local zone.\nMake "
468			    "sure %s exists in the global zone.", path, subdir);
469		else
470			zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
471		return (-1);
472	}
473
474	(void) chown(path, userid, groupid);
475	return (0);
476}
477
478static void
479free_remote_fstypes(char **types)
480{
481	uint_t i;
482
483	if (types == NULL)
484		return;
485	for (i = 0; types[i] != NULL; i++)
486		free(types[i]);
487	free(types);
488}
489
490static char **
491get_remote_fstypes(zlog_t *zlogp)
492{
493	char **types = NULL;
494	FILE *fp;
495	char buf[MAXPATHLEN];
496	char fstype[MAXPATHLEN];
497	uint_t lines = 0;
498	uint_t i;
499
500	if ((fp = fopen(DFSTYPES, "r")) == NULL) {
501		zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
502		return (NULL);
503	}
504	/*
505	 * Count the number of lines
506	 */
507	while (fgets(buf, sizeof (buf), fp) != NULL)
508		lines++;
509	if (lines == 0)	/* didn't read anything; empty file */
510		goto out;
511	rewind(fp);
512	/*
513	 * Allocate enough space for a NULL-terminated array.
514	 */
515	types = calloc(lines + 1, sizeof (char *));
516	if (types == NULL) {
517		zerror(zlogp, B_TRUE, "memory allocation failed");
518		goto out;
519	}
520	i = 0;
521	while (fgets(buf, sizeof (buf), fp) != NULL) {
522		/* LINTED - fstype is big enough to hold buf */
523		if (sscanf(buf, "%s", fstype) == 0) {
524			zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
525			free_remote_fstypes(types);
526			types = NULL;
527			goto out;
528		}
529		types[i] = strdup(fstype);
530		if (types[i] == NULL) {
531			zerror(zlogp, B_TRUE, "memory allocation failed");
532			free_remote_fstypes(types);
533			types = NULL;
534			goto out;
535		}
536		i++;
537	}
538out:
539	(void) fclose(fp);
540	return (types);
541}
542
543static boolean_t
544is_remote_fstype(const char *fstype, char *const *remote_fstypes)
545{
546	uint_t i;
547
548	if (remote_fstypes == NULL)
549		return (B_FALSE);
550	for (i = 0; remote_fstypes[i] != NULL; i++) {
551		if (strcmp(remote_fstypes[i], fstype) == 0)
552			return (B_TRUE);
553	}
554	return (B_FALSE);
555}
556
557/*
558 * This converts a zone root path (normally of the form .../root) to a Live
559 * Upgrade scratch zone root (of the form .../lu).
560 */
561static void
562root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
563{
564	assert(zone_isnative);
565
566	if (!isresolved && zonecfg_in_alt_root())
567		resolve_lofs(zlogp, zroot, zrootlen);
568	(void) strcpy(strrchr(zroot, '/') + 1, "lu");
569}
570
571/*
572 * The general strategy for unmounting filesystems is as follows:
573 *
574 * - Remote filesystems may be dead, and attempting to contact them as
575 * part of a regular unmount may hang forever; we want to always try to
576 * forcibly unmount such filesystems and only fall back to regular
577 * unmounts if the filesystem doesn't support forced unmounts.
578 *
579 * - We don't want to unnecessarily corrupt metadata on local
580 * filesystems (ie UFS), so we want to start off with graceful unmounts,
581 * and only escalate to doing forced unmounts if we get stuck.
582 *
583 * We start off walking backwards through the mount table.  This doesn't
584 * give us strict ordering but ensures that we try to unmount submounts
585 * first.  We thus limit the number of failed umount2(2) calls.
586 *
587 * The mechanism for determining if we're stuck is to count the number
588 * of failed unmounts each iteration through the mount table.  This
589 * gives us an upper bound on the number of filesystems which remain
590 * mounted (autofs trigger nodes are dealt with separately).  If at the
591 * end of one unmount+autofs_cleanup cycle we still have the same number
592 * of mounts that we started out with, we're stuck and try a forced
593 * unmount.  If that fails (filesystem doesn't support forced unmounts)
594 * then we bail and are unable to teardown the zone.  If it succeeds,
595 * we're no longer stuck so we continue with our policy of trying
596 * graceful mounts first.
597 *
598 * Zone must be down (ie, no processes or threads active).
599 */
600static int
601unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
602{
603	int error = 0;
604	FILE *mnttab;
605	struct mnttab *mnts;
606	uint_t nmnt;
607	char zroot[MAXPATHLEN + 1];
608	size_t zrootlen;
609	uint_t oldcount = UINT_MAX;
610	boolean_t stuck = B_FALSE;
611	char **remote_fstypes = NULL;
612
613	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
614		zerror(zlogp, B_FALSE, "unable to determine zone root");
615		return (-1);
616	}
617	if (unmount_cmd)
618		root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
619
620	(void) strcat(zroot, "/");
621	zrootlen = strlen(zroot);
622
623	/*
624	 * For Trusted Extensions unmount each higher level zone's mount
625	 * of our zone's /export/home
626	 */
627	if (!unmount_cmd)
628		tsol_unmounts(zlogp, zone_name);
629
630	if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
631		zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
632		return (-1);
633	}
634	/*
635	 * Use our hacky mntfs ioctl so we see everything, even mounts with
636	 * MS_NOMNTTAB.
637	 */
638	if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
639		zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
640		error++;
641		goto out;
642	}
643
644	/*
645	 * Build the list of remote fstypes so we know which ones we
646	 * should forcibly unmount.
647	 */
648	remote_fstypes = get_remote_fstypes(zlogp);
649	for (; /* ever */; ) {
650		uint_t newcount = 0;
651		boolean_t unmounted;
652		struct mnttab *mnp;
653		char *path;
654		uint_t i;
655
656		mnts = NULL;
657		nmnt = 0;
658		/*
659		 * MNTTAB gives us a way to walk through mounted
660		 * filesystems; we need to be able to walk them in
661		 * reverse order, so we build a list of all mounted
662		 * filesystems.
663		 */
664		if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
665		    &nmnt) != 0) {
666			error++;
667			goto out;
668		}
669		for (i = 0; i < nmnt; i++) {
670			mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
671			path = mnp->mnt_mountp;
672			unmounted = B_FALSE;
673			/*
674			 * Try forced unmount first for remote filesystems.
675			 *
676			 * Not all remote filesystems support forced unmounts,
677			 * so if this fails (ENOTSUP) we'll continue on
678			 * and try a regular unmount.
679			 */
680			if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
681				if (umount2(path, MS_FORCE) == 0)
682					unmounted = B_TRUE;
683			}
684			/*
685			 * Try forced unmount if we're stuck.
686			 */
687			if (stuck) {
688				if (umount2(path, MS_FORCE) == 0) {
689					unmounted = B_TRUE;
690					stuck = B_FALSE;
691				} else {
692					/*
693					 * The first failure indicates a
694					 * mount we won't be able to get
695					 * rid of automatically, so we
696					 * bail.
697					 */
698					error++;
699					zerror(zlogp, B_FALSE,
700					    "unable to unmount '%s'", path);
701					free_mnttable(mnts, nmnt);
702					goto out;
703				}
704			}
705			/*
706			 * Try regular unmounts for everything else.
707			 */
708			if (!unmounted && umount2(path, 0) != 0)
709				newcount++;
710		}
711		free_mnttable(mnts, nmnt);
712
713		if (newcount == 0)
714			break;
715		if (newcount >= oldcount) {
716			/*
717			 * Last round didn't unmount anything; we're stuck and
718			 * should start trying forced unmounts.
719			 */
720			stuck = B_TRUE;
721		}
722		oldcount = newcount;
723
724		/*
725		 * Autofs doesn't let you unmount its trigger nodes from
726		 * userland so we have to tell the kernel to cleanup for us.
727		 */
728		if (autofs_cleanup(zoneid) != 0) {
729			zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
730			error++;
731			goto out;
732		}
733	}
734
735out:
736	free_remote_fstypes(remote_fstypes);
737	(void) fclose(mnttab);
738	return (error ? -1 : 0);
739}
740
741static int
742fs_compare(const void *m1, const void *m2)
743{
744	struct zone_fstab *i = (struct zone_fstab *)m1;
745	struct zone_fstab *j = (struct zone_fstab *)m2;
746
747	return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
748}
749
750/*
751 * Fork and exec (and wait for) the mentioned binary with the provided
752 * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
753 * returns the exit status otherwise.
754 *
755 * If we were unable to exec the provided pathname (for whatever
756 * reason), we return the special token ZEXIT_EXEC.  The current value
757 * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
758 * consumers of this function; any future consumers must make sure this
759 * remains the case.
760 */
761static int
762forkexec(zlog_t *zlogp, const char *path, char *const argv[])
763{
764	pid_t child_pid;
765	int child_status = 0;
766
767	/*
768	 * Do not let another thread localize a message while we are forking.
769	 */
770	(void) mutex_lock(&msglock);
771	child_pid = fork();
772	(void) mutex_unlock(&msglock);
773	if (child_pid == -1) {
774		zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
775		return (-1);
776	} else if (child_pid == 0) {
777		closefrom(0);
778		/* redirect stdin, stdout & stderr to /dev/null */
779		(void) open("/dev/null", O_RDONLY);	/* stdin */
780		(void) open("/dev/null", O_WRONLY);	/* stdout */
781		(void) open("/dev/null", O_WRONLY);	/* stderr */
782		(void) execv(path, argv);
783		/*
784		 * Since we are in the child, there is no point calling zerror()
785		 * since there is nobody waiting to consume it.  So exit with a
786		 * special code that the parent will recognize and call zerror()
787		 * accordingly.
788		 */
789
790		_exit(ZEXIT_EXEC);
791	} else {
792		(void) waitpid(child_pid, &child_status, 0);
793	}
794
795	if (WIFSIGNALED(child_status)) {
796		zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
797		    "signal %d", path, WTERMSIG(child_status));
798		return (-1);
799	}
800	assert(WIFEXITED(child_status));
801	if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
802		zerror(zlogp, B_FALSE, "failed to exec %s", path);
803		return (-1);
804	}
805	return (WEXITSTATUS(child_status));
806}
807
808static int
809dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
810{
811	char cmdbuf[MAXPATHLEN];
812	char *argv[4];
813	int status;
814
815	/*
816	 * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
817	 * that would cost us an extra fork/exec without buying us anything.
818	 */
819	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
820	    >= sizeof (cmdbuf)) {
821		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
822		return (-1);
823	}
824
825	argv[0] = "fsck";
826	argv[1] = "-m";
827	argv[2] = (char *)rawdev;
828	argv[3] = NULL;
829
830	status = forkexec(zlogp, cmdbuf, argv);
831	if (status == 0 || status == -1)
832		return (status);
833	zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
834	    "run fsck manually", rawdev, status);
835	return (-1);
836}
837
838static int
839domount(zlog_t *zlogp, const char *fstype, const char *opts,
840    const char *special, const char *directory)
841{
842	char cmdbuf[MAXPATHLEN];
843	char *argv[6];
844	int status;
845
846	/*
847	 * We could alternatively have called /usr/sbin/mount -F <fstype>, but
848	 * that would cost us an extra fork/exec without buying us anything.
849	 */
850	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
851	    >= sizeof (cmdbuf)) {
852		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
853		return (-1);
854	}
855	argv[0] = "mount";
856	if (opts[0] == '\0') {
857		argv[1] = (char *)special;
858		argv[2] = (char *)directory;
859		argv[3] = NULL;
860	} else {
861		argv[1] = "-o";
862		argv[2] = (char *)opts;
863		argv[3] = (char *)special;
864		argv[4] = (char *)directory;
865		argv[5] = NULL;
866	}
867
868	status = forkexec(zlogp, cmdbuf, argv);
869	if (status == 0 || status == -1)
870		return (status);
871	if (opts[0] == '\0')
872		zerror(zlogp, B_FALSE, "\"%s %s %s\" "
873		    "failed with exit code %d",
874		    cmdbuf, special, directory, status);
875	else
876		zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
877		    "failed with exit code %d",
878		    cmdbuf, opts, special, directory, status);
879	return (-1);
880}
881
882/*
883 * Make sure if a given path exists, it is not a sym-link, and is a directory.
884 */
885static int
886check_path(zlog_t *zlogp, const char *path)
887{
888	struct stat statbuf;
889	char respath[MAXPATHLEN];
890	int res;
891
892	if (lstat(path, &statbuf) != 0) {
893		if (errno == ENOENT)
894			return (0);
895		zerror(zlogp, B_TRUE, "can't stat %s", path);
896		return (-1);
897	}
898	if (S_ISLNK(statbuf.st_mode)) {
899		zerror(zlogp, B_FALSE, "%s is a symlink", path);
900		return (-1);
901	}
902	if (!S_ISDIR(statbuf.st_mode)) {
903		if (is_system_labeled() && S_ISREG(statbuf.st_mode)) {
904			/*
905			 * The need to mount readonly copies of
906			 * global zone /etc/ files is unique to
907			 * Trusted Extensions.
908			 * The check for /etc/ via strstr() is to
909			 * allow paths like $ZONEROOT/etc/passwd
910			 */
911			if (strstr(path, "/etc/") == NULL) {
912				zerror(zlogp, B_FALSE,
913				    "%s is not in /etc", path);
914				return (-1);
915			}
916		} else {
917			zerror(zlogp, B_FALSE, "%s is not a directory", path);
918			return (-1);
919		}
920	}
921	if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
922		zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
923		return (-1);
924	}
925	respath[res] = '\0';
926	if (strcmp(path, respath) != 0) {
927		/*
928		 * We don't like ".."s and "."s throwing us off
929		 */
930		zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
931		return (-1);
932	}
933	return (0);
934}
935
936/*
937 * Check every component of rootpath/relpath.  If any component fails (ie,
938 * exists but isn't the canonical path to a directory), it is returned in
939 * badpath, which is assumed to be at least of size MAXPATHLEN.
940 *
941 * Relpath must begin with '/'.
942 */
943static boolean_t
944valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *relpath)
945{
946	char abspath[MAXPATHLEN], *slashp;
947
948	/*
949	 * Make sure abspath has at least one '/' after its rootpath
950	 * component, and ends with '/'.
951	 */
952	if (snprintf(abspath, sizeof (abspath), "%s%s/", rootpath, relpath) >=
953	    sizeof (abspath)) {
954		zerror(zlogp, B_FALSE, "pathname %s%s is too long", rootpath,
955		    relpath);
956		return (B_FALSE);
957	}
958
959	slashp = &abspath[strlen(rootpath)];
960	assert(*slashp == '/');
961	do {
962		*slashp = '\0';
963		if (check_path(zlogp, abspath) != 0)
964			return (B_FALSE);
965		*slashp = '/';
966		slashp++;
967	} while ((slashp = strchr(slashp, '/')) != NULL);
968	return (B_TRUE);
969}
970
971static int
972mount_one_dev_device_cb(void *arg, const char *match, const char *name)
973{
974	di_prof_t prof = arg;
975
976	if (name == NULL)
977		return (di_prof_add_dev(prof, match));
978	return (di_prof_add_map(prof, match, name));
979}
980
981static int
982mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
983{
984	di_prof_t prof = arg;
985
986	return (di_prof_add_symlink(prof, source, target));
987}
988
989static int
990get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
991{
992	zone_dochandle_t handle;
993
994	if ((handle = zonecfg_init_handle()) == NULL) {
995		zerror(zlogp, B_TRUE, "getting zone configuration handle");
996		return (-1);
997	}
998	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
999		zerror(zlogp, B_FALSE, "invalid configuration");
1000		zonecfg_fini_handle(handle);
1001		return (-1);
1002	}
1003	if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
1004		zerror(zlogp, B_FALSE, "invalid ip-type configuration");
1005		zonecfg_fini_handle(handle);
1006		return (-1);
1007	}
1008	zonecfg_fini_handle(handle);
1009	return (0);
1010}
1011
1012/*
1013 * Apply the standard lists of devices/symlinks/mappings and the user-specified
1014 * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
1015 * use these as a profile/filter to determine what exists in /dev.
1016 */
1017static int
1018mount_one_dev(zlog_t *zlogp, char *devpath)
1019{
1020	char			brand[MAXNAMELEN];
1021	zone_dochandle_t	handle = NULL;
1022	brand_handle_t		bh = NULL;
1023	struct zone_devtab	ztab;
1024	di_prof_t		prof = NULL;
1025	int			err;
1026	int			retval = -1;
1027	zone_iptype_t		iptype;
1028	const char 		*curr_iptype;
1029
1030	if (di_prof_init(devpath, &prof)) {
1031		zerror(zlogp, B_TRUE, "failed to initialize profile");
1032		goto cleanup;
1033	}
1034
1035	/* Get a handle to the brand info for this zone */
1036	if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1037	    (bh = brand_open(brand)) == NULL) {
1038		zerror(zlogp, B_FALSE, "unable to determine zone brand");
1039		goto cleanup;
1040	}
1041
1042	if (get_iptype(zlogp, &iptype) < 0) {
1043		zerror(zlogp, B_TRUE, "unable to determine ip-type");
1044		goto cleanup;
1045	}
1046	switch (iptype) {
1047	case ZS_SHARED:
1048		curr_iptype = "shared";
1049		break;
1050	case ZS_EXCLUSIVE:
1051		curr_iptype = "exclusive";
1052		break;
1053	}
1054
1055	if (brand_platform_iter_devices(bh, zone_name,
1056	    mount_one_dev_device_cb, prof, curr_iptype) != 0) {
1057		zerror(zlogp, B_TRUE, "failed to add standard device");
1058		goto cleanup;
1059	}
1060
1061	if (brand_platform_iter_link(bh,
1062	    mount_one_dev_symlink_cb, prof) != 0) {
1063		zerror(zlogp, B_TRUE, "failed to add standard symlink");
1064		goto cleanup;
1065	}
1066
1067	/* Add user-specified devices and directories */
1068	if ((handle = zonecfg_init_handle()) == NULL) {
1069		zerror(zlogp, B_FALSE, "can't initialize zone handle");
1070		goto cleanup;
1071	}
1072	if (err = zonecfg_get_handle(zone_name, handle)) {
1073		zerror(zlogp, B_FALSE, "can't get handle for zone "
1074		    "%s: %s", zone_name, zonecfg_strerror(err));
1075		goto cleanup;
1076	}
1077	if (err = zonecfg_setdevent(handle)) {
1078		zerror(zlogp, B_FALSE, "%s: %s", zone_name,
1079		    zonecfg_strerror(err));
1080		goto cleanup;
1081	}
1082	while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
1083		if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
1084			zerror(zlogp, B_TRUE, "failed to add "
1085			    "user-specified device");
1086			goto cleanup;
1087		}
1088	}
1089	(void) zonecfg_enddevent(handle);
1090
1091	/* Send profile to kernel */
1092	if (di_prof_commit(prof)) {
1093		zerror(zlogp, B_TRUE, "failed to commit profile");
1094		goto cleanup;
1095	}
1096
1097	retval = 0;
1098
1099cleanup:
1100	if (bh != NULL)
1101		brand_close(bh);
1102	if (handle != NULL)
1103		zonecfg_fini_handle(handle);
1104	if (prof)
1105		di_prof_fini(prof);
1106	return (retval);
1107}
1108
1109static int
1110mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath)
1111{
1112	char path[MAXPATHLEN];
1113	char specpath[MAXPATHLEN];
1114	char optstr[MAX_MNTOPT_STR];
1115	zone_fsopt_t *optptr;
1116	int rv;
1117
1118	if (!valid_mount_path(zlogp, rootpath, fsptr->zone_fs_dir)) {
1119		zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1120		    rootpath, fsptr->zone_fs_dir);
1121		return (-1);
1122	}
1123
1124	if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1125	    DEFAULT_DIR_MODE, DEFAULT_DIR_USER, DEFAULT_DIR_GROUP) != 0)
1126		return (-1);
1127
1128	(void) snprintf(path, sizeof (path), "%s%s", rootpath,
1129	    fsptr->zone_fs_dir);
1130
1131	if (strlen(fsptr->zone_fs_special) == 0) {
1132		/*
1133		 * A zero-length special is how we distinguish IPDs from
1134		 * general-purpose FSs.  Make sure it mounts from a place that
1135		 * can be seen via the alternate zone's root.
1136		 */
1137		if (snprintf(specpath, sizeof (specpath), "%s%s",
1138		    zonecfg_get_root(), fsptr->zone_fs_dir) >=
1139		    sizeof (specpath)) {
1140			zerror(zlogp, B_FALSE, "cannot mount %s: path too "
1141			    "long in alternate root", fsptr->zone_fs_dir);
1142			return (-1);
1143		}
1144		if (zonecfg_in_alt_root())
1145			resolve_lofs(zlogp, specpath, sizeof (specpath));
1146		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS,
1147		    specpath, path) != 0) {
1148			zerror(zlogp, B_TRUE, "failed to loopback mount %s",
1149			    specpath);
1150			return (-1);
1151		}
1152		return (0);
1153	}
1154
1155	/*
1156	 * In general the strategy here is to do just as much verification as
1157	 * necessary to avoid crashing or otherwise doing something bad; if the
1158	 * administrator initiated the operation via zoneadm(1m), he'll get
1159	 * auto-verification which will let him know what's wrong.  If he
1160	 * modifies the zone configuration of a running zone and doesn't attempt
1161	 * to verify that it's OK we won't crash but won't bother trying to be
1162	 * too helpful either.  zoneadm verify is only a couple keystrokes away.
1163	 */
1164	if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1165		zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1166		    "invalid file-system type %s", fsptr->zone_fs_special,
1167		    fsptr->zone_fs_dir, fsptr->zone_fs_type);
1168		return (-1);
1169	}
1170
1171	/*
1172	 * If we're looking at an alternate root environment, then construct
1173	 * read-only loopback mounts as necessary.  Note that any special
1174	 * paths for lofs zone mounts in an alternate root must have
1175	 * already been pre-pended with any alternate root path by the
1176	 * time we get here.
1177	 */
1178	if (zonecfg_in_alt_root()) {
1179		struct stat64 st;
1180
1181		if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1182		    S_ISBLK(st.st_mode)) {
1183			/*
1184			 * If we're going to mount a block device we need
1185			 * to check if that device is already mounted
1186			 * somewhere else, and if so, do a lofs mount
1187			 * of the device instead of a direct mount
1188			 */
1189			if (check_lofs_needed(zlogp, fsptr) == -1)
1190				return (-1);
1191		} else if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1192			/*
1193			 * For lofs mounts, the special node is inside the
1194			 * alternate root.  We need lofs resolution for
1195			 * this case in order to get at the underlying
1196			 * read-write path.
1197			 */
1198			resolve_lofs(zlogp, fsptr->zone_fs_special,
1199			    sizeof (fsptr->zone_fs_special));
1200		}
1201	}
1202
1203	/*
1204	 * Run 'fsck -m' if there's a device to fsck.
1205	 */
1206	if (fsptr->zone_fs_raw[0] != '\0' &&
1207	    dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0)
1208		return (-1);
1209
1210	/*
1211	 * Build up mount option string.
1212	 */
1213	optstr[0] = '\0';
1214	if (fsptr->zone_fs_options != NULL) {
1215		(void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1216		    sizeof (optstr));
1217		for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1218		    optptr != NULL; optptr = optptr->zone_fsopt_next) {
1219			(void) strlcat(optstr, ",", sizeof (optstr));
1220			(void) strlcat(optstr, optptr->zone_fsopt_opt,
1221			    sizeof (optstr));
1222		}
1223	}
1224
1225	if ((rv = domount(zlogp, fsptr->zone_fs_type, optstr,
1226	    fsptr->zone_fs_special, path)) != 0)
1227		return (rv);
1228
1229	/*
1230	 * The mount succeeded.  If this was not a mount of /dev then
1231	 * we're done.
1232	 */
1233	if (strcmp(fsptr->zone_fs_type, MNTTYPE_DEV) != 0)
1234		return (0);
1235
1236	/*
1237	 * We just mounted an instance of a /dev filesystem, so now we
1238	 * need to configure it.
1239	 */
1240	return (mount_one_dev(zlogp, path));
1241}
1242
1243static void
1244free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1245{
1246	uint_t i;
1247
1248	if (fsarray == NULL)
1249		return;
1250	for (i = 0; i < nelem; i++)
1251		zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1252	free(fsarray);
1253}
1254
1255/*
1256 * This function initiates the creation of a small Solaris Environment for
1257 * scratch zone. The Environment creation process is split up into two
1258 * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1259 * is done this way because:
1260 * 	We need to have both /etc and /var in the root of the scratchzone.
1261 * 	We loopback mount zone's own /etc and /var into the root of the
1262 * 	scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1263 * 	need to delay the mount of /var till the zone's root gets populated.
1264 *	So mounting of localdirs[](/etc and /var) have been moved to the
1265 * 	build_mounted_post_var() which gets called only after the zone
1266 * 	specific filesystems are mounted.
1267 */
1268static boolean_t
1269build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1270    size_t rootlen, const char *zonepath, char *luroot, size_t lurootlen)
1271{
1272	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1273	const char **cpp;
1274	static const char *mkdirs[] = {
1275		"/system", "/system/contract", "/system/object", "/proc",
1276		"/dev", "/tmp", "/a", NULL
1277	};
1278	char *altstr;
1279	FILE *fp;
1280	uuid_t uuid;
1281
1282	assert(zone_isnative);
1283
1284	resolve_lofs(zlogp, rootpath, rootlen);
1285	(void) snprintf(luroot, lurootlen, "%s/lu", zonepath);
1286	resolve_lofs(zlogp, luroot, lurootlen);
1287	(void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1288	(void) symlink("./usr/bin", tmp);
1289
1290	/*
1291	 * These are mostly special mount points; not handled here.  (See
1292	 * zone_mount_early.)
1293	 */
1294	for (cpp = mkdirs; *cpp != NULL; cpp++) {
1295		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1296		if (mkdir(tmp, 0755) != 0) {
1297			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1298			return (B_FALSE);
1299		}
1300	}
1301	/*
1302	 * This is here to support lucopy.  If there's an instance of this same
1303	 * zone on the current running system, then we mount its root up as
1304	 * read-only inside the scratch zone.
1305	 */
1306	(void) zonecfg_get_uuid(zone_name, uuid);
1307	altstr = strdup(zonecfg_get_root());
1308	if (altstr == NULL) {
1309		zerror(zlogp, B_TRUE, "memory allocation failed");
1310		return (B_FALSE);
1311	}
1312	zonecfg_set_root("");
1313	(void) strlcpy(tmp, zone_name, sizeof (tmp));
1314	(void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1315	if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1316	    strcmp(fromdir, rootpath) != 0) {
1317		(void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1318		if (mkdir(tmp, 0755) != 0) {
1319			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1320			return (B_FALSE);
1321		}
1322		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, fromdir,
1323		    tmp) != 0) {
1324			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1325			    fromdir);
1326			return (B_FALSE);
1327		}
1328	}
1329	zonecfg_set_root(altstr);
1330	free(altstr);
1331
1332	if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1333		zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1334		return (B_FALSE);
1335	}
1336	(void) ftruncate(fileno(fp), 0);
1337	if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1338		zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1339	}
1340	zonecfg_close_scratch(fp);
1341	(void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1342	if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1343		return (B_FALSE);
1344	(void) strlcpy(rootpath, tmp, rootlen);
1345	return (B_TRUE);
1346}
1347
1348
1349static boolean_t
1350build_mounted_post_var(zlog_t *zlogp, char *rootpath, const char *luroot)
1351{
1352	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1353	const char **cpp;
1354	static const char *localdirs[] = {
1355		"/etc", "/var", NULL
1356	};
1357	static const char *loopdirs[] = {
1358		"/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1359		"/usr", NULL
1360	};
1361	static const char *tmpdirs[] = {
1362		"/tmp", "/var/run", NULL
1363	};
1364	struct stat st;
1365
1366	/*
1367	 * These are mounted read-write from the zone undergoing upgrade.  We
1368	 * must be careful not to 'leak' things from the main system into the
1369	 * zone, and this accomplishes that goal.
1370	 */
1371	for (cpp = localdirs; *cpp != NULL; cpp++) {
1372		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1373		(void) snprintf(fromdir, sizeof (fromdir), "%s%s", rootpath,
1374		    *cpp);
1375		if (mkdir(tmp, 0755) != 0) {
1376			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1377			return (B_FALSE);
1378		}
1379		if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp) != 0) {
1380			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1381			    *cpp);
1382			return (B_FALSE);
1383		}
1384	}
1385
1386	/*
1387	 * These are things mounted read-only from the running system because
1388	 * they contain binaries that must match system.
1389	 */
1390	for (cpp = loopdirs; *cpp != NULL; cpp++) {
1391		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1392		if (mkdir(tmp, 0755) != 0) {
1393			if (errno != EEXIST) {
1394				zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1395				return (B_FALSE);
1396			}
1397			if (lstat(tmp, &st) != 0) {
1398				zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1399				return (B_FALSE);
1400			}
1401			/*
1402			 * Ignore any non-directories encountered.  These are
1403			 * things that have been converted into symlinks
1404			 * (/etc/fs and /etc/lib) and no longer need a lofs
1405			 * fixup.
1406			 */
1407			if (!S_ISDIR(st.st_mode))
1408				continue;
1409		}
1410		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, *cpp,
1411		    tmp) != 0) {
1412			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1413			    *cpp);
1414			return (B_FALSE);
1415		}
1416	}
1417
1418	/*
1419	 * These are things with tmpfs mounted inside.
1420	 */
1421	for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1422		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1423		if (mkdir(tmp, 0755) != 0 && errno != EEXIST) {
1424			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1425			return (B_FALSE);
1426		}
1427
1428		/*
1429		 * We could set the mode for /tmp when we do the mkdir but
1430		 * since that can be modified by the umask we will just set
1431		 * the correct mode for /tmp now.
1432		 */
1433		if (strcmp(*cpp, "/tmp") == 0 && chmod(tmp, 01777) != 0) {
1434			zerror(zlogp, B_TRUE, "cannot chmod %s", tmp);
1435			return (B_FALSE);
1436		}
1437
1438		if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1439			zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1440			return (B_FALSE);
1441		}
1442	}
1443	return (B_TRUE);
1444}
1445
1446typedef struct plat_gmount_cb_data {
1447	zlog_t			*pgcd_zlogp;
1448	struct zone_fstab	**pgcd_fs_tab;
1449	int			*pgcd_num_fs;
1450} plat_gmount_cb_data_t;
1451
1452/*
1453 * plat_gmount_cb() is a callback function invoked by libbrand to iterate
1454 * through all global brand platform mounts.
1455 */
1456int
1457plat_gmount_cb(void *data, const char *spec, const char *dir,
1458    const char *fstype, const char *opt)
1459{
1460	plat_gmount_cb_data_t	*cp = data;
1461	zlog_t			*zlogp = cp->pgcd_zlogp;
1462	struct zone_fstab	*fs_ptr = *cp->pgcd_fs_tab;
1463	int			num_fs = *cp->pgcd_num_fs;
1464	struct zone_fstab	*fsp, *tmp_ptr;
1465
1466	num_fs++;
1467	if ((tmp_ptr = realloc(fs_ptr, num_fs * sizeof (*tmp_ptr))) == NULL) {
1468		zerror(zlogp, B_TRUE, "memory allocation failed");
1469		return (-1);
1470	}
1471
1472	fs_ptr = tmp_ptr;
1473	fsp = &fs_ptr[num_fs - 1];
1474
1475	/* update the callback struct passed in */
1476	*cp->pgcd_fs_tab = fs_ptr;
1477	*cp->pgcd_num_fs = num_fs;
1478
1479	fsp->zone_fs_raw[0] = '\0';
1480	(void) strlcpy(fsp->zone_fs_special, spec,
1481	    sizeof (fsp->zone_fs_special));
1482	(void) strlcpy(fsp->zone_fs_dir, dir, sizeof (fsp->zone_fs_dir));
1483	(void) strlcpy(fsp->zone_fs_type, fstype, sizeof (fsp->zone_fs_type));
1484	fsp->zone_fs_options = NULL;
1485	if ((opt != NULL) &&
1486	    (zonecfg_add_fs_option(fsp, (char *)opt) != Z_OK)) {
1487		zerror(zlogp, B_FALSE, "error adding property");
1488		return (-1);
1489	}
1490
1491	return (0);
1492}
1493
1494static int
1495mount_filesystems_ipdent(zone_dochandle_t handle, zlog_t *zlogp,
1496    struct zone_fstab **fs_tabp, int *num_fsp)
1497{
1498	struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1499	int num_fs;
1500
1501	num_fs = *num_fsp;
1502	fs_ptr = *fs_tabp;
1503
1504	if (zonecfg_setipdent(handle) != Z_OK) {
1505		zerror(zlogp, B_FALSE, "invalid configuration");
1506		return (-1);
1507	}
1508	while (zonecfg_getipdent(handle, &fstab) == Z_OK) {
1509		num_fs++;
1510		if ((tmp_ptr = realloc(fs_ptr,
1511		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1512			zerror(zlogp, B_TRUE, "memory allocation failed");
1513			(void) zonecfg_endipdent(handle);
1514			return (-1);
1515		}
1516
1517		/* update the pointers passed in */
1518		*fs_tabp = tmp_ptr;
1519		*num_fsp = num_fs;
1520
1521		/*
1522		 * IPDs logically only have a mount point; all other properties
1523		 * are implied.
1524		 */
1525		fs_ptr = tmp_ptr;
1526		fsp = &fs_ptr[num_fs - 1];
1527		(void) strlcpy(fsp->zone_fs_dir,
1528		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1529		fsp->zone_fs_special[0] = '\0';
1530		fsp->zone_fs_raw[0] = '\0';
1531		fsp->zone_fs_type[0] = '\0';
1532		fsp->zone_fs_options = NULL;
1533	}
1534	(void) zonecfg_endipdent(handle);
1535	return (0);
1536}
1537
1538static int
1539mount_filesystems_fsent(zone_dochandle_t handle, zlog_t *zlogp,
1540    struct zone_fstab **fs_tabp, int *num_fsp, int mount_cmd)
1541{
1542	struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1543	int num_fs;
1544
1545	num_fs = *num_fsp;
1546	fs_ptr = *fs_tabp;
1547
1548	if (zonecfg_setfsent(handle) != Z_OK) {
1549		zerror(zlogp, B_FALSE, "invalid configuration");
1550		return (-1);
1551	}
1552	while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1553		/*
1554		 * ZFS filesystems will not be accessible under an alternate
1555		 * root, since the pool will not be known.  Ignore them in this
1556		 * case.
1557		 */
1558		if (mount_cmd && strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1559			continue;
1560
1561		num_fs++;
1562		if ((tmp_ptr = realloc(fs_ptr,
1563		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1564			zerror(zlogp, B_TRUE, "memory allocation failed");
1565			(void) zonecfg_endfsent(handle);
1566			return (-1);
1567		}
1568		/* update the pointers passed in */
1569		*fs_tabp = tmp_ptr;
1570		*num_fsp = num_fs;
1571
1572		fs_ptr = tmp_ptr;
1573		fsp = &fs_ptr[num_fs - 1];
1574		(void) strlcpy(fsp->zone_fs_dir,
1575		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1576		(void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1577		    sizeof (fsp->zone_fs_raw));
1578		(void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1579		    sizeof (fsp->zone_fs_type));
1580		fsp->zone_fs_options = fstab.zone_fs_options;
1581
1582		/*
1583		 * For all lofs mounts, make sure that the 'special'
1584		 * entry points inside the alternate root.  The
1585		 * source path for a lofs mount in a given zone needs
1586		 * to be relative to the root of the boot environment
1587		 * that contains the zone.  Note that we don't do this
1588		 * for non-lofs mounts since they will have a device
1589		 * as a backing store and device paths must always be
1590		 * specified relative to the current boot environment.
1591		 */
1592		fsp->zone_fs_special[0] = '\0';
1593		if (strcmp(fsp->zone_fs_type, MNTTYPE_LOFS) == 0) {
1594			(void) strlcat(fsp->zone_fs_special, zonecfg_get_root(),
1595			    sizeof (fsp->zone_fs_special));
1596		}
1597		(void) strlcat(fsp->zone_fs_special, fstab.zone_fs_special,
1598		    sizeof (fsp->zone_fs_special));
1599	}
1600	(void) zonecfg_endfsent(handle);
1601	return (0);
1602}
1603
1604static int
1605mount_filesystems(zlog_t *zlogp, boolean_t mount_cmd)
1606{
1607	char rootpath[MAXPATHLEN];
1608	char zonepath[MAXPATHLEN];
1609	char brand[MAXNAMELEN];
1610	char luroot[MAXPATHLEN];
1611	int i, num_fs = 0;
1612	struct zone_fstab *fs_ptr = NULL;
1613	zone_dochandle_t handle = NULL;
1614	zone_state_t zstate;
1615	brand_handle_t bh;
1616	plat_gmount_cb_data_t cb;
1617
1618	if (zone_get_state(zone_name, &zstate) != Z_OK ||
1619	    (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1620		zerror(zlogp, B_FALSE,
1621		    "zone must be in '%s' or '%s' state to mount file-systems",
1622		    zone_state_str(ZONE_STATE_READY),
1623		    zone_state_str(ZONE_STATE_MOUNTED));
1624		goto bad;
1625	}
1626
1627	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1628		zerror(zlogp, B_TRUE, "unable to determine zone path");
1629		goto bad;
1630	}
1631
1632	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1633		zerror(zlogp, B_TRUE, "unable to determine zone root");
1634		goto bad;
1635	}
1636
1637	if ((handle = zonecfg_init_handle()) == NULL) {
1638		zerror(zlogp, B_TRUE, "getting zone configuration handle");
1639		goto bad;
1640	}
1641	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1642	    zonecfg_setfsent(handle) != Z_OK) {
1643		zerror(zlogp, B_FALSE, "invalid configuration");
1644		goto bad;
1645	}
1646
1647	/* Get a handle to the brand info for this zone */
1648	if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1649	    (bh = brand_open(brand)) == NULL) {
1650		zerror(zlogp, B_FALSE, "unable to determine zone brand");
1651		zonecfg_fini_handle(handle);
1652		return (-1);
1653	}
1654
1655	/*
1656	 * Get the list of global filesystems to mount from the brand
1657	 * configuration.
1658	 */
1659	cb.pgcd_zlogp = zlogp;
1660	cb.pgcd_fs_tab = &fs_ptr;
1661	cb.pgcd_num_fs = &num_fs;
1662	if (brand_platform_iter_gmounts(bh, zonepath,
1663	    plat_gmount_cb, &cb) != 0) {
1664		zerror(zlogp, B_FALSE, "unable to mount filesystems");
1665		brand_close(bh);
1666		zonecfg_fini_handle(handle);
1667		return (-1);
1668	}
1669	brand_close(bh);
1670
1671	/*
1672	 * Iterate through the rest of the filesystems, first the IPDs, then
1673	 * the general FSs.  Sort them all, then mount them in sorted order.
1674	 * This is to make sure the higher level directories (e.g., /usr)
1675	 * get mounted before any beneath them (e.g., /usr/local).
1676	 */
1677	if (mount_filesystems_ipdent(handle, zlogp, &fs_ptr, &num_fs) != 0)
1678		goto bad;
1679
1680	if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
1681	    mount_cmd) != 0)
1682		goto bad;
1683
1684	zonecfg_fini_handle(handle);
1685	handle = NULL;
1686
1687	/*
1688	 * Normally when we mount a zone all the zone filesystems
1689	 * get mounted relative to rootpath, which is usually
1690	 * <zonepath>/root.  But when mounting a zone for administration
1691	 * purposes via the zone "mount" state, build_mounted_pre_var()
1692	 * updates rootpath to be <zonepath>/lu/a so we'll mount all
1693	 * the zones filesystems there instead.
1694	 *
1695	 * build_mounted_pre_var() and build_mounted_post_var() will
1696	 * also do some extra work to create directories and lofs mount
1697	 * a bunch of global zone file system paths into <zonepath>/lu.
1698	 *
1699	 * This allows us to be able to enter the zone (now rooted at
1700	 * <zonepath>/lu) and run the upgrade/patch tools that are in the
1701	 * global zone and have them upgrade the to-be-modified zone's
1702	 * files mounted on /a.  (Which mirrors the existing standard
1703	 * upgrade environment.)
1704	 *
1705	 * There is of course one catch.  When doing the upgrade
1706	 * we need <zoneroot>/lu/dev to be the /dev filesystem
1707	 * for the zone and we don't want to have any /dev filesystem
1708	 * mounted at <zoneroot>/lu/a/dev.  Since /dev is specified
1709	 * as a normal zone filesystem by default we'll try to mount
1710	 * it at <zoneroot>/lu/a/dev, so we have to detect this
1711	 * case and instead mount it at <zoneroot>/lu/dev.
1712	 *
1713	 * All this work is done in three phases:
1714	 *   1) Create and populate lu directory (build_mounted_pre_var()).
1715	 *   2) Mount the required filesystems as per the zone configuration.
1716	 *   3) Set up the rest of the scratch zone environment
1717	 *	(build_mounted_post_var()).
1718	 */
1719	if (mount_cmd &&
1720	    !build_mounted_pre_var(zlogp,
1721	    rootpath, sizeof (rootpath), zonepath, luroot, sizeof (luroot)))
1722		goto bad;
1723
1724	qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1725
1726	for (i = 0; i < num_fs; i++) {
1727		if (mount_cmd &&
1728		    strcmp(fs_ptr[i].zone_fs_dir, "/dev") == 0) {
1729			size_t slen = strlen(rootpath) - 2;
1730
1731			/*
1732			 * By default we'll try to mount /dev as /a/dev
1733			 * but /dev is special and always goes at the top
1734			 * so strip the trailing '/a' from the rootpath.
1735			 */
1736			assert(zone_isnative);
1737			assert(strcmp(&rootpath[slen], "/a") == 0);
1738			rootpath[slen] = '\0';
1739			if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1740				goto bad;
1741			rootpath[slen] = '/';
1742			continue;
1743		}
1744		if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1745			goto bad;
1746	}
1747	if (mount_cmd &&
1748	    !build_mounted_post_var(zlogp, rootpath, luroot))
1749		goto bad;
1750
1751	/*
1752	 * For Trusted Extensions cross-mount each lower level /export/home
1753	 */
1754	if (!mount_cmd && tsol_mounts(zlogp, zone_name, rootpath) != 0)
1755		goto bad;
1756
1757	free_fs_data(fs_ptr, num_fs);
1758
1759	/*
1760	 * Everything looks fine.
1761	 */
1762	return (0);
1763
1764bad:
1765	if (handle != NULL)
1766		zonecfg_fini_handle(handle);
1767	free_fs_data(fs_ptr, num_fs);
1768	return (-1);
1769}
1770
1771/* caller makes sure neither parameter is NULL */
1772static int
1773addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1774{
1775	int prefixlen;
1776
1777	prefixlen = atoi(prefixstr);
1778	if (prefixlen < 0 || prefixlen > maxprefixlen)
1779		return (1);
1780	while (prefixlen > 0) {
1781		if (prefixlen >= 8) {
1782			*maskstr++ = 0xFF;
1783			prefixlen -= 8;
1784			continue;
1785		}
1786		*maskstr |= 1 << (8 - prefixlen);
1787		prefixlen--;
1788	}
1789	return (0);
1790}
1791
1792/*
1793 * Tear down all interfaces belonging to the given zone.  This should
1794 * be called with the zone in a state other than "running", so that
1795 * interfaces can't be assigned to the zone after this returns.
1796 *
1797 * If anything goes wrong, log an error message and return an error.
1798 */
1799static int
1800unconfigure_shared_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1801{
1802	struct lifnum lifn;
1803	struct lifconf lifc;
1804	struct lifreq *lifrp, lifrl;
1805	int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1806	int num_ifs, s, i, ret_code = 0;
1807	uint_t bufsize;
1808	char *buf = NULL;
1809
1810	if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1811		zerror(zlogp, B_TRUE, "could not get socket");
1812		ret_code = -1;
1813		goto bad;
1814	}
1815	lifn.lifn_family = AF_UNSPEC;
1816	lifn.lifn_flags = (int)lifc_flags;
1817	if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1818		zerror(zlogp, B_TRUE,
1819		    "could not determine number of network interfaces");
1820		ret_code = -1;
1821		goto bad;
1822	}
1823	num_ifs = lifn.lifn_count;
1824	bufsize = num_ifs * sizeof (struct lifreq);
1825	if ((buf = malloc(bufsize)) == NULL) {
1826		zerror(zlogp, B_TRUE, "memory allocation failed");
1827		ret_code = -1;
1828		goto bad;
1829	}
1830	lifc.lifc_family = AF_UNSPEC;
1831	lifc.lifc_flags = (int)lifc_flags;
1832	lifc.lifc_len = bufsize;
1833	lifc.lifc_buf = buf;
1834	if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1835		zerror(zlogp, B_TRUE, "could not get configured network "
1836		    "interfaces");
1837		ret_code = -1;
1838		goto bad;
1839	}
1840	lifrp = lifc.lifc_req;
1841	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1842		(void) close(s);
1843		if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1844		    0) {
1845			zerror(zlogp, B_TRUE, "%s: could not get socket",
1846			    lifrl.lifr_name);
1847			ret_code = -1;
1848			continue;
1849		}
1850		(void) memset(&lifrl, 0, sizeof (lifrl));
1851		(void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1852		    sizeof (lifrl.lifr_name));
1853		if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1854			if (errno == ENXIO)
1855				/*
1856				 * Interface may have been removed by admin or
1857				 * another zone halting.
1858				 */
1859				continue;
1860			zerror(zlogp, B_TRUE,
1861			    "%s: could not determine the zone to which this "
1862			    "network interface is bound", lifrl.lifr_name);
1863			ret_code = -1;
1864			continue;
1865		}
1866		if (lifrl.lifr_zoneid == zone_id) {
1867			if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1868				zerror(zlogp, B_TRUE,
1869				    "%s: could not remove network interface",
1870				    lifrl.lifr_name);
1871				ret_code = -1;
1872				continue;
1873			}
1874		}
1875	}
1876bad:
1877	if (s > 0)
1878		(void) close(s);
1879	if (buf)
1880		free(buf);
1881	return (ret_code);
1882}
1883
1884static union	sockunion {
1885	struct	sockaddr sa;
1886	struct	sockaddr_in sin;
1887	struct	sockaddr_dl sdl;
1888	struct	sockaddr_in6 sin6;
1889} so_dst, so_ifp;
1890
1891static struct {
1892	struct	rt_msghdr hdr;
1893	char	space[512];
1894} rtmsg;
1895
1896static int
1897salen(struct sockaddr *sa)
1898{
1899	switch (sa->sa_family) {
1900	case AF_INET:
1901		return (sizeof (struct sockaddr_in));
1902	case AF_LINK:
1903		return (sizeof (struct sockaddr_dl));
1904	case AF_INET6:
1905		return (sizeof (struct sockaddr_in6));
1906	default:
1907		return (sizeof (struct sockaddr));
1908	}
1909}
1910
1911#define	ROUNDUP_LONG(a) \
1912	((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
1913
1914/*
1915 * Look up which zone is using a given IP address.  The address in question
1916 * is expected to have been stuffed into the structure to which lifr points
1917 * via a previous SIOCGLIFADDR ioctl().
1918 *
1919 * This is done using black router socket magic.
1920 *
1921 * Return the name of the zone on success or NULL on failure.
1922 *
1923 * This is a lot of code for a simple task; a new ioctl request to take care
1924 * of this might be a useful RFE.
1925 */
1926
1927static char *
1928who_is_using(zlog_t *zlogp, struct lifreq *lifr)
1929{
1930	static char answer[ZONENAME_MAX];
1931	pid_t pid;
1932	int s, rlen, l, i;
1933	char *cp = rtmsg.space;
1934	struct sockaddr_dl *ifp = NULL;
1935	struct sockaddr *sa;
1936	char save_if_name[LIFNAMSIZ];
1937
1938	answer[0] = '\0';
1939
1940	pid = getpid();
1941	if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
1942		zerror(zlogp, B_TRUE, "could not get routing socket");
1943		return (NULL);
1944	}
1945
1946	if (lifr->lifr_addr.ss_family == AF_INET) {
1947		struct sockaddr_in *sin4;
1948
1949		so_dst.sa.sa_family = AF_INET;
1950		sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
1951		so_dst.sin.sin_addr = sin4->sin_addr;
1952	} else {
1953		struct sockaddr_in6 *sin6;
1954
1955		so_dst.sa.sa_family = AF_INET6;
1956		sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
1957		so_dst.sin6.sin6_addr = sin6->sin6_addr;
1958	}
1959
1960	so_ifp.sa.sa_family = AF_LINK;
1961
1962	(void) memset(&rtmsg, 0, sizeof (rtmsg));
1963	rtmsg.hdr.rtm_type = RTM_GET;
1964	rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
1965	rtmsg.hdr.rtm_version = RTM_VERSION;
1966	rtmsg.hdr.rtm_seq = ++rts_seqno;
1967	rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
1968
1969	l = ROUNDUP_LONG(salen(&so_dst.sa));
1970	(void) memmove(cp, &(so_dst), l);
1971	cp += l;
1972	l = ROUNDUP_LONG(salen(&so_ifp.sa));
1973	(void) memmove(cp, &(so_ifp), l);
1974	cp += l;
1975
1976	rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
1977
1978	if ((rlen = write(s, &rtmsg, l)) < 0) {
1979		zerror(zlogp, B_TRUE, "writing to routing socket");
1980		return (NULL);
1981	} else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
1982		zerror(zlogp, B_TRUE,
1983		    "write to routing socket got only %d for len\n", rlen);
1984		return (NULL);
1985	}
1986	do {
1987		l = read(s, &rtmsg, sizeof (rtmsg));
1988	} while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
1989	    rtmsg.hdr.rtm_pid != pid));
1990	if (l < 0) {
1991		zerror(zlogp, B_TRUE, "reading from routing socket");
1992		return (NULL);
1993	}
1994
1995	if (rtmsg.hdr.rtm_version != RTM_VERSION) {
1996		zerror(zlogp, B_FALSE,
1997		    "routing message version %d not understood",
1998		    rtmsg.hdr.rtm_version);
1999		return (NULL);
2000	}
2001	if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
2002		zerror(zlogp, B_FALSE, "message length mismatch, "
2003		    "expected %d bytes, returned %d bytes",
2004		    rtmsg.hdr.rtm_msglen, l);
2005		return (NULL);
2006	}
2007	if (rtmsg.hdr.rtm_errno != 0)  {
2008		errno = rtmsg.hdr.rtm_errno;
2009		zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
2010		return (NULL);
2011	}
2012	if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
2013		zerror(zlogp, B_FALSE, "network interface not found");
2014		return (NULL);
2015	}
2016	cp = ((char *)(&rtmsg.hdr + 1));
2017	for (i = 1; i != 0; i <<= 1) {
2018		/* LINTED E_BAD_PTR_CAST_ALIGN */
2019		sa = (struct sockaddr *)cp;
2020		if (i != RTA_IFP) {
2021			if ((i & rtmsg.hdr.rtm_addrs) != 0)
2022				cp += ROUNDUP_LONG(salen(sa));
2023			continue;
2024		}
2025		if (sa->sa_family == AF_LINK &&
2026		    ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
2027			ifp = (struct sockaddr_dl *)sa;
2028		break;
2029	}
2030	if (ifp == NULL) {
2031		zerror(zlogp, B_FALSE, "network interface could not be "
2032		    "determined");
2033		return (NULL);
2034	}
2035
2036	/*
2037	 * We need to set the I/F name to what we got above, then do the
2038	 * appropriate ioctl to get its zone name.  But lifr->lifr_name is
2039	 * used by the calling function to do a REMOVEIF, so if we leave the
2040	 * "good" zone's I/F name in place, *that* I/F will be removed instead
2041	 * of the bad one.  So we save the old (bad) I/F name before over-
2042	 * writing it and doing the ioctl, then restore it after the ioctl.
2043	 */
2044	(void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
2045	(void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
2046	lifr->lifr_name[ifp->sdl_nlen] = '\0';
2047	i = ioctl(s, SIOCGLIFZONE, lifr);
2048	(void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
2049	if (i < 0) {
2050		zerror(zlogp, B_TRUE,
2051		    "%s: could not determine the zone network interface "
2052		    "belongs to", lifr->lifr_name);
2053		return (NULL);
2054	}
2055	if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
2056		(void) snprintf(answer, sizeof (answer), "%d",
2057		    lifr->lifr_zoneid);
2058
2059	if (strlen(answer) > 0)
2060		return (answer);
2061	return (NULL);
2062}
2063
2064typedef struct mcast_rtmsg_s {
2065	struct rt_msghdr	m_rtm;
2066	union {
2067		struct {
2068			struct sockaddr_in	m_dst;
2069			struct sockaddr_in	m_gw;
2070			struct sockaddr_in	m_netmask;
2071		} m_v4;
2072		struct {
2073			struct sockaddr_in6	m_dst;
2074			struct sockaddr_in6	m_gw;
2075			struct sockaddr_in6	m_netmask;
2076		} m_v6;
2077	} m_u;
2078} mcast_rtmsg_t;
2079#define	m_dst4		m_u.m_v4.m_dst
2080#define	m_dst6		m_u.m_v6.m_dst
2081#define	m_gw4		m_u.m_v4.m_gw
2082#define	m_gw6		m_u.m_v6.m_gw
2083#define	m_netmask4	m_u.m_v4.m_netmask
2084#define	m_netmask6	m_u.m_v6.m_netmask
2085
2086/*
2087 * Configures a single interface: a new virtual interface is added, based on
2088 * the physical interface nwiftabptr->zone_nwif_physical, with the address
2089 * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
2090 * the "address" can be an IPv6 address (with a /prefixlength required), an
2091 * IPv4 address (with a /prefixlength optional), or a name; for the latter,
2092 * an IPv4 name-to-address resolution will be attempted.
2093 *
2094 * A default interface route for multicast is created on the first IPv4 and
2095 * IPv6 interfaces (that have the IFF_MULTICAST flag set), respectively.
2096 * This should really be done in the init scripts if we ever allow zones to
2097 * modify the routing tables.
2098 *
2099 * If anything goes wrong, we log an detailed error message, attempt to tear
2100 * down whatever we set up and return an error.
2101 */
2102static int
2103configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
2104    struct zone_nwiftab *nwiftabptr, boolean_t *mcast_rt_v4_setp,
2105    boolean_t *mcast_rt_v6_setp)
2106{
2107	struct lifreq lifr;
2108	struct sockaddr_in netmask4;
2109	struct sockaddr_in6 netmask6;
2110	struct in_addr in4;
2111	struct in6_addr in6;
2112	sa_family_t af;
2113	char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
2114	mcast_rtmsg_t mcast_rtmsg;
2115	int s;
2116	int rs;
2117	int rlen;
2118	boolean_t got_netmask = B_FALSE;
2119	char addrstr4[INET_ADDRSTRLEN];
2120	int res;
2121
2122	res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
2123	if (res != Z_OK) {
2124		zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
2125		    nwiftabptr->zone_nwif_address);
2126		return (-1);
2127	}
2128	af = lifr.lifr_addr.ss_family;
2129	if (af == AF_INET)
2130		in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
2131	else
2132		in6 = ((struct sockaddr_in6 *)(&lifr.lifr_addr))->sin6_addr;
2133
2134	if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
2135		zerror(zlogp, B_TRUE, "could not get socket");
2136		return (-1);
2137	}
2138
2139	(void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
2140	    sizeof (lifr.lifr_name));
2141	if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
2142		/*
2143		 * Here, we know that the interface can't be brought up.
2144		 * A similar warning message was already printed out to
2145		 * the console by zoneadm(1M) so instead we log the
2146		 * message to syslog and continue.
2147		 */
2148		zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
2149		    "'%s' which may not be present/plumbed in the "
2150		    "global zone.", lifr.lifr_name);
2151		(void) close(s);
2152		return (Z_OK);
2153	}
2154
2155	if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2156		zerror(zlogp, B_TRUE,
2157		    "%s: could not set IP address to %s",
2158		    lifr.lifr_name, nwiftabptr->zone_nwif_address);
2159		goto bad;
2160	}
2161
2162	/* Preserve literal IPv4 address for later potential printing. */
2163	if (af == AF_INET)
2164		(void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
2165
2166	lifr.lifr_zoneid = zone_id;
2167	if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
2168		zerror(zlogp, B_TRUE, "%s: could not place network interface "
2169		    "into zone", lifr.lifr_name);
2170		goto bad;
2171	}
2172
2173	if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
2174		got_netmask = B_TRUE;	/* default setting will be correct */
2175	} else {
2176		if (af == AF_INET) {
2177			/*
2178			 * The IPv4 netmask can be determined either
2179			 * directly if a prefix length was supplied with
2180			 * the address or via the netmasks database.  Not
2181			 * being able to determine it is a common failure,
2182			 * but it often is not fatal to operation of the
2183			 * interface.  In that case, a warning will be
2184			 * printed after the rest of the interface's
2185			 * parameters have been configured.
2186			 */
2187			(void) memset(&netmask4, 0, sizeof (netmask4));
2188			if (slashp != NULL) {
2189				if (addr2netmask(slashp + 1, V4_ADDR_LEN,
2190				    (uchar_t *)&netmask4.sin_addr) != 0) {
2191					*slashp = '/';
2192					zerror(zlogp, B_FALSE,
2193					    "%s: invalid prefix length in %s",
2194					    lifr.lifr_name,
2195					    nwiftabptr->zone_nwif_address);
2196					goto bad;
2197				}
2198				got_netmask = B_TRUE;
2199			} else if (getnetmaskbyaddr(in4,
2200			    &netmask4.sin_addr) == 0) {
2201				got_netmask = B_TRUE;
2202			}
2203			if (got_netmask) {
2204				netmask4.sin_family = af;
2205				(void) memcpy(&lifr.lifr_addr, &netmask4,
2206				    sizeof (netmask4));
2207			}
2208		} else {
2209			(void) memset(&netmask6, 0, sizeof (netmask6));
2210			if (addr2netmask(slashp + 1, V6_ADDR_LEN,
2211			    (uchar_t *)&netmask6.sin6_addr) != 0) {
2212				*slashp = '/';
2213				zerror(zlogp, B_FALSE,
2214				    "%s: invalid prefix length in %s",
2215				    lifr.lifr_name,
2216				    nwiftabptr->zone_nwif_address);
2217				goto bad;
2218			}
2219			got_netmask = B_TRUE;
2220			netmask6.sin6_family = af;
2221			(void) memcpy(&lifr.lifr_addr, &netmask6,
2222			    sizeof (netmask6));
2223		}
2224		if (got_netmask &&
2225		    ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
2226			zerror(zlogp, B_TRUE, "%s: could not set netmask",
2227			    lifr.lifr_name);
2228			goto bad;
2229		}
2230
2231		/*
2232		 * This doesn't set the broadcast address at all. Rather, it
2233		 * gets, then sets the interface's address, relying on the fact
2234		 * that resetting the address will reset the broadcast address.
2235		 */
2236		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2237			zerror(zlogp, B_TRUE, "%s: could not get address",
2238			    lifr.lifr_name);
2239			goto bad;
2240		}
2241		if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2242			zerror(zlogp, B_TRUE,
2243			    "%s: could not reset broadcast address",
2244			    lifr.lifr_name);
2245			goto bad;
2246		}
2247	}
2248
2249	if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
2250		zerror(zlogp, B_TRUE, "%s: could not get flags",
2251		    lifr.lifr_name);
2252		goto bad;
2253	}
2254	lifr.lifr_flags |= IFF_UP;
2255	if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
2256		int save_errno = errno;
2257		char *zone_using;
2258
2259		/*
2260		 * If we failed with something other than EADDRNOTAVAIL,
2261		 * then skip to the end.  Otherwise, look up our address,
2262		 * then call a function to determine which zone is already
2263		 * using that address.
2264		 */
2265		if (errno != EADDRNOTAVAIL) {
2266			zerror(zlogp, B_TRUE,
2267			    "%s: could not bring network interface up",
2268			    lifr.lifr_name);
2269			goto bad;
2270		}
2271		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2272			zerror(zlogp, B_TRUE, "%s: could not get address",
2273			    lifr.lifr_name);
2274			goto bad;
2275		}
2276		zone_using = who_is_using(zlogp, &lifr);
2277		errno = save_errno;
2278		if (zone_using == NULL)
2279			zerror(zlogp, B_TRUE,
2280			    "%s: could not bring network interface up",
2281			    lifr.lifr_name);
2282		else
2283			zerror(zlogp, B_TRUE, "%s: could not bring network "
2284			    "interface up: address in use by zone '%s'",
2285			    lifr.lifr_name, zone_using);
2286		goto bad;
2287	}
2288	if ((lifr.lifr_flags & IFF_MULTICAST) && ((af == AF_INET &&
2289	    mcast_rt_v4_setp != NULL && *mcast_rt_v4_setp == B_FALSE) ||
2290	    (af == AF_INET6 &&
2291	    mcast_rt_v6_setp != NULL && *mcast_rt_v6_setp == B_FALSE))) {
2292		rs = socket(PF_ROUTE, SOCK_RAW, 0);
2293		if (rs < 0) {
2294			zerror(zlogp, B_TRUE, "%s: could not create "
2295			    "routing socket", lifr.lifr_name);
2296			goto bad;
2297		}
2298		(void) shutdown(rs, 0);
2299		(void) memset((void *)&mcast_rtmsg, 0, sizeof (mcast_rtmsg_t));
2300		mcast_rtmsg.m_rtm.rtm_msglen =  sizeof (struct rt_msghdr) +
2301		    3 * (af == AF_INET ? sizeof (struct sockaddr_in) :
2302		    sizeof (struct sockaddr_in6));
2303		mcast_rtmsg.m_rtm.rtm_version = RTM_VERSION;
2304		mcast_rtmsg.m_rtm.rtm_type = RTM_ADD;
2305		mcast_rtmsg.m_rtm.rtm_flags = RTF_UP;
2306		mcast_rtmsg.m_rtm.rtm_addrs =
2307		    RTA_DST | RTA_GATEWAY | RTA_NETMASK;
2308		mcast_rtmsg.m_rtm.rtm_seq = ++rts_seqno;
2309		if (af == AF_INET) {
2310			mcast_rtmsg.m_dst4.sin_family = AF_INET;
2311			mcast_rtmsg.m_dst4.sin_addr.s_addr =
2312			    htonl(INADDR_UNSPEC_GROUP);
2313			mcast_rtmsg.m_gw4.sin_family = AF_INET;
2314			mcast_rtmsg.m_gw4.sin_addr = in4;
2315			mcast_rtmsg.m_netmask4.sin_family = AF_INET;
2316			mcast_rtmsg.m_netmask4.sin_addr.s_addr =
2317			    htonl(IN_CLASSD_NET);
2318		} else {
2319			mcast_rtmsg.m_dst6.sin6_family = AF_INET6;
2320			mcast_rtmsg.m_dst6.sin6_addr.s6_addr[0] = 0xffU;
2321			mcast_rtmsg.m_gw6.sin6_family = AF_INET6;
2322			mcast_rtmsg.m_gw6.sin6_addr = in6;
2323			mcast_rtmsg.m_netmask6.sin6_family = AF_INET6;
2324			mcast_rtmsg.m_netmask6.sin6_addr.s6_addr[0] = 0xffU;
2325		}
2326		rlen = write(rs, (char *)&mcast_rtmsg,
2327		    mcast_rtmsg.m_rtm.rtm_msglen);
2328		/*
2329		 * The write to the multicast socket will fail if the
2330		 * interface belongs to a failed IPMP group. This is a
2331		 * non-fatal error and the zone will continue booting.
2332		 * While the zone is running, if any interface in the
2333		 * failed IPMP group recovers, the zone will fallback to
2334		 * using that interface.
2335		 */
2336		if (rlen < mcast_rtmsg.m_rtm.rtm_msglen) {
2337			if (rlen < 0) {
2338				zerror(zlogp, B_TRUE, "WARNING: network "
2339				    "interface '%s' not available as default "
2340				    "for multicast.", lifr.lifr_name);
2341			} else {
2342				zerror(zlogp, B_FALSE, "WARNING: network "
2343				    "interface '%s' not available as default "
2344				    "for multicast; routing socket returned "
2345				    "unexpected %d bytes.",
2346				    lifr.lifr_name, rlen);
2347			}
2348		} else {
2349
2350			if (af == AF_INET) {
2351				*mcast_rt_v4_setp = B_TRUE;
2352			} else {
2353				*mcast_rt_v6_setp = B_TRUE;
2354			}
2355		}
2356		(void) close(rs);
2357	}
2358
2359	if (!got_netmask) {
2360		/*
2361		 * A common, but often non-fatal problem, is that the system
2362		 * cannot find the netmask for an interface address. This is
2363		 * often caused by it being only in /etc/inet/netmasks, but
2364		 * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2365		 * in that. This doesn't show up at boot because the netmask
2366		 * is obtained from /etc/inet/netmasks when no network
2367		 * interfaces are up, but isn't consulted when NIS/NIS+ is
2368		 * available. We warn the user here that something like this
2369		 * has happened and we're just running with a default and
2370		 * possible incorrect netmask.
2371		 */
2372		char buffer[INET6_ADDRSTRLEN];
2373		void  *addr;
2374
2375		if (af == AF_INET)
2376			addr = &((struct sockaddr_in *)
2377			    (&lifr.lifr_addr))->sin_addr;
2378		else
2379			addr = &((struct sockaddr_in6 *)
2380			    (&lifr.lifr_addr))->sin6_addr;
2381
2382		/* Find out what netmask interface is going to be using */
2383		if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2384		    inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
2385			goto bad;
2386		zerror(zlogp, B_FALSE,
2387		    "WARNING: %s: no matching subnet found in netmasks(4) for "
2388		    "%s; using default of %s.",
2389		    lifr.lifr_name, addrstr4, buffer);
2390	}
2391
2392	(void) close(s);
2393	return (Z_OK);
2394bad:
2395	(void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2396	(void) close(s);
2397	return (-1);
2398}
2399
2400/*
2401 * Sets up network interfaces based on information from the zone configuration.
2402 * An IPv4 loopback interface is set up "for free", modeling the global system.
2403 * If any of the configuration interfaces were IPv6, then an IPv6 loopback
2404 * address is set up as well.
2405 *
2406 * If anything goes wrong, we log a general error message, attempt to tear down
2407 * whatever we set up, and return an error.
2408 */
2409static int
2410configure_shared_network_interfaces(zlog_t *zlogp)
2411{
2412	zone_dochandle_t handle;
2413	struct zone_nwiftab nwiftab, loopback_iftab;
2414	boolean_t saw_v6 = B_FALSE;
2415	boolean_t mcast_rt_v4_set = B_FALSE;
2416	boolean_t mcast_rt_v6_set = B_FALSE;
2417	zoneid_t zoneid;
2418
2419	if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2420		zerror(zlogp, B_TRUE, "unable to get zoneid");
2421		return (-1);
2422	}
2423
2424	if ((handle = zonecfg_init_handle()) == NULL) {
2425		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2426		return (-1);
2427	}
2428	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2429		zerror(zlogp, B_FALSE, "invalid configuration");
2430		zonecfg_fini_handle(handle);
2431		return (-1);
2432	}
2433	if (zonecfg_setnwifent(handle) == Z_OK) {
2434		for (;;) {
2435			struct in6_addr in6;
2436
2437			if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2438				break;
2439			if (configure_one_interface(zlogp, zoneid,
2440			    &nwiftab, &mcast_rt_v4_set, &mcast_rt_v6_set) !=
2441			    Z_OK) {
2442				(void) zonecfg_endnwifent(handle);
2443				zonecfg_fini_handle(handle);
2444				return (-1);
2445			}
2446			if (inet_pton(AF_INET6, nwiftab.zone_nwif_address,
2447			    &in6) == 1)
2448				saw_v6 = B_TRUE;
2449		}
2450		(void) zonecfg_endnwifent(handle);
2451	}
2452	zonecfg_fini_handle(handle);
2453	(void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2454	    sizeof (loopback_iftab.zone_nwif_physical));
2455	(void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2456	    sizeof (loopback_iftab.zone_nwif_address));
2457	if (configure_one_interface(zlogp, zoneid, &loopback_iftab, NULL, NULL)
2458	    != Z_OK) {
2459		return (-1);
2460	}
2461	if (saw_v6) {
2462		(void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2463		    sizeof (loopback_iftab.zone_nwif_address));
2464		if (configure_one_interface(zlogp, zoneid,
2465		    &loopback_iftab, NULL, NULL) != Z_OK) {
2466			return (-1);
2467		}
2468	}
2469	return (0);
2470}
2471
2472static void
2473show_owner(zlog_t *zlogp, char *dlname)
2474{
2475	zoneid_t dl_owner_zid;
2476	char dl_owner_zname[ZONENAME_MAX];
2477
2478	dl_owner_zid = ALL_ZONES;
2479	if (zone_check_datalink(&dl_owner_zid, dlname) != 0)
2480		(void) snprintf(dl_owner_zname, ZONENAME_MAX, "<unknown>");
2481	else if (getzonenamebyid(dl_owner_zid, dl_owner_zname, ZONENAME_MAX)
2482	    < 0)
2483		(void) snprintf(dl_owner_zname, ZONENAME_MAX, "<%d>",
2484		    dl_owner_zid);
2485
2486	errno = EPERM;
2487	zerror(zlogp, B_TRUE, "WARNING: skipping network interface '%s' "
2488	    "which is used by the non-global zone '%s'.\n",
2489	    dlname, dl_owner_zname);
2490}
2491
2492static int
2493add_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2494{
2495	/* First check if it's in use by global zone. */
2496	if (zonecfg_ifname_exists(AF_INET, dlname) ||
2497	    zonecfg_ifname_exists(AF_INET6, dlname)) {
2498		errno = EPERM;
2499		zerror(zlogp, B_TRUE, "WARNING: skipping network interface "
2500		    "'%s' which is used in the global zone.", dlname);
2501		return (-1);
2502	}
2503
2504	/* Add access control information */
2505	if (zone_add_datalink(zoneid, dlname) != 0) {
2506		/* If someone got this link before us, show its name */
2507		if (errno == EPERM)
2508			show_owner(zlogp, dlname);
2509		else
2510			zerror(zlogp, B_TRUE, "WARNING: unable to add network "
2511			    "interface '%s'.", dlname);
2512		return (-1);
2513	}
2514
2515	/* Hold the link for this zone */
2516	if (dladm_hold_link(dlname, zoneid, B_FALSE) < 0) {
2517		zerror(zlogp, B_TRUE, "WARNING: unable to hold network "
2518		    "interface '%s'.", dlname);
2519		(void) zone_remove_datalink(zoneid, dlname);
2520		return (-1);
2521	}
2522
2523	return (0);
2524}
2525
2526static int
2527remove_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2528{
2529	/*
2530	 * Remove access control information.
2531	 * If the errno is ENXIO, the interface is not added yet,
2532	 * nothing to report then.
2533	 */
2534	if (zone_remove_datalink(zoneid, dlname) != 0) {
2535		if (errno == ENXIO)
2536			return (0);
2537		zerror(zlogp, B_TRUE, "unable to remove network interface '%s'",
2538		    dlname);
2539		return (-1);
2540	}
2541
2542	if (dladm_rele_link(dlname, 0, B_FALSE) < 0) {
2543		zerror(zlogp, B_TRUE, "unable to release network "
2544		    "interface '%s'", dlname);
2545		return (-1);
2546	}
2547	return (0);
2548}
2549
2550/*
2551 * Add the kernel access control information for the interface names.
2552 * If anything goes wrong, we log a general error message, attempt to tear down
2553 * whatever we set up, and return an error.
2554 */
2555static int
2556configure_exclusive_network_interfaces(zlog_t *zlogp)
2557{
2558	zone_dochandle_t handle;
2559	struct zone_nwiftab nwiftab;
2560	zoneid_t zoneid;
2561	char rootpath[MAXPATHLEN];
2562	char path[MAXPATHLEN];
2563	di_prof_t prof = NULL;
2564	boolean_t added = B_FALSE;
2565
2566	if ((zoneid = getzoneidbyname(zone_name)) == -1) {
2567		zerror(zlogp, B_TRUE, "unable to get zoneid");
2568		return (-1);
2569	}
2570
2571	if ((handle = zonecfg_init_handle()) == NULL) {
2572		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2573		return (-1);
2574	}
2575	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2576		zerror(zlogp, B_FALSE, "invalid configuration");
2577		zonecfg_fini_handle(handle);
2578		return (-1);
2579	}
2580
2581	if (zonecfg_setnwifent(handle) != Z_OK) {
2582		zonecfg_fini_handle(handle);
2583		return (0);
2584	}
2585
2586	for (;;) {
2587		if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2588			break;
2589
2590		if (prof == NULL) {
2591			if (zone_get_devroot(zone_name, rootpath,
2592			    sizeof (rootpath)) != Z_OK) {
2593				(void) zonecfg_endnwifent(handle);
2594				zonecfg_fini_handle(handle);
2595				zerror(zlogp, B_TRUE,
2596				    "unable to determine dev root");
2597				return (-1);
2598			}
2599			(void) snprintf(path, sizeof (path), "%s%s", rootpath,
2600			    "/dev");
2601			if (di_prof_init(path, &prof) != 0) {
2602				(void) zonecfg_endnwifent(handle);
2603				zonecfg_fini_handle(handle);
2604				zerror(zlogp, B_TRUE,
2605				    "failed to initialize profile");
2606				return (-1);
2607			}
2608		}
2609
2610		/*
2611		 * Only create the /dev entry if it's not in use.
2612		 * Note here the zone still boots when the interfaces
2613		 * assigned is inaccessible, used by others, etc.
2614		 */
2615		if (add_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2616		    == 0) {
2617			if (di_prof_add_dev(prof, nwiftab.zone_nwif_physical)
2618			    != 0) {
2619				(void) zonecfg_endnwifent(handle);
2620				zonecfg_fini_handle(handle);
2621				zerror(zlogp, B_TRUE,
2622				    "failed to add network device");
2623				return (-1);
2624			}
2625			added = B_TRUE;
2626		}
2627	}
2628	(void) zonecfg_endnwifent(handle);
2629	zonecfg_fini_handle(handle);
2630
2631	if (prof != NULL && added) {
2632		if (di_prof_commit(prof) != 0) {
2633			zerror(zlogp, B_TRUE, "failed to commit profile");
2634			return (-1);
2635		}
2636	}
2637	if (prof != NULL)
2638		di_prof_fini(prof);
2639
2640	return (0);
2641}
2642
2643/*
2644 * Get the list of the data-links from kernel, and try to remove it
2645 */
2646static int
2647unconfigure_exclusive_network_interfaces_run(zlog_t *zlogp, zoneid_t zoneid)
2648{
2649	char *dlnames, *ptr;
2650	int dlnum, dlnum_saved, i;
2651
2652	dlnum = 0;
2653	if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
2654		zerror(zlogp, B_TRUE, "unable to list network interfaces");
2655		return (-1);
2656	}
2657again:
2658	/* this zone doesn't have any data-links */
2659	if (dlnum == 0)
2660		return (0);
2661
2662	dlnames = malloc(dlnum * LIFNAMSIZ);
2663	if (dlnames == NULL) {
2664		zerror(zlogp, B_TRUE, "memory allocation failed");
2665		return (-1);
2666	}
2667	dlnum_saved = dlnum;
2668
2669	if (zone_list_datalink(zoneid, &dlnum, dlnames) != 0) {
2670		zerror(zlogp, B_TRUE, "unable to list network interfaces");
2671		free(dlnames);
2672		return (-1);
2673	}
2674	if (dlnum_saved < dlnum) {
2675		/* list increased, try again */
2676		free(dlnames);
2677		goto again;
2678	}
2679	ptr = dlnames;
2680	for (i = 0; i < dlnum; i++) {
2681		/* Remove access control information */
2682		if (remove_datalink(zlogp, zoneid, ptr) != 0) {
2683			free(dlnames);
2684			return (-1);
2685		}
2686		ptr += LIFNAMSIZ;
2687	}
2688	free(dlnames);
2689	return (0);
2690}
2691
2692/*
2693 * Get the list of the data-links from configuration, and try to remove it
2694 */
2695static int
2696unconfigure_exclusive_network_interfaces_static(zlog_t *zlogp, zoneid_t zoneid)
2697{
2698	zone_dochandle_t handle;
2699	struct zone_nwiftab nwiftab;
2700
2701	if ((handle = zonecfg_init_handle()) == NULL) {
2702		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2703		return (-1);
2704	}
2705	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2706		zerror(zlogp, B_FALSE, "invalid configuration");
2707		zonecfg_fini_handle(handle);
2708		return (-1);
2709	}
2710	if (zonecfg_setnwifent(handle) != Z_OK) {
2711		zonecfg_fini_handle(handle);
2712		return (0);
2713	}
2714	for (;;) {
2715		if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2716			break;
2717		/* Remove access control information */
2718		if (remove_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2719		    != 0) {
2720			(void) zonecfg_endnwifent(handle);
2721			zonecfg_fini_handle(handle);
2722			return (-1);
2723		}
2724	}
2725	(void) zonecfg_endnwifent(handle);
2726	zonecfg_fini_handle(handle);
2727	return (0);
2728}
2729
2730/*
2731 * Remove the access control information from the kernel for the exclusive
2732 * network interfaces.
2733 */
2734static int
2735unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
2736{
2737	if (unconfigure_exclusive_network_interfaces_run(zlogp, zoneid) != 0) {
2738		return (unconfigure_exclusive_network_interfaces_static(zlogp,
2739		    zoneid));
2740	}
2741
2742	return (0);
2743}
2744
2745static int
2746tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
2747    const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
2748{
2749	int fd;
2750	struct strioctl ioc;
2751	tcp_ioc_abort_conn_t conn;
2752	int error;
2753
2754	conn.ac_local = *local;
2755	conn.ac_remote = *remote;
2756	conn.ac_start = TCPS_SYN_SENT;
2757	conn.ac_end = TCPS_TIME_WAIT;
2758	conn.ac_zoneid = zoneid;
2759
2760	ioc.ic_cmd = TCP_IOC_ABORT_CONN;
2761	ioc.ic_timout = -1; /* infinite timeout */
2762	ioc.ic_len = sizeof (conn);
2763	ioc.ic_dp = (char *)&conn;
2764
2765	if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
2766		zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
2767		return (-1);
2768	}
2769
2770	error = ioctl(fd, I_STR, &ioc);
2771	(void) close(fd);
2772	if (error == 0 || errno == ENOENT)	/* ENOENT is not an error */
2773		return (0);
2774	return (-1);
2775}
2776
2777static int
2778tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
2779{
2780	struct sockaddr_storage l, r;
2781	struct sockaddr_in *local, *remote;
2782	struct sockaddr_in6 *local6, *remote6;
2783	int error;
2784
2785	/*
2786	 * Abort IPv4 connections.
2787	 */
2788	bzero(&l, sizeof (*local));
2789	local = (struct sockaddr_in *)&l;
2790	local->sin_family = AF_INET;
2791	local->sin_addr.s_addr = INADDR_ANY;
2792	local->sin_port = 0;
2793
2794	bzero(&r, sizeof (*remote));
2795	remote = (struct sockaddr_in *)&r;
2796	remote->sin_family = AF_INET;
2797	remote->sin_addr.s_addr = INADDR_ANY;
2798	remote->sin_port = 0;
2799
2800	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2801		return (error);
2802
2803	/*
2804	 * Abort IPv6 connections.
2805	 */
2806	bzero(&l, sizeof (*local6));
2807	local6 = (struct sockaddr_in6 *)&l;
2808	local6->sin6_family = AF_INET6;
2809	local6->sin6_port = 0;
2810	local6->sin6_addr = in6addr_any;
2811
2812	bzero(&r, sizeof (*remote6));
2813	remote6 = (struct sockaddr_in6 *)&r;
2814	remote6->sin6_family = AF_INET6;
2815	remote6->sin6_port = 0;
2816	remote6->sin6_addr = in6addr_any;
2817
2818	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2819		return (error);
2820	return (0);
2821}
2822
2823static int
2824get_privset(zlog_t *zlogp, priv_set_t *privs, boolean_t mount_cmd)
2825{
2826	int error = -1;
2827	zone_dochandle_t handle;
2828	char *privname = NULL;
2829
2830	if ((handle = zonecfg_init_handle()) == NULL) {
2831		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2832		return (-1);
2833	}
2834	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2835		zerror(zlogp, B_FALSE, "invalid configuration");
2836		zonecfg_fini_handle(handle);
2837		return (-1);
2838	}
2839
2840	if (mount_cmd) {
2841		zone_iptype_t	iptype;
2842		const char	*curr_iptype;
2843
2844		if (zonecfg_get_iptype(handle, &iptype) != Z_OK) {
2845			zerror(zlogp, B_TRUE, "unable to determine ip-type");
2846			zonecfg_fini_handle(handle);
2847			return (-1);
2848		}
2849
2850		switch (iptype) {
2851		case ZS_SHARED:
2852			curr_iptype = "shared";
2853			break;
2854		case ZS_EXCLUSIVE:
2855			curr_iptype = "exclusive";
2856			break;
2857		}
2858
2859		if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) {
2860			zonecfg_fini_handle(handle);
2861			return (0);
2862		}
2863		zerror(zlogp, B_FALSE,
2864		    "failed to determine the zone's default privilege set");
2865		zonecfg_fini_handle(handle);
2866		return (-1);
2867	}
2868
2869	switch (zonecfg_get_privset(handle, privs, &privname)) {
2870	case Z_OK:
2871		error = 0;
2872		break;
2873	case Z_PRIV_PROHIBITED:
2874		zerror(zlogp, B_FALSE, "privilege \"%s\" is not permitted "
2875		    "within the zone's privilege set", privname);
2876		break;
2877	case Z_PRIV_REQUIRED:
2878		zerror(zlogp, B_FALSE, "required privilege \"%s\" is missing "
2879		    "from the zone's privilege set", privname);
2880		break;
2881	case Z_PRIV_UNKNOWN:
2882		zerror(zlogp, B_FALSE, "unknown privilege \"%s\" specified "
2883		    "in the zone's privilege set", privname);
2884		break;
2885	default:
2886		zerror(zlogp, B_FALSE, "failed to determine the zone's "
2887		    "privilege set");
2888		break;
2889	}
2890
2891	free(privname);
2892	zonecfg_fini_handle(handle);
2893	return (error);
2894}
2895
2896static int
2897get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
2898{
2899	nvlist_t *nvl = NULL;
2900	char *nvl_packed = NULL;
2901	size_t nvl_size = 0;
2902	nvlist_t **nvlv = NULL;
2903	int rctlcount = 0;
2904	int error = -1;
2905	zone_dochandle_t handle;
2906	struct zone_rctltab rctltab;
2907	rctlblk_t *rctlblk = NULL;
2908
2909	*bufp = NULL;
2910	*bufsizep = 0;
2911
2912	if ((handle = zonecfg_init_handle()) == NULL) {
2913		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2914		return (-1);
2915	}
2916	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2917		zerror(zlogp, B_FALSE, "invalid configuration");
2918		zonecfg_fini_handle(handle);
2919		return (-1);
2920	}
2921
2922	rctltab.zone_rctl_valptr = NULL;
2923	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
2924		zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
2925		goto out;
2926	}
2927
2928	if (zonecfg_setrctlent(handle) != Z_OK) {
2929		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
2930		goto out;
2931	}
2932
2933	if ((rctlblk = malloc(rctlblk_size())) == NULL) {
2934		zerror(zlogp, B_TRUE, "memory allocation failed");
2935		goto out;
2936	}
2937	while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
2938		struct zone_rctlvaltab *rctlval;
2939		uint_t i, count;
2940		const char *name = rctltab.zone_rctl_name;
2941
2942		/* zoneadm should have already warned about unknown rctls. */
2943		if (!zonecfg_is_rctl(name)) {
2944			zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2945			rctltab.zone_rctl_valptr = NULL;
2946			continue;
2947		}
2948		count = 0;
2949		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2950		    rctlval = rctlval->zone_rctlval_next) {
2951			count++;
2952		}
2953		if (count == 0) {	/* ignore */
2954			continue;	/* Nothing to free */
2955		}
2956		if ((nvlv = malloc(sizeof (*nvlv) * count)) == NULL)
2957			goto out;
2958		i = 0;
2959		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2960		    rctlval = rctlval->zone_rctlval_next, i++) {
2961			if (nvlist_alloc(&nvlv[i], NV_UNIQUE_NAME, 0) != 0) {
2962				zerror(zlogp, B_TRUE, "%s failed",
2963				    "nvlist_alloc");
2964				goto out;
2965			}
2966			if (zonecfg_construct_rctlblk(rctlval, rctlblk)
2967			    != Z_OK) {
2968				zerror(zlogp, B_FALSE, "invalid rctl value: "
2969				    "(priv=%s,limit=%s,action=%s)",
2970				    rctlval->zone_rctlval_priv,
2971				    rctlval->zone_rctlval_limit,
2972				    rctlval->zone_rctlval_action);
2973				goto out;
2974			}
2975			if (!zonecfg_valid_rctl(name, rctlblk)) {
2976				zerror(zlogp, B_FALSE,
2977				    "(priv=%s,limit=%s,action=%s) is not a "
2978				    "valid value for rctl '%s'",
2979				    rctlval->zone_rctlval_priv,
2980				    rctlval->zone_rctlval_limit,
2981				    rctlval->zone_rctlval_action,
2982				    name);
2983				goto out;
2984			}
2985			if (nvlist_add_uint64(nvlv[i], "privilege",
2986			    rctlblk_get_privilege(rctlblk)) != 0) {
2987				zerror(zlogp, B_FALSE, "%s failed",
2988				    "nvlist_add_uint64");
2989				goto out;
2990			}
2991			if (nvlist_add_uint64(nvlv[i], "limit",
2992			    rctlblk_get_value(rctlblk)) != 0) {
2993				zerror(zlogp, B_FALSE, "%s failed",
2994				    "nvlist_add_uint64");
2995				goto out;
2996			}
2997			if (nvlist_add_uint64(nvlv[i], "action",
2998			    (uint_t)rctlblk_get_local_action(rctlblk, NULL))
2999			    != 0) {
3000				zerror(zlogp, B_FALSE, "%s failed",
3001				    "nvlist_add_uint64");
3002				goto out;
3003			}
3004		}
3005		zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
3006		rctltab.zone_rctl_valptr = NULL;
3007		if (nvlist_add_nvlist_array(nvl, (char *)name, nvlv, count)
3008		    != 0) {
3009			zerror(zlogp, B_FALSE, "%s failed",
3010			    "nvlist_add_nvlist_array");
3011			goto out;
3012		}
3013		for (i = 0; i < count; i++)
3014			nvlist_free(nvlv[i]);
3015		free(nvlv);
3016		nvlv = NULL;
3017		rctlcount++;
3018	}
3019	(void) zonecfg_endrctlent(handle);
3020
3021	if (rctlcount == 0) {
3022		error = 0;
3023		goto out;
3024	}
3025	if (nvlist_pack(nvl, &nvl_packed, &nvl_size, NV_ENCODE_NATIVE, 0)
3026	    != 0) {
3027		zerror(zlogp, B_FALSE, "%s failed", "nvlist_pack");
3028		goto out;
3029	}
3030
3031	error = 0;
3032	*bufp = nvl_packed;
3033	*bufsizep = nvl_size;
3034
3035out:
3036	free(rctlblk);
3037	zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
3038	if (error && nvl_packed != NULL)
3039		free(nvl_packed);
3040	if (nvl != NULL)
3041		nvlist_free(nvl);
3042	if (nvlv != NULL)
3043		free(nvlv);
3044	if (handle != NULL)
3045		zonecfg_fini_handle(handle);
3046	return (error);
3047}
3048
3049static int
3050get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
3051{
3052	zone_dochandle_t handle;
3053	struct zone_dstab dstab;
3054	size_t total, offset, len;
3055	int error = -1;
3056	char *str;
3057
3058	*bufp = NULL;
3059	*bufsizep = 0;
3060
3061	if ((handle = zonecfg_init_handle()) == NULL) {
3062		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3063		return (-1);
3064	}
3065	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3066		zerror(zlogp, B_FALSE, "invalid configuration");
3067		zonecfg_fini_handle(handle);
3068		return (-1);
3069	}
3070
3071	if (zonecfg_setdsent(handle) != Z_OK) {
3072		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
3073		goto out;
3074	}
3075
3076	total = 0;
3077	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
3078		total += strlen(dstab.zone_dataset_name) + 1;
3079	(void) zonecfg_enddsent(handle);
3080
3081	if (total == 0) {
3082		error = 0;
3083		goto out;
3084	}
3085
3086	if ((str = malloc(total)) == NULL) {
3087		zerror(zlogp, B_TRUE, "memory allocation failed");
3088		goto out;
3089	}
3090
3091	if (zonecfg_setdsent(handle) != Z_OK) {
3092		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
3093		goto out;
3094	}
3095	offset = 0;
3096	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
3097		len = strlen(dstab.zone_dataset_name);
3098		(void) strlcpy(str + offset, dstab.zone_dataset_name,
3099		    sizeof (dstab.zone_dataset_name) - offset);
3100		offset += len;
3101		if (offset != total - 1)
3102			str[offset++] = ',';
3103	}
3104	(void) zonecfg_enddsent(handle);
3105
3106	error = 0;
3107	*bufp = str;
3108	*bufsizep = total;
3109
3110out:
3111	if (error != 0 && str != NULL)
3112		free(str);
3113	if (handle != NULL)
3114		zonecfg_fini_handle(handle);
3115
3116	return (error);
3117}
3118
3119static int
3120validate_datasets(zlog_t *zlogp)
3121{
3122	zone_dochandle_t handle;
3123	struct zone_dstab dstab;
3124	zfs_handle_t *zhp;
3125	libzfs_handle_t *hdl;
3126
3127	if ((handle = zonecfg_init_handle()) == NULL) {
3128		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3129		return (-1);
3130	}
3131	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3132		zerror(zlogp, B_FALSE, "invalid configuration");
3133		zonecfg_fini_handle(handle);
3134		return (-1);
3135	}
3136
3137	if (zonecfg_setdsent(handle) != Z_OK) {
3138		zerror(zlogp, B_FALSE, "invalid configuration");
3139		zonecfg_fini_handle(handle);
3140		return (-1);
3141	}
3142
3143	if ((hdl = libzfs_init()) == NULL) {
3144		zerror(zlogp, B_FALSE, "opening ZFS library");
3145		zonecfg_fini_handle(handle);
3146		return (-1);
3147	}
3148
3149	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
3150
3151		if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
3152		    ZFS_TYPE_FILESYSTEM)) == NULL) {
3153			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
3154			    dstab.zone_dataset_name);
3155			zonecfg_fini_handle(handle);
3156			libzfs_fini(hdl);
3157			return (-1);
3158		}
3159
3160		/*
3161		 * Automatically set the 'zoned' property.  We check the value
3162		 * first because we'll get EPERM if it is already set.
3163		 */
3164		if (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
3165		    zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_ZONED),
3166		    "on") != 0) {
3167			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
3168			    "property for ZFS dataset '%s'\n",
3169			    dstab.zone_dataset_name);
3170			zonecfg_fini_handle(handle);
3171			zfs_close(zhp);
3172			libzfs_fini(hdl);
3173			return (-1);
3174		}
3175
3176		zfs_close(zhp);
3177	}
3178	(void) zonecfg_enddsent(handle);
3179
3180	zonecfg_fini_handle(handle);
3181	libzfs_fini(hdl);
3182
3183	return (0);
3184}
3185
3186/*
3187 * Mount lower level home directories into/from current zone
3188 * Share exported directories specified in dfstab for zone
3189 */
3190static int
3191tsol_mounts(zlog_t *zlogp, char *zone_name, char *rootpath)
3192{
3193	zoneid_t *zids = NULL;
3194	priv_set_t *zid_privs;
3195	const priv_impl_info_t *ip = NULL;
3196	uint_t nzents_saved;
3197	uint_t nzents;
3198	int i;
3199	char readonly[] = "ro";
3200	struct zone_fstab lower_fstab;
3201	char *argv[4];
3202
3203	if (!is_system_labeled())
3204		return (0);
3205
3206	if (zid_label == NULL) {
3207		zid_label = m_label_alloc(MAC_LABEL);
3208		if (zid_label == NULL)
3209			return (-1);
3210	}
3211
3212	/* Make sure our zone has an /export/home dir */
3213	(void) make_one_dir(zlogp, rootpath, "/export/home",
3214	    DEFAULT_DIR_MODE, DEFAULT_DIR_USER, DEFAULT_DIR_GROUP);
3215
3216	lower_fstab.zone_fs_raw[0] = '\0';
3217	(void) strlcpy(lower_fstab.zone_fs_type, MNTTYPE_LOFS,
3218	    sizeof (lower_fstab.zone_fs_type));
3219	lower_fstab.zone_fs_options = NULL;
3220	(void) zonecfg_add_fs_option(&lower_fstab, readonly);
3221
3222	/*
3223	 * Get the list of zones from the kernel
3224	 */
3225	if (zone_list(NULL, &nzents) != 0) {
3226		zerror(zlogp, B_TRUE, "unable to list zones");
3227		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3228		return (-1);
3229	}
3230again:
3231	if (nzents == 0) {
3232		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3233		return (-1);
3234	}
3235
3236	zids = malloc(nzents * sizeof (zoneid_t));
3237	if (zids == NULL) {
3238		zerror(zlogp, B_TRUE, "memory allocation failed");
3239		return (-1);
3240	}
3241	nzents_saved = nzents;
3242
3243	if (zone_list(zids, &nzents) != 0) {
3244		zerror(zlogp, B_TRUE, "unable to list zones");
3245		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3246		free(zids);
3247		return (-1);
3248	}
3249	if (nzents != nzents_saved) {
3250		/* list changed, try again */
3251		free(zids);
3252		goto again;
3253	}
3254
3255	ip = getprivimplinfo();
3256	if ((zid_privs = priv_allocset()) == NULL) {
3257		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3258		zonecfg_free_fs_option_list(
3259		    lower_fstab.zone_fs_options);
3260		free(zids);
3261		return (-1);
3262	}
3263
3264	for (i = 0; i < nzents; i++) {
3265		char zid_name[ZONENAME_MAX];
3266		zone_state_t zid_state;
3267		char zid_rpath[MAXPATHLEN];
3268		struct stat stat_buf;
3269
3270		if (zids[i] == GLOBAL_ZONEID)
3271			continue;
3272
3273		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
3274			continue;
3275
3276		/*
3277		 * Do special setup for the zone we are booting
3278		 */
3279		if (strcmp(zid_name, zone_name) == 0) {
3280			struct zone_fstab autofs_fstab;
3281			char map_path[MAXPATHLEN];
3282			int fd;
3283
3284			/*
3285			 * Create auto_home_<zone> map for this zone
3286			 * in the global zone. The non-global zone entry
3287			 * will be created by automount when the zone
3288			 * is booted.
3289			 */
3290
3291			(void) snprintf(autofs_fstab.zone_fs_special,
3292			    MAXPATHLEN, "auto_home_%s", zid_name);
3293
3294			(void) snprintf(autofs_fstab.zone_fs_dir, MAXPATHLEN,
3295			    "/zone/%s/home", zid_name);
3296
3297			(void) snprintf(map_path, sizeof (map_path),
3298			    "/etc/%s", autofs_fstab.zone_fs_special);
3299			/*
3300			 * If the map file doesn't exist create a template
3301			 */
3302			if ((fd = open(map_path, O_RDWR | O_CREAT | O_EXCL,
3303			    S_IRUSR | S_IWUSR | S_IRGRP| S_IROTH)) != -1) {
3304				int len;
3305				char map_rec[MAXPATHLEN];
3306
3307				len = snprintf(map_rec, sizeof (map_rec),
3308				    "+%s\n*\t-fstype=lofs\t:%s/export/home/&\n",
3309				    autofs_fstab.zone_fs_special, rootpath);
3310				(void) write(fd, map_rec, len);
3311				(void) close(fd);
3312			}
3313
3314			/*
3315			 * Mount auto_home_<zone> in the global zone if absent.
3316			 * If it's already of type autofs, then
3317			 * don't mount it again.
3318			 */
3319			if ((stat(autofs_fstab.zone_fs_dir, &stat_buf) == -1) ||
3320			    strcmp(stat_buf.st_fstype, MNTTYPE_AUTOFS) != 0) {
3321				char optstr[] = "indirect,ignore,nobrowse";
3322
3323				(void) make_one_dir(zlogp, "",
3324				    autofs_fstab.zone_fs_dir, DEFAULT_DIR_MODE,
3325				    DEFAULT_DIR_USER, DEFAULT_DIR_GROUP);
3326
3327				/*
3328				 * Mount will fail if automounter has already
3329				 * processed the auto_home_<zonename> map
3330				 */
3331				(void) domount(zlogp, MNTTYPE_AUTOFS, optstr,
3332				    autofs_fstab.zone_fs_special,
3333				    autofs_fstab.zone_fs_dir);
3334			}
3335			continue;
3336		}
3337
3338
3339		if (zone_get_state(zid_name, &zid_state) != Z_OK ||
3340		    (zid_state != ZONE_STATE_READY &&
3341		    zid_state != ZONE_STATE_RUNNING))
3342			/* Skip over zones without mounted filesystems */
3343			continue;
3344
3345		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
3346		    sizeof (m_label_t)) < 0)
3347			/* Skip over zones with unspecified label */
3348			continue;
3349
3350		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
3351		    sizeof (zid_rpath)) == -1)
3352			/* Skip over zones with bad path */
3353			continue;
3354
3355		if (zone_getattr(zids[i], ZONE_ATTR_PRIVSET, zid_privs,
3356		    sizeof (priv_chunk_t) * ip->priv_setsize) == -1)
3357			/* Skip over zones with bad privs */
3358			continue;
3359
3360		/*
3361		 * Reading down is valid according to our label model
3362		 * but some customers want to disable it because it
3363		 * allows execute down and other possible attacks.
3364		 * Therefore, we restrict this feature to zones that
3365		 * have the NET_MAC_AWARE privilege which is required
3366		 * for NFS read-down semantics.
3367		 */
3368		if ((bldominates(zlabel, zid_label)) &&
3369		    (priv_ismember(zprivs, PRIV_NET_MAC_AWARE))) {
3370			/*
3371			 * Our zone dominates this one.
3372			 * Create a lofs mount from lower zone's /export/home
3373			 */
3374			(void) snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
3375			    "%s/zone/%s/export/home", rootpath, zid_name);
3376
3377			/*
3378			 * If the target is already an LOFS mount
3379			 * then don't do it again.
3380			 */
3381			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
3382			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
3383
3384				if (snprintf(lower_fstab.zone_fs_special,
3385				    MAXPATHLEN, "%s/export",
3386				    zid_rpath) > MAXPATHLEN)
3387					continue;
3388
3389				/*
3390				 * Make sure the lower-level home exists
3391				 */
3392				if (make_one_dir(zlogp,
3393				    lower_fstab.zone_fs_special, "/home",
3394				    DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
3395				    DEFAULT_DIR_GROUP) != 0)
3396					continue;
3397
3398				(void) strlcat(lower_fstab.zone_fs_special,
3399				    "/home", MAXPATHLEN);
3400
3401				/*
3402				 * Mount can fail because the lower-level
3403				 * zone may have already done a mount up.
3404				 */
3405				(void) mount_one(zlogp, &lower_fstab, "");
3406			}
3407		} else if ((bldominates(zid_label, zlabel)) &&
3408		    (priv_ismember(zid_privs, PRIV_NET_MAC_AWARE))) {
3409			/*
3410			 * This zone dominates our zone.
3411			 * Create a lofs mount from our zone's /export/home
3412			 */
3413			if (snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
3414			    "%s/zone/%s/export/home", zid_rpath,
3415			    zone_name) > MAXPATHLEN)
3416				continue;
3417
3418			/*
3419			 * If the target is already an LOFS mount
3420			 * then don't do it again.
3421			 */
3422			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
3423			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
3424
3425				(void) snprintf(lower_fstab.zone_fs_special,
3426				    MAXPATHLEN, "%s/export/home", rootpath);
3427
3428				/*
3429				 * Mount can fail because the higher-level
3430				 * zone may have already done a mount down.
3431				 */
3432				(void) mount_one(zlogp, &lower_fstab, "");
3433			}
3434		}
3435	}
3436	zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3437	priv_freeset(zid_privs);
3438	free(zids);
3439
3440	/*
3441	 * Now share any exported directories from this zone.
3442	 * Each zone can have its own dfstab.
3443	 */
3444
3445	argv[0] = "zoneshare";
3446	argv[1] = "-z";
3447	argv[2] = zone_name;
3448	argv[3] = NULL;
3449
3450	(void) forkexec(zlogp, "/usr/lib/zones/zoneshare", argv);
3451	/* Don't check for errors since they don't affect the zone */
3452
3453	return (0);
3454}
3455
3456/*
3457 * Unmount lofs mounts from higher level zones
3458 * Unshare nfs exported directories
3459 */
3460static void
3461tsol_unmounts(zlog_t *zlogp, char *zone_name)
3462{
3463	zoneid_t *zids = NULL;
3464	uint_t nzents_saved;
3465	uint_t nzents;
3466	int i;
3467	char *argv[4];
3468	char path[MAXPATHLEN];
3469
3470	if (!is_system_labeled())
3471		return;
3472
3473	/*
3474	 * Get the list of zones from the kernel
3475	 */
3476	if (zone_list(NULL, &nzents) != 0) {
3477		return;
3478	}
3479
3480	if (zid_label == NULL) {
3481		zid_label = m_label_alloc(MAC_LABEL);
3482		if (zid_label == NULL)
3483			return;
3484	}
3485
3486again:
3487	if (nzents == 0)
3488		return;
3489
3490	zids = malloc(nzents * sizeof (zoneid_t));
3491	if (zids == NULL) {
3492		zerror(zlogp, B_TRUE, "memory allocation failed");
3493		return;
3494	}
3495	nzents_saved = nzents;
3496
3497	if (zone_list(zids, &nzents) != 0) {
3498		free(zids);
3499		return;
3500	}
3501	if (nzents != nzents_saved) {
3502		/* list changed, try again */
3503		free(zids);
3504		goto again;
3505	}
3506
3507	for (i = 0; i < nzents; i++) {
3508		char zid_name[ZONENAME_MAX];
3509		zone_state_t zid_state;
3510		char zid_rpath[MAXPATHLEN];
3511
3512		if (zids[i] == GLOBAL_ZONEID)
3513			continue;
3514
3515		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
3516			continue;
3517
3518		/*
3519		 * Skip the zone we are halting
3520		 */
3521		if (strcmp(zid_name, zone_name) == 0)
3522			continue;
3523
3524		if ((zone_getattr(zids[i], ZONE_ATTR_STATUS, &zid_state,
3525		    sizeof (zid_state)) < 0) ||
3526		    (zid_state < ZONE_IS_READY))
3527			/* Skip over zones without mounted filesystems */
3528			continue;
3529
3530		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
3531		    sizeof (m_label_t)) < 0)
3532			/* Skip over zones with unspecified label */
3533			continue;
3534
3535		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
3536		    sizeof (zid_rpath)) == -1)
3537			/* Skip over zones with bad path */
3538			continue;
3539
3540		if (zlabel != NULL && bldominates(zid_label, zlabel)) {
3541			/*
3542			 * This zone dominates our zone.
3543			 * Unmount the lofs mount of our zone's /export/home
3544			 */
3545
3546			if (snprintf(path, MAXPATHLEN,
3547			    "%s/zone/%s/export/home", zid_rpath,
3548			    zone_name) > MAXPATHLEN)
3549				continue;
3550
3551			/* Skip over mount failures */
3552			(void) umount(path);
3553		}
3554	}
3555	free(zids);
3556
3557	/*
3558	 * Unmount global zone autofs trigger for this zone
3559	 */
3560	(void) snprintf(path, MAXPATHLEN, "/zone/%s/home", zone_name);
3561	/* Skip over mount failures */
3562	(void) umount(path);
3563
3564	/*
3565	 * Next unshare any exported directories from this zone.
3566	 */
3567
3568	argv[0] = "zoneunshare";
3569	argv[1] = "-z";
3570	argv[2] = zone_name;
3571	argv[3] = NULL;
3572
3573	(void) forkexec(zlogp, "/usr/lib/zones/zoneunshare", argv);
3574	/* Don't check for errors since they don't affect the zone */
3575
3576	/*
3577	 * Finally, deallocate any devices in the zone.
3578	 */
3579
3580	argv[0] = "deallocate";
3581	argv[1] = "-Isz";
3582	argv[2] = zone_name;
3583	argv[3] = NULL;
3584
3585	(void) forkexec(zlogp, "/usr/sbin/deallocate", argv);
3586	/* Don't check for errors since they don't affect the zone */
3587}
3588
3589/*
3590 * Fetch the Trusted Extensions label and multi-level ports (MLPs) for
3591 * this zone.
3592 */
3593static tsol_zcent_t *
3594get_zone_label(zlog_t *zlogp, priv_set_t *privs)
3595{
3596	FILE *fp;
3597	tsol_zcent_t *zcent = NULL;
3598	char line[MAXTNZLEN];
3599
3600	if ((fp = fopen(TNZONECFG_PATH, "r")) == NULL) {
3601		zerror(zlogp, B_TRUE, "%s", TNZONECFG_PATH);
3602		return (NULL);
3603	}
3604
3605	while (fgets(line, sizeof (line), fp) != NULL) {
3606		/*
3607		 * Check for malformed database
3608		 */
3609		if (strlen(line) == MAXTNZLEN - 1)
3610			break;
3611		if ((zcent = tsol_sgetzcent(line, NULL, NULL)) == NULL)
3612			continue;
3613		if (strcmp(zcent->zc_name, zone_name) == 0)
3614			break;
3615		tsol_freezcent(zcent);
3616		zcent = NULL;
3617	}
3618	(void) fclose(fp);
3619
3620	if (zcent == NULL) {
3621		zerror(zlogp, B_FALSE, "zone requires a label assignment. "
3622		    "See tnzonecfg(4)");
3623	} else {
3624		if (zlabel == NULL)
3625			zlabel = m_label_alloc(MAC_LABEL);
3626		/*
3627		 * Save this zone's privileges for later read-down processing
3628		 */
3629		if ((zprivs = priv_allocset()) == NULL) {
3630			zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3631			return (NULL);
3632		} else {
3633			priv_copyset(privs, zprivs);
3634		}
3635	}
3636	return (zcent);
3637}
3638
3639/*
3640 * Add the Trusted Extensions multi-level ports for this zone.
3641 */
3642static void
3643set_mlps(zlog_t *zlogp, zoneid_t zoneid, tsol_zcent_t *zcent)
3644{
3645	tsol_mlp_t *mlp;
3646	tsol_mlpent_t tsme;
3647
3648	if (!is_system_labeled())
3649		return;
3650
3651	tsme.tsme_zoneid = zoneid;
3652	tsme.tsme_flags = 0;
3653	for (mlp = zcent->zc_private_mlp; !TSOL_MLP_END(mlp); mlp++) {
3654		tsme.tsme_mlp = *mlp;
3655		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3656			zerror(zlogp, B_TRUE, "cannot set zone-specific MLP "
3657			    "on %d-%d/%d", mlp->mlp_port,
3658			    mlp->mlp_port_upper, mlp->mlp_ipp);
3659		}
3660	}
3661
3662	tsme.tsme_flags = TSOL_MEF_SHARED;
3663	for (mlp = zcent->zc_shared_mlp; !TSOL_MLP_END(mlp); mlp++) {
3664		tsme.tsme_mlp = *mlp;
3665		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3666			zerror(zlogp, B_TRUE, "cannot set shared MLP "
3667			    "on %d-%d/%d", mlp->mlp_port,
3668			    mlp->mlp_port_upper, mlp->mlp_ipp);
3669		}
3670	}
3671}
3672
3673static void
3674remove_mlps(zlog_t *zlogp, zoneid_t zoneid)
3675{
3676	tsol_mlpent_t tsme;
3677
3678	if (!is_system_labeled())
3679		return;
3680
3681	(void) memset(&tsme, 0, sizeof (tsme));
3682	tsme.tsme_zoneid = zoneid;
3683	if (tnmlp(TNDB_FLUSH, &tsme) != 0)
3684		zerror(zlogp, B_TRUE, "cannot flush MLPs");
3685}
3686
3687int
3688prtmount(const char *fs, void *x) {
3689	zerror((zlog_t *)x, B_FALSE, "  %s", fs);
3690	return (0);
3691}
3692
3693/*
3694 * Look for zones running on the main system that are using this root (or any
3695 * subdirectory of it).  Return B_TRUE and print an error if a conflicting zone
3696 * is found or if we can't tell.
3697 */
3698static boolean_t
3699duplicate_zone_root(zlog_t *zlogp, const char *rootpath)
3700{
3701	zoneid_t *zids = NULL;
3702	uint_t nzids = 0;
3703	boolean_t retv;
3704	int rlen, zlen;
3705	char zroot[MAXPATHLEN];
3706	char zonename[ZONENAME_MAX];
3707
3708	for (;;) {
3709		nzids += 10;
3710		zids = malloc(nzids * sizeof (*zids));
3711		if (zids == NULL) {
3712			zerror(zlogp, B_TRUE, "memory allocation failed");
3713			return (B_TRUE);
3714		}
3715		if (zone_list(zids, &nzids) == 0)
3716			break;
3717		free(zids);
3718	}
3719	retv = B_FALSE;
3720	rlen = strlen(rootpath);
3721	while (nzids > 0) {
3722		/*
3723		 * Ignore errors; they just mean that the zone has disappeared
3724		 * while we were busy.
3725		 */
3726		if (zone_getattr(zids[--nzids], ZONE_ATTR_ROOT, zroot,
3727		    sizeof (zroot)) == -1)
3728			continue;
3729		zlen = strlen(zroot);
3730		if (zlen > rlen)
3731			zlen = rlen;
3732		if (strncmp(rootpath, zroot, zlen) == 0 &&
3733		    (zroot[zlen] == '\0' || zroot[zlen] == '/') &&
3734		    (rootpath[zlen] == '\0' || rootpath[zlen] == '/')) {
3735			if (getzonenamebyid(zids[nzids], zonename,
3736			    sizeof (zonename)) == -1)
3737				(void) snprintf(zonename, sizeof (zonename),
3738				    "id %d", (int)zids[nzids]);
3739			zerror(zlogp, B_FALSE,
3740			    "zone root %s already in use by zone %s",
3741			    rootpath, zonename);
3742			retv = B_TRUE;
3743			break;
3744		}
3745	}
3746	free(zids);
3747	return (retv);
3748}
3749
3750/*
3751 * Search for loopback mounts that use this same source node (same device and
3752 * inode).  Return B_TRUE if there is one or if we can't tell.
3753 */
3754static boolean_t
3755duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
3756{
3757	struct stat64 rst, zst;
3758	struct mnttab *mnp;
3759
3760	if (stat64(rootpath, &rst) == -1) {
3761		zerror(zlogp, B_TRUE, "can't stat %s", rootpath);
3762		return (B_TRUE);
3763	}
3764	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
3765		return (B_TRUE);
3766	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max; mnp++) {
3767		if (mnp->mnt_fstype == NULL ||
3768		    strcmp(MNTTYPE_LOFS, mnp->mnt_fstype) != 0)
3769			continue;
3770		/* We're looking at a loopback mount.  Stat it. */
3771		if (mnp->mnt_special != NULL &&
3772		    stat64(mnp->mnt_special, &zst) != -1 &&
3773		    rst.st_dev == zst.st_dev && rst.st_ino == zst.st_ino) {
3774			zerror(zlogp, B_FALSE,
3775			    "zone root %s is reachable through %s",
3776			    rootpath, mnp->mnt_mountp);
3777			return (B_TRUE);
3778		}
3779	}
3780	return (B_FALSE);
3781}
3782
3783/*
3784 * Set memory cap and pool info for the zone's resource management
3785 * configuration.
3786 */
3787static int
3788setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
3789{
3790	int res;
3791	uint64_t tmp;
3792	struct zone_mcaptab mcap;
3793	char sched[MAXNAMELEN];
3794	zone_dochandle_t handle = NULL;
3795	char pool_err[128];
3796
3797	if ((handle = zonecfg_init_handle()) == NULL) {
3798		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3799		return (Z_BAD_HANDLE);
3800	}
3801
3802	if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) {
3803		zerror(zlogp, B_FALSE, "invalid configuration");
3804		zonecfg_fini_handle(handle);
3805		return (res);
3806	}
3807
3808	/*
3809	 * If a memory cap is configured, set the cap in the kernel using
3810	 * zone_setattr() and make sure the rcapd SMF service is enabled.
3811	 */
3812	if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
3813		uint64_t num;
3814		char smf_err[128];
3815
3816		num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10);
3817		if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
3818			zerror(zlogp, B_TRUE, "could not set zone memory cap");
3819			zonecfg_fini_handle(handle);
3820			return (Z_INVAL);
3821		}
3822
3823		if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
3824			zerror(zlogp, B_FALSE, "enabling system/rcap service "
3825			    "failed: %s", smf_err);
3826			zonecfg_fini_handle(handle);
3827			return (Z_INVAL);
3828		}
3829	}
3830
3831	/* Get the scheduling class set in the zone configuration. */
3832	if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
3833	    strlen(sched) > 0) {
3834		if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched,
3835		    strlen(sched)) == -1)
3836			zerror(zlogp, B_TRUE, "WARNING: unable to set the "
3837			    "default scheduling class");
3838
3839	} else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp)
3840	    == Z_OK) {
3841		/*
3842		 * If the zone has the zone.cpu-shares rctl set then we want to
3843		 * use the Fair Share Scheduler (FSS) for processes in the
3844		 * zone.  Check what scheduling class the zone would be running
3845		 * in by default so we can print a warning and modify the class
3846		 * if we wouldn't be using FSS.
3847		 */
3848		char class_name[PC_CLNMSZ];
3849
3850		if (zonecfg_get_dflt_sched_class(handle, class_name,
3851		    sizeof (class_name)) != Z_OK) {
3852			zerror(zlogp, B_FALSE, "WARNING: unable to determine "
3853			    "the zone's scheduling class");
3854
3855		} else if (strcmp("FSS", class_name) != 0) {
3856			zerror(zlogp, B_FALSE, "WARNING: The zone.cpu-shares "
3857			    "rctl is set but\nFSS is not the default "
3858			    "scheduling class for\nthis zone.  FSS will be "
3859			    "used for processes\nin the zone but to get the "
3860			    "full benefit of FSS,\nit should be the default "
3861			    "scheduling class.\nSee dispadmin(1M) for more "
3862			    "details.");
3863
3864			if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, "FSS",
3865			    strlen("FSS")) == -1)
3866				zerror(zlogp, B_TRUE, "WARNING: unable to set "
3867				    "zone scheduling class to FSS");
3868		}
3869	}
3870
3871	/*
3872	 * The next few blocks of code attempt to set up temporary pools as
3873	 * well as persistent pools.  In all cases we call the functions
3874	 * unconditionally.  Within each funtion the code will check if the
3875	 * zone is actually configured for a temporary pool or persistent pool
3876	 * and just return if there is nothing to do.
3877	 *
3878	 * If we are rebooting we want to attempt to reuse any temporary pool
3879	 * that was previously set up.  zonecfg_bind_tmp_pool() will do the
3880	 * right thing in all cases (reuse or create) based on the current
3881	 * zonecfg.
3882	 */
3883	if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
3884	    sizeof (pool_err))) != Z_OK) {
3885		if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND)
3886			zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting "
3887			    "cannot be instantiated", zonecfg_strerror(res),
3888			    pool_err);
3889		else
3890			zerror(zlogp, B_FALSE, "could not bind zone to "
3891			    "temporary pool: %s", zonecfg_strerror(res));
3892		zonecfg_fini_handle(handle);
3893		return (Z_POOL_BIND);
3894	}
3895
3896	/*
3897	 * Check if we need to warn about poold not being enabled.
3898	 */
3899	if (zonecfg_warn_poold(handle)) {
3900		zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has "
3901		    "been specified\nbut the dynamic pool service is not "
3902		    "enabled.\nThe system will not dynamically adjust the\n"
3903		    "processor allocation within the specified range\n"
3904		    "until svc:/system/pools/dynamic is enabled.\n"
3905		    "See poold(1M).");
3906	}
3907
3908	/* The following is a warning, not an error. */
3909	if ((res = zonecfg_bind_pool(handle, zoneid, pool_err,
3910	    sizeof (pool_err))) != Z_OK) {
3911		if (res == Z_POOL_BIND)
3912			zerror(zlogp, B_FALSE, "WARNING: unable to bind to "
3913			    "pool '%s'; using default pool.", pool_err);
3914		else if (res == Z_POOL)
3915			zerror(zlogp, B_FALSE, "WARNING: %s: %s",
3916			    zonecfg_strerror(res), pool_err);
3917		else
3918			zerror(zlogp, B_FALSE, "WARNING: %s",
3919			    zonecfg_strerror(res));
3920	}
3921
3922	zonecfg_fini_handle(handle);
3923	return (Z_OK);
3924}
3925
3926zoneid_t
3927vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
3928{
3929	zoneid_t rval = -1;
3930	priv_set_t *privs;
3931	char rootpath[MAXPATHLEN];
3932	char modname[MAXPATHLEN];
3933	struct brand_attr attr;
3934	brand_handle_t bh;
3935	char *rctlbuf = NULL;
3936	size_t rctlbufsz = 0;
3937	char *zfsbuf = NULL;
3938	size_t zfsbufsz = 0;
3939	zoneid_t zoneid = -1;
3940	int xerr;
3941	char *kzone;
3942	FILE *fp = NULL;
3943	tsol_zcent_t *zcent = NULL;
3944	int match = 0;
3945	int doi = 0;
3946	int flags;
3947	zone_iptype_t iptype;
3948
3949	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
3950		zerror(zlogp, B_TRUE, "unable to determine zone root");
3951		return (-1);
3952	}
3953	if (zonecfg_in_alt_root())
3954		resolve_lofs(zlogp, rootpath, sizeof (rootpath));
3955
3956	if (get_iptype(zlogp, &iptype) < 0) {
3957		zerror(zlogp, B_TRUE, "unable to determine ip-type");
3958		return (-1);
3959	}
3960	switch (iptype) {
3961	case ZS_SHARED:
3962		flags = 0;
3963		break;
3964	case ZS_EXCLUSIVE:
3965		flags = ZCF_NET_EXCL;
3966		break;
3967	}
3968
3969	if ((privs = priv_allocset()) == NULL) {
3970		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3971		return (-1);
3972	}
3973	priv_emptyset(privs);
3974	if (get_privset(zlogp, privs, mount_cmd) != 0)
3975		goto error;
3976
3977	if (!mount_cmd && get_rctls(zlogp, &rctlbuf, &rctlbufsz) != 0) {
3978		zerror(zlogp, B_FALSE, "Unable to get list of rctls");
3979		goto error;
3980	}
3981
3982	if (get_datasets(zlogp, &zfsbuf, &zfsbufsz) != 0) {
3983		zerror(zlogp, B_FALSE, "Unable to get list of ZFS datasets");
3984		goto error;
3985	}
3986
3987	if (!mount_cmd && is_system_labeled()) {
3988		zcent = get_zone_label(zlogp, privs);
3989		if (zcent != NULL) {
3990			match = zcent->zc_match;
3991			doi = zcent->zc_doi;
3992			*zlabel = zcent->zc_label;
3993		} else {
3994			goto error;
3995		}
3996	}
3997
3998	kzone = zone_name;
3999
4000	/*
4001	 * We must do this scan twice.  First, we look for zones running on the
4002	 * main system that are using this root (or any subdirectory of it).
4003	 * Next, we reduce to the shortest path and search for loopback mounts
4004	 * that use this same source node (same device and inode).
4005	 */
4006	if (duplicate_zone_root(zlogp, rootpath))
4007		goto error;
4008	if (duplicate_reachable_path(zlogp, rootpath))
4009		goto error;
4010
4011	if (mount_cmd) {
4012		assert(zone_isnative);
4013		root_to_lu(zlogp, rootpath, sizeof (rootpath), B_TRUE);
4014
4015		/*
4016		 * Forge up a special root for this zone.  When a zone is
4017		 * mounted, we can't let the zone have its own root because the
4018		 * tools that will be used in this "scratch zone" need access
4019		 * to both the zone's resources and the running machine's
4020		 * executables.
4021		 *
4022		 * Note that the mkdir here also catches read-only filesystems.
4023		 */
4024		if (mkdir(rootpath, 0755) != 0 && errno != EEXIST) {
4025			zerror(zlogp, B_TRUE, "cannot create %s", rootpath);
4026			goto error;
4027		}
4028		if (domount(zlogp, "tmpfs", "", "swap", rootpath) != 0)
4029			goto error;
4030	}
4031
4032	if (zonecfg_in_alt_root()) {
4033		/*
4034		 * If we are mounting up a zone in an alternate root partition,
4035		 * then we have some additional work to do before starting the
4036		 * zone.  First, resolve the root path down so that we're not
4037		 * fooled by duplicates.  Then forge up an internal name for
4038		 * the zone.
4039		 */
4040		if ((fp = zonecfg_open_scratch("", B_TRUE)) == NULL) {
4041			zerror(zlogp, B_TRUE, "cannot open mapfile");
4042			goto error;
4043		}
4044		if (zonecfg_lock_scratch(fp) != 0) {
4045			zerror(zlogp, B_TRUE, "cannot lock mapfile");
4046			goto error;
4047		}
4048		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
4049		    NULL, 0) == 0) {
4050			zerror(zlogp, B_FALSE, "scratch zone already running");
4051			goto error;
4052		}
4053		/* This is the preferred name */
4054		(void) snprintf(kernzone, sizeof (kernzone), "SUNWlu-%s",
4055		    zone_name);
4056		srandom(getpid());
4057		while (zonecfg_reverse_scratch(fp, kernzone, NULL, 0, NULL,
4058		    0) == 0) {
4059			/* This is just an arbitrary name; note "." usage */
4060			(void) snprintf(kernzone, sizeof (kernzone),
4061			    "SUNWlu.%08lX%08lX", random(), random());
4062		}
4063		kzone = kernzone;
4064	}
4065
4066	xerr = 0;
4067	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
4068	    rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel,
4069	    flags)) == -1) {
4070		if (xerr == ZE_AREMOUNTS) {
4071			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
4072				zerror(zlogp, B_FALSE,
4073				    "An unknown file-system is mounted on "
4074				    "a subdirectory of %s", rootpath);
4075			} else {
4076
4077				zerror(zlogp, B_FALSE,
4078				    "These file-systems are mounted on "
4079				    "subdirectories of %s:", rootpath);
4080				(void) zonecfg_find_mounts(rootpath,
4081				    prtmount, zlogp);
4082			}
4083		} else if (xerr == ZE_CHROOTED) {
4084			zerror(zlogp, B_FALSE, "%s: "
4085			    "cannot create a zone from a chrooted "
4086			    "environment", "zone_create");
4087		} else {
4088			zerror(zlogp, B_TRUE, "%s failed", "zone_create");
4089		}
4090		goto error;
4091	}
4092
4093	if (zonecfg_in_alt_root() &&
4094	    zonecfg_add_scratch(fp, zone_name, kernzone,
4095	    zonecfg_get_root()) == -1) {
4096		zerror(zlogp, B_TRUE, "cannot add mapfile entry");
4097		goto error;
4098	}
4099
4100	if ((zone_get_brand(zone_name, attr.ba_brandname,
4101	    MAXNAMELEN) != Z_OK) ||
4102	    (bh = brand_open(attr.ba_brandname)) == NULL) {
4103		zerror(zlogp, B_FALSE, "unable to determine brand name");
4104		return (-1);
4105	}
4106
4107	/*
4108	 * If this brand requires any kernel support, now is the time to
4109	 * get it loaded and initialized.
4110	 */
4111	if (brand_get_modname(bh, modname, MAXPATHLEN) < 0) {
4112		brand_close(bh);
4113		zerror(zlogp, B_FALSE, "unable to determine brand kernel "
4114		    "module");
4115		return (-1);
4116	}
4117	brand_close(bh);
4118
4119	if (strlen(modname) > 0) {
4120		(void) strlcpy(attr.ba_modname, modname, MAXPATHLEN);
4121		if (zone_setattr(zoneid, ZONE_ATTR_BRAND, &attr,
4122		    sizeof (attr) != 0)) {
4123			zerror(zlogp, B_TRUE, "could not set zone brand "
4124			    "attribute.");
4125			goto error;
4126		}
4127	}
4128
4129	/*
4130	 * The following actions are not performed when merely mounting a zone
4131	 * for administrative use.
4132	 */
4133	if (!mount_cmd) {
4134		if (setup_zone_rm(zlogp, zone_name, zoneid) != Z_OK) {
4135			(void) zone_shutdown(zoneid);
4136			goto error;
4137		}
4138
4139		set_mlps(zlogp, zoneid, zcent);
4140	}
4141
4142	rval = zoneid;
4143	zoneid = -1;
4144
4145error:
4146	if (zoneid != -1)
4147		(void) zone_destroy(zoneid);
4148	if (rctlbuf != NULL)
4149		free(rctlbuf);
4150	priv_freeset(privs);
4151	if (fp != NULL)
4152		zonecfg_close_scratch(fp);
4153	lofs_discard_mnttab();
4154	if (zcent != NULL)
4155		tsol_freezcent(zcent);
4156	return (rval);
4157}
4158
4159/*
4160 * Enter the zone and write a /etc/zones/index file there.  This allows
4161 * libzonecfg (and thus zoneadm) to report the UUID and potentially other zone
4162 * details from inside the zone.
4163 */
4164static void
4165write_index_file(zoneid_t zoneid)
4166{
4167	FILE *zef;
4168	FILE *zet;
4169	struct zoneent *zep;
4170	pid_t child;
4171	int tmpl_fd;
4172	ctid_t ct;
4173	int fd;
4174	char uuidstr[UUID_PRINTABLE_STRING_LENGTH];
4175
4176	/* Locate the zone entry in the global zone's index file */
4177	if ((zef = setzoneent()) == NULL)
4178		return;
4179	while ((zep = getzoneent_private(zef)) != NULL) {
4180		if (strcmp(zep->zone_name, zone_name) == 0)
4181			break;
4182		free(zep);
4183	}
4184	endzoneent(zef);
4185	if (zep == NULL)
4186		return;
4187
4188	if ((tmpl_fd = init_template()) == -1) {
4189		free(zep);
4190		return;
4191	}
4192
4193	if ((child = fork()) == -1) {
4194		(void) ct_tmpl_clear(tmpl_fd);
4195		(void) close(tmpl_fd);
4196		free(zep);
4197		return;
4198	}
4199
4200	/* parent waits for child to finish */
4201	if (child != 0) {
4202		free(zep);
4203		if (contract_latest(&ct) == -1)
4204			ct = -1;
4205		(void) ct_tmpl_clear(tmpl_fd);
4206		(void) close(tmpl_fd);
4207		(void) waitpid(child, NULL, 0);
4208		(void) contract_abandon_id(ct);
4209		return;
4210	}
4211
4212	/* child enters zone and sets up index file */
4213	(void) ct_tmpl_clear(tmpl_fd);
4214	if (zone_enter(zoneid) != -1) {
4215		(void) mkdir(ZONE_CONFIG_ROOT, ZONE_CONFIG_MODE);
4216		(void) chown(ZONE_CONFIG_ROOT, ZONE_CONFIG_UID,
4217		    ZONE_CONFIG_GID);
4218		fd = open(ZONE_INDEX_FILE, O_WRONLY|O_CREAT|O_TRUNC,
4219		    ZONE_INDEX_MODE);
4220		if (fd != -1 && (zet = fdopen(fd, "w")) != NULL) {
4221			(void) fchown(fd, ZONE_INDEX_UID, ZONE_INDEX_GID);
4222			if (uuid_is_null(zep->zone_uuid))
4223				uuidstr[0] = '\0';
4224			else
4225				uuid_unparse(zep->zone_uuid, uuidstr);
4226			(void) fprintf(zet, "%s:%s:/:%s\n", zep->zone_name,
4227			    zone_state_str(zep->zone_state),
4228			    uuidstr);
4229			(void) fclose(zet);
4230		}
4231	}
4232	_exit(0);
4233}
4234
4235int
4236vplat_bringup(zlog_t *zlogp, boolean_t mount_cmd, zoneid_t zoneid)
4237{
4238	char zonepath[MAXPATHLEN];
4239
4240	if (!mount_cmd && validate_datasets(zlogp) != 0) {
4241		lofs_discard_mnttab();
4242		return (-1);
4243	}
4244
4245	/*
4246	 * Before we try to mount filesystems we need to create the
4247	 * attribute backing store for /dev
4248	 */
4249	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
4250		lofs_discard_mnttab();
4251		return (-1);
4252	}
4253	resolve_lofs(zlogp, zonepath, sizeof (zonepath));
4254
4255	/* Make /dev directory owned by root, grouped sys */
4256	if (make_one_dir(zlogp, zonepath, "/dev", DEFAULT_DIR_MODE,
4257	    0, 3) != 0) {
4258		lofs_discard_mnttab();
4259		return (-1);
4260	}
4261
4262	if (mount_filesystems(zlogp, mount_cmd) != 0) {
4263		lofs_discard_mnttab();
4264		return (-1);
4265	}
4266
4267	if (!mount_cmd) {
4268		zone_iptype_t iptype;
4269
4270		if (get_iptype(zlogp, &iptype) < 0) {
4271			zerror(zlogp, B_TRUE, "unable to determine ip-type");
4272			lofs_discard_mnttab();
4273			return (-1);
4274		}
4275
4276		switch (iptype) {
4277		case ZS_SHARED:
4278			/* Always do this to make lo0 get configured */
4279			if (configure_shared_network_interfaces(zlogp) != 0) {
4280				lofs_discard_mnttab();
4281				return (-1);
4282			}
4283			break;
4284		case ZS_EXCLUSIVE:
4285			if (configure_exclusive_network_interfaces(zlogp) !=
4286			    0) {
4287				lofs_discard_mnttab();
4288				return (-1);
4289			}
4290			break;
4291		}
4292	}
4293
4294	write_index_file(zoneid);
4295
4296	lofs_discard_mnttab();
4297	return (0);
4298}
4299
4300static int
4301lu_root_teardown(zlog_t *zlogp)
4302{
4303	char zroot[MAXPATHLEN];
4304
4305	assert(zone_isnative);
4306
4307	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
4308		zerror(zlogp, B_FALSE, "unable to determine zone root");
4309		return (-1);
4310	}
4311	root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
4312
4313	/*
4314	 * At this point, the processes are gone, the filesystems (save the
4315	 * root) are unmounted, and the zone is on death row.  But there may
4316	 * still be creds floating about in the system that reference the
4317	 * zone_t, and which pin down zone_rootvp causing this call to fail
4318	 * with EBUSY.  Thus, we try for a little while before just giving up.
4319	 * (How I wish this were not true, and umount2 just did the right
4320	 * thing, or tmpfs supported MS_FORCE This is a gross hack.)
4321	 */
4322	if (umount2(zroot, MS_FORCE) != 0) {
4323		if (errno == ENOTSUP && umount2(zroot, 0) == 0)
4324			goto unmounted;
4325		if (errno == EBUSY) {
4326			int tries = 10;
4327
4328			while (--tries >= 0) {
4329				(void) sleep(1);
4330				if (umount2(zroot, 0) == 0)
4331					goto unmounted;
4332				if (errno != EBUSY)
4333					break;
4334			}
4335		}
4336		zerror(zlogp, B_TRUE, "unable to unmount '%s'", zroot);
4337		return (-1);
4338	}
4339unmounted:
4340
4341	/*
4342	 * Only zones in an alternate root environment have scratch zone
4343	 * entries.
4344	 */
4345	if (zonecfg_in_alt_root()) {
4346		FILE *fp;
4347		int retv;
4348
4349		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
4350			zerror(zlogp, B_TRUE, "cannot open mapfile");
4351			return (-1);
4352		}
4353		retv = -1;
4354		if (zonecfg_lock_scratch(fp) != 0)
4355			zerror(zlogp, B_TRUE, "cannot lock mapfile");
4356		else if (zonecfg_delete_scratch(fp, kernzone) != 0)
4357			zerror(zlogp, B_TRUE, "cannot delete map entry");
4358		else
4359			retv = 0;
4360		zonecfg_close_scratch(fp);
4361		return (retv);
4362	} else {
4363		return (0);
4364	}
4365}
4366
4367int
4368vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
4369{
4370	char *kzone;
4371	zoneid_t zoneid;
4372	int res;
4373	char pool_err[128];
4374	char zroot[MAXPATHLEN];
4375	char cmdbuf[MAXPATHLEN];
4376	char brand[MAXNAMELEN];
4377	brand_handle_t bh = NULL;
4378	ushort_t flags;
4379
4380	kzone = zone_name;
4381	if (zonecfg_in_alt_root()) {
4382		FILE *fp;
4383
4384		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
4385			zerror(zlogp, B_TRUE, "unable to open map file");
4386			goto error;
4387		}
4388		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
4389		    kernzone, sizeof (kernzone)) != 0) {
4390			zerror(zlogp, B_FALSE, "unable to find scratch zone");
4391			zonecfg_close_scratch(fp);
4392			goto error;
4393		}
4394		zonecfg_close_scratch(fp);
4395		kzone = kernzone;
4396	}
4397
4398	if ((zoneid = getzoneidbyname(kzone)) == ZONE_ID_UNDEFINED) {
4399		if (!bringup_failure_recovery)
4400			zerror(zlogp, B_TRUE, "unable to get zoneid");
4401		if (unmount_cmd)
4402			(void) lu_root_teardown(zlogp);
4403		goto error;
4404	}
4405
4406	if (zone_shutdown(zoneid) != 0) {
4407		zerror(zlogp, B_TRUE, "unable to shutdown zone");
4408		goto error;
4409	}
4410
4411	/* Get the path to the root of this zone */
4412	if (zone_get_zonepath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
4413		zerror(zlogp, B_FALSE, "unable to determine zone root");
4414		goto error;
4415	}
4416
4417	/* Get a handle to the brand info for this zone */
4418	if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
4419	    (bh = brand_open(brand)) == NULL) {
4420		zerror(zlogp, B_FALSE, "unable to determine zone brand");
4421		return (-1);
4422	}
4423	/*
4424	 * If there is a brand 'halt' callback, execute it now to give the
4425	 * brand a chance to cleanup any custom configuration.
4426	 */
4427	(void) strcpy(cmdbuf, EXEC_PREFIX);
4428	if (brand_get_halt(bh, zone_name, zroot, cmdbuf + EXEC_LEN,
4429	    sizeof (cmdbuf) - EXEC_LEN, 0, NULL) < 0) {
4430		brand_close(bh);
4431		zerror(zlogp, B_FALSE, "unable to determine branded zone's "
4432		    "halt callback.");
4433		goto error;
4434	}
4435	brand_close(bh);
4436
4437	if ((strlen(cmdbuf) > EXEC_LEN) &&
4438	    (do_subproc(zlogp, cmdbuf) != Z_OK)) {
4439		zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
4440		goto error;
4441	}
4442
4443	if (!unmount_cmd) {
4444		zone_iptype_t iptype;
4445
4446		if (zone_getattr(zoneid, ZONE_ATTR_FLAGS, &flags,
4447		    sizeof (flags)) < 0) {
4448			if (get_iptype(zlogp, &iptype) < 0) {
4449				zerror(zlogp, B_TRUE, "unable to determine "
4450				    "ip-type");
4451				goto error;
4452			}
4453		} else {
4454			if (flags & ZF_NET_EXCL)
4455				iptype = ZS_EXCLUSIVE;
4456			else
4457				iptype = ZS_SHARED;
4458		}
4459
4460		switch (iptype) {
4461		case ZS_SHARED:
4462			if (unconfigure_shared_network_interfaces(zlogp,
4463			    zoneid) != 0) {
4464				zerror(zlogp, B_FALSE, "unable to unconfigure "
4465				    "network interfaces in zone");
4466				goto error;
4467			}
4468			break;
4469		case ZS_EXCLUSIVE:
4470			if (unconfigure_exclusive_network_interfaces(zlogp,
4471			    zoneid) != 0) {
4472				zerror(zlogp, B_FALSE, "unable to unconfigure "
4473				    "network interfaces in zone");
4474				goto error;
4475			}
4476			break;
4477		}
4478	}
4479
4480	if (!unmount_cmd && tcp_abort_connections(zlogp, zoneid) != 0) {
4481		zerror(zlogp, B_TRUE, "unable to abort TCP connections");
4482		goto error;
4483	}
4484
4485	/* destroy zconsole before umount /dev */
4486	if (!unmount_cmd)
4487		destroy_console_slave();
4488
4489	if (unmount_filesystems(zlogp, zoneid, unmount_cmd) != 0) {
4490		zerror(zlogp, B_FALSE,
4491		    "unable to unmount file systems in zone");
4492		goto error;
4493	}
4494
4495	/*
4496	 * If we are rebooting then we normally don't want to destroy an
4497	 * existing temporary pool at this point so that we can just reuse it
4498	 * when the zone boots back up.  However, it is also possible we were
4499	 * running with a temporary pool and the zone configuration has been
4500	 * modified to no longer use a temporary pool.  In that case we need
4501	 * to destroy the temporary pool now.  This case looks like the case
4502	 * where we never had a temporary pool configured but
4503	 * zonecfg_destroy_tmp_pool will do the right thing either way.
4504	 */
4505	if (!unmount_cmd) {
4506		boolean_t destroy_tmp_pool = B_TRUE;
4507
4508		if (rebooting) {
4509			struct zone_psettab pset_tab;
4510			zone_dochandle_t handle;
4511
4512			if ((handle = zonecfg_init_handle()) != NULL &&
4513			    zonecfg_get_handle(zone_name, handle) == Z_OK &&
4514			    zonecfg_lookup_pset(handle, &pset_tab) == Z_OK)
4515				destroy_tmp_pool = B_FALSE;
4516
4517			zonecfg_fini_handle(handle);
4518		}
4519
4520		if (destroy_tmp_pool) {
4521			if ((res = zonecfg_destroy_tmp_pool(zone_name, pool_err,
4522			    sizeof (pool_err))) != Z_OK) {
4523				if (res == Z_POOL)
4524					zerror(zlogp, B_FALSE, pool_err);
4525			}
4526		}
4527	}
4528
4529	remove_mlps(zlogp, zoneid);
4530
4531	if (zone_destroy(zoneid) != 0) {
4532		zerror(zlogp, B_TRUE, "unable to destroy zone");
4533		goto error;
4534	}
4535
4536	/*
4537	 * Special teardown for alternate boot environments: remove the tmpfs
4538	 * root for the zone and then remove it from the map file.
4539	 */
4540	if (unmount_cmd && lu_root_teardown(zlogp) != 0)
4541		goto error;
4542
4543	lofs_discard_mnttab();
4544	return (0);
4545
4546error:
4547	lofs_discard_mnttab();
4548	return (-1);
4549}
4550