1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24307121Smav * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
25297119Smav * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
26168404Spjd */
27168404Spjd
28168404Spjd/*
29168404Spjd * Functions to convert between a list of vdevs and an nvlist representing the
30168404Spjd * configuration.  Each entry in the list can be one of:
31168404Spjd *
32168404Spjd * 	Device vdevs
33168404Spjd * 		disk=(path=..., devid=...)
34168404Spjd * 		file=(path=...)
35168404Spjd *
36168404Spjd * 	Group vdevs
37168404Spjd * 		raidz[1|2]=(...)
38168404Spjd * 		mirror=(...)
39168404Spjd *
40168404Spjd * 	Hot spares
41168404Spjd *
42168404Spjd * While the underlying implementation supports it, group vdevs cannot contain
43168404Spjd * other group vdevs.  All userland verification of devices is contained within
44168404Spjd * this file.  If successful, the nvlist returned can be passed directly to the
45168404Spjd * kernel; we've done as much verification as possible in userland.
46168404Spjd *
47168404Spjd * Hot spares are a special case, and passed down as an array of disk vdevs, at
48168404Spjd * the same level as the root of the vdev tree.
49168404Spjd *
50185029Spjd * The only function exported by this file is 'make_root_vdev'.  The
51185029Spjd * function performs several passes:
52168404Spjd *
53168404Spjd * 	1. Construct the vdev specification.  Performs syntax validation and
54168404Spjd *         makes sure each device is valid.
55168404Spjd * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
56168404Spjd *         devices are also in use.  Some can be overridden using the 'force'
57168404Spjd *         flag, others cannot.
58168404Spjd * 	3. Check for replication errors if the 'force' flag is not specified.
59168404Spjd *         validates that the replication level is consistent across the
60168404Spjd *         entire pool.
61185029Spjd * 	4. Call libzfs to label any whole disks with an EFI label.
62168404Spjd */
63168404Spjd
64168404Spjd#include <assert.h>
65168404Spjd#include <devid.h>
66168404Spjd#include <errno.h>
67168404Spjd#include <fcntl.h>
68168404Spjd#include <libintl.h>
69168404Spjd#include <libnvpair.h>
70219089Spjd#include <limits.h>
71168404Spjd#include <stdio.h>
72168404Spjd#include <string.h>
73168404Spjd#include <unistd.h>
74168404Spjd#include <paths.h>
75168404Spjd#include <sys/stat.h>
76168404Spjd#include <sys/disk.h>
77168404Spjd#include <sys/mntent.h>
78168404Spjd#include <libgeom.h>
79168404Spjd
80168404Spjd#include "zpool_util.h"
81168404Spjd
82219089Spjd#define	BACKUP_SLICE	"s2"
83219089Spjd
84168404Spjd/*
85168404Spjd * For any given vdev specification, we can have multiple errors.  The
86168404Spjd * vdev_error() function keeps track of whether we have seen an error yet, and
87168404Spjd * prints out a header if its the first error we've seen.
88168404Spjd */
89168404Spjdboolean_t error_seen;
90168404Spjdboolean_t is_force;
91168404Spjd
92168404Spjd/*PRINTFLIKE1*/
93168404Spjdstatic void
94168404Spjdvdev_error(const char *fmt, ...)
95168404Spjd{
96168404Spjd	va_list ap;
97168404Spjd
98168404Spjd	if (!error_seen) {
99168404Spjd		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
100168404Spjd		if (!is_force)
101168404Spjd			(void) fprintf(stderr, gettext("use '-f' to override "
102168404Spjd			    "the following errors:\n"));
103168404Spjd		else
104168404Spjd			(void) fprintf(stderr, gettext("the following errors "
105168404Spjd			    "must be manually repaired:\n"));
106168404Spjd		error_seen = B_TRUE;
107168404Spjd	}
108168404Spjd
109168404Spjd	va_start(ap, fmt);
110168404Spjd	(void) vfprintf(stderr, fmt, ap);
111168404Spjd	va_end(ap);
112168404Spjd}
113168404Spjd
114297077Smav#ifdef illumos
115219089Spjdstatic void
116219089Spjdlibdiskmgt_error(int error)
117219089Spjd{
118219089Spjd	/*
119219089Spjd	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
120219089Spjd	 * /dev/dsk.  Don't bother printing an error message in this case.
121219089Spjd	 */
122219089Spjd	if (error == ENXIO || error == ENODEV)
123219089Spjd		return;
124219089Spjd
125219089Spjd	(void) fprintf(stderr, gettext("warning: device in use checking "
126219089Spjd	    "failed: %s\n"), strerror(error));
127219089Spjd}
128219089Spjd
129168404Spjd/*
130219089Spjd * Validate a device, passing the bulk of the work off to libdiskmgt.
131219089Spjd */
132219089Spjdstatic int
133219089Spjdcheck_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
134219089Spjd{
135219089Spjd	char *msg;
136219089Spjd	int error = 0;
137219089Spjd	dm_who_type_t who;
138219089Spjd
139219089Spjd	if (force)
140219089Spjd		who = DM_WHO_ZPOOL_FORCE;
141219089Spjd	else if (isspare)
142219089Spjd		who = DM_WHO_ZPOOL_SPARE;
143219089Spjd	else
144219089Spjd		who = DM_WHO_ZPOOL;
145219089Spjd
146219089Spjd	if (dm_inuse((char *)path, &msg, who, &error) || error) {
147219089Spjd		if (error != 0) {
148219089Spjd			libdiskmgt_error(error);
149219089Spjd			return (0);
150219089Spjd		} else {
151219089Spjd			vdev_error("%s", msg);
152219089Spjd			free(msg);
153219089Spjd			return (-1);
154219089Spjd		}
155219089Spjd	}
156219089Spjd
157219089Spjd	/*
158219089Spjd	 * If we're given a whole disk, ignore overlapping slices since we're
159219089Spjd	 * about to label it anyway.
160219089Spjd	 */
161219089Spjd	error = 0;
162219089Spjd	if (!wholedisk && !force &&
163219089Spjd	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
164219089Spjd		if (error == 0) {
165219089Spjd			/* dm_isoverlapping returned -1 */
166219089Spjd			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
167219089Spjd			free(msg);
168219089Spjd			return (-1);
169219089Spjd		} else if (error != ENODEV) {
170219089Spjd			/* libdiskmgt's devcache only handles physical drives */
171219089Spjd			libdiskmgt_error(error);
172219089Spjd			return (0);
173219089Spjd		}
174219089Spjd	}
175219089Spjd
176219089Spjd	return (0);
177219089Spjd}
178219089Spjd
179219089Spjd
180219089Spjd/*
181219089Spjd * Validate a whole disk.  Iterate over all slices on the disk and make sure
182219089Spjd * that none is in use by calling check_slice().
183219089Spjd */
184219089Spjdstatic int
185219089Spjdcheck_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
186219089Spjd{
187219089Spjd	dm_descriptor_t *drive, *media, *slice;
188219089Spjd	int err = 0;
189219089Spjd	int i;
190219089Spjd	int ret;
191219089Spjd
192219089Spjd	/*
193219089Spjd	 * Get the drive associated with this disk.  This should never fail,
194219089Spjd	 * because we already have an alias handle open for the device.
195219089Spjd	 */
196219089Spjd	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
197219089Spjd	    &err)) == NULL || *drive == NULL) {
198219089Spjd		if (err)
199219089Spjd			libdiskmgt_error(err);
200219089Spjd		return (0);
201219089Spjd	}
202219089Spjd
203219089Spjd	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
204219089Spjd	    &err)) == NULL) {
205219089Spjd		dm_free_descriptors(drive);
206219089Spjd		if (err)
207219089Spjd			libdiskmgt_error(err);
208219089Spjd		return (0);
209219089Spjd	}
210219089Spjd
211219089Spjd	dm_free_descriptors(drive);
212219089Spjd
213219089Spjd	/*
214219089Spjd	 * It is possible that the user has specified a removable media drive,
215219089Spjd	 * and the media is not present.
216219089Spjd	 */
217219089Spjd	if (*media == NULL) {
218219089Spjd		dm_free_descriptors(media);
219219089Spjd		vdev_error(gettext("'%s' has no media in drive\n"), name);
220219089Spjd		return (-1);
221219089Spjd	}
222219089Spjd
223219089Spjd	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
224219089Spjd	    &err)) == NULL) {
225219089Spjd		dm_free_descriptors(media);
226219089Spjd		if (err)
227219089Spjd			libdiskmgt_error(err);
228219089Spjd		return (0);
229219089Spjd	}
230219089Spjd
231219089Spjd	dm_free_descriptors(media);
232219089Spjd
233219089Spjd	ret = 0;
234219089Spjd
235219089Spjd	/*
236219089Spjd	 * Iterate over all slices and report any errors.  We don't care about
237219089Spjd	 * overlapping slices because we are using the whole disk.
238219089Spjd	 */
239219089Spjd	for (i = 0; slice[i] != NULL; i++) {
240219089Spjd		char *name = dm_get_name(slice[i], &err);
241219089Spjd
242219089Spjd		if (check_slice(name, force, B_TRUE, isspare) != 0)
243219089Spjd			ret = -1;
244219089Spjd
245219089Spjd		dm_free_name(name);
246219089Spjd	}
247219089Spjd
248219089Spjd	dm_free_descriptors(slice);
249219089Spjd	return (ret);
250219089Spjd}
251219089Spjd
252219089Spjd/*
253219089Spjd * Validate a device.
254219089Spjd */
255219089Spjdstatic int
256219089Spjdcheck_device(const char *path, boolean_t force, boolean_t isspare)
257219089Spjd{
258219089Spjd	dm_descriptor_t desc;
259219089Spjd	int err;
260219089Spjd	char *dev;
261219089Spjd
262219089Spjd	/*
263219089Spjd	 * For whole disks, libdiskmgt does not include the leading dev path.
264219089Spjd	 */
265219089Spjd	dev = strrchr(path, '/');
266219089Spjd	assert(dev != NULL);
267219089Spjd	dev++;
268219089Spjd	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
269219089Spjd		err = check_disk(path, desc, force, isspare);
270219089Spjd		dm_free_descriptor(desc);
271219089Spjd		return (err);
272219089Spjd	}
273219089Spjd
274219089Spjd	return (check_slice(path, force, B_FALSE, isspare));
275219089Spjd}
276297077Smav#endif	/* illumos */
277219089Spjd
278219089Spjd/*
279185029Spjd * Check that a file is valid.  All we can do in this case is check that it's
280185029Spjd * not in use by another pool, and not in use by swap.
281168404Spjd */
282168404Spjdstatic int
283185029Spjdcheck_file(const char *file, boolean_t force, boolean_t isspare)
284168404Spjd{
285185029Spjd	char  *name;
286185029Spjd	int fd;
287185029Spjd	int ret = 0;
288185029Spjd	int err;
289185029Spjd	pool_state_t state;
290185029Spjd	boolean_t inuse;
291168404Spjd
292297077Smav#ifdef illumos
293185029Spjd	if (dm_inuse_swap(file, &err)) {
294185029Spjd		if (err)
295185029Spjd			libdiskmgt_error(err);
296185029Spjd		else
297185029Spjd			vdev_error(gettext("%s is currently used by swap. "
298185029Spjd			    "Please see swap(1M).\n"), file);
299185029Spjd		return (-1);
300185029Spjd	}
301185029Spjd#endif
302168404Spjd
303185029Spjd	if ((fd = open(file, O_RDONLY)) < 0)
304185029Spjd		return (0);
305168404Spjd
306185029Spjd	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
307185029Spjd		const char *desc;
308168404Spjd
309185029Spjd		switch (state) {
310185029Spjd		case POOL_STATE_ACTIVE:
311185029Spjd			desc = gettext("active");
312185029Spjd			break;
313185029Spjd
314185029Spjd		case POOL_STATE_EXPORTED:
315185029Spjd			desc = gettext("exported");
316185029Spjd			break;
317185029Spjd
318185029Spjd		case POOL_STATE_POTENTIALLY_ACTIVE:
319185029Spjd			desc = gettext("potentially active");
320185029Spjd			break;
321185029Spjd
322185029Spjd		default:
323185029Spjd			desc = gettext("unknown");
324185029Spjd			break;
325185029Spjd		}
326185029Spjd
327185029Spjd		/*
328185029Spjd		 * Allow hot spares to be shared between pools.
329185029Spjd		 */
330185029Spjd		if (state == POOL_STATE_SPARE && isspare)
331185029Spjd			return (0);
332185029Spjd
333185029Spjd		if (state == POOL_STATE_ACTIVE ||
334185029Spjd		    state == POOL_STATE_SPARE || !force) {
335185029Spjd			switch (state) {
336185029Spjd			case POOL_STATE_SPARE:
337185029Spjd				vdev_error(gettext("%s is reserved as a hot "
338185029Spjd				    "spare for pool %s\n"), file, name);
339185029Spjd				break;
340185029Spjd			default:
341185029Spjd				vdev_error(gettext("%s is part of %s pool "
342185029Spjd				    "'%s'\n"), file, desc, name);
343185029Spjd				break;
344168404Spjd			}
345185029Spjd			ret = -1;
346168404Spjd		}
347185029Spjd
348185029Spjd		free(name);
349168404Spjd	}
350168404Spjd
351185029Spjd	(void) close(fd);
352185029Spjd	return (ret);
353168404Spjd}
354168404Spjd
355185029Spjdstatic int
356219089Spjdcheck_device(const char *name, boolean_t force, boolean_t isspare)
357185029Spjd{
358185029Spjd	char path[MAXPATHLEN];
359185029Spjd
360185029Spjd	if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0)
361185029Spjd		snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name);
362185029Spjd	else
363185029Spjd		strlcpy(path, name, sizeof(path));
364185029Spjd
365185029Spjd	return (check_file(path, force, isspare));
366185029Spjd}
367185029Spjd
368185029Spjd/*
369185029Spjd * By "whole disk" we mean an entire physical disk (something we can
370185029Spjd * label, toggle the write cache on, etc.) as opposed to the full
371185029Spjd * capacity of a pseudo-device such as lofi or did.  We act as if we
372185029Spjd * are labeling the disk, which should be a pretty good test of whether
373185029Spjd * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
374185029Spjd * it isn't.
375185029Spjd */
376168404Spjdstatic boolean_t
377219089Spjdis_whole_disk(const char *arg)
378168404Spjd{
379297077Smav#ifdef illumos
380219089Spjd	struct dk_gpt *label;
381219089Spjd	int	fd;
382219089Spjd	char	path[MAXPATHLEN];
383219089Spjd
384219089Spjd	(void) snprintf(path, sizeof (path), "%s%s%s",
385299430Smav	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
386219089Spjd	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
387219089Spjd		return (B_FALSE);
388219089Spjd	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
389219089Spjd		(void) close(fd);
390219089Spjd		return (B_FALSE);
391219089Spjd	}
392219089Spjd	efi_free(label);
393219089Spjd	(void) close(fd);
394219089Spjd	return (B_TRUE);
395219089Spjd#else
396168404Spjd	int fd;
397168404Spjd
398219089Spjd	fd = g_open(arg, 0);
399169303Spjd	if (fd >= 0) {
400169303Spjd		g_close(fd);
401169303Spjd		return (B_TRUE);
402168404Spjd	}
403169303Spjd	return (B_FALSE);
404219089Spjd#endif
405185029Spjd}
406168404Spjd
407168404Spjd/*
408219089Spjd * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
409219089Spjd * device, fill in the device id to make a complete nvlist.  Valid forms for a
410219089Spjd * leaf vdev are:
411168404Spjd *
412219089Spjd * 	/dev/dsk/xxx	Complete disk path
413219089Spjd * 	/xxx		Full path to file
414219089Spjd * 	xxx		Shorthand for /dev/dsk/xxx
415168404Spjd */
416185029Spjdstatic nvlist_t *
417185029Spjdmake_leaf_vdev(const char *arg, uint64_t is_log)
418168404Spjd{
419185029Spjd	char path[MAXPATHLEN];
420169303Spjd	struct stat64 statbuf;
421168404Spjd	nvlist_t *vdev = NULL;
422168404Spjd	char *type = NULL;
423169303Spjd	boolean_t wholedisk = B_FALSE;
424168404Spjd
425185029Spjd	/*
426185029Spjd	 * Determine what type of vdev this is, and put the full path into
427185029Spjd	 * 'path'.  We detect whether this is a device of file afterwards by
428185029Spjd	 * checking the st_mode of the file.
429185029Spjd	 */
430185029Spjd	if (arg[0] == '/') {
431185029Spjd		/*
432185029Spjd		 * Complete device or file path.  Exact type is determined by
433185029Spjd		 * examining the file descriptor afterwards.
434185029Spjd		 */
435185029Spjd		wholedisk = is_whole_disk(arg);
436185029Spjd		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
437185029Spjd			(void) fprintf(stderr,
438185029Spjd			    gettext("cannot open '%s': %s\n"),
439185029Spjd			    arg, strerror(errno));
440185029Spjd			return (NULL);
441185029Spjd		}
442168404Spjd
443185029Spjd		(void) strlcpy(path, arg, sizeof (path));
444185029Spjd	} else {
445185029Spjd		/*
446185029Spjd		 * This may be a short path for a device, or it could be total
447185029Spjd		 * gibberish.  Check to see if it's a known device in
448185029Spjd		 * /dev/dsk/.  As part of this check, see if we've been given a
449185029Spjd		 * an entire disk (minus the slice number).
450185029Spjd		 */
451185029Spjd		if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
452185029Spjd			strlcpy(path, arg, sizeof (path));
453185029Spjd		else
454185029Spjd			snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
455185029Spjd		wholedisk = is_whole_disk(path);
456185029Spjd		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
457185029Spjd			/*
458185029Spjd			 * If we got ENOENT, then the user gave us
459185029Spjd			 * gibberish, so try to direct them with a
460185029Spjd			 * reasonable error message.  Otherwise,
461185029Spjd			 * regurgitate strerror() since it's the best we
462185029Spjd			 * can do.
463185029Spjd			 */
464185029Spjd			if (errno == ENOENT) {
465185029Spjd				(void) fprintf(stderr,
466185029Spjd				    gettext("cannot open '%s': no such "
467185029Spjd				    "GEOM provider\n"), arg);
468185029Spjd				(void) fprintf(stderr,
469185029Spjd				    gettext("must be a full path or "
470185029Spjd				    "shorthand device name\n"));
471185029Spjd				return (NULL);
472185029Spjd			} else {
473185029Spjd				(void) fprintf(stderr,
474185029Spjd				    gettext("cannot open '%s': %s\n"),
475185029Spjd				    path, strerror(errno));
476185029Spjd				return (NULL);
477185029Spjd			}
478185029Spjd		}
479185029Spjd	}
480185029Spjd
481219089Spjd#ifdef __FreeBSD__
482219089Spjd	if (S_ISCHR(statbuf.st_mode)) {
483219089Spjd		statbuf.st_mode &= ~S_IFCHR;
484219089Spjd		statbuf.st_mode |= S_IFBLK;
485219089Spjd		wholedisk = B_FALSE;
486219089Spjd	}
487219089Spjd#endif
488219089Spjd
489185029Spjd	/*
490185029Spjd	 * Determine whether this is a device or a file.
491185029Spjd	 */
492219089Spjd	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
493168404Spjd		type = VDEV_TYPE_DISK;
494185029Spjd	} else if (S_ISREG(statbuf.st_mode)) {
495185029Spjd		type = VDEV_TYPE_FILE;
496185029Spjd	} else {
497168404Spjd		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
498185029Spjd		    "GEOM provider or regular file\n"), path);
499168404Spjd		return (NULL);
500168404Spjd	}
501168404Spjd
502168404Spjd	/*
503168404Spjd	 * Finally, we have the complete device or file, and we know that it is
504168404Spjd	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
505168404Spjd	 * vdevs have a 'path' element, and devices also have a 'devid' element.
506168404Spjd	 */
507168404Spjd	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
508168404Spjd	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
509168404Spjd	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
510185029Spjd	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
511168404Spjd	if (strcmp(type, VDEV_TYPE_DISK) == 0)
512168404Spjd		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
513219089Spjd		    (uint64_t)wholedisk) == 0);
514168404Spjd
515266611Smav#ifdef have_devid
516169303Spjd	/*
517169303Spjd	 * For a whole disk, defer getting its devid until after labeling it.
518169303Spjd	 */
519219089Spjd	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
520169303Spjd		/*
521169303Spjd		 * Get the devid for the device.
522169303Spjd		 */
523169303Spjd		int fd;
524169303Spjd		ddi_devid_t devid;
525169303Spjd		char *minor = NULL, *devid_str = NULL;
526169303Spjd
527169303Spjd		if ((fd = open(path, O_RDONLY)) < 0) {
528169303Spjd			(void) fprintf(stderr, gettext("cannot open '%s': "
529169303Spjd			    "%s\n"), path, strerror(errno));
530169303Spjd			nvlist_free(vdev);
531169303Spjd			return (NULL);
532169303Spjd		}
533169303Spjd
534169303Spjd		if (devid_get(fd, &devid) == 0) {
535169303Spjd			if (devid_get_minor_name(fd, &minor) == 0 &&
536169303Spjd			    (devid_str = devid_str_encode(devid, minor)) !=
537169303Spjd			    NULL) {
538169303Spjd				verify(nvlist_add_string(vdev,
539169303Spjd				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
540169303Spjd			}
541169303Spjd			if (devid_str != NULL)
542169303Spjd				devid_str_free(devid_str);
543169303Spjd			if (minor != NULL)
544169303Spjd				devid_str_free(minor);
545169303Spjd			devid_free(devid);
546169303Spjd		}
547169303Spjd
548169303Spjd		(void) close(fd);
549169303Spjd	}
550266611Smav#endif
551169303Spjd
552168404Spjd	return (vdev);
553168404Spjd}
554168404Spjd
555168404Spjd/*
556168404Spjd * Go through and verify the replication level of the pool is consistent.
557168404Spjd * Performs the following checks:
558168404Spjd *
559168404Spjd * 	For the new spec, verifies that devices in mirrors and raidz are the
560168404Spjd * 	same size.
561168404Spjd *
562168404Spjd * 	If the current configuration already has inconsistent replication
563168404Spjd * 	levels, ignore any other potential problems in the new spec.
564168404Spjd *
565168404Spjd * 	Otherwise, make sure that the current spec (if there is one) and the new
566168404Spjd * 	spec have consistent replication levels.
567168404Spjd */
568168404Spjdtypedef struct replication_level {
569168404Spjd	char *zprl_type;
570168404Spjd	uint64_t zprl_children;
571168404Spjd	uint64_t zprl_parity;
572168404Spjd} replication_level_t;
573168404Spjd
574185029Spjd#define	ZPOOL_FUZZ	(16 * 1024 * 1024)
575185029Spjd
576168404Spjd/*
577168404Spjd * Given a list of toplevel vdevs, return the current replication level.  If
578168404Spjd * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
579168404Spjd * an error message will be displayed for each self-inconsistent vdev.
580168404Spjd */
581185029Spjdstatic replication_level_t *
582168404Spjdget_replication(nvlist_t *nvroot, boolean_t fatal)
583168404Spjd{
584168404Spjd	nvlist_t **top;
585168404Spjd	uint_t t, toplevels;
586168404Spjd	nvlist_t **child;
587168404Spjd	uint_t c, children;
588168404Spjd	nvlist_t *nv;
589168404Spjd	char *type;
590297119Smav	replication_level_t lastrep = {0};
591297119Smav	replication_level_t rep;
592297119Smav	replication_level_t *ret;
593168404Spjd	boolean_t dontreport;
594168404Spjd
595168404Spjd	ret = safe_malloc(sizeof (replication_level_t));
596168404Spjd
597168404Spjd	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
598168404Spjd	    &top, &toplevels) == 0);
599168404Spjd
600168404Spjd	for (t = 0; t < toplevels; t++) {
601185029Spjd		uint64_t is_log = B_FALSE;
602185029Spjd
603168404Spjd		nv = top[t];
604168404Spjd
605185029Spjd		/*
606185029Spjd		 * For separate logs we ignore the top level vdev replication
607185029Spjd		 * constraints.
608185029Spjd		 */
609185029Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
610185029Spjd		if (is_log)
611185029Spjd			continue;
612168404Spjd
613185029Spjd		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
614185029Spjd		    &type) == 0);
615168404Spjd		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
616168404Spjd		    &child, &children) != 0) {
617168404Spjd			/*
618168404Spjd			 * This is a 'file' or 'disk' vdev.
619168404Spjd			 */
620168404Spjd			rep.zprl_type = type;
621168404Spjd			rep.zprl_children = 1;
622168404Spjd			rep.zprl_parity = 0;
623168404Spjd		} else {
624168404Spjd			uint64_t vdev_size;
625168404Spjd
626168404Spjd			/*
627168404Spjd			 * This is a mirror or RAID-Z vdev.  Go through and make
628168404Spjd			 * sure the contents are all the same (files vs. disks),
629168404Spjd			 * keeping track of the number of elements in the
630168404Spjd			 * process.
631168404Spjd			 *
632168404Spjd			 * We also check that the size of each vdev (if it can
633168404Spjd			 * be determined) is the same.
634168404Spjd			 */
635168404Spjd			rep.zprl_type = type;
636168404Spjd			rep.zprl_children = 0;
637168404Spjd
638168404Spjd			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
639168404Spjd				verify(nvlist_lookup_uint64(nv,
640168404Spjd				    ZPOOL_CONFIG_NPARITY,
641168404Spjd				    &rep.zprl_parity) == 0);
642168404Spjd				assert(rep.zprl_parity != 0);
643168404Spjd			} else {
644168404Spjd				rep.zprl_parity = 0;
645168404Spjd			}
646168404Spjd
647168404Spjd			/*
648185029Spjd			 * The 'dontreport' variable indicates that we've
649168404Spjd			 * already reported an error for this spec, so don't
650168404Spjd			 * bother doing it again.
651168404Spjd			 */
652168404Spjd			type = NULL;
653168404Spjd			dontreport = 0;
654168404Spjd			vdev_size = -1ULL;
655168404Spjd			for (c = 0; c < children; c++) {
656168404Spjd				nvlist_t *cnv = child[c];
657168404Spjd				char *path;
658168404Spjd				struct stat64 statbuf;
659168404Spjd				uint64_t size = -1ULL;
660168404Spjd				char *childtype;
661168404Spjd				int fd, err;
662168404Spjd
663168404Spjd				rep.zprl_children++;
664168404Spjd
665168404Spjd				verify(nvlist_lookup_string(cnv,
666168404Spjd				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
667168404Spjd
668168404Spjd				/*
669185029Spjd				 * If this is a replacing or spare vdev, then
670168404Spjd				 * get the real first child of the vdev.
671168404Spjd				 */
672168404Spjd				if (strcmp(childtype,
673168404Spjd				    VDEV_TYPE_REPLACING) == 0 ||
674168404Spjd				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
675168404Spjd					nvlist_t **rchild;
676168404Spjd					uint_t rchildren;
677168404Spjd
678168404Spjd					verify(nvlist_lookup_nvlist_array(cnv,
679168404Spjd					    ZPOOL_CONFIG_CHILDREN, &rchild,
680168404Spjd					    &rchildren) == 0);
681168404Spjd					assert(rchildren == 2);
682168404Spjd					cnv = rchild[0];
683168404Spjd
684168404Spjd					verify(nvlist_lookup_string(cnv,
685168404Spjd					    ZPOOL_CONFIG_TYPE,
686168404Spjd					    &childtype) == 0);
687330735Sasomers					if (strcmp(childtype,
688330735Sasomers					    VDEV_TYPE_SPARE) == 0) {
689330735Sasomers						/* We have a replacing vdev with
690330735Sasomers						 * a spare child.  Get the first
691330735Sasomers						 * real child of the spare
692330735Sasomers						 */
693330735Sasomers						verify(
694330735Sasomers						    nvlist_lookup_nvlist_array(
695330735Sasomers							cnv,
696330735Sasomers							ZPOOL_CONFIG_CHILDREN,
697330735Sasomers							&rchild,
698330735Sasomers						    &rchildren) == 0);
699330735Sasomers						assert(rchildren >= 2);
700330735Sasomers						cnv = rchild[0];
701330735Sasomers					}
702168404Spjd				}
703168404Spjd
704168404Spjd				verify(nvlist_lookup_string(cnv,
705168404Spjd				    ZPOOL_CONFIG_PATH, &path) == 0);
706168404Spjd
707168404Spjd				/*
708168404Spjd				 * If we have a raidz/mirror that combines disks
709168404Spjd				 * with files, report it as an error.
710168404Spjd				 */
711168404Spjd				if (!dontreport && type != NULL &&
712168404Spjd				    strcmp(type, childtype) != 0) {
713168404Spjd					if (ret != NULL)
714168404Spjd						free(ret);
715168404Spjd					ret = NULL;
716168404Spjd					if (fatal)
717168404Spjd						vdev_error(gettext(
718168404Spjd						    "mismatched replication "
719168404Spjd						    "level: %s contains both "
720168404Spjd						    "files and devices\n"),
721168404Spjd						    rep.zprl_type);
722168404Spjd					else
723168404Spjd						return (NULL);
724168404Spjd					dontreport = B_TRUE;
725168404Spjd				}
726168404Spjd
727168404Spjd				/*
728168404Spjd				 * According to stat(2), the value of 'st_size'
729168404Spjd				 * is undefined for block devices and character
730168404Spjd				 * devices.  But there is no effective way to
731168404Spjd				 * determine the real size in userland.
732168404Spjd				 *
733168404Spjd				 * Instead, we'll take advantage of an
734168404Spjd				 * implementation detail of spec_size().  If the
735168404Spjd				 * device is currently open, then we (should)
736168404Spjd				 * return a valid size.
737168404Spjd				 *
738168404Spjd				 * If we still don't get a valid size (indicated
739168404Spjd				 * by a size of 0 or MAXOFFSET_T), then ignore
740168404Spjd				 * this device altogether.
741168404Spjd				 */
742168404Spjd				if ((fd = open(path, O_RDONLY)) >= 0) {
743168404Spjd					err = fstat64(fd, &statbuf);
744168404Spjd					(void) close(fd);
745168404Spjd				} else {
746168404Spjd					err = stat64(path, &statbuf);
747168404Spjd				}
748219089Spjd
749219089Spjd				if (err != 0 ||
750219089Spjd				    statbuf.st_size == 0 ||
751219089Spjd				    statbuf.st_size == MAXOFFSET_T)
752168404Spjd					continue;
753168404Spjd
754168404Spjd				size = statbuf.st_size;
755168404Spjd
756168404Spjd				/*
757185029Spjd				 * Also make sure that devices and
758185029Spjd				 * slices have a consistent size.  If
759185029Spjd				 * they differ by a significant amount
760185029Spjd				 * (~16MB) then report an error.
761168404Spjd				 */
762185029Spjd				if (!dontreport &&
763185029Spjd				    (vdev_size != -1ULL &&
764185029Spjd				    (labs(size - vdev_size) >
765185029Spjd				    ZPOOL_FUZZ))) {
766168404Spjd					if (ret != NULL)
767168404Spjd						free(ret);
768168404Spjd					ret = NULL;
769168404Spjd					if (fatal)
770168404Spjd						vdev_error(gettext(
771168404Spjd						    "%s contains devices of "
772168404Spjd						    "different sizes\n"),
773168404Spjd						    rep.zprl_type);
774168404Spjd					else
775168404Spjd						return (NULL);
776168404Spjd					dontreport = B_TRUE;
777168404Spjd				}
778168404Spjd
779168404Spjd				type = childtype;
780168404Spjd				vdev_size = size;
781168404Spjd			}
782168404Spjd		}
783168404Spjd
784168404Spjd		/*
785168404Spjd		 * At this point, we have the replication of the last toplevel
786168404Spjd		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
787168404Spjd		 * different.
788168404Spjd		 */
789168404Spjd		if (lastrep.zprl_type != NULL) {
790168404Spjd			if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
791168404Spjd				if (ret != NULL)
792168404Spjd					free(ret);
793168404Spjd				ret = NULL;
794168404Spjd				if (fatal)
795168404Spjd					vdev_error(gettext(
796168404Spjd					    "mismatched replication level: "
797168404Spjd					    "both %s and %s vdevs are "
798168404Spjd					    "present\n"),
799168404Spjd					    lastrep.zprl_type, rep.zprl_type);
800168404Spjd				else
801168404Spjd					return (NULL);
802168404Spjd			} else if (lastrep.zprl_parity != rep.zprl_parity) {
803168404Spjd				if (ret)
804168404Spjd					free(ret);
805168404Spjd				ret = NULL;
806168404Spjd				if (fatal)
807168404Spjd					vdev_error(gettext(
808168404Spjd					    "mismatched replication level: "
809168404Spjd					    "both %llu and %llu device parity "
810168404Spjd					    "%s vdevs are present\n"),
811168404Spjd					    lastrep.zprl_parity,
812168404Spjd					    rep.zprl_parity,
813168404Spjd					    rep.zprl_type);
814168404Spjd				else
815168404Spjd					return (NULL);
816168404Spjd			} else if (lastrep.zprl_children != rep.zprl_children) {
817168404Spjd				if (ret)
818168404Spjd					free(ret);
819168404Spjd				ret = NULL;
820168404Spjd				if (fatal)
821168404Spjd					vdev_error(gettext(
822168404Spjd					    "mismatched replication level: "
823168404Spjd					    "both %llu-way and %llu-way %s "
824168404Spjd					    "vdevs are present\n"),
825168404Spjd					    lastrep.zprl_children,
826168404Spjd					    rep.zprl_children,
827168404Spjd					    rep.zprl_type);
828168404Spjd				else
829168404Spjd					return (NULL);
830168404Spjd			}
831168404Spjd		}
832168404Spjd		lastrep = rep;
833168404Spjd	}
834168404Spjd
835168404Spjd	if (ret != NULL)
836168404Spjd		*ret = rep;
837168404Spjd
838168404Spjd	return (ret);
839168404Spjd}
840168404Spjd
841168404Spjd/*
842168404Spjd * Check the replication level of the vdev spec against the current pool.  Calls
843168404Spjd * get_replication() to make sure the new spec is self-consistent.  If the pool
844168404Spjd * has a consistent replication level, then we ignore any errors.  Otherwise,
845168404Spjd * report any difference between the two.
846168404Spjd */
847185029Spjdstatic int
848168404Spjdcheck_replication(nvlist_t *config, nvlist_t *newroot)
849168404Spjd{
850185029Spjd	nvlist_t **child;
851185029Spjd	uint_t	children;
852168404Spjd	replication_level_t *current = NULL, *new;
853168404Spjd	int ret;
854168404Spjd
855168404Spjd	/*
856168404Spjd	 * If we have a current pool configuration, check to see if it's
857168404Spjd	 * self-consistent.  If not, simply return success.
858168404Spjd	 */
859168404Spjd	if (config != NULL) {
860168404Spjd		nvlist_t *nvroot;
861168404Spjd
862168404Spjd		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
863168404Spjd		    &nvroot) == 0);
864168404Spjd		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
865168404Spjd			return (0);
866168404Spjd	}
867185029Spjd	/*
868185029Spjd	 * for spares there may be no children, and therefore no
869185029Spjd	 * replication level to check
870185029Spjd	 */
871185029Spjd	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
872185029Spjd	    &child, &children) != 0) || (children == 0)) {
873185029Spjd		free(current);
874185029Spjd		return (0);
875185029Spjd	}
876168404Spjd
877168404Spjd	/*
878185029Spjd	 * If all we have is logs then there's no replication level to check.
879185029Spjd	 */
880185029Spjd	if (num_logs(newroot) == children) {
881185029Spjd		free(current);
882185029Spjd		return (0);
883185029Spjd	}
884185029Spjd
885185029Spjd	/*
886168404Spjd	 * Get the replication level of the new vdev spec, reporting any
887168404Spjd	 * inconsistencies found.
888168404Spjd	 */
889168404Spjd	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
890168404Spjd		free(current);
891168404Spjd		return (-1);
892168404Spjd	}
893168404Spjd
894168404Spjd	/*
895168404Spjd	 * Check to see if the new vdev spec matches the replication level of
896168404Spjd	 * the current pool.
897168404Spjd	 */
898168404Spjd	ret = 0;
899168404Spjd	if (current != NULL) {
900168404Spjd		if (strcmp(current->zprl_type, new->zprl_type) != 0) {
901168404Spjd			vdev_error(gettext(
902168404Spjd			    "mismatched replication level: pool uses %s "
903168404Spjd			    "and new vdev is %s\n"),
904168404Spjd			    current->zprl_type, new->zprl_type);
905168404Spjd			ret = -1;
906168404Spjd		} else if (current->zprl_parity != new->zprl_parity) {
907168404Spjd			vdev_error(gettext(
908168404Spjd			    "mismatched replication level: pool uses %llu "
909168404Spjd			    "device parity and new vdev uses %llu\n"),
910168404Spjd			    current->zprl_parity, new->zprl_parity);
911168404Spjd			ret = -1;
912168404Spjd		} else if (current->zprl_children != new->zprl_children) {
913168404Spjd			vdev_error(gettext(
914168404Spjd			    "mismatched replication level: pool uses %llu-way "
915168404Spjd			    "%s and new vdev uses %llu-way %s\n"),
916168404Spjd			    current->zprl_children, current->zprl_type,
917168404Spjd			    new->zprl_children, new->zprl_type);
918168404Spjd			ret = -1;
919168404Spjd		}
920168404Spjd	}
921168404Spjd
922168404Spjd	free(new);
923168404Spjd	if (current != NULL)
924168404Spjd		free(current);
925168404Spjd
926168404Spjd	return (ret);
927168404Spjd}
928168404Spjd
929297077Smav#ifdef illumos
930168404Spjd/*
931219089Spjd * Go through and find any whole disks in the vdev specification, labelling them
932219089Spjd * as appropriate.  When constructing the vdev spec, we were unable to open this
933219089Spjd * device in order to provide a devid.  Now that we have labelled the disk and
934219089Spjd * know that slice 0 is valid, we can construct the devid now.
935219089Spjd *
936219089Spjd * If the disk was already labeled with an EFI label, we will have gotten the
937219089Spjd * devid already (because we were able to open the whole disk).  Otherwise, we
938219089Spjd * need to get the devid after we label the disk.
939219089Spjd */
940219089Spjdstatic int
941219089Spjdmake_disks(zpool_handle_t *zhp, nvlist_t *nv)
942219089Spjd{
943219089Spjd	nvlist_t **child;
944219089Spjd	uint_t c, children;
945219089Spjd	char *type, *path, *diskname;
946219089Spjd	char buf[MAXPATHLEN];
947219089Spjd	uint64_t wholedisk;
948219089Spjd	int fd;
949219089Spjd	int ret;
950219089Spjd	ddi_devid_t devid;
951219089Spjd	char *minor = NULL, *devid_str = NULL;
952219089Spjd
953219089Spjd	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
954219089Spjd
955219089Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
956219089Spjd	    &child, &children) != 0) {
957219089Spjd
958219089Spjd		if (strcmp(type, VDEV_TYPE_DISK) != 0)
959219089Spjd			return (0);
960219089Spjd
961219089Spjd		/*
962219089Spjd		 * We have a disk device.  Get the path to the device
963219089Spjd		 * and see if it's a whole disk by appending the backup
964219089Spjd		 * slice and stat()ing the device.
965219089Spjd		 */
966219089Spjd		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
967219089Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
968219089Spjd		    &wholedisk) != 0 || !wholedisk)
969219089Spjd			return (0);
970219089Spjd
971219089Spjd		diskname = strrchr(path, '/');
972219089Spjd		assert(diskname != NULL);
973219089Spjd		diskname++;
974219089Spjd		if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
975219089Spjd			return (-1);
976219089Spjd
977219089Spjd		/*
978219089Spjd		 * Fill in the devid, now that we've labeled the disk.
979219089Spjd		 */
980219089Spjd		(void) snprintf(buf, sizeof (buf), "%ss0", path);
981219089Spjd		if ((fd = open(buf, O_RDONLY)) < 0) {
982219089Spjd			(void) fprintf(stderr,
983219089Spjd			    gettext("cannot open '%s': %s\n"),
984219089Spjd			    buf, strerror(errno));
985219089Spjd			return (-1);
986219089Spjd		}
987219089Spjd
988219089Spjd		if (devid_get(fd, &devid) == 0) {
989219089Spjd			if (devid_get_minor_name(fd, &minor) == 0 &&
990219089Spjd			    (devid_str = devid_str_encode(devid, minor)) !=
991219089Spjd			    NULL) {
992219089Spjd				verify(nvlist_add_string(nv,
993219089Spjd				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
994219089Spjd			}
995219089Spjd			if (devid_str != NULL)
996219089Spjd				devid_str_free(devid_str);
997219089Spjd			if (minor != NULL)
998219089Spjd				devid_str_free(minor);
999219089Spjd			devid_free(devid);
1000219089Spjd		}
1001219089Spjd
1002219089Spjd		/*
1003219089Spjd		 * Update the path to refer to the 's0' slice.  The presence of
1004219089Spjd		 * the 'whole_disk' field indicates to the CLI that we should
1005219089Spjd		 * chop off the slice number when displaying the device in
1006219089Spjd		 * future output.
1007219089Spjd		 */
1008219089Spjd		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
1009219089Spjd
1010219089Spjd		(void) close(fd);
1011219089Spjd
1012219089Spjd		return (0);
1013219089Spjd	}
1014219089Spjd
1015219089Spjd	for (c = 0; c < children; c++)
1016219089Spjd		if ((ret = make_disks(zhp, child[c])) != 0)
1017219089Spjd			return (ret);
1018219089Spjd
1019219089Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1020219089Spjd	    &child, &children) == 0)
1021219089Spjd		for (c = 0; c < children; c++)
1022219089Spjd			if ((ret = make_disks(zhp, child[c])) != 0)
1023219089Spjd				return (ret);
1024219089Spjd
1025219089Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1026219089Spjd	    &child, &children) == 0)
1027219089Spjd		for (c = 0; c < children; c++)
1028219089Spjd			if ((ret = make_disks(zhp, child[c])) != 0)
1029219089Spjd				return (ret);
1030219089Spjd
1031219089Spjd	return (0);
1032219089Spjd}
1033297077Smav#endif	/* illumos */
1034219089Spjd
1035219089Spjd/*
1036168404Spjd * Determine if the given path is a hot spare within the given configuration.
1037168404Spjd */
1038168404Spjdstatic boolean_t
1039168404Spjdis_spare(nvlist_t *config, const char *path)
1040168404Spjd{
1041168404Spjd	int fd;
1042168404Spjd	pool_state_t state;
1043168404Spjd	char *name = NULL;
1044168404Spjd	nvlist_t *label;
1045168404Spjd	uint64_t guid, spareguid;
1046168404Spjd	nvlist_t *nvroot;
1047168404Spjd	nvlist_t **spares;
1048168404Spjd	uint_t i, nspares;
1049168404Spjd	boolean_t inuse;
1050168404Spjd
1051168404Spjd	if ((fd = open(path, O_RDONLY)) < 0)
1052168404Spjd		return (B_FALSE);
1053168404Spjd
1054168404Spjd	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1055168404Spjd	    !inuse ||
1056168404Spjd	    state != POOL_STATE_SPARE ||
1057168404Spjd	    zpool_read_label(fd, &label) != 0) {
1058168404Spjd		free(name);
1059168404Spjd		(void) close(fd);
1060168404Spjd		return (B_FALSE);
1061168404Spjd	}
1062168404Spjd	free(name);
1063219089Spjd	(void) close(fd);
1064168404Spjd
1065168404Spjd	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1066168404Spjd	nvlist_free(label);
1067168404Spjd
1068168404Spjd	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1069168404Spjd	    &nvroot) == 0);
1070168404Spjd	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1071168404Spjd	    &spares, &nspares) == 0) {
1072168404Spjd		for (i = 0; i < nspares; i++) {
1073168404Spjd			verify(nvlist_lookup_uint64(spares[i],
1074168404Spjd			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
1075168404Spjd			if (spareguid == guid)
1076168404Spjd				return (B_TRUE);
1077168404Spjd		}
1078168404Spjd	}
1079168404Spjd
1080168404Spjd	return (B_FALSE);
1081168404Spjd}
1082168404Spjd
1083168404Spjd/*
1084168404Spjd * Go through and find any devices that are in use.  We rely on libdiskmgt for
1085168404Spjd * the majority of this task.
1086168404Spjd */
1087272136Sdelphijstatic boolean_t
1088272136Sdelphijis_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1089219089Spjd    boolean_t replacing, boolean_t isspare)
1090168404Spjd{
1091168404Spjd	nvlist_t **child;
1092168404Spjd	uint_t c, children;
1093168404Spjd	char *type, *path;
1094297119Smav	int ret = 0;
1095168404Spjd	char buf[MAXPATHLEN];
1096168404Spjd	uint64_t wholedisk;
1097272136Sdelphij	boolean_t anyinuse = B_FALSE;
1098168404Spjd
1099168404Spjd	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1100168404Spjd
1101168404Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1102168404Spjd	    &child, &children) != 0) {
1103168404Spjd
1104168404Spjd		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1105168404Spjd
1106168404Spjd		/*
1107168404Spjd		 * As a generic check, we look to see if this is a replace of a
1108168404Spjd		 * hot spare within the same pool.  If so, we allow it
1109168404Spjd		 * regardless of what libdiskmgt or zpool_in_use() says.
1110168404Spjd		 */
1111219089Spjd		if (replacing) {
1112297077Smav#ifdef illumos
1113219089Spjd			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1114219089Spjd			    &wholedisk) == 0 && wholedisk)
1115219089Spjd				(void) snprintf(buf, sizeof (buf), "%ss0",
1116219089Spjd				    path);
1117219089Spjd			else
1118219089Spjd#endif
1119219089Spjd				(void) strlcpy(buf, path, sizeof (buf));
1120219089Spjd
1121168404Spjd			if (is_spare(config, buf))
1122272136Sdelphij				return (B_FALSE);
1123168404Spjd		}
1124168404Spjd
1125168404Spjd		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1126219089Spjd			ret = check_device(path, force, isspare);
1127272136Sdelphij		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1128185029Spjd			ret = check_file(path, force, isspare);
1129185029Spjd
1130272136Sdelphij		return (ret != 0);
1131168404Spjd	}
1132168404Spjd
1133168404Spjd	for (c = 0; c < children; c++)
1134272136Sdelphij		if (is_device_in_use(config, child[c], force, replacing,
1135272136Sdelphij		    B_FALSE))
1136272136Sdelphij			anyinuse = B_TRUE;
1137168404Spjd
1138168404Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1139168404Spjd	    &child, &children) == 0)
1140168404Spjd		for (c = 0; c < children; c++)
1141272136Sdelphij			if (is_device_in_use(config, child[c], force, replacing,
1142272136Sdelphij			    B_TRUE))
1143272136Sdelphij				anyinuse = B_TRUE;
1144168404Spjd
1145185029Spjd	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1146185029Spjd	    &child, &children) == 0)
1147185029Spjd		for (c = 0; c < children; c++)
1148272136Sdelphij			if (is_device_in_use(config, child[c], force, replacing,
1149272136Sdelphij			    B_FALSE))
1150272136Sdelphij				anyinuse = B_TRUE;
1151185029Spjd
1152272136Sdelphij	return (anyinuse);
1153168404Spjd}
1154168404Spjd
1155185029Spjdstatic const char *
1156219089Spjdis_grouping(const char *type, int *mindev, int *maxdev)
1157168404Spjd{
1158219089Spjd	if (strncmp(type, "raidz", 5) == 0) {
1159219089Spjd		const char *p = type + 5;
1160219089Spjd		char *end;
1161219089Spjd		long nparity;
1162168404Spjd
1163219089Spjd		if (*p == '\0') {
1164219089Spjd			nparity = 1;
1165219089Spjd		} else if (*p == '0') {
1166219089Spjd			return (NULL); /* no zero prefixes allowed */
1167219089Spjd		} else {
1168219089Spjd			errno = 0;
1169219089Spjd			nparity = strtol(p, &end, 10);
1170219089Spjd			if (errno != 0 || nparity < 1 || nparity >= 255 ||
1171219089Spjd			    *end != '\0')
1172219089Spjd				return (NULL);
1173219089Spjd		}
1174219089Spjd
1175168404Spjd		if (mindev != NULL)
1176219089Spjd			*mindev = nparity + 1;
1177219089Spjd		if (maxdev != NULL)
1178219089Spjd			*maxdev = 255;
1179168404Spjd		return (VDEV_TYPE_RAIDZ);
1180168404Spjd	}
1181168404Spjd
1182219089Spjd	if (maxdev != NULL)
1183219089Spjd		*maxdev = INT_MAX;
1184219089Spjd
1185168404Spjd	if (strcmp(type, "mirror") == 0) {
1186168404Spjd		if (mindev != NULL)
1187168404Spjd			*mindev = 2;
1188168404Spjd		return (VDEV_TYPE_MIRROR);
1189168404Spjd	}
1190168404Spjd
1191168404Spjd	if (strcmp(type, "spare") == 0) {
1192168404Spjd		if (mindev != NULL)
1193168404Spjd			*mindev = 1;
1194168404Spjd		return (VDEV_TYPE_SPARE);
1195168404Spjd	}
1196168404Spjd
1197185029Spjd	if (strcmp(type, "log") == 0) {
1198185029Spjd		if (mindev != NULL)
1199185029Spjd			*mindev = 1;
1200185029Spjd		return (VDEV_TYPE_LOG);
1201185029Spjd	}
1202185029Spjd
1203185029Spjd	if (strcmp(type, "cache") == 0) {
1204185029Spjd		if (mindev != NULL)
1205185029Spjd			*mindev = 1;
1206185029Spjd		return (VDEV_TYPE_L2CACHE);
1207185029Spjd	}
1208185029Spjd
1209168404Spjd	return (NULL);
1210168404Spjd}
1211168404Spjd
1212168404Spjd/*
1213168404Spjd * Construct a syntactically valid vdev specification,
1214168404Spjd * and ensure that all devices and files exist and can be opened.
1215168404Spjd * Note: we don't bother freeing anything in the error paths
1216168404Spjd * because the program is just going to exit anyway.
1217168404Spjd */
1218168404Spjdnvlist_t *
1219168404Spjdconstruct_spec(int argc, char **argv)
1220168404Spjd{
1221185029Spjd	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1222219089Spjd	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1223168404Spjd	const char *type;
1224185029Spjd	uint64_t is_log;
1225185029Spjd	boolean_t seen_logs;
1226168404Spjd
1227168404Spjd	top = NULL;
1228168404Spjd	toplevels = 0;
1229168404Spjd	spares = NULL;
1230185029Spjd	l2cache = NULL;
1231168404Spjd	nspares = 0;
1232185029Spjd	nlogs = 0;
1233185029Spjd	nl2cache = 0;
1234185029Spjd	is_log = B_FALSE;
1235185029Spjd	seen_logs = B_FALSE;
1236168404Spjd
1237168404Spjd	while (argc > 0) {
1238168404Spjd		nv = NULL;
1239168404Spjd
1240168404Spjd		/*
1241168404Spjd		 * If it's a mirror or raidz, the subsequent arguments are
1242168404Spjd		 * its leaves -- until we encounter the next mirror or raidz.
1243168404Spjd		 */
1244219089Spjd		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1245168404Spjd			nvlist_t **child = NULL;
1246168404Spjd			int c, children = 0;
1247168404Spjd
1248185029Spjd			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1249185029Spjd				if (spares != NULL) {
1250185029Spjd					(void) fprintf(stderr,
1251185029Spjd					    gettext("invalid vdev "
1252185029Spjd					    "specification: 'spare' can be "
1253185029Spjd					    "specified only once\n"));
1254185029Spjd					return (NULL);
1255185029Spjd				}
1256185029Spjd				is_log = B_FALSE;
1257168404Spjd			}
1258168404Spjd
1259185029Spjd			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1260185029Spjd				if (seen_logs) {
1261185029Spjd					(void) fprintf(stderr,
1262185029Spjd					    gettext("invalid vdev "
1263185029Spjd					    "specification: 'log' can be "
1264185029Spjd					    "specified only once\n"));
1265185029Spjd					return (NULL);
1266185029Spjd				}
1267185029Spjd				seen_logs = B_TRUE;
1268185029Spjd				is_log = B_TRUE;
1269185029Spjd				argc--;
1270185029Spjd				argv++;
1271185029Spjd				/*
1272185029Spjd				 * A log is not a real grouping device.
1273185029Spjd				 * We just set is_log and continue.
1274185029Spjd				 */
1275185029Spjd				continue;
1276185029Spjd			}
1277185029Spjd
1278185029Spjd			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1279185029Spjd				if (l2cache != NULL) {
1280185029Spjd					(void) fprintf(stderr,
1281185029Spjd					    gettext("invalid vdev "
1282185029Spjd					    "specification: 'cache' can be "
1283185029Spjd					    "specified only once\n"));
1284185029Spjd					return (NULL);
1285185029Spjd				}
1286185029Spjd				is_log = B_FALSE;
1287185029Spjd			}
1288185029Spjd
1289185029Spjd			if (is_log) {
1290185029Spjd				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1291185029Spjd					(void) fprintf(stderr,
1292185029Spjd					    gettext("invalid vdev "
1293185029Spjd					    "specification: unsupported 'log' "
1294185029Spjd					    "device: %s\n"), type);
1295185029Spjd					return (NULL);
1296185029Spjd				}
1297185029Spjd				nlogs++;
1298185029Spjd			}
1299185029Spjd
1300168404Spjd			for (c = 1; c < argc; c++) {
1301219089Spjd				if (is_grouping(argv[c], NULL, NULL) != NULL)
1302168404Spjd					break;
1303168404Spjd				children++;
1304168404Spjd				child = realloc(child,
1305168404Spjd				    children * sizeof (nvlist_t *));
1306168404Spjd				if (child == NULL)
1307168404Spjd					zpool_no_memory();
1308185029Spjd				if ((nv = make_leaf_vdev(argv[c], B_FALSE))
1309185029Spjd				    == NULL)
1310168404Spjd					return (NULL);
1311168404Spjd				child[children - 1] = nv;
1312168404Spjd			}
1313168404Spjd
1314168404Spjd			if (children < mindev) {
1315168404Spjd				(void) fprintf(stderr, gettext("invalid vdev "
1316168404Spjd				    "specification: %s requires at least %d "
1317168404Spjd				    "devices\n"), argv[0], mindev);
1318168404Spjd				return (NULL);
1319168404Spjd			}
1320168404Spjd
1321219089Spjd			if (children > maxdev) {
1322219089Spjd				(void) fprintf(stderr, gettext("invalid vdev "
1323219089Spjd				    "specification: %s supports no more than "
1324219089Spjd				    "%d devices\n"), argv[0], maxdev);
1325219089Spjd				return (NULL);
1326219089Spjd			}
1327219089Spjd
1328168404Spjd			argc -= c;
1329168404Spjd			argv += c;
1330168404Spjd
1331168404Spjd			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1332168404Spjd				spares = child;
1333168404Spjd				nspares = children;
1334168404Spjd				continue;
1335185029Spjd			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1336185029Spjd				l2cache = child;
1337185029Spjd				nl2cache = children;
1338185029Spjd				continue;
1339168404Spjd			} else {
1340168404Spjd				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1341168404Spjd				    0) == 0);
1342168404Spjd				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1343168404Spjd				    type) == 0);
1344185029Spjd				verify(nvlist_add_uint64(nv,
1345185029Spjd				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1346168404Spjd				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1347168404Spjd					verify(nvlist_add_uint64(nv,
1348168404Spjd					    ZPOOL_CONFIG_NPARITY,
1349168404Spjd					    mindev - 1) == 0);
1350168404Spjd				}
1351168404Spjd				verify(nvlist_add_nvlist_array(nv,
1352168404Spjd				    ZPOOL_CONFIG_CHILDREN, child,
1353168404Spjd				    children) == 0);
1354168404Spjd
1355168404Spjd				for (c = 0; c < children; c++)
1356168404Spjd					nvlist_free(child[c]);
1357168404Spjd				free(child);
1358168404Spjd			}
1359168404Spjd		} else {
1360168404Spjd			/*
1361168404Spjd			 * We have a device.  Pass off to make_leaf_vdev() to
1362168404Spjd			 * construct the appropriate nvlist describing the vdev.
1363168404Spjd			 */
1364185029Spjd			if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
1365168404Spjd				return (NULL);
1366185029Spjd			if (is_log)
1367185029Spjd				nlogs++;
1368168404Spjd			argc--;
1369168404Spjd			argv++;
1370168404Spjd		}
1371168404Spjd
1372168404Spjd		toplevels++;
1373168404Spjd		top = realloc(top, toplevels * sizeof (nvlist_t *));
1374168404Spjd		if (top == NULL)
1375168404Spjd			zpool_no_memory();
1376168404Spjd		top[toplevels - 1] = nv;
1377168404Spjd	}
1378168404Spjd
1379185029Spjd	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1380168404Spjd		(void) fprintf(stderr, gettext("invalid vdev "
1381168404Spjd		    "specification: at least one toplevel vdev must be "
1382168404Spjd		    "specified\n"));
1383168404Spjd		return (NULL);
1384168404Spjd	}
1385168404Spjd
1386185029Spjd	if (seen_logs && nlogs == 0) {
1387185029Spjd		(void) fprintf(stderr, gettext("invalid vdev specification: "
1388185029Spjd		    "log requires at least 1 device\n"));
1389185029Spjd		return (NULL);
1390185029Spjd	}
1391185029Spjd
1392168404Spjd	/*
1393168404Spjd	 * Finally, create nvroot and add all top-level vdevs to it.
1394168404Spjd	 */
1395168404Spjd	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1396168404Spjd	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1397168404Spjd	    VDEV_TYPE_ROOT) == 0);
1398168404Spjd	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1399168404Spjd	    top, toplevels) == 0);
1400168404Spjd	if (nspares != 0)
1401168404Spjd		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1402168404Spjd		    spares, nspares) == 0);
1403185029Spjd	if (nl2cache != 0)
1404185029Spjd		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1405185029Spjd		    l2cache, nl2cache) == 0);
1406168404Spjd
1407168404Spjd	for (t = 0; t < toplevels; t++)
1408168404Spjd		nvlist_free(top[t]);
1409168404Spjd	for (t = 0; t < nspares; t++)
1410168404Spjd		nvlist_free(spares[t]);
1411185029Spjd	for (t = 0; t < nl2cache; t++)
1412185029Spjd		nvlist_free(l2cache[t]);
1413168404Spjd	if (spares)
1414168404Spjd		free(spares);
1415185029Spjd	if (l2cache)
1416185029Spjd		free(l2cache);
1417168404Spjd	free(top);
1418168404Spjd
1419168404Spjd	return (nvroot);
1420168404Spjd}
1421168404Spjd
1422219089Spjdnvlist_t *
1423219089Spjdsplit_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1424219089Spjd    splitflags_t flags, int argc, char **argv)
1425219089Spjd{
1426219089Spjd	nvlist_t *newroot = NULL, **child;
1427219089Spjd	uint_t c, children;
1428185029Spjd
1429219089Spjd	if (argc > 0) {
1430219089Spjd		if ((newroot = construct_spec(argc, argv)) == NULL) {
1431219089Spjd			(void) fprintf(stderr, gettext("Unable to build a "
1432219089Spjd			    "pool from the specified devices\n"));
1433219089Spjd			return (NULL);
1434219089Spjd		}
1435219089Spjd
1436297077Smav#ifdef illumos
1437219089Spjd		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1438219089Spjd			nvlist_free(newroot);
1439219089Spjd			return (NULL);
1440219089Spjd		}
1441219089Spjd#endif
1442219089Spjd
1443219089Spjd		/* avoid any tricks in the spec */
1444219089Spjd		verify(nvlist_lookup_nvlist_array(newroot,
1445219089Spjd		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1446219089Spjd		for (c = 0; c < children; c++) {
1447219089Spjd			char *path;
1448219089Spjd			const char *type;
1449219089Spjd			int min, max;
1450219089Spjd
1451219089Spjd			verify(nvlist_lookup_string(child[c],
1452219089Spjd			    ZPOOL_CONFIG_PATH, &path) == 0);
1453219089Spjd			if ((type = is_grouping(path, &min, &max)) != NULL) {
1454219089Spjd				(void) fprintf(stderr, gettext("Cannot use "
1455219089Spjd				    "'%s' as a device for splitting\n"), type);
1456219089Spjd				nvlist_free(newroot);
1457219089Spjd				return (NULL);
1458219089Spjd			}
1459219089Spjd		}
1460219089Spjd	}
1461219089Spjd
1462219089Spjd	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1463297115Smav		nvlist_free(newroot);
1464219089Spjd		return (NULL);
1465219089Spjd	}
1466219089Spjd
1467219089Spjd	return (newroot);
1468219089Spjd}
1469219089Spjd
1470168404Spjd/*
1471168404Spjd * Get and validate the contents of the given vdev specification.  This ensures
1472168404Spjd * that the nvlist returned is well-formed, that all the devices exist, and that
1473168404Spjd * they are not currently in use by any other known consumer.  The 'poolconfig'
1474168404Spjd * parameter is the current configuration of the pool when adding devices
1475168404Spjd * existing pool, and is used to perform additional checks, such as changing the
1476168404Spjd * replication level of the pool.  It can be 'NULL' to indicate that this is a
1477168404Spjd * new pool.  The 'force' flag controls whether devices should be forcefully
1478168404Spjd * added, even if they appear in use.
1479168404Spjd */
1480168404Spjdnvlist_t *
1481185029Spjdmake_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
1482219089Spjd    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1483168404Spjd{
1484168404Spjd	nvlist_t *newroot;
1485185029Spjd	nvlist_t *poolconfig = NULL;
1486168404Spjd	is_force = force;
1487168404Spjd
1488168404Spjd	/*
1489168404Spjd	 * Construct the vdev specification.  If this is successful, we know
1490168404Spjd	 * that we have a valid specification, and that all devices can be
1491168404Spjd	 * opened.
1492168404Spjd	 */
1493168404Spjd	if ((newroot = construct_spec(argc, argv)) == NULL)
1494168404Spjd		return (NULL);
1495168404Spjd
1496185029Spjd	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1497185029Spjd		return (NULL);
1498185029Spjd
1499168404Spjd	/*
1500168404Spjd	 * Validate each device to make sure that its not shared with another
1501168404Spjd	 * subsystem.  We do this even if 'force' is set, because there are some
1502168404Spjd	 * uses (such as a dedicated dump device) that even '-f' cannot
1503168404Spjd	 * override.
1504168404Spjd	 */
1505272136Sdelphij	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1506168404Spjd		nvlist_free(newroot);
1507168404Spjd		return (NULL);
1508168404Spjd	}
1509168404Spjd
1510168404Spjd	/*
1511168404Spjd	 * Check the replication level of the given vdevs and report any errors
1512168404Spjd	 * found.  We include the existing pool spec, if any, as we need to
1513168404Spjd	 * catch changes against the existing replication level.
1514168404Spjd	 */
1515168404Spjd	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1516168404Spjd		nvlist_free(newroot);
1517168404Spjd		return (NULL);
1518168404Spjd	}
1519168404Spjd
1520297077Smav#ifdef illumos
1521219089Spjd	/*
1522219089Spjd	 * Run through the vdev specification and label any whole disks found.
1523219089Spjd	 */
1524219089Spjd	if (!dryrun && make_disks(zhp, newroot) != 0) {
1525219089Spjd		nvlist_free(newroot);
1526219089Spjd		return (NULL);
1527219089Spjd	}
1528219089Spjd#endif
1529219089Spjd
1530168404Spjd	return (newroot);
1531168404Spjd}
1532