1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2016, 2017 Intel Corporation.
26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27 */
28
29/*
30 * Functions to convert between a list of vdevs and an nvlist representing the
31 * configuration.  Each entry in the list can be one of:
32 *
33 * 	Device vdevs
34 * 		disk=(path=..., devid=...)
35 * 		file=(path=...)
36 *
37 * 	Group vdevs
38 * 		raidz[1|2]=(...)
39 * 		mirror=(...)
40 *
41 * 	Hot spares
42 *
43 * While the underlying implementation supports it, group vdevs cannot contain
44 * other group vdevs.  All userland verification of devices is contained within
45 * this file.  If successful, the nvlist returned can be passed directly to the
46 * kernel; we've done as much verification as possible in userland.
47 *
48 * Hot spares are a special case, and passed down as an array of disk vdevs, at
49 * the same level as the root of the vdev tree.
50 *
51 * The only function exported by this file is 'make_root_vdev'.  The
52 * function performs several passes:
53 *
54 * 	1. Construct the vdev specification.  Performs syntax validation and
55 *         makes sure each device is valid.
56 * 	2. Check for devices in use.  Using libblkid to make sure that no
57 *         devices are also in use.  Some can be overridden using the 'force'
58 *         flag, others cannot.
59 * 	3. Check for replication errors if the 'force' flag is not specified.
60 *         validates that the replication level is consistent across the
61 *         entire pool.
62 * 	4. Call libzfs to label any whole disks with an EFI label.
63 */
64
65#include <assert.h>
66#include <ctype.h>
67#include <errno.h>
68#include <fcntl.h>
69#include <libintl.h>
70#include <libnvpair.h>
71#include <libzutil.h>
72#include <limits.h>
73#include <sys/spa.h>
74#include <stdio.h>
75#include <string.h>
76#include <unistd.h>
77#include "zpool_util.h"
78#include <sys/zfs_context.h>
79#include <sys/stat.h>
80
81/*
82 * For any given vdev specification, we can have multiple errors.  The
83 * vdev_error() function keeps track of whether we have seen an error yet, and
84 * prints out a header if its the first error we've seen.
85 */
86boolean_t error_seen;
87boolean_t is_force;
88
89/*PRINTFLIKE1*/
90void
91vdev_error(const char *fmt, ...)
92{
93	va_list ap;
94
95	if (!error_seen) {
96		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
97		if (!is_force)
98			(void) fprintf(stderr, gettext("use '-f' to override "
99			    "the following errors:\n"));
100		else
101			(void) fprintf(stderr, gettext("the following errors "
102			    "must be manually repaired:\n"));
103		error_seen = B_TRUE;
104	}
105
106	va_start(ap, fmt);
107	(void) vfprintf(stderr, fmt, ap);
108	va_end(ap);
109}
110
111/*
112 * Check that a file is valid.  All we can do in this case is check that it's
113 * not in use by another pool, and not in use by swap.
114 */
115int
116check_file(const char *file, boolean_t force, boolean_t isspare)
117{
118	char  *name;
119	int fd;
120	int ret = 0;
121	pool_state_t state;
122	boolean_t inuse;
123
124	if ((fd = open(file, O_RDONLY)) < 0)
125		return (0);
126
127	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
128		const char *desc;
129
130		switch (state) {
131		case POOL_STATE_ACTIVE:
132			desc = gettext("active");
133			break;
134
135		case POOL_STATE_EXPORTED:
136			desc = gettext("exported");
137			break;
138
139		case POOL_STATE_POTENTIALLY_ACTIVE:
140			desc = gettext("potentially active");
141			break;
142
143		default:
144			desc = gettext("unknown");
145			break;
146		}
147
148		/*
149		 * Allow hot spares to be shared between pools.
150		 */
151		if (state == POOL_STATE_SPARE && isspare) {
152			free(name);
153			(void) close(fd);
154			return (0);
155		}
156
157		if (state == POOL_STATE_ACTIVE ||
158		    state == POOL_STATE_SPARE || !force) {
159			switch (state) {
160			case POOL_STATE_SPARE:
161				vdev_error(gettext("%s is reserved as a hot "
162				    "spare for pool %s\n"), file, name);
163				break;
164			default:
165				vdev_error(gettext("%s is part of %s pool "
166				    "'%s'\n"), file, desc, name);
167				break;
168			}
169			ret = -1;
170		}
171
172		free(name);
173	}
174
175	(void) close(fd);
176	return (ret);
177}
178
179/*
180 * This may be a shorthand device path or it could be total gibberish.
181 * Check to see if it is a known device available in zfs_vdev_paths.
182 * As part of this check, see if we've been given an entire disk
183 * (minus the slice number).
184 */
185static int
186is_shorthand_path(const char *arg, char *path, size_t path_size,
187    struct stat64 *statbuf, boolean_t *wholedisk)
188{
189	int error;
190
191	error = zfs_resolve_shortname(arg, path, path_size);
192	if (error == 0) {
193		*wholedisk = zfs_dev_is_whole_disk(path);
194		if (*wholedisk || (stat64(path, statbuf) == 0))
195			return (0);
196	}
197
198	strlcpy(path, arg, path_size);
199	memset(statbuf, 0, sizeof (*statbuf));
200	*wholedisk = B_FALSE;
201
202	return (error);
203}
204
205/*
206 * Determine if the given path is a hot spare within the given configuration.
207 * If no configuration is given we rely solely on the label.
208 */
209static boolean_t
210is_spare(nvlist_t *config, const char *path)
211{
212	int fd;
213	pool_state_t state;
214	char *name = NULL;
215	nvlist_t *label;
216	uint64_t guid, spareguid;
217	nvlist_t *nvroot;
218	nvlist_t **spares;
219	uint_t i, nspares;
220	boolean_t inuse;
221
222	if (zpool_is_draid_spare(path))
223		return (B_TRUE);
224
225	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
226		return (B_FALSE);
227
228	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
229	    !inuse ||
230	    state != POOL_STATE_SPARE ||
231	    zpool_read_label(fd, &label, NULL) != 0) {
232		free(name);
233		(void) close(fd);
234		return (B_FALSE);
235	}
236	free(name);
237	(void) close(fd);
238
239	if (config == NULL) {
240		nvlist_free(label);
241		return (B_TRUE);
242	}
243
244	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
245	nvlist_free(label);
246
247	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
248	    &nvroot) == 0);
249	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
250	    &spares, &nspares) == 0) {
251		for (i = 0; i < nspares; i++) {
252			verify(nvlist_lookup_uint64(spares[i],
253			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
254			if (spareguid == guid)
255				return (B_TRUE);
256		}
257	}
258
259	return (B_FALSE);
260}
261
262/*
263 * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
264 * device, fill in the device id to make a complete nvlist.  Valid forms for a
265 * leaf vdev are:
266 *
267 *	/dev/xxx	Complete disk path
268 *	/xxx		Full path to file
269 *	xxx		Shorthand for <zfs_vdev_paths>/xxx
270 *	draid*		Virtual dRAID spare
271 */
272static nvlist_t *
273make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
274{
275	char path[MAXPATHLEN];
276	struct stat64 statbuf;
277	nvlist_t *vdev = NULL;
278	char *type = NULL;
279	boolean_t wholedisk = B_FALSE;
280	uint64_t ashift = 0;
281	int err;
282
283	/*
284	 * Determine what type of vdev this is, and put the full path into
285	 * 'path'.  We detect whether this is a device of file afterwards by
286	 * checking the st_mode of the file.
287	 */
288	if (arg[0] == '/') {
289		/*
290		 * Complete device or file path.  Exact type is determined by
291		 * examining the file descriptor afterwards.  Symbolic links
292		 * are resolved to their real paths to determine whole disk
293		 * and S_ISBLK/S_ISREG type checks.  However, we are careful
294		 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
295		 * can leverage udev's persistent device labels.
296		 */
297		if (realpath(arg, path) == NULL) {
298			(void) fprintf(stderr,
299			    gettext("cannot resolve path '%s'\n"), arg);
300			return (NULL);
301		}
302
303		wholedisk = zfs_dev_is_whole_disk(path);
304		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
305			(void) fprintf(stderr,
306			    gettext("cannot open '%s': %s\n"),
307			    path, strerror(errno));
308			return (NULL);
309		}
310
311		/* After whole disk check restore original passed path */
312		strlcpy(path, arg, sizeof (path));
313	} else if (zpool_is_draid_spare(arg)) {
314		if (!is_primary) {
315			(void) fprintf(stderr,
316			    gettext("cannot open '%s': dRAID spares can only "
317			    "be used to replace primary vdevs\n"), arg);
318			return (NULL);
319		}
320
321		wholedisk = B_TRUE;
322		strlcpy(path, arg, sizeof (path));
323		type = VDEV_TYPE_DRAID_SPARE;
324	} else {
325		err = is_shorthand_path(arg, path, sizeof (path),
326		    &statbuf, &wholedisk);
327		if (err != 0) {
328			/*
329			 * If we got ENOENT, then the user gave us
330			 * gibberish, so try to direct them with a
331			 * reasonable error message.  Otherwise,
332			 * regurgitate strerror() since it's the best we
333			 * can do.
334			 */
335			if (err == ENOENT) {
336				(void) fprintf(stderr,
337				    gettext("cannot open '%s': no such "
338				    "device in %s\n"), arg, DISK_ROOT);
339				(void) fprintf(stderr,
340				    gettext("must be a full path or "
341				    "shorthand device name\n"));
342				return (NULL);
343			} else {
344				(void) fprintf(stderr,
345				    gettext("cannot open '%s': %s\n"),
346				    path, strerror(errno));
347				return (NULL);
348			}
349		}
350	}
351
352	if (type == NULL) {
353		/*
354		 * Determine whether this is a device or a file.
355		 */
356		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
357			type = VDEV_TYPE_DISK;
358		} else if (S_ISREG(statbuf.st_mode)) {
359			type = VDEV_TYPE_FILE;
360		} else {
361			fprintf(stderr, gettext("cannot use '%s': must "
362			    "be a block device or regular file\n"), path);
363			return (NULL);
364		}
365	}
366
367	/*
368	 * Finally, we have the complete device or file, and we know that it is
369	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
370	 * vdevs have a 'path' element, and devices also have a 'devid' element.
371	 */
372	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
373	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
374	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
375
376	if (strcmp(type, VDEV_TYPE_DISK) == 0)
377		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
378		    (uint64_t)wholedisk) == 0);
379
380	/*
381	 * Override defaults if custom properties are provided.
382	 */
383	if (props != NULL) {
384		char *value = NULL;
385
386		if (nvlist_lookup_string(props,
387		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
388			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
389				(void) fprintf(stderr,
390				    gettext("ashift must be a number.\n"));
391				return (NULL);
392			}
393			if (ashift != 0 &&
394			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
395				(void) fprintf(stderr,
396				    gettext("invalid 'ashift=%" PRIu64 "' "
397				    "property: only values between %" PRId32 " "
398				    "and %" PRId32 " are allowed.\n"),
399				    ashift, ASHIFT_MIN, ASHIFT_MAX);
400				return (NULL);
401			}
402		}
403	}
404
405	/*
406	 * If the device is known to incorrectly report its physical sector
407	 * size explicitly provide the known correct value.
408	 */
409	if (ashift == 0) {
410		int sector_size;
411
412		if (check_sector_size_database(path, &sector_size) == B_TRUE)
413			ashift = highbit64(sector_size) - 1;
414	}
415
416	if (ashift > 0)
417		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
418
419	return (vdev);
420}
421
422/*
423 * Go through and verify the replication level of the pool is consistent.
424 * Performs the following checks:
425 *
426 * 	For the new spec, verifies that devices in mirrors and raidz are the
427 * 	same size.
428 *
429 * 	If the current configuration already has inconsistent replication
430 * 	levels, ignore any other potential problems in the new spec.
431 *
432 * 	Otherwise, make sure that the current spec (if there is one) and the new
433 * 	spec have consistent replication levels.
434 *
435 *	If there is no current spec (create), make sure new spec has at least
436 *	one general purpose vdev.
437 */
438typedef struct replication_level {
439	char *zprl_type;
440	uint64_t zprl_children;
441	uint64_t zprl_parity;
442} replication_level_t;
443
444#define	ZPOOL_FUZZ	(16 * 1024 * 1024)
445
446/*
447 * N.B. For the purposes of comparing replication levels dRAID can be
448 * considered functionally equivalent to raidz.
449 */
450static boolean_t
451is_raidz_mirror(replication_level_t *a, replication_level_t *b,
452    replication_level_t **raidz, replication_level_t **mirror)
453{
454	if ((strcmp(a->zprl_type, "raidz") == 0 ||
455	    strcmp(a->zprl_type, "draid") == 0) &&
456	    strcmp(b->zprl_type, "mirror") == 0) {
457		*raidz = a;
458		*mirror = b;
459		return (B_TRUE);
460	}
461	return (B_FALSE);
462}
463
464/*
465 * Comparison for determining if dRAID and raidz where passed in either order.
466 */
467static boolean_t
468is_raidz_draid(replication_level_t *a, replication_level_t *b)
469{
470	if ((strcmp(a->zprl_type, "raidz") == 0 ||
471	    strcmp(a->zprl_type, "draid") == 0) &&
472	    (strcmp(b->zprl_type, "raidz") == 0 ||
473	    strcmp(b->zprl_type, "draid") == 0)) {
474		return (B_TRUE);
475	}
476
477	return (B_FALSE);
478}
479
480/*
481 * Given a list of toplevel vdevs, return the current replication level.  If
482 * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
483 * an error message will be displayed for each self-inconsistent vdev.
484 */
485static replication_level_t *
486get_replication(nvlist_t *nvroot, boolean_t fatal)
487{
488	nvlist_t **top;
489	uint_t t, toplevels;
490	nvlist_t **child;
491	uint_t c, children;
492	nvlist_t *nv;
493	char *type;
494	replication_level_t lastrep = {0};
495	replication_level_t rep;
496	replication_level_t *ret;
497	replication_level_t *raidz, *mirror;
498	boolean_t dontreport;
499
500	ret = safe_malloc(sizeof (replication_level_t));
501
502	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
503	    &top, &toplevels) == 0);
504
505	for (t = 0; t < toplevels; t++) {
506		uint64_t is_log = B_FALSE;
507
508		nv = top[t];
509
510		/*
511		 * For separate logs we ignore the top level vdev replication
512		 * constraints.
513		 */
514		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
515		if (is_log)
516			continue;
517
518		/* Ignore holes introduced by removing aux devices */
519		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
520		if (strcmp(type, VDEV_TYPE_HOLE) == 0)
521			continue;
522
523		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
524		    &child, &children) != 0) {
525			/*
526			 * This is a 'file' or 'disk' vdev.
527			 */
528			rep.zprl_type = type;
529			rep.zprl_children = 1;
530			rep.zprl_parity = 0;
531		} else {
532			int64_t vdev_size;
533
534			/*
535			 * This is a mirror or RAID-Z vdev.  Go through and make
536			 * sure the contents are all the same (files vs. disks),
537			 * keeping track of the number of elements in the
538			 * process.
539			 *
540			 * We also check that the size of each vdev (if it can
541			 * be determined) is the same.
542			 */
543			rep.zprl_type = type;
544			rep.zprl_children = 0;
545
546			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
547			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
548				verify(nvlist_lookup_uint64(nv,
549				    ZPOOL_CONFIG_NPARITY,
550				    &rep.zprl_parity) == 0);
551				assert(rep.zprl_parity != 0);
552			} else {
553				rep.zprl_parity = 0;
554			}
555
556			/*
557			 * The 'dontreport' variable indicates that we've
558			 * already reported an error for this spec, so don't
559			 * bother doing it again.
560			 */
561			type = NULL;
562			dontreport = 0;
563			vdev_size = -1LL;
564			for (c = 0; c < children; c++) {
565				nvlist_t *cnv = child[c];
566				char *path;
567				struct stat64 statbuf;
568				int64_t size = -1LL;
569				char *childtype;
570				int fd, err;
571
572				rep.zprl_children++;
573
574				verify(nvlist_lookup_string(cnv,
575				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
576
577				/*
578				 * If this is a replacing or spare vdev, then
579				 * get the real first child of the vdev: do this
580				 * in a loop because replacing and spare vdevs
581				 * can be nested.
582				 */
583				while (strcmp(childtype,
584				    VDEV_TYPE_REPLACING) == 0 ||
585				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
586					nvlist_t **rchild;
587					uint_t rchildren;
588
589					verify(nvlist_lookup_nvlist_array(cnv,
590					    ZPOOL_CONFIG_CHILDREN, &rchild,
591					    &rchildren) == 0);
592					assert(rchildren == 2);
593					cnv = rchild[0];
594
595					verify(nvlist_lookup_string(cnv,
596					    ZPOOL_CONFIG_TYPE,
597					    &childtype) == 0);
598				}
599
600				verify(nvlist_lookup_string(cnv,
601				    ZPOOL_CONFIG_PATH, &path) == 0);
602
603				/*
604				 * If we have a raidz/mirror that combines disks
605				 * with files, report it as an error.
606				 */
607				if (!dontreport && type != NULL &&
608				    strcmp(type, childtype) != 0) {
609					if (ret != NULL)
610						free(ret);
611					ret = NULL;
612					if (fatal)
613						vdev_error(gettext(
614						    "mismatched replication "
615						    "level: %s contains both "
616						    "files and devices\n"),
617						    rep.zprl_type);
618					else
619						return (NULL);
620					dontreport = B_TRUE;
621				}
622
623				/*
624				 * According to stat(2), the value of 'st_size'
625				 * is undefined for block devices and character
626				 * devices.  But there is no effective way to
627				 * determine the real size in userland.
628				 *
629				 * Instead, we'll take advantage of an
630				 * implementation detail of spec_size().  If the
631				 * device is currently open, then we (should)
632				 * return a valid size.
633				 *
634				 * If we still don't get a valid size (indicated
635				 * by a size of 0 or MAXOFFSET_T), then ignore
636				 * this device altogether.
637				 */
638				if ((fd = open(path, O_RDONLY)) >= 0) {
639					err = fstat64_blk(fd, &statbuf);
640					(void) close(fd);
641				} else {
642					err = stat64(path, &statbuf);
643				}
644
645				if (err != 0 ||
646				    statbuf.st_size == 0 ||
647				    statbuf.st_size == MAXOFFSET_T)
648					continue;
649
650				size = statbuf.st_size;
651
652				/*
653				 * Also make sure that devices and
654				 * slices have a consistent size.  If
655				 * they differ by a significant amount
656				 * (~16MB) then report an error.
657				 */
658				if (!dontreport &&
659				    (vdev_size != -1LL &&
660				    (llabs(size - vdev_size) >
661				    ZPOOL_FUZZ))) {
662					if (ret != NULL)
663						free(ret);
664					ret = NULL;
665					if (fatal)
666						vdev_error(gettext(
667						    "%s contains devices of "
668						    "different sizes\n"),
669						    rep.zprl_type);
670					else
671						return (NULL);
672					dontreport = B_TRUE;
673				}
674
675				type = childtype;
676				vdev_size = size;
677			}
678		}
679
680		/*
681		 * At this point, we have the replication of the last toplevel
682		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
683		 * different.
684		 */
685		if (lastrep.zprl_type != NULL) {
686			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
687			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
688				/*
689				 * Accepted raidz and mirror when they can
690				 * handle the same number of disk failures.
691				 */
692				if (raidz->zprl_parity !=
693				    mirror->zprl_children - 1) {
694					if (ret != NULL)
695						free(ret);
696					ret = NULL;
697					if (fatal)
698						vdev_error(gettext(
699						    "mismatched replication "
700						    "level: "
701						    "%s and %s vdevs with "
702						    "different redundancy, "
703						    "%llu vs. %llu (%llu-way) "
704						    "are present\n"),
705						    raidz->zprl_type,
706						    mirror->zprl_type,
707						    raidz->zprl_parity,
708						    mirror->zprl_children - 1,
709						    mirror->zprl_children);
710					else
711						return (NULL);
712				}
713			} else if (is_raidz_draid(&lastrep, &rep)) {
714				/*
715				 * Accepted raidz and draid when they can
716				 * handle the same number of disk failures.
717				 */
718				if (lastrep.zprl_parity != rep.zprl_parity) {
719					if (ret != NULL)
720						free(ret);
721					ret = NULL;
722					if (fatal)
723						vdev_error(gettext(
724						    "mismatched replication "
725						    "level: %s and %s vdevs "
726						    "with different "
727						    "redundancy, %llu vs. "
728						    "%llu are present\n"),
729						    lastrep.zprl_type,
730						    rep.zprl_type,
731						    lastrep.zprl_parity,
732						    rep.zprl_parity);
733					else
734						return (NULL);
735				}
736			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
737			    0) {
738				if (ret != NULL)
739					free(ret);
740				ret = NULL;
741				if (fatal)
742					vdev_error(gettext(
743					    "mismatched replication level: "
744					    "both %s and %s vdevs are "
745					    "present\n"),
746					    lastrep.zprl_type, rep.zprl_type);
747				else
748					return (NULL);
749			} else if (lastrep.zprl_parity != rep.zprl_parity) {
750				if (ret)
751					free(ret);
752				ret = NULL;
753				if (fatal)
754					vdev_error(gettext(
755					    "mismatched replication level: "
756					    "both %llu and %llu device parity "
757					    "%s vdevs are present\n"),
758					    lastrep.zprl_parity,
759					    rep.zprl_parity,
760					    rep.zprl_type);
761				else
762					return (NULL);
763			} else if (lastrep.zprl_children != rep.zprl_children) {
764				if (ret)
765					free(ret);
766				ret = NULL;
767				if (fatal)
768					vdev_error(gettext(
769					    "mismatched replication level: "
770					    "both %llu-way and %llu-way %s "
771					    "vdevs are present\n"),
772					    lastrep.zprl_children,
773					    rep.zprl_children,
774					    rep.zprl_type);
775				else
776					return (NULL);
777			}
778		}
779		lastrep = rep;
780	}
781
782	if (ret != NULL)
783		*ret = rep;
784
785	return (ret);
786}
787
788/*
789 * Check the replication level of the vdev spec against the current pool.  Calls
790 * get_replication() to make sure the new spec is self-consistent.  If the pool
791 * has a consistent replication level, then we ignore any errors.  Otherwise,
792 * report any difference between the two.
793 */
794static int
795check_replication(nvlist_t *config, nvlist_t *newroot)
796{
797	nvlist_t **child;
798	uint_t	children;
799	replication_level_t *current = NULL, *new;
800	replication_level_t *raidz, *mirror;
801	int ret;
802
803	/*
804	 * If we have a current pool configuration, check to see if it's
805	 * self-consistent.  If not, simply return success.
806	 */
807	if (config != NULL) {
808		nvlist_t *nvroot;
809
810		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
811		    &nvroot) == 0);
812		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
813			return (0);
814	}
815	/*
816	 * for spares there may be no children, and therefore no
817	 * replication level to check
818	 */
819	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
820	    &child, &children) != 0) || (children == 0)) {
821		free(current);
822		return (0);
823	}
824
825	/*
826	 * If all we have is logs then there's no replication level to check.
827	 */
828	if (num_logs(newroot) == children) {
829		free(current);
830		return (0);
831	}
832
833	/*
834	 * Get the replication level of the new vdev spec, reporting any
835	 * inconsistencies found.
836	 */
837	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
838		free(current);
839		return (-1);
840	}
841
842	/*
843	 * Check to see if the new vdev spec matches the replication level of
844	 * the current pool.
845	 */
846	ret = 0;
847	if (current != NULL) {
848		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
849		    is_raidz_mirror(new, current, &raidz, &mirror)) {
850			if (raidz->zprl_parity != mirror->zprl_children - 1) {
851				vdev_error(gettext(
852				    "mismatched replication level: pool and "
853				    "new vdev with different redundancy, %s "
854				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
855				    raidz->zprl_type,
856				    mirror->zprl_type,
857				    raidz->zprl_parity,
858				    mirror->zprl_children - 1,
859				    mirror->zprl_children);
860				ret = -1;
861			}
862		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
863			vdev_error(gettext(
864			    "mismatched replication level: pool uses %s "
865			    "and new vdev is %s\n"),
866			    current->zprl_type, new->zprl_type);
867			ret = -1;
868		} else if (current->zprl_parity != new->zprl_parity) {
869			vdev_error(gettext(
870			    "mismatched replication level: pool uses %llu "
871			    "device parity and new vdev uses %llu\n"),
872			    current->zprl_parity, new->zprl_parity);
873			ret = -1;
874		} else if (current->zprl_children != new->zprl_children) {
875			vdev_error(gettext(
876			    "mismatched replication level: pool uses %llu-way "
877			    "%s and new vdev uses %llu-way %s\n"),
878			    current->zprl_children, current->zprl_type,
879			    new->zprl_children, new->zprl_type);
880			ret = -1;
881		}
882	}
883
884	free(new);
885	if (current != NULL)
886		free(current);
887
888	return (ret);
889}
890
891static int
892zero_label(char *path)
893{
894	const int size = 4096;
895	char buf[size];
896	int err, fd;
897
898	if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
899		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
900		    path, strerror(errno));
901		return (-1);
902	}
903
904	memset(buf, 0, size);
905	err = write(fd, buf, size);
906	(void) fdatasync(fd);
907	(void) close(fd);
908
909	if (err == -1) {
910		(void) fprintf(stderr, gettext("cannot zero first %d bytes "
911		    "of '%s': %s\n"), size, path, strerror(errno));
912		return (-1);
913	}
914
915	if (err != size) {
916		(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
917		    "of '%s'\n"), err, size, path);
918		return (-1);
919	}
920
921	return (0);
922}
923
924/*
925 * Go through and find any whole disks in the vdev specification, labelling them
926 * as appropriate.  When constructing the vdev spec, we were unable to open this
927 * device in order to provide a devid.  Now that we have labelled the disk and
928 * know that slice 0 is valid, we can construct the devid now.
929 *
930 * If the disk was already labeled with an EFI label, we will have gotten the
931 * devid already (because we were able to open the whole disk).  Otherwise, we
932 * need to get the devid after we label the disk.
933 */
934static int
935make_disks(zpool_handle_t *zhp, nvlist_t *nv)
936{
937	nvlist_t **child;
938	uint_t c, children;
939	char *type, *path;
940	char devpath[MAXPATHLEN];
941	char udevpath[MAXPATHLEN];
942	uint64_t wholedisk;
943	struct stat64 statbuf;
944	int is_exclusive = 0;
945	int fd;
946	int ret;
947
948	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
949
950	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
951	    &child, &children) != 0) {
952
953		if (strcmp(type, VDEV_TYPE_DISK) != 0)
954			return (0);
955
956		/*
957		 * We have a disk device.  If this is a whole disk write
958		 * out the efi partition table, otherwise write zero's to
959		 * the first 4k of the partition.  This is to ensure that
960		 * libblkid will not misidentify the partition due to a
961		 * magic value left by the previous filesystem.
962		 */
963		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
964		verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
965		    &wholedisk));
966
967		if (!wholedisk) {
968			/*
969			 * Update device id string for mpath nodes (Linux only)
970			 */
971			if (is_mpath_whole_disk(path))
972				update_vdev_config_dev_strs(nv);
973
974			if (!is_spare(NULL, path))
975				(void) zero_label(path);
976			return (0);
977		}
978
979		if (realpath(path, devpath) == NULL) {
980			ret = errno;
981			(void) fprintf(stderr,
982			    gettext("cannot resolve path '%s'\n"), path);
983			return (ret);
984		}
985
986		/*
987		 * Remove any previously existing symlink from a udev path to
988		 * the device before labeling the disk.  This ensures that
989		 * only newly created links are used.  Otherwise there is a
990		 * window between when udev deletes and recreates the link
991		 * during which access attempts will fail with ENOENT.
992		 */
993		strlcpy(udevpath, path, MAXPATHLEN);
994		(void) zfs_append_partition(udevpath, MAXPATHLEN);
995
996		fd = open(devpath, O_RDWR|O_EXCL);
997		if (fd == -1) {
998			if (errno == EBUSY)
999				is_exclusive = 1;
1000#ifdef __FreeBSD__
1001			if (errno == EPERM)
1002				is_exclusive = 1;
1003#endif
1004		} else {
1005			(void) close(fd);
1006		}
1007
1008		/*
1009		 * If the partition exists, contains a valid spare label,
1010		 * and is opened exclusively there is no need to partition
1011		 * it.  Hot spares have already been partitioned and are
1012		 * held open exclusively by the kernel as a safety measure.
1013		 *
1014		 * If the provided path is for a /dev/disk/ device its
1015		 * symbolic link will be removed, partition table created,
1016		 * and then block until udev creates the new link.
1017		 */
1018		if (!is_exclusive && !is_spare(NULL, udevpath)) {
1019			char *devnode = strrchr(devpath, '/') + 1;
1020
1021			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1022			if (ret == 0) {
1023				ret = lstat64(udevpath, &statbuf);
1024				if (ret == 0 && S_ISLNK(statbuf.st_mode))
1025					(void) unlink(udevpath);
1026			}
1027
1028			/*
1029			 * When labeling a pool the raw device node name
1030			 * is provided as it appears under /dev/.
1031			 */
1032			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
1033				return (-1);
1034
1035			/*
1036			 * Wait for udev to signal the device is available
1037			 * by the provided path.
1038			 */
1039			ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1040			if (ret) {
1041				(void) fprintf(stderr,
1042				    gettext("missing link: %s was "
1043				    "partitioned but %s is missing\n"),
1044				    devnode, udevpath);
1045				return (ret);
1046			}
1047
1048			ret = zero_label(udevpath);
1049			if (ret)
1050				return (ret);
1051		}
1052
1053		/*
1054		 * Update the path to refer to the partition.  The presence of
1055		 * the 'whole_disk' field indicates to the CLI that we should
1056		 * chop off the partition number when displaying the device in
1057		 * future output.
1058		 */
1059		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1060
1061		/*
1062		 * Update device id strings for whole disks (Linux only)
1063		 */
1064		update_vdev_config_dev_strs(nv);
1065
1066		return (0);
1067	}
1068
1069	for (c = 0; c < children; c++)
1070		if ((ret = make_disks(zhp, child[c])) != 0)
1071			return (ret);
1072
1073	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1074	    &child, &children) == 0)
1075		for (c = 0; c < children; c++)
1076			if ((ret = make_disks(zhp, child[c])) != 0)
1077				return (ret);
1078
1079	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1080	    &child, &children) == 0)
1081		for (c = 0; c < children; c++)
1082			if ((ret = make_disks(zhp, child[c])) != 0)
1083				return (ret);
1084
1085	return (0);
1086}
1087
1088/*
1089 * Go through and find any devices that are in use.  We rely on libdiskmgt for
1090 * the majority of this task.
1091 */
1092static boolean_t
1093is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1094    boolean_t replacing, boolean_t isspare)
1095{
1096	nvlist_t **child;
1097	uint_t c, children;
1098	char *type, *path;
1099	int ret = 0;
1100	char buf[MAXPATHLEN];
1101	uint64_t wholedisk = B_FALSE;
1102	boolean_t anyinuse = B_FALSE;
1103
1104	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1105
1106	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1107	    &child, &children) != 0) {
1108
1109		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1110		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1111			verify(!nvlist_lookup_uint64(nv,
1112			    ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1113
1114		/*
1115		 * As a generic check, we look to see if this is a replace of a
1116		 * hot spare within the same pool.  If so, we allow it
1117		 * regardless of what libblkid or zpool_in_use() says.
1118		 */
1119		if (replacing) {
1120			(void) strlcpy(buf, path, sizeof (buf));
1121			if (wholedisk) {
1122				ret = zfs_append_partition(buf,  sizeof (buf));
1123				if (ret == -1)
1124					return (-1);
1125			}
1126
1127			if (is_spare(config, buf))
1128				return (B_FALSE);
1129		}
1130
1131		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1132			ret = check_device(path, force, isspare, wholedisk);
1133
1134		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1135			ret = check_file(path, force, isspare);
1136
1137		return (ret != 0);
1138	}
1139
1140	for (c = 0; c < children; c++)
1141		if (is_device_in_use(config, child[c], force, replacing,
1142		    B_FALSE))
1143			anyinuse = B_TRUE;
1144
1145	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1146	    &child, &children) == 0)
1147		for (c = 0; c < children; c++)
1148			if (is_device_in_use(config, child[c], force, replacing,
1149			    B_TRUE))
1150				anyinuse = B_TRUE;
1151
1152	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1153	    &child, &children) == 0)
1154		for (c = 0; c < children; c++)
1155			if (is_device_in_use(config, child[c], force, replacing,
1156			    B_FALSE))
1157				anyinuse = B_TRUE;
1158
1159	return (anyinuse);
1160}
1161
1162/*
1163 * Returns the parity level extracted from a raidz or draid type.
1164 * If the parity cannot be determined zero is returned.
1165 */
1166static int
1167get_parity(const char *type)
1168{
1169	long parity = 0;
1170	const char *p;
1171
1172	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
1173		p = type + strlen(VDEV_TYPE_RAIDZ);
1174
1175		if (*p == '\0') {
1176			/* when unspecified default to single parity */
1177			return (1);
1178		} else if (*p == '0') {
1179			/* no zero prefixes allowed */
1180			return (0);
1181		} else {
1182			/* 0-3, no suffixes allowed */
1183			char *end;
1184			errno = 0;
1185			parity = strtol(p, &end, 10);
1186			if (errno != 0 || *end != '\0' ||
1187			    parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
1188				return (0);
1189			}
1190		}
1191	} else if (strncmp(type, VDEV_TYPE_DRAID,
1192	    strlen(VDEV_TYPE_DRAID)) == 0) {
1193		p = type + strlen(VDEV_TYPE_DRAID);
1194
1195		if (*p == '\0' || *p == ':') {
1196			/* when unspecified default to single parity */
1197			return (1);
1198		} else if (*p == '0') {
1199			/* no zero prefixes allowed */
1200			return (0);
1201		} else {
1202			/* 0-3, allowed suffixes: '\0' or ':' */
1203			char *end;
1204			errno = 0;
1205			parity = strtol(p, &end, 10);
1206			if (errno != 0 ||
1207			    parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
1208			    (*end != '\0' && *end != ':')) {
1209				return (0);
1210			}
1211		}
1212	}
1213
1214	return ((int)parity);
1215}
1216
1217/*
1218 * Assign the minimum and maximum number of devices allowed for
1219 * the specified type.  On error NULL is returned, otherwise the
1220 * type prefix is returned (raidz, mirror, etc).
1221 */
1222static const char *
1223is_grouping(const char *type, int *mindev, int *maxdev)
1224{
1225	int nparity;
1226
1227	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
1228	    strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
1229		nparity = get_parity(type);
1230		if (nparity == 0)
1231			return (NULL);
1232		if (mindev != NULL)
1233			*mindev = nparity + 1;
1234		if (maxdev != NULL)
1235			*maxdev = 255;
1236
1237		if (strncmp(type, VDEV_TYPE_RAIDZ,
1238		    strlen(VDEV_TYPE_RAIDZ)) == 0) {
1239			return (VDEV_TYPE_RAIDZ);
1240		} else {
1241			return (VDEV_TYPE_DRAID);
1242		}
1243	}
1244
1245	if (maxdev != NULL)
1246		*maxdev = INT_MAX;
1247
1248	if (strcmp(type, "mirror") == 0) {
1249		if (mindev != NULL)
1250			*mindev = 2;
1251		return (VDEV_TYPE_MIRROR);
1252	}
1253
1254	if (strcmp(type, "spare") == 0) {
1255		if (mindev != NULL)
1256			*mindev = 1;
1257		return (VDEV_TYPE_SPARE);
1258	}
1259
1260	if (strcmp(type, "log") == 0) {
1261		if (mindev != NULL)
1262			*mindev = 1;
1263		return (VDEV_TYPE_LOG);
1264	}
1265
1266	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1267	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1268		if (mindev != NULL)
1269			*mindev = 1;
1270		return (type);
1271	}
1272
1273	if (strcmp(type, "cache") == 0) {
1274		if (mindev != NULL)
1275			*mindev = 1;
1276		return (VDEV_TYPE_L2CACHE);
1277	}
1278
1279	return (NULL);
1280}
1281
1282/*
1283 * Extract the configuration parameters encoded in the dRAID type and
1284 * use them to generate a dRAID configuration.  The expected format is:
1285 *
1286 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
1287 *
1288 * The intent is to be able to generate a good configuration when no
1289 * additional information is provided.  The only mandatory component
1290 * of the 'type' is the 'draid' prefix.  If a value is not provided
1291 * then reasonable defaults are used.  The optional components may
1292 * appear in any order but the d/s/c suffix is required.
1293 *
1294 * Valid inputs:
1295 * - data:     number of data devices per group (1-255)
1296 * - parity:   number of parity blocks per group (1-3)
1297 * - spares:   number of distributed spare (0-100)
1298 * - children: total number of devices (1-255)
1299 *
1300 * Examples:
1301 * - zpool create tank draid <devices...>
1302 * - zpool create tank draid2:8d:51c:2s <devices...>
1303 */
1304static int
1305draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
1306{
1307	uint64_t nparity = 1;
1308	uint64_t nspares = 0;
1309	uint64_t ndata = UINT64_MAX;
1310	uint64_t ngroups = 1;
1311	long value;
1312
1313	if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
1314		return (EINVAL);
1315
1316	nparity = (uint64_t)get_parity(type);
1317	if (nparity == 0)
1318		return (EINVAL);
1319
1320	char *p = (char *)type;
1321	while ((p = strchr(p, ':')) != NULL) {
1322		char *end;
1323
1324		p = p + 1;
1325		errno = 0;
1326
1327		if (!isdigit(p[0])) {
1328			(void) fprintf(stderr, gettext("invalid dRAID "
1329			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
1330			    type);
1331			return (EINVAL);
1332		}
1333
1334		/* Expected non-zero value with c/d/s suffix */
1335		value = strtol(p, &end, 10);
1336		char suffix = tolower(*end);
1337		if (errno != 0 ||
1338		    (suffix != 'c' && suffix != 'd' && suffix != 's')) {
1339			(void) fprintf(stderr, gettext("invalid dRAID "
1340			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
1341			    type);
1342			return (EINVAL);
1343		}
1344
1345		if (suffix == 'c') {
1346			if ((uint64_t)value != children) {
1347				fprintf(stderr,
1348				    gettext("invalid number of dRAID children; "
1349				    "%llu required but %llu provided\n"),
1350				    (u_longlong_t)value,
1351				    (u_longlong_t)children);
1352				return (EINVAL);
1353			}
1354		} else if (suffix == 'd') {
1355			ndata = (uint64_t)value;
1356		} else if (suffix == 's') {
1357			nspares = (uint64_t)value;
1358		} else {
1359			verify(0); /* Unreachable */
1360		}
1361	}
1362
1363	/*
1364	 * When a specific number of data disks is not provided limit a
1365	 * redundancy group to 8 data disks.  This value was selected to
1366	 * provide a reasonable tradeoff between capacity and performance.
1367	 */
1368	if (ndata == UINT64_MAX) {
1369		if (children > nspares + nparity) {
1370			ndata = MIN(children - nspares - nparity, 8);
1371		} else {
1372			fprintf(stderr, gettext("request number of "
1373			    "distributed spares %llu and parity level %llu\n"
1374			    "leaves no disks available for data\n"),
1375			    (u_longlong_t)nspares, (u_longlong_t)nparity);
1376			return (EINVAL);
1377		}
1378	}
1379
1380	/* Verify the maximum allowed group size is never exceeded. */
1381	if (ndata == 0 || (ndata + nparity > children - nspares)) {
1382		fprintf(stderr, gettext("requested number of dRAID data "
1383		    "disks per group %llu is too high,\nat most %llu disks "
1384		    "are available for data\n"), (u_longlong_t)ndata,
1385		    (u_longlong_t)(children - nspares - nparity));
1386		return (EINVAL);
1387	}
1388
1389	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
1390		fprintf(stderr,
1391		    gettext("invalid dRAID parity level %llu; must be "
1392		    "between 1 and %d\n"), (u_longlong_t)nparity,
1393		    VDEV_DRAID_MAXPARITY);
1394		return (EINVAL);
1395	}
1396
1397	/*
1398	 * Verify the requested number of spares can be satisfied.
1399	 * An arbitrary limit of 100 distributed spares is applied.
1400	 */
1401	if (nspares > 100 || nspares > (children - (ndata + nparity))) {
1402		fprintf(stderr,
1403		    gettext("invalid number of dRAID spares %llu; additional "
1404		    "disks would be required\n"), (u_longlong_t)nspares);
1405		return (EINVAL);
1406	}
1407
1408	/* Verify the requested number children is sufficient. */
1409	if (children < (ndata + nparity + nspares)) {
1410		fprintf(stderr, gettext("%llu disks were provided, but at "
1411		    "least %llu disks are required for this config\n"),
1412		    (u_longlong_t)children,
1413		    (u_longlong_t)(ndata + nparity + nspares));
1414	}
1415
1416	if (children > VDEV_DRAID_MAX_CHILDREN) {
1417		fprintf(stderr, gettext("%llu disks were provided, but "
1418		    "dRAID only supports up to %u disks"),
1419		    (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
1420	}
1421
1422	/*
1423	 * Calculate the minimum number of groups required to fill a slice.
1424	 * This is the LCM of the stripe width (ndata + nparity) and the
1425	 * number of data drives (children - nspares).
1426	 */
1427	while (ngroups * (ndata + nparity) % (children - nspares) != 0)
1428		ngroups++;
1429
1430	/* Store the basic dRAID configuration. */
1431	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
1432	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
1433	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
1434	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
1435
1436	return (0);
1437}
1438
1439/*
1440 * Construct a syntactically valid vdev specification,
1441 * and ensure that all devices and files exist and can be opened.
1442 * Note: we don't bother freeing anything in the error paths
1443 * because the program is just going to exit anyway.
1444 */
1445static nvlist_t *
1446construct_spec(nvlist_t *props, int argc, char **argv)
1447{
1448	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1449	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1450	const char *type, *fulltype;
1451	boolean_t is_log, is_special, is_dedup, is_spare;
1452	boolean_t seen_logs;
1453
1454	top = NULL;
1455	toplevels = 0;
1456	spares = NULL;
1457	l2cache = NULL;
1458	nspares = 0;
1459	nlogs = 0;
1460	nl2cache = 0;
1461	is_log = is_special = is_dedup = is_spare = B_FALSE;
1462	seen_logs = B_FALSE;
1463	nvroot = NULL;
1464
1465	while (argc > 0) {
1466		fulltype = argv[0];
1467		nv = NULL;
1468
1469		/*
1470		 * If it's a mirror, raidz, or draid the subsequent arguments
1471		 * are its leaves -- until we encounter the next mirror,
1472		 * raidz or draid.
1473		 */
1474		if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
1475			nvlist_t **child = NULL;
1476			int c, children = 0;
1477
1478			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1479				if (spares != NULL) {
1480					(void) fprintf(stderr,
1481					    gettext("invalid vdev "
1482					    "specification: 'spare' can be "
1483					    "specified only once\n"));
1484					goto spec_out;
1485				}
1486				is_spare = B_TRUE;
1487				is_log = is_special = is_dedup = B_FALSE;
1488			}
1489
1490			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1491				if (seen_logs) {
1492					(void) fprintf(stderr,
1493					    gettext("invalid vdev "
1494					    "specification: 'log' can be "
1495					    "specified only once\n"));
1496					goto spec_out;
1497				}
1498				seen_logs = B_TRUE;
1499				is_log = B_TRUE;
1500				is_special = is_dedup = is_spare = B_FALSE;
1501				argc--;
1502				argv++;
1503				/*
1504				 * A log is not a real grouping device.
1505				 * We just set is_log and continue.
1506				 */
1507				continue;
1508			}
1509
1510			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1511				is_special = B_TRUE;
1512				is_log = is_dedup = is_spare = B_FALSE;
1513				argc--;
1514				argv++;
1515				continue;
1516			}
1517
1518			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1519				is_dedup = B_TRUE;
1520				is_log = is_special = is_spare = B_FALSE;
1521				argc--;
1522				argv++;
1523				continue;
1524			}
1525
1526			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1527				if (l2cache != NULL) {
1528					(void) fprintf(stderr,
1529					    gettext("invalid vdev "
1530					    "specification: 'cache' can be "
1531					    "specified only once\n"));
1532					goto spec_out;
1533				}
1534				is_log = is_special = B_FALSE;
1535				is_dedup = is_spare = B_FALSE;
1536			}
1537
1538			if (is_log || is_special || is_dedup) {
1539				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1540					(void) fprintf(stderr,
1541					    gettext("invalid vdev "
1542					    "specification: unsupported '%s' "
1543					    "device: %s\n"), is_log ? "log" :
1544					    "special", type);
1545					goto spec_out;
1546				}
1547				nlogs++;
1548			}
1549
1550			for (c = 1; c < argc; c++) {
1551				if (is_grouping(argv[c], NULL, NULL) != NULL)
1552					break;
1553
1554				children++;
1555				child = realloc(child,
1556				    children * sizeof (nvlist_t *));
1557				if (child == NULL)
1558					zpool_no_memory();
1559				if ((nv = make_leaf_vdev(props, argv[c],
1560				    !(is_log || is_special || is_dedup ||
1561				    is_spare))) == NULL) {
1562					for (c = 0; c < children - 1; c++)
1563						nvlist_free(child[c]);
1564					free(child);
1565					goto spec_out;
1566				}
1567
1568				child[children - 1] = nv;
1569			}
1570
1571			if (children < mindev) {
1572				(void) fprintf(stderr, gettext("invalid vdev "
1573				    "specification: %s requires at least %d "
1574				    "devices\n"), argv[0], mindev);
1575				for (c = 0; c < children; c++)
1576					nvlist_free(child[c]);
1577				free(child);
1578				goto spec_out;
1579			}
1580
1581			if (children > maxdev) {
1582				(void) fprintf(stderr, gettext("invalid vdev "
1583				    "specification: %s supports no more than "
1584				    "%d devices\n"), argv[0], maxdev);
1585				for (c = 0; c < children; c++)
1586					nvlist_free(child[c]);
1587				free(child);
1588				goto spec_out;
1589			}
1590
1591			argc -= c;
1592			argv += c;
1593
1594			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1595				spares = child;
1596				nspares = children;
1597				continue;
1598			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1599				l2cache = child;
1600				nl2cache = children;
1601				continue;
1602			} else {
1603				/* create a top-level vdev with children */
1604				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1605				    0) == 0);
1606				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1607				    type) == 0);
1608				verify(nvlist_add_uint64(nv,
1609				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1610				if (is_log) {
1611					verify(nvlist_add_string(nv,
1612					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1613					    VDEV_ALLOC_BIAS_LOG) == 0);
1614				}
1615				if (is_special) {
1616					verify(nvlist_add_string(nv,
1617					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1618					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1619				}
1620				if (is_dedup) {
1621					verify(nvlist_add_string(nv,
1622					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1623					    VDEV_ALLOC_BIAS_DEDUP) == 0);
1624				}
1625				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1626					verify(nvlist_add_uint64(nv,
1627					    ZPOOL_CONFIG_NPARITY,
1628					    mindev - 1) == 0);
1629				}
1630				if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
1631					if (draid_config_by_type(nv,
1632					    fulltype, children) != 0) {
1633						for (c = 0; c < children; c++)
1634							nvlist_free(child[c]);
1635						free(child);
1636						goto spec_out;
1637					}
1638				}
1639				verify(nvlist_add_nvlist_array(nv,
1640				    ZPOOL_CONFIG_CHILDREN, child,
1641				    children) == 0);
1642
1643				for (c = 0; c < children; c++)
1644					nvlist_free(child[c]);
1645				free(child);
1646			}
1647		} else {
1648			/*
1649			 * We have a device.  Pass off to make_leaf_vdev() to
1650			 * construct the appropriate nvlist describing the vdev.
1651			 */
1652			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
1653			    is_special || is_dedup || is_spare))) == NULL)
1654				goto spec_out;
1655
1656			verify(nvlist_add_uint64(nv,
1657			    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1658			if (is_log) {
1659				verify(nvlist_add_string(nv,
1660				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1661				    VDEV_ALLOC_BIAS_LOG) == 0);
1662				nlogs++;
1663			}
1664
1665			if (is_special) {
1666				verify(nvlist_add_string(nv,
1667				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1668				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1669			}
1670			if (is_dedup) {
1671				verify(nvlist_add_string(nv,
1672				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1673				    VDEV_ALLOC_BIAS_DEDUP) == 0);
1674			}
1675			argc--;
1676			argv++;
1677		}
1678
1679		toplevels++;
1680		top = realloc(top, toplevels * sizeof (nvlist_t *));
1681		if (top == NULL)
1682			zpool_no_memory();
1683		top[toplevels - 1] = nv;
1684	}
1685
1686	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1687		(void) fprintf(stderr, gettext("invalid vdev "
1688		    "specification: at least one toplevel vdev must be "
1689		    "specified\n"));
1690		goto spec_out;
1691	}
1692
1693	if (seen_logs && nlogs == 0) {
1694		(void) fprintf(stderr, gettext("invalid vdev specification: "
1695		    "log requires at least 1 device\n"));
1696		goto spec_out;
1697	}
1698
1699	/*
1700	 * Finally, create nvroot and add all top-level vdevs to it.
1701	 */
1702	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1703	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1704	    VDEV_TYPE_ROOT) == 0);
1705	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1706	    top, toplevels) == 0);
1707	if (nspares != 0)
1708		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1709		    spares, nspares) == 0);
1710	if (nl2cache != 0)
1711		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1712		    l2cache, nl2cache) == 0);
1713
1714spec_out:
1715	for (t = 0; t < toplevels; t++)
1716		nvlist_free(top[t]);
1717	for (t = 0; t < nspares; t++)
1718		nvlist_free(spares[t]);
1719	for (t = 0; t < nl2cache; t++)
1720		nvlist_free(l2cache[t]);
1721
1722	free(spares);
1723	free(l2cache);
1724	free(top);
1725
1726	return (nvroot);
1727}
1728
1729nvlist_t *
1730split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1731    splitflags_t flags, int argc, char **argv)
1732{
1733	nvlist_t *newroot = NULL, **child;
1734	uint_t c, children;
1735
1736	if (argc > 0) {
1737		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1738			(void) fprintf(stderr, gettext("Unable to build a "
1739			    "pool from the specified devices\n"));
1740			return (NULL);
1741		}
1742
1743		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1744			nvlist_free(newroot);
1745			return (NULL);
1746		}
1747
1748		/* avoid any tricks in the spec */
1749		verify(nvlist_lookup_nvlist_array(newroot,
1750		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1751		for (c = 0; c < children; c++) {
1752			char *path;
1753			const char *type;
1754			int min, max;
1755
1756			verify(nvlist_lookup_string(child[c],
1757			    ZPOOL_CONFIG_PATH, &path) == 0);
1758			if ((type = is_grouping(path, &min, &max)) != NULL) {
1759				(void) fprintf(stderr, gettext("Cannot use "
1760				    "'%s' as a device for splitting\n"), type);
1761				nvlist_free(newroot);
1762				return (NULL);
1763			}
1764		}
1765	}
1766
1767	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1768		nvlist_free(newroot);
1769		return (NULL);
1770	}
1771
1772	return (newroot);
1773}
1774
1775static int
1776num_normal_vdevs(nvlist_t *nvroot)
1777{
1778	nvlist_t **top;
1779	uint_t t, toplevels, normal = 0;
1780
1781	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1782	    &top, &toplevels) == 0);
1783
1784	for (t = 0; t < toplevels; t++) {
1785		uint64_t log = B_FALSE;
1786
1787		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1788		if (log)
1789			continue;
1790		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1791			continue;
1792
1793		normal++;
1794	}
1795
1796	return (normal);
1797}
1798
1799/*
1800 * Get and validate the contents of the given vdev specification.  This ensures
1801 * that the nvlist returned is well-formed, that all the devices exist, and that
1802 * they are not currently in use by any other known consumer.  The 'poolconfig'
1803 * parameter is the current configuration of the pool when adding devices
1804 * existing pool, and is used to perform additional checks, such as changing the
1805 * replication level of the pool.  It can be 'NULL' to indicate that this is a
1806 * new pool.  The 'force' flag controls whether devices should be forcefully
1807 * added, even if they appear in use.
1808 */
1809nvlist_t *
1810make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1811    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1812{
1813	nvlist_t *newroot;
1814	nvlist_t *poolconfig = NULL;
1815	is_force = force;
1816
1817	/*
1818	 * Construct the vdev specification.  If this is successful, we know
1819	 * that we have a valid specification, and that all devices can be
1820	 * opened.
1821	 */
1822	if ((newroot = construct_spec(props, argc, argv)) == NULL)
1823		return (NULL);
1824
1825	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1826		nvlist_free(newroot);
1827		return (NULL);
1828	}
1829
1830	/*
1831	 * Validate each device to make sure that it's not shared with another
1832	 * subsystem.  We do this even if 'force' is set, because there are some
1833	 * uses (such as a dedicated dump device) that even '-f' cannot
1834	 * override.
1835	 */
1836	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1837		nvlist_free(newroot);
1838		return (NULL);
1839	}
1840
1841	/*
1842	 * Check the replication level of the given vdevs and report any errors
1843	 * found.  We include the existing pool spec, if any, as we need to
1844	 * catch changes against the existing replication level.
1845	 */
1846	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1847		nvlist_free(newroot);
1848		return (NULL);
1849	}
1850
1851	/*
1852	 * On pool create the new vdev spec must have one normal vdev.
1853	 */
1854	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1855		vdev_error(gettext("at least one general top-level vdev must "
1856		    "be specified\n"));
1857		nvlist_free(newroot);
1858		return (NULL);
1859	}
1860
1861	/*
1862	 * Run through the vdev specification and label any whole disks found.
1863	 */
1864	if (!dryrun && make_disks(zhp, newroot) != 0) {
1865		nvlist_free(newroot);
1866		return (NULL);
1867	}
1868
1869	return (newroot);
1870}
1871