1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24307121Smav * Copyright (c) 2013, 2015 by Delphix. All rights reserved. 25297119Smav * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 26168404Spjd */ 27168404Spjd 28168404Spjd/* 29168404Spjd * Functions to convert between a list of vdevs and an nvlist representing the 30168404Spjd * configuration. Each entry in the list can be one of: 31168404Spjd * 32168404Spjd * Device vdevs 33168404Spjd * disk=(path=..., devid=...) 34168404Spjd * file=(path=...) 35168404Spjd * 36168404Spjd * Group vdevs 37168404Spjd * raidz[1|2]=(...) 38168404Spjd * mirror=(...) 39168404Spjd * 40168404Spjd * Hot spares 41168404Spjd * 42168404Spjd * While the underlying implementation supports it, group vdevs cannot contain 43168404Spjd * other group vdevs. All userland verification of devices is contained within 44168404Spjd * this file. If successful, the nvlist returned can be passed directly to the 45168404Spjd * kernel; we've done as much verification as possible in userland. 46168404Spjd * 47168404Spjd * Hot spares are a special case, and passed down as an array of disk vdevs, at 48168404Spjd * the same level as the root of the vdev tree. 49168404Spjd * 50185029Spjd * The only function exported by this file is 'make_root_vdev'. The 51185029Spjd * function performs several passes: 52168404Spjd * 53168404Spjd * 1. Construct the vdev specification. Performs syntax validation and 54168404Spjd * makes sure each device is valid. 55168404Spjd * 2. Check for devices in use. Using libdiskmgt, makes sure that no 56168404Spjd * devices are also in use. Some can be overridden using the 'force' 57168404Spjd * flag, others cannot. 58168404Spjd * 3. Check for replication errors if the 'force' flag is not specified. 59168404Spjd * validates that the replication level is consistent across the 60168404Spjd * entire pool. 61185029Spjd * 4. Call libzfs to label any whole disks with an EFI label. 62168404Spjd */ 63168404Spjd 64168404Spjd#include <assert.h> 65168404Spjd#include <devid.h> 66168404Spjd#include <errno.h> 67168404Spjd#include <fcntl.h> 68168404Spjd#include <libintl.h> 69168404Spjd#include <libnvpair.h> 70219089Spjd#include <limits.h> 71168404Spjd#include <stdio.h> 72168404Spjd#include <string.h> 73168404Spjd#include <unistd.h> 74168404Spjd#include <paths.h> 75168404Spjd#include <sys/stat.h> 76168404Spjd#include <sys/disk.h> 77168404Spjd#include <sys/mntent.h> 78168404Spjd#include <libgeom.h> 79168404Spjd 80168404Spjd#include "zpool_util.h" 81168404Spjd 82219089Spjd#define BACKUP_SLICE "s2" 83219089Spjd 84168404Spjd/* 85168404Spjd * For any given vdev specification, we can have multiple errors. The 86168404Spjd * vdev_error() function keeps track of whether we have seen an error yet, and 87168404Spjd * prints out a header if its the first error we've seen. 88168404Spjd */ 89168404Spjdboolean_t error_seen; 90168404Spjdboolean_t is_force; 91168404Spjd 92168404Spjd/*PRINTFLIKE1*/ 93168404Spjdstatic void 94168404Spjdvdev_error(const char *fmt, ...) 95168404Spjd{ 96168404Spjd va_list ap; 97168404Spjd 98168404Spjd if (!error_seen) { 99168404Spjd (void) fprintf(stderr, gettext("invalid vdev specification\n")); 100168404Spjd if (!is_force) 101168404Spjd (void) fprintf(stderr, gettext("use '-f' to override " 102168404Spjd "the following errors:\n")); 103168404Spjd else 104168404Spjd (void) fprintf(stderr, gettext("the following errors " 105168404Spjd "must be manually repaired:\n")); 106168404Spjd error_seen = B_TRUE; 107168404Spjd } 108168404Spjd 109168404Spjd va_start(ap, fmt); 110168404Spjd (void) vfprintf(stderr, fmt, ap); 111168404Spjd va_end(ap); 112168404Spjd} 113168404Spjd 114297077Smav#ifdef illumos 115219089Spjdstatic void 116219089Spjdlibdiskmgt_error(int error) 117219089Spjd{ 118219089Spjd /* 119219089Spjd * ENXIO/ENODEV is a valid error message if the device doesn't live in 120219089Spjd * /dev/dsk. Don't bother printing an error message in this case. 121219089Spjd */ 122219089Spjd if (error == ENXIO || error == ENODEV) 123219089Spjd return; 124219089Spjd 125219089Spjd (void) fprintf(stderr, gettext("warning: device in use checking " 126219089Spjd "failed: %s\n"), strerror(error)); 127219089Spjd} 128219089Spjd 129168404Spjd/* 130219089Spjd * Validate a device, passing the bulk of the work off to libdiskmgt. 131219089Spjd */ 132219089Spjdstatic int 133219089Spjdcheck_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 134219089Spjd{ 135219089Spjd char *msg; 136219089Spjd int error = 0; 137219089Spjd dm_who_type_t who; 138219089Spjd 139219089Spjd if (force) 140219089Spjd who = DM_WHO_ZPOOL_FORCE; 141219089Spjd else if (isspare) 142219089Spjd who = DM_WHO_ZPOOL_SPARE; 143219089Spjd else 144219089Spjd who = DM_WHO_ZPOOL; 145219089Spjd 146219089Spjd if (dm_inuse((char *)path, &msg, who, &error) || error) { 147219089Spjd if (error != 0) { 148219089Spjd libdiskmgt_error(error); 149219089Spjd return (0); 150219089Spjd } else { 151219089Spjd vdev_error("%s", msg); 152219089Spjd free(msg); 153219089Spjd return (-1); 154219089Spjd } 155219089Spjd } 156219089Spjd 157219089Spjd /* 158219089Spjd * If we're given a whole disk, ignore overlapping slices since we're 159219089Spjd * about to label it anyway. 160219089Spjd */ 161219089Spjd error = 0; 162219089Spjd if (!wholedisk && !force && 163219089Spjd (dm_isoverlapping((char *)path, &msg, &error) || error)) { 164219089Spjd if (error == 0) { 165219089Spjd /* dm_isoverlapping returned -1 */ 166219089Spjd vdev_error(gettext("%s overlaps with %s\n"), path, msg); 167219089Spjd free(msg); 168219089Spjd return (-1); 169219089Spjd } else if (error != ENODEV) { 170219089Spjd /* libdiskmgt's devcache only handles physical drives */ 171219089Spjd libdiskmgt_error(error); 172219089Spjd return (0); 173219089Spjd } 174219089Spjd } 175219089Spjd 176219089Spjd return (0); 177219089Spjd} 178219089Spjd 179219089Spjd 180219089Spjd/* 181219089Spjd * Validate a whole disk. Iterate over all slices on the disk and make sure 182219089Spjd * that none is in use by calling check_slice(). 183219089Spjd */ 184219089Spjdstatic int 185219089Spjdcheck_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 186219089Spjd{ 187219089Spjd dm_descriptor_t *drive, *media, *slice; 188219089Spjd int err = 0; 189219089Spjd int i; 190219089Spjd int ret; 191219089Spjd 192219089Spjd /* 193219089Spjd * Get the drive associated with this disk. This should never fail, 194219089Spjd * because we already have an alias handle open for the device. 195219089Spjd */ 196219089Spjd if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 197219089Spjd &err)) == NULL || *drive == NULL) { 198219089Spjd if (err) 199219089Spjd libdiskmgt_error(err); 200219089Spjd return (0); 201219089Spjd } 202219089Spjd 203219089Spjd if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 204219089Spjd &err)) == NULL) { 205219089Spjd dm_free_descriptors(drive); 206219089Spjd if (err) 207219089Spjd libdiskmgt_error(err); 208219089Spjd return (0); 209219089Spjd } 210219089Spjd 211219089Spjd dm_free_descriptors(drive); 212219089Spjd 213219089Spjd /* 214219089Spjd * It is possible that the user has specified a removable media drive, 215219089Spjd * and the media is not present. 216219089Spjd */ 217219089Spjd if (*media == NULL) { 218219089Spjd dm_free_descriptors(media); 219219089Spjd vdev_error(gettext("'%s' has no media in drive\n"), name); 220219089Spjd return (-1); 221219089Spjd } 222219089Spjd 223219089Spjd if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 224219089Spjd &err)) == NULL) { 225219089Spjd dm_free_descriptors(media); 226219089Spjd if (err) 227219089Spjd libdiskmgt_error(err); 228219089Spjd return (0); 229219089Spjd } 230219089Spjd 231219089Spjd dm_free_descriptors(media); 232219089Spjd 233219089Spjd ret = 0; 234219089Spjd 235219089Spjd /* 236219089Spjd * Iterate over all slices and report any errors. We don't care about 237219089Spjd * overlapping slices because we are using the whole disk. 238219089Spjd */ 239219089Spjd for (i = 0; slice[i] != NULL; i++) { 240219089Spjd char *name = dm_get_name(slice[i], &err); 241219089Spjd 242219089Spjd if (check_slice(name, force, B_TRUE, isspare) != 0) 243219089Spjd ret = -1; 244219089Spjd 245219089Spjd dm_free_name(name); 246219089Spjd } 247219089Spjd 248219089Spjd dm_free_descriptors(slice); 249219089Spjd return (ret); 250219089Spjd} 251219089Spjd 252219089Spjd/* 253219089Spjd * Validate a device. 254219089Spjd */ 255219089Spjdstatic int 256219089Spjdcheck_device(const char *path, boolean_t force, boolean_t isspare) 257219089Spjd{ 258219089Spjd dm_descriptor_t desc; 259219089Spjd int err; 260219089Spjd char *dev; 261219089Spjd 262219089Spjd /* 263219089Spjd * For whole disks, libdiskmgt does not include the leading dev path. 264219089Spjd */ 265219089Spjd dev = strrchr(path, '/'); 266219089Spjd assert(dev != NULL); 267219089Spjd dev++; 268219089Spjd if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 269219089Spjd err = check_disk(path, desc, force, isspare); 270219089Spjd dm_free_descriptor(desc); 271219089Spjd return (err); 272219089Spjd } 273219089Spjd 274219089Spjd return (check_slice(path, force, B_FALSE, isspare)); 275219089Spjd} 276297077Smav#endif /* illumos */ 277219089Spjd 278219089Spjd/* 279185029Spjd * Check that a file is valid. All we can do in this case is check that it's 280185029Spjd * not in use by another pool, and not in use by swap. 281168404Spjd */ 282168404Spjdstatic int 283185029Spjdcheck_file(const char *file, boolean_t force, boolean_t isspare) 284168404Spjd{ 285185029Spjd char *name; 286185029Spjd int fd; 287185029Spjd int ret = 0; 288185029Spjd int err; 289185029Spjd pool_state_t state; 290185029Spjd boolean_t inuse; 291168404Spjd 292297077Smav#ifdef illumos 293185029Spjd if (dm_inuse_swap(file, &err)) { 294185029Spjd if (err) 295185029Spjd libdiskmgt_error(err); 296185029Spjd else 297185029Spjd vdev_error(gettext("%s is currently used by swap. " 298185029Spjd "Please see swap(1M).\n"), file); 299185029Spjd return (-1); 300185029Spjd } 301185029Spjd#endif 302168404Spjd 303185029Spjd if ((fd = open(file, O_RDONLY)) < 0) 304185029Spjd return (0); 305168404Spjd 306185029Spjd if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 307185029Spjd const char *desc; 308168404Spjd 309185029Spjd switch (state) { 310185029Spjd case POOL_STATE_ACTIVE: 311185029Spjd desc = gettext("active"); 312185029Spjd break; 313185029Spjd 314185029Spjd case POOL_STATE_EXPORTED: 315185029Spjd desc = gettext("exported"); 316185029Spjd break; 317185029Spjd 318185029Spjd case POOL_STATE_POTENTIALLY_ACTIVE: 319185029Spjd desc = gettext("potentially active"); 320185029Spjd break; 321185029Spjd 322185029Spjd default: 323185029Spjd desc = gettext("unknown"); 324185029Spjd break; 325185029Spjd } 326185029Spjd 327185029Spjd /* 328185029Spjd * Allow hot spares to be shared between pools. 329185029Spjd */ 330185029Spjd if (state == POOL_STATE_SPARE && isspare) 331185029Spjd return (0); 332185029Spjd 333185029Spjd if (state == POOL_STATE_ACTIVE || 334185029Spjd state == POOL_STATE_SPARE || !force) { 335185029Spjd switch (state) { 336185029Spjd case POOL_STATE_SPARE: 337185029Spjd vdev_error(gettext("%s is reserved as a hot " 338185029Spjd "spare for pool %s\n"), file, name); 339185029Spjd break; 340185029Spjd default: 341185029Spjd vdev_error(gettext("%s is part of %s pool " 342185029Spjd "'%s'\n"), file, desc, name); 343185029Spjd break; 344168404Spjd } 345185029Spjd ret = -1; 346168404Spjd } 347185029Spjd 348185029Spjd free(name); 349168404Spjd } 350168404Spjd 351185029Spjd (void) close(fd); 352185029Spjd return (ret); 353168404Spjd} 354168404Spjd 355185029Spjdstatic int 356219089Spjdcheck_device(const char *name, boolean_t force, boolean_t isspare) 357185029Spjd{ 358185029Spjd char path[MAXPATHLEN]; 359185029Spjd 360185029Spjd if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0) 361185029Spjd snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name); 362185029Spjd else 363185029Spjd strlcpy(path, name, sizeof(path)); 364185029Spjd 365185029Spjd return (check_file(path, force, isspare)); 366185029Spjd} 367185029Spjd 368185029Spjd/* 369185029Spjd * By "whole disk" we mean an entire physical disk (something we can 370185029Spjd * label, toggle the write cache on, etc.) as opposed to the full 371185029Spjd * capacity of a pseudo-device such as lofi or did. We act as if we 372185029Spjd * are labeling the disk, which should be a pretty good test of whether 373185029Spjd * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 374185029Spjd * it isn't. 375185029Spjd */ 376168404Spjdstatic boolean_t 377219089Spjdis_whole_disk(const char *arg) 378168404Spjd{ 379297077Smav#ifdef illumos 380219089Spjd struct dk_gpt *label; 381219089Spjd int fd; 382219089Spjd char path[MAXPATHLEN]; 383219089Spjd 384219089Spjd (void) snprintf(path, sizeof (path), "%s%s%s", 385299430Smav ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 386219089Spjd if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 387219089Spjd return (B_FALSE); 388219089Spjd if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 389219089Spjd (void) close(fd); 390219089Spjd return (B_FALSE); 391219089Spjd } 392219089Spjd efi_free(label); 393219089Spjd (void) close(fd); 394219089Spjd return (B_TRUE); 395219089Spjd#else 396168404Spjd int fd; 397168404Spjd 398219089Spjd fd = g_open(arg, 0); 399169303Spjd if (fd >= 0) { 400169303Spjd g_close(fd); 401169303Spjd return (B_TRUE); 402168404Spjd } 403169303Spjd return (B_FALSE); 404219089Spjd#endif 405185029Spjd} 406168404Spjd 407168404Spjd/* 408219089Spjd * Create a leaf vdev. Determine if this is a file or a device. If it's a 409219089Spjd * device, fill in the device id to make a complete nvlist. Valid forms for a 410219089Spjd * leaf vdev are: 411168404Spjd * 412219089Spjd * /dev/dsk/xxx Complete disk path 413219089Spjd * /xxx Full path to file 414219089Spjd * xxx Shorthand for /dev/dsk/xxx 415168404Spjd */ 416185029Spjdstatic nvlist_t * 417185029Spjdmake_leaf_vdev(const char *arg, uint64_t is_log) 418168404Spjd{ 419185029Spjd char path[MAXPATHLEN]; 420169303Spjd struct stat64 statbuf; 421168404Spjd nvlist_t *vdev = NULL; 422168404Spjd char *type = NULL; 423169303Spjd boolean_t wholedisk = B_FALSE; 424168404Spjd 425185029Spjd /* 426185029Spjd * Determine what type of vdev this is, and put the full path into 427185029Spjd * 'path'. We detect whether this is a device of file afterwards by 428185029Spjd * checking the st_mode of the file. 429185029Spjd */ 430185029Spjd if (arg[0] == '/') { 431185029Spjd /* 432185029Spjd * Complete device or file path. Exact type is determined by 433185029Spjd * examining the file descriptor afterwards. 434185029Spjd */ 435185029Spjd wholedisk = is_whole_disk(arg); 436185029Spjd if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 437185029Spjd (void) fprintf(stderr, 438185029Spjd gettext("cannot open '%s': %s\n"), 439185029Spjd arg, strerror(errno)); 440185029Spjd return (NULL); 441185029Spjd } 442168404Spjd 443185029Spjd (void) strlcpy(path, arg, sizeof (path)); 444185029Spjd } else { 445185029Spjd /* 446185029Spjd * This may be a short path for a device, or it could be total 447185029Spjd * gibberish. Check to see if it's a known device in 448185029Spjd * /dev/dsk/. As part of this check, see if we've been given a 449185029Spjd * an entire disk (minus the slice number). 450185029Spjd */ 451185029Spjd if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) 452185029Spjd strlcpy(path, arg, sizeof (path)); 453185029Spjd else 454185029Spjd snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg); 455185029Spjd wholedisk = is_whole_disk(path); 456185029Spjd if (!wholedisk && (stat64(path, &statbuf) != 0)) { 457185029Spjd /* 458185029Spjd * If we got ENOENT, then the user gave us 459185029Spjd * gibberish, so try to direct them with a 460185029Spjd * reasonable error message. Otherwise, 461185029Spjd * regurgitate strerror() since it's the best we 462185029Spjd * can do. 463185029Spjd */ 464185029Spjd if (errno == ENOENT) { 465185029Spjd (void) fprintf(stderr, 466185029Spjd gettext("cannot open '%s': no such " 467185029Spjd "GEOM provider\n"), arg); 468185029Spjd (void) fprintf(stderr, 469185029Spjd gettext("must be a full path or " 470185029Spjd "shorthand device name\n")); 471185029Spjd return (NULL); 472185029Spjd } else { 473185029Spjd (void) fprintf(stderr, 474185029Spjd gettext("cannot open '%s': %s\n"), 475185029Spjd path, strerror(errno)); 476185029Spjd return (NULL); 477185029Spjd } 478185029Spjd } 479185029Spjd } 480185029Spjd 481219089Spjd#ifdef __FreeBSD__ 482219089Spjd if (S_ISCHR(statbuf.st_mode)) { 483219089Spjd statbuf.st_mode &= ~S_IFCHR; 484219089Spjd statbuf.st_mode |= S_IFBLK; 485219089Spjd wholedisk = B_FALSE; 486219089Spjd } 487219089Spjd#endif 488219089Spjd 489185029Spjd /* 490185029Spjd * Determine whether this is a device or a file. 491185029Spjd */ 492219089Spjd if (wholedisk || S_ISBLK(statbuf.st_mode)) { 493168404Spjd type = VDEV_TYPE_DISK; 494185029Spjd } else if (S_ISREG(statbuf.st_mode)) { 495185029Spjd type = VDEV_TYPE_FILE; 496185029Spjd } else { 497168404Spjd (void) fprintf(stderr, gettext("cannot use '%s': must be a " 498185029Spjd "GEOM provider or regular file\n"), path); 499168404Spjd return (NULL); 500168404Spjd } 501168404Spjd 502168404Spjd /* 503168404Spjd * Finally, we have the complete device or file, and we know that it is 504168404Spjd * acceptable to use. Construct the nvlist to describe this vdev. All 505168404Spjd * vdevs have a 'path' element, and devices also have a 'devid' element. 506168404Spjd */ 507168404Spjd verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 508168404Spjd verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 509168404Spjd verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 510185029Spjd verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 511168404Spjd if (strcmp(type, VDEV_TYPE_DISK) == 0) 512168404Spjd verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 513219089Spjd (uint64_t)wholedisk) == 0); 514168404Spjd 515266611Smav#ifdef have_devid 516169303Spjd /* 517169303Spjd * For a whole disk, defer getting its devid until after labeling it. 518169303Spjd */ 519219089Spjd if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 520169303Spjd /* 521169303Spjd * Get the devid for the device. 522169303Spjd */ 523169303Spjd int fd; 524169303Spjd ddi_devid_t devid; 525169303Spjd char *minor = NULL, *devid_str = NULL; 526169303Spjd 527169303Spjd if ((fd = open(path, O_RDONLY)) < 0) { 528169303Spjd (void) fprintf(stderr, gettext("cannot open '%s': " 529169303Spjd "%s\n"), path, strerror(errno)); 530169303Spjd nvlist_free(vdev); 531169303Spjd return (NULL); 532169303Spjd } 533169303Spjd 534169303Spjd if (devid_get(fd, &devid) == 0) { 535169303Spjd if (devid_get_minor_name(fd, &minor) == 0 && 536169303Spjd (devid_str = devid_str_encode(devid, minor)) != 537169303Spjd NULL) { 538169303Spjd verify(nvlist_add_string(vdev, 539169303Spjd ZPOOL_CONFIG_DEVID, devid_str) == 0); 540169303Spjd } 541169303Spjd if (devid_str != NULL) 542169303Spjd devid_str_free(devid_str); 543169303Spjd if (minor != NULL) 544169303Spjd devid_str_free(minor); 545169303Spjd devid_free(devid); 546169303Spjd } 547169303Spjd 548169303Spjd (void) close(fd); 549169303Spjd } 550266611Smav#endif 551169303Spjd 552168404Spjd return (vdev); 553168404Spjd} 554168404Spjd 555168404Spjd/* 556168404Spjd * Go through and verify the replication level of the pool is consistent. 557168404Spjd * Performs the following checks: 558168404Spjd * 559168404Spjd * For the new spec, verifies that devices in mirrors and raidz are the 560168404Spjd * same size. 561168404Spjd * 562168404Spjd * If the current configuration already has inconsistent replication 563168404Spjd * levels, ignore any other potential problems in the new spec. 564168404Spjd * 565168404Spjd * Otherwise, make sure that the current spec (if there is one) and the new 566168404Spjd * spec have consistent replication levels. 567168404Spjd */ 568168404Spjdtypedef struct replication_level { 569168404Spjd char *zprl_type; 570168404Spjd uint64_t zprl_children; 571168404Spjd uint64_t zprl_parity; 572168404Spjd} replication_level_t; 573168404Spjd 574185029Spjd#define ZPOOL_FUZZ (16 * 1024 * 1024) 575185029Spjd 576168404Spjd/* 577168404Spjd * Given a list of toplevel vdevs, return the current replication level. If 578168404Spjd * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 579168404Spjd * an error message will be displayed for each self-inconsistent vdev. 580168404Spjd */ 581185029Spjdstatic replication_level_t * 582168404Spjdget_replication(nvlist_t *nvroot, boolean_t fatal) 583168404Spjd{ 584168404Spjd nvlist_t **top; 585168404Spjd uint_t t, toplevels; 586168404Spjd nvlist_t **child; 587168404Spjd uint_t c, children; 588168404Spjd nvlist_t *nv; 589168404Spjd char *type; 590297119Smav replication_level_t lastrep = {0}; 591297119Smav replication_level_t rep; 592297119Smav replication_level_t *ret; 593168404Spjd boolean_t dontreport; 594168404Spjd 595168404Spjd ret = safe_malloc(sizeof (replication_level_t)); 596168404Spjd 597168404Spjd verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 598168404Spjd &top, &toplevels) == 0); 599168404Spjd 600168404Spjd for (t = 0; t < toplevels; t++) { 601185029Spjd uint64_t is_log = B_FALSE; 602185029Spjd 603168404Spjd nv = top[t]; 604168404Spjd 605185029Spjd /* 606185029Spjd * For separate logs we ignore the top level vdev replication 607185029Spjd * constraints. 608185029Spjd */ 609185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 610185029Spjd if (is_log) 611185029Spjd continue; 612168404Spjd 613185029Spjd verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 614185029Spjd &type) == 0); 615168404Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 616168404Spjd &child, &children) != 0) { 617168404Spjd /* 618168404Spjd * This is a 'file' or 'disk' vdev. 619168404Spjd */ 620168404Spjd rep.zprl_type = type; 621168404Spjd rep.zprl_children = 1; 622168404Spjd rep.zprl_parity = 0; 623168404Spjd } else { 624168404Spjd uint64_t vdev_size; 625168404Spjd 626168404Spjd /* 627168404Spjd * This is a mirror or RAID-Z vdev. Go through and make 628168404Spjd * sure the contents are all the same (files vs. disks), 629168404Spjd * keeping track of the number of elements in the 630168404Spjd * process. 631168404Spjd * 632168404Spjd * We also check that the size of each vdev (if it can 633168404Spjd * be determined) is the same. 634168404Spjd */ 635168404Spjd rep.zprl_type = type; 636168404Spjd rep.zprl_children = 0; 637168404Spjd 638168404Spjd if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 639168404Spjd verify(nvlist_lookup_uint64(nv, 640168404Spjd ZPOOL_CONFIG_NPARITY, 641168404Spjd &rep.zprl_parity) == 0); 642168404Spjd assert(rep.zprl_parity != 0); 643168404Spjd } else { 644168404Spjd rep.zprl_parity = 0; 645168404Spjd } 646168404Spjd 647168404Spjd /* 648185029Spjd * The 'dontreport' variable indicates that we've 649168404Spjd * already reported an error for this spec, so don't 650168404Spjd * bother doing it again. 651168404Spjd */ 652168404Spjd type = NULL; 653168404Spjd dontreport = 0; 654168404Spjd vdev_size = -1ULL; 655168404Spjd for (c = 0; c < children; c++) { 656168404Spjd nvlist_t *cnv = child[c]; 657168404Spjd char *path; 658168404Spjd struct stat64 statbuf; 659168404Spjd uint64_t size = -1ULL; 660168404Spjd char *childtype; 661168404Spjd int fd, err; 662168404Spjd 663168404Spjd rep.zprl_children++; 664168404Spjd 665168404Spjd verify(nvlist_lookup_string(cnv, 666168404Spjd ZPOOL_CONFIG_TYPE, &childtype) == 0); 667168404Spjd 668168404Spjd /* 669185029Spjd * If this is a replacing or spare vdev, then 670168404Spjd * get the real first child of the vdev. 671168404Spjd */ 672168404Spjd if (strcmp(childtype, 673168404Spjd VDEV_TYPE_REPLACING) == 0 || 674168404Spjd strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 675168404Spjd nvlist_t **rchild; 676168404Spjd uint_t rchildren; 677168404Spjd 678168404Spjd verify(nvlist_lookup_nvlist_array(cnv, 679168404Spjd ZPOOL_CONFIG_CHILDREN, &rchild, 680168404Spjd &rchildren) == 0); 681168404Spjd assert(rchildren == 2); 682168404Spjd cnv = rchild[0]; 683168404Spjd 684168404Spjd verify(nvlist_lookup_string(cnv, 685168404Spjd ZPOOL_CONFIG_TYPE, 686168404Spjd &childtype) == 0); 687330735Sasomers if (strcmp(childtype, 688330735Sasomers VDEV_TYPE_SPARE) == 0) { 689330735Sasomers /* We have a replacing vdev with 690330735Sasomers * a spare child. Get the first 691330735Sasomers * real child of the spare 692330735Sasomers */ 693330735Sasomers verify( 694330735Sasomers nvlist_lookup_nvlist_array( 695330735Sasomers cnv, 696330735Sasomers ZPOOL_CONFIG_CHILDREN, 697330735Sasomers &rchild, 698330735Sasomers &rchildren) == 0); 699330735Sasomers assert(rchildren >= 2); 700330735Sasomers cnv = rchild[0]; 701330735Sasomers } 702168404Spjd } 703168404Spjd 704168404Spjd verify(nvlist_lookup_string(cnv, 705168404Spjd ZPOOL_CONFIG_PATH, &path) == 0); 706168404Spjd 707168404Spjd /* 708168404Spjd * If we have a raidz/mirror that combines disks 709168404Spjd * with files, report it as an error. 710168404Spjd */ 711168404Spjd if (!dontreport && type != NULL && 712168404Spjd strcmp(type, childtype) != 0) { 713168404Spjd if (ret != NULL) 714168404Spjd free(ret); 715168404Spjd ret = NULL; 716168404Spjd if (fatal) 717168404Spjd vdev_error(gettext( 718168404Spjd "mismatched replication " 719168404Spjd "level: %s contains both " 720168404Spjd "files and devices\n"), 721168404Spjd rep.zprl_type); 722168404Spjd else 723168404Spjd return (NULL); 724168404Spjd dontreport = B_TRUE; 725168404Spjd } 726168404Spjd 727168404Spjd /* 728168404Spjd * According to stat(2), the value of 'st_size' 729168404Spjd * is undefined for block devices and character 730168404Spjd * devices. But there is no effective way to 731168404Spjd * determine the real size in userland. 732168404Spjd * 733168404Spjd * Instead, we'll take advantage of an 734168404Spjd * implementation detail of spec_size(). If the 735168404Spjd * device is currently open, then we (should) 736168404Spjd * return a valid size. 737168404Spjd * 738168404Spjd * If we still don't get a valid size (indicated 739168404Spjd * by a size of 0 or MAXOFFSET_T), then ignore 740168404Spjd * this device altogether. 741168404Spjd */ 742168404Spjd if ((fd = open(path, O_RDONLY)) >= 0) { 743168404Spjd err = fstat64(fd, &statbuf); 744168404Spjd (void) close(fd); 745168404Spjd } else { 746168404Spjd err = stat64(path, &statbuf); 747168404Spjd } 748219089Spjd 749219089Spjd if (err != 0 || 750219089Spjd statbuf.st_size == 0 || 751219089Spjd statbuf.st_size == MAXOFFSET_T) 752168404Spjd continue; 753168404Spjd 754168404Spjd size = statbuf.st_size; 755168404Spjd 756168404Spjd /* 757185029Spjd * Also make sure that devices and 758185029Spjd * slices have a consistent size. If 759185029Spjd * they differ by a significant amount 760185029Spjd * (~16MB) then report an error. 761168404Spjd */ 762185029Spjd if (!dontreport && 763185029Spjd (vdev_size != -1ULL && 764185029Spjd (labs(size - vdev_size) > 765185029Spjd ZPOOL_FUZZ))) { 766168404Spjd if (ret != NULL) 767168404Spjd free(ret); 768168404Spjd ret = NULL; 769168404Spjd if (fatal) 770168404Spjd vdev_error(gettext( 771168404Spjd "%s contains devices of " 772168404Spjd "different sizes\n"), 773168404Spjd rep.zprl_type); 774168404Spjd else 775168404Spjd return (NULL); 776168404Spjd dontreport = B_TRUE; 777168404Spjd } 778168404Spjd 779168404Spjd type = childtype; 780168404Spjd vdev_size = size; 781168404Spjd } 782168404Spjd } 783168404Spjd 784168404Spjd /* 785168404Spjd * At this point, we have the replication of the last toplevel 786168404Spjd * vdev in 'rep'. Compare it to 'lastrep' to see if its 787168404Spjd * different. 788168404Spjd */ 789168404Spjd if (lastrep.zprl_type != NULL) { 790168404Spjd if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 791168404Spjd if (ret != NULL) 792168404Spjd free(ret); 793168404Spjd ret = NULL; 794168404Spjd if (fatal) 795168404Spjd vdev_error(gettext( 796168404Spjd "mismatched replication level: " 797168404Spjd "both %s and %s vdevs are " 798168404Spjd "present\n"), 799168404Spjd lastrep.zprl_type, rep.zprl_type); 800168404Spjd else 801168404Spjd return (NULL); 802168404Spjd } else if (lastrep.zprl_parity != rep.zprl_parity) { 803168404Spjd if (ret) 804168404Spjd free(ret); 805168404Spjd ret = NULL; 806168404Spjd if (fatal) 807168404Spjd vdev_error(gettext( 808168404Spjd "mismatched replication level: " 809168404Spjd "both %llu and %llu device parity " 810168404Spjd "%s vdevs are present\n"), 811168404Spjd lastrep.zprl_parity, 812168404Spjd rep.zprl_parity, 813168404Spjd rep.zprl_type); 814168404Spjd else 815168404Spjd return (NULL); 816168404Spjd } else if (lastrep.zprl_children != rep.zprl_children) { 817168404Spjd if (ret) 818168404Spjd free(ret); 819168404Spjd ret = NULL; 820168404Spjd if (fatal) 821168404Spjd vdev_error(gettext( 822168404Spjd "mismatched replication level: " 823168404Spjd "both %llu-way and %llu-way %s " 824168404Spjd "vdevs are present\n"), 825168404Spjd lastrep.zprl_children, 826168404Spjd rep.zprl_children, 827168404Spjd rep.zprl_type); 828168404Spjd else 829168404Spjd return (NULL); 830168404Spjd } 831168404Spjd } 832168404Spjd lastrep = rep; 833168404Spjd } 834168404Spjd 835168404Spjd if (ret != NULL) 836168404Spjd *ret = rep; 837168404Spjd 838168404Spjd return (ret); 839168404Spjd} 840168404Spjd 841168404Spjd/* 842168404Spjd * Check the replication level of the vdev spec against the current pool. Calls 843168404Spjd * get_replication() to make sure the new spec is self-consistent. If the pool 844168404Spjd * has a consistent replication level, then we ignore any errors. Otherwise, 845168404Spjd * report any difference between the two. 846168404Spjd */ 847185029Spjdstatic int 848168404Spjdcheck_replication(nvlist_t *config, nvlist_t *newroot) 849168404Spjd{ 850185029Spjd nvlist_t **child; 851185029Spjd uint_t children; 852168404Spjd replication_level_t *current = NULL, *new; 853168404Spjd int ret; 854168404Spjd 855168404Spjd /* 856168404Spjd * If we have a current pool configuration, check to see if it's 857168404Spjd * self-consistent. If not, simply return success. 858168404Spjd */ 859168404Spjd if (config != NULL) { 860168404Spjd nvlist_t *nvroot; 861168404Spjd 862168404Spjd verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 863168404Spjd &nvroot) == 0); 864168404Spjd if ((current = get_replication(nvroot, B_FALSE)) == NULL) 865168404Spjd return (0); 866168404Spjd } 867185029Spjd /* 868185029Spjd * for spares there may be no children, and therefore no 869185029Spjd * replication level to check 870185029Spjd */ 871185029Spjd if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 872185029Spjd &child, &children) != 0) || (children == 0)) { 873185029Spjd free(current); 874185029Spjd return (0); 875185029Spjd } 876168404Spjd 877168404Spjd /* 878185029Spjd * If all we have is logs then there's no replication level to check. 879185029Spjd */ 880185029Spjd if (num_logs(newroot) == children) { 881185029Spjd free(current); 882185029Spjd return (0); 883185029Spjd } 884185029Spjd 885185029Spjd /* 886168404Spjd * Get the replication level of the new vdev spec, reporting any 887168404Spjd * inconsistencies found. 888168404Spjd */ 889168404Spjd if ((new = get_replication(newroot, B_TRUE)) == NULL) { 890168404Spjd free(current); 891168404Spjd return (-1); 892168404Spjd } 893168404Spjd 894168404Spjd /* 895168404Spjd * Check to see if the new vdev spec matches the replication level of 896168404Spjd * the current pool. 897168404Spjd */ 898168404Spjd ret = 0; 899168404Spjd if (current != NULL) { 900168404Spjd if (strcmp(current->zprl_type, new->zprl_type) != 0) { 901168404Spjd vdev_error(gettext( 902168404Spjd "mismatched replication level: pool uses %s " 903168404Spjd "and new vdev is %s\n"), 904168404Spjd current->zprl_type, new->zprl_type); 905168404Spjd ret = -1; 906168404Spjd } else if (current->zprl_parity != new->zprl_parity) { 907168404Spjd vdev_error(gettext( 908168404Spjd "mismatched replication level: pool uses %llu " 909168404Spjd "device parity and new vdev uses %llu\n"), 910168404Spjd current->zprl_parity, new->zprl_parity); 911168404Spjd ret = -1; 912168404Spjd } else if (current->zprl_children != new->zprl_children) { 913168404Spjd vdev_error(gettext( 914168404Spjd "mismatched replication level: pool uses %llu-way " 915168404Spjd "%s and new vdev uses %llu-way %s\n"), 916168404Spjd current->zprl_children, current->zprl_type, 917168404Spjd new->zprl_children, new->zprl_type); 918168404Spjd ret = -1; 919168404Spjd } 920168404Spjd } 921168404Spjd 922168404Spjd free(new); 923168404Spjd if (current != NULL) 924168404Spjd free(current); 925168404Spjd 926168404Spjd return (ret); 927168404Spjd} 928168404Spjd 929297077Smav#ifdef illumos 930168404Spjd/* 931219089Spjd * Go through and find any whole disks in the vdev specification, labelling them 932219089Spjd * as appropriate. When constructing the vdev spec, we were unable to open this 933219089Spjd * device in order to provide a devid. Now that we have labelled the disk and 934219089Spjd * know that slice 0 is valid, we can construct the devid now. 935219089Spjd * 936219089Spjd * If the disk was already labeled with an EFI label, we will have gotten the 937219089Spjd * devid already (because we were able to open the whole disk). Otherwise, we 938219089Spjd * need to get the devid after we label the disk. 939219089Spjd */ 940219089Spjdstatic int 941219089Spjdmake_disks(zpool_handle_t *zhp, nvlist_t *nv) 942219089Spjd{ 943219089Spjd nvlist_t **child; 944219089Spjd uint_t c, children; 945219089Spjd char *type, *path, *diskname; 946219089Spjd char buf[MAXPATHLEN]; 947219089Spjd uint64_t wholedisk; 948219089Spjd int fd; 949219089Spjd int ret; 950219089Spjd ddi_devid_t devid; 951219089Spjd char *minor = NULL, *devid_str = NULL; 952219089Spjd 953219089Spjd verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 954219089Spjd 955219089Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 956219089Spjd &child, &children) != 0) { 957219089Spjd 958219089Spjd if (strcmp(type, VDEV_TYPE_DISK) != 0) 959219089Spjd return (0); 960219089Spjd 961219089Spjd /* 962219089Spjd * We have a disk device. Get the path to the device 963219089Spjd * and see if it's a whole disk by appending the backup 964219089Spjd * slice and stat()ing the device. 965219089Spjd */ 966219089Spjd verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 967219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 968219089Spjd &wholedisk) != 0 || !wholedisk) 969219089Spjd return (0); 970219089Spjd 971219089Spjd diskname = strrchr(path, '/'); 972219089Spjd assert(diskname != NULL); 973219089Spjd diskname++; 974219089Spjd if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 975219089Spjd return (-1); 976219089Spjd 977219089Spjd /* 978219089Spjd * Fill in the devid, now that we've labeled the disk. 979219089Spjd */ 980219089Spjd (void) snprintf(buf, sizeof (buf), "%ss0", path); 981219089Spjd if ((fd = open(buf, O_RDONLY)) < 0) { 982219089Spjd (void) fprintf(stderr, 983219089Spjd gettext("cannot open '%s': %s\n"), 984219089Spjd buf, strerror(errno)); 985219089Spjd return (-1); 986219089Spjd } 987219089Spjd 988219089Spjd if (devid_get(fd, &devid) == 0) { 989219089Spjd if (devid_get_minor_name(fd, &minor) == 0 && 990219089Spjd (devid_str = devid_str_encode(devid, minor)) != 991219089Spjd NULL) { 992219089Spjd verify(nvlist_add_string(nv, 993219089Spjd ZPOOL_CONFIG_DEVID, devid_str) == 0); 994219089Spjd } 995219089Spjd if (devid_str != NULL) 996219089Spjd devid_str_free(devid_str); 997219089Spjd if (minor != NULL) 998219089Spjd devid_str_free(minor); 999219089Spjd devid_free(devid); 1000219089Spjd } 1001219089Spjd 1002219089Spjd /* 1003219089Spjd * Update the path to refer to the 's0' slice. The presence of 1004219089Spjd * the 'whole_disk' field indicates to the CLI that we should 1005219089Spjd * chop off the slice number when displaying the device in 1006219089Spjd * future output. 1007219089Spjd */ 1008219089Spjd verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 1009219089Spjd 1010219089Spjd (void) close(fd); 1011219089Spjd 1012219089Spjd return (0); 1013219089Spjd } 1014219089Spjd 1015219089Spjd for (c = 0; c < children; c++) 1016219089Spjd if ((ret = make_disks(zhp, child[c])) != 0) 1017219089Spjd return (ret); 1018219089Spjd 1019219089Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1020219089Spjd &child, &children) == 0) 1021219089Spjd for (c = 0; c < children; c++) 1022219089Spjd if ((ret = make_disks(zhp, child[c])) != 0) 1023219089Spjd return (ret); 1024219089Spjd 1025219089Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1026219089Spjd &child, &children) == 0) 1027219089Spjd for (c = 0; c < children; c++) 1028219089Spjd if ((ret = make_disks(zhp, child[c])) != 0) 1029219089Spjd return (ret); 1030219089Spjd 1031219089Spjd return (0); 1032219089Spjd} 1033297077Smav#endif /* illumos */ 1034219089Spjd 1035219089Spjd/* 1036168404Spjd * Determine if the given path is a hot spare within the given configuration. 1037168404Spjd */ 1038168404Spjdstatic boolean_t 1039168404Spjdis_spare(nvlist_t *config, const char *path) 1040168404Spjd{ 1041168404Spjd int fd; 1042168404Spjd pool_state_t state; 1043168404Spjd char *name = NULL; 1044168404Spjd nvlist_t *label; 1045168404Spjd uint64_t guid, spareguid; 1046168404Spjd nvlist_t *nvroot; 1047168404Spjd nvlist_t **spares; 1048168404Spjd uint_t i, nspares; 1049168404Spjd boolean_t inuse; 1050168404Spjd 1051168404Spjd if ((fd = open(path, O_RDONLY)) < 0) 1052168404Spjd return (B_FALSE); 1053168404Spjd 1054168404Spjd if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1055168404Spjd !inuse || 1056168404Spjd state != POOL_STATE_SPARE || 1057168404Spjd zpool_read_label(fd, &label) != 0) { 1058168404Spjd free(name); 1059168404Spjd (void) close(fd); 1060168404Spjd return (B_FALSE); 1061168404Spjd } 1062168404Spjd free(name); 1063219089Spjd (void) close(fd); 1064168404Spjd 1065168404Spjd verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1066168404Spjd nvlist_free(label); 1067168404Spjd 1068168404Spjd verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1069168404Spjd &nvroot) == 0); 1070168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1071168404Spjd &spares, &nspares) == 0) { 1072168404Spjd for (i = 0; i < nspares; i++) { 1073168404Spjd verify(nvlist_lookup_uint64(spares[i], 1074168404Spjd ZPOOL_CONFIG_GUID, &spareguid) == 0); 1075168404Spjd if (spareguid == guid) 1076168404Spjd return (B_TRUE); 1077168404Spjd } 1078168404Spjd } 1079168404Spjd 1080168404Spjd return (B_FALSE); 1081168404Spjd} 1082168404Spjd 1083168404Spjd/* 1084168404Spjd * Go through and find any devices that are in use. We rely on libdiskmgt for 1085168404Spjd * the majority of this task. 1086168404Spjd */ 1087272136Sdelphijstatic boolean_t 1088272136Sdelphijis_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1089219089Spjd boolean_t replacing, boolean_t isspare) 1090168404Spjd{ 1091168404Spjd nvlist_t **child; 1092168404Spjd uint_t c, children; 1093168404Spjd char *type, *path; 1094297119Smav int ret = 0; 1095168404Spjd char buf[MAXPATHLEN]; 1096168404Spjd uint64_t wholedisk; 1097272136Sdelphij boolean_t anyinuse = B_FALSE; 1098168404Spjd 1099168404Spjd verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1100168404Spjd 1101168404Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1102168404Spjd &child, &children) != 0) { 1103168404Spjd 1104168404Spjd verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1105168404Spjd 1106168404Spjd /* 1107168404Spjd * As a generic check, we look to see if this is a replace of a 1108168404Spjd * hot spare within the same pool. If so, we allow it 1109168404Spjd * regardless of what libdiskmgt or zpool_in_use() says. 1110168404Spjd */ 1111219089Spjd if (replacing) { 1112297077Smav#ifdef illumos 1113219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1114219089Spjd &wholedisk) == 0 && wholedisk) 1115219089Spjd (void) snprintf(buf, sizeof (buf), "%ss0", 1116219089Spjd path); 1117219089Spjd else 1118219089Spjd#endif 1119219089Spjd (void) strlcpy(buf, path, sizeof (buf)); 1120219089Spjd 1121168404Spjd if (is_spare(config, buf)) 1122272136Sdelphij return (B_FALSE); 1123168404Spjd } 1124168404Spjd 1125168404Spjd if (strcmp(type, VDEV_TYPE_DISK) == 0) 1126219089Spjd ret = check_device(path, force, isspare); 1127272136Sdelphij else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1128185029Spjd ret = check_file(path, force, isspare); 1129185029Spjd 1130272136Sdelphij return (ret != 0); 1131168404Spjd } 1132168404Spjd 1133168404Spjd for (c = 0; c < children; c++) 1134272136Sdelphij if (is_device_in_use(config, child[c], force, replacing, 1135272136Sdelphij B_FALSE)) 1136272136Sdelphij anyinuse = B_TRUE; 1137168404Spjd 1138168404Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1139168404Spjd &child, &children) == 0) 1140168404Spjd for (c = 0; c < children; c++) 1141272136Sdelphij if (is_device_in_use(config, child[c], force, replacing, 1142272136Sdelphij B_TRUE)) 1143272136Sdelphij anyinuse = B_TRUE; 1144168404Spjd 1145185029Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1146185029Spjd &child, &children) == 0) 1147185029Spjd for (c = 0; c < children; c++) 1148272136Sdelphij if (is_device_in_use(config, child[c], force, replacing, 1149272136Sdelphij B_FALSE)) 1150272136Sdelphij anyinuse = B_TRUE; 1151185029Spjd 1152272136Sdelphij return (anyinuse); 1153168404Spjd} 1154168404Spjd 1155185029Spjdstatic const char * 1156219089Spjdis_grouping(const char *type, int *mindev, int *maxdev) 1157168404Spjd{ 1158219089Spjd if (strncmp(type, "raidz", 5) == 0) { 1159219089Spjd const char *p = type + 5; 1160219089Spjd char *end; 1161219089Spjd long nparity; 1162168404Spjd 1163219089Spjd if (*p == '\0') { 1164219089Spjd nparity = 1; 1165219089Spjd } else if (*p == '0') { 1166219089Spjd return (NULL); /* no zero prefixes allowed */ 1167219089Spjd } else { 1168219089Spjd errno = 0; 1169219089Spjd nparity = strtol(p, &end, 10); 1170219089Spjd if (errno != 0 || nparity < 1 || nparity >= 255 || 1171219089Spjd *end != '\0') 1172219089Spjd return (NULL); 1173219089Spjd } 1174219089Spjd 1175168404Spjd if (mindev != NULL) 1176219089Spjd *mindev = nparity + 1; 1177219089Spjd if (maxdev != NULL) 1178219089Spjd *maxdev = 255; 1179168404Spjd return (VDEV_TYPE_RAIDZ); 1180168404Spjd } 1181168404Spjd 1182219089Spjd if (maxdev != NULL) 1183219089Spjd *maxdev = INT_MAX; 1184219089Spjd 1185168404Spjd if (strcmp(type, "mirror") == 0) { 1186168404Spjd if (mindev != NULL) 1187168404Spjd *mindev = 2; 1188168404Spjd return (VDEV_TYPE_MIRROR); 1189168404Spjd } 1190168404Spjd 1191168404Spjd if (strcmp(type, "spare") == 0) { 1192168404Spjd if (mindev != NULL) 1193168404Spjd *mindev = 1; 1194168404Spjd return (VDEV_TYPE_SPARE); 1195168404Spjd } 1196168404Spjd 1197185029Spjd if (strcmp(type, "log") == 0) { 1198185029Spjd if (mindev != NULL) 1199185029Spjd *mindev = 1; 1200185029Spjd return (VDEV_TYPE_LOG); 1201185029Spjd } 1202185029Spjd 1203185029Spjd if (strcmp(type, "cache") == 0) { 1204185029Spjd if (mindev != NULL) 1205185029Spjd *mindev = 1; 1206185029Spjd return (VDEV_TYPE_L2CACHE); 1207185029Spjd } 1208185029Spjd 1209168404Spjd return (NULL); 1210168404Spjd} 1211168404Spjd 1212168404Spjd/* 1213168404Spjd * Construct a syntactically valid vdev specification, 1214168404Spjd * and ensure that all devices and files exist and can be opened. 1215168404Spjd * Note: we don't bother freeing anything in the error paths 1216168404Spjd * because the program is just going to exit anyway. 1217168404Spjd */ 1218168404Spjdnvlist_t * 1219168404Spjdconstruct_spec(int argc, char **argv) 1220168404Spjd{ 1221185029Spjd nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1222219089Spjd int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1223168404Spjd const char *type; 1224185029Spjd uint64_t is_log; 1225185029Spjd boolean_t seen_logs; 1226168404Spjd 1227168404Spjd top = NULL; 1228168404Spjd toplevels = 0; 1229168404Spjd spares = NULL; 1230185029Spjd l2cache = NULL; 1231168404Spjd nspares = 0; 1232185029Spjd nlogs = 0; 1233185029Spjd nl2cache = 0; 1234185029Spjd is_log = B_FALSE; 1235185029Spjd seen_logs = B_FALSE; 1236168404Spjd 1237168404Spjd while (argc > 0) { 1238168404Spjd nv = NULL; 1239168404Spjd 1240168404Spjd /* 1241168404Spjd * If it's a mirror or raidz, the subsequent arguments are 1242168404Spjd * its leaves -- until we encounter the next mirror or raidz. 1243168404Spjd */ 1244219089Spjd if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1245168404Spjd nvlist_t **child = NULL; 1246168404Spjd int c, children = 0; 1247168404Spjd 1248185029Spjd if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1249185029Spjd if (spares != NULL) { 1250185029Spjd (void) fprintf(stderr, 1251185029Spjd gettext("invalid vdev " 1252185029Spjd "specification: 'spare' can be " 1253185029Spjd "specified only once\n")); 1254185029Spjd return (NULL); 1255185029Spjd } 1256185029Spjd is_log = B_FALSE; 1257168404Spjd } 1258168404Spjd 1259185029Spjd if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1260185029Spjd if (seen_logs) { 1261185029Spjd (void) fprintf(stderr, 1262185029Spjd gettext("invalid vdev " 1263185029Spjd "specification: 'log' can be " 1264185029Spjd "specified only once\n")); 1265185029Spjd return (NULL); 1266185029Spjd } 1267185029Spjd seen_logs = B_TRUE; 1268185029Spjd is_log = B_TRUE; 1269185029Spjd argc--; 1270185029Spjd argv++; 1271185029Spjd /* 1272185029Spjd * A log is not a real grouping device. 1273185029Spjd * We just set is_log and continue. 1274185029Spjd */ 1275185029Spjd continue; 1276185029Spjd } 1277185029Spjd 1278185029Spjd if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1279185029Spjd if (l2cache != NULL) { 1280185029Spjd (void) fprintf(stderr, 1281185029Spjd gettext("invalid vdev " 1282185029Spjd "specification: 'cache' can be " 1283185029Spjd "specified only once\n")); 1284185029Spjd return (NULL); 1285185029Spjd } 1286185029Spjd is_log = B_FALSE; 1287185029Spjd } 1288185029Spjd 1289185029Spjd if (is_log) { 1290185029Spjd if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1291185029Spjd (void) fprintf(stderr, 1292185029Spjd gettext("invalid vdev " 1293185029Spjd "specification: unsupported 'log' " 1294185029Spjd "device: %s\n"), type); 1295185029Spjd return (NULL); 1296185029Spjd } 1297185029Spjd nlogs++; 1298185029Spjd } 1299185029Spjd 1300168404Spjd for (c = 1; c < argc; c++) { 1301219089Spjd if (is_grouping(argv[c], NULL, NULL) != NULL) 1302168404Spjd break; 1303168404Spjd children++; 1304168404Spjd child = realloc(child, 1305168404Spjd children * sizeof (nvlist_t *)); 1306168404Spjd if (child == NULL) 1307168404Spjd zpool_no_memory(); 1308185029Spjd if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1309185029Spjd == NULL) 1310168404Spjd return (NULL); 1311168404Spjd child[children - 1] = nv; 1312168404Spjd } 1313168404Spjd 1314168404Spjd if (children < mindev) { 1315168404Spjd (void) fprintf(stderr, gettext("invalid vdev " 1316168404Spjd "specification: %s requires at least %d " 1317168404Spjd "devices\n"), argv[0], mindev); 1318168404Spjd return (NULL); 1319168404Spjd } 1320168404Spjd 1321219089Spjd if (children > maxdev) { 1322219089Spjd (void) fprintf(stderr, gettext("invalid vdev " 1323219089Spjd "specification: %s supports no more than " 1324219089Spjd "%d devices\n"), argv[0], maxdev); 1325219089Spjd return (NULL); 1326219089Spjd } 1327219089Spjd 1328168404Spjd argc -= c; 1329168404Spjd argv += c; 1330168404Spjd 1331168404Spjd if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1332168404Spjd spares = child; 1333168404Spjd nspares = children; 1334168404Spjd continue; 1335185029Spjd } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1336185029Spjd l2cache = child; 1337185029Spjd nl2cache = children; 1338185029Spjd continue; 1339168404Spjd } else { 1340168404Spjd verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1341168404Spjd 0) == 0); 1342168404Spjd verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1343168404Spjd type) == 0); 1344185029Spjd verify(nvlist_add_uint64(nv, 1345185029Spjd ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1346168404Spjd if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1347168404Spjd verify(nvlist_add_uint64(nv, 1348168404Spjd ZPOOL_CONFIG_NPARITY, 1349168404Spjd mindev - 1) == 0); 1350168404Spjd } 1351168404Spjd verify(nvlist_add_nvlist_array(nv, 1352168404Spjd ZPOOL_CONFIG_CHILDREN, child, 1353168404Spjd children) == 0); 1354168404Spjd 1355168404Spjd for (c = 0; c < children; c++) 1356168404Spjd nvlist_free(child[c]); 1357168404Spjd free(child); 1358168404Spjd } 1359168404Spjd } else { 1360168404Spjd /* 1361168404Spjd * We have a device. Pass off to make_leaf_vdev() to 1362168404Spjd * construct the appropriate nvlist describing the vdev. 1363168404Spjd */ 1364185029Spjd if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1365168404Spjd return (NULL); 1366185029Spjd if (is_log) 1367185029Spjd nlogs++; 1368168404Spjd argc--; 1369168404Spjd argv++; 1370168404Spjd } 1371168404Spjd 1372168404Spjd toplevels++; 1373168404Spjd top = realloc(top, toplevels * sizeof (nvlist_t *)); 1374168404Spjd if (top == NULL) 1375168404Spjd zpool_no_memory(); 1376168404Spjd top[toplevels - 1] = nv; 1377168404Spjd } 1378168404Spjd 1379185029Spjd if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1380168404Spjd (void) fprintf(stderr, gettext("invalid vdev " 1381168404Spjd "specification: at least one toplevel vdev must be " 1382168404Spjd "specified\n")); 1383168404Spjd return (NULL); 1384168404Spjd } 1385168404Spjd 1386185029Spjd if (seen_logs && nlogs == 0) { 1387185029Spjd (void) fprintf(stderr, gettext("invalid vdev specification: " 1388185029Spjd "log requires at least 1 device\n")); 1389185029Spjd return (NULL); 1390185029Spjd } 1391185029Spjd 1392168404Spjd /* 1393168404Spjd * Finally, create nvroot and add all top-level vdevs to it. 1394168404Spjd */ 1395168404Spjd verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1396168404Spjd verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1397168404Spjd VDEV_TYPE_ROOT) == 0); 1398168404Spjd verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1399168404Spjd top, toplevels) == 0); 1400168404Spjd if (nspares != 0) 1401168404Spjd verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1402168404Spjd spares, nspares) == 0); 1403185029Spjd if (nl2cache != 0) 1404185029Spjd verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1405185029Spjd l2cache, nl2cache) == 0); 1406168404Spjd 1407168404Spjd for (t = 0; t < toplevels; t++) 1408168404Spjd nvlist_free(top[t]); 1409168404Spjd for (t = 0; t < nspares; t++) 1410168404Spjd nvlist_free(spares[t]); 1411185029Spjd for (t = 0; t < nl2cache; t++) 1412185029Spjd nvlist_free(l2cache[t]); 1413168404Spjd if (spares) 1414168404Spjd free(spares); 1415185029Spjd if (l2cache) 1416185029Spjd free(l2cache); 1417168404Spjd free(top); 1418168404Spjd 1419168404Spjd return (nvroot); 1420168404Spjd} 1421168404Spjd 1422219089Spjdnvlist_t * 1423219089Spjdsplit_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1424219089Spjd splitflags_t flags, int argc, char **argv) 1425219089Spjd{ 1426219089Spjd nvlist_t *newroot = NULL, **child; 1427219089Spjd uint_t c, children; 1428185029Spjd 1429219089Spjd if (argc > 0) { 1430219089Spjd if ((newroot = construct_spec(argc, argv)) == NULL) { 1431219089Spjd (void) fprintf(stderr, gettext("Unable to build a " 1432219089Spjd "pool from the specified devices\n")); 1433219089Spjd return (NULL); 1434219089Spjd } 1435219089Spjd 1436297077Smav#ifdef illumos 1437219089Spjd if (!flags.dryrun && make_disks(zhp, newroot) != 0) { 1438219089Spjd nvlist_free(newroot); 1439219089Spjd return (NULL); 1440219089Spjd } 1441219089Spjd#endif 1442219089Spjd 1443219089Spjd /* avoid any tricks in the spec */ 1444219089Spjd verify(nvlist_lookup_nvlist_array(newroot, 1445219089Spjd ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1446219089Spjd for (c = 0; c < children; c++) { 1447219089Spjd char *path; 1448219089Spjd const char *type; 1449219089Spjd int min, max; 1450219089Spjd 1451219089Spjd verify(nvlist_lookup_string(child[c], 1452219089Spjd ZPOOL_CONFIG_PATH, &path) == 0); 1453219089Spjd if ((type = is_grouping(path, &min, &max)) != NULL) { 1454219089Spjd (void) fprintf(stderr, gettext("Cannot use " 1455219089Spjd "'%s' as a device for splitting\n"), type); 1456219089Spjd nvlist_free(newroot); 1457219089Spjd return (NULL); 1458219089Spjd } 1459219089Spjd } 1460219089Spjd } 1461219089Spjd 1462219089Spjd if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1463297115Smav nvlist_free(newroot); 1464219089Spjd return (NULL); 1465219089Spjd } 1466219089Spjd 1467219089Spjd return (newroot); 1468219089Spjd} 1469219089Spjd 1470168404Spjd/* 1471168404Spjd * Get and validate the contents of the given vdev specification. This ensures 1472168404Spjd * that the nvlist returned is well-formed, that all the devices exist, and that 1473168404Spjd * they are not currently in use by any other known consumer. The 'poolconfig' 1474168404Spjd * parameter is the current configuration of the pool when adding devices 1475168404Spjd * existing pool, and is used to perform additional checks, such as changing the 1476168404Spjd * replication level of the pool. It can be 'NULL' to indicate that this is a 1477168404Spjd * new pool. The 'force' flag controls whether devices should be forcefully 1478168404Spjd * added, even if they appear in use. 1479168404Spjd */ 1480168404Spjdnvlist_t * 1481185029Spjdmake_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1482219089Spjd boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1483168404Spjd{ 1484168404Spjd nvlist_t *newroot; 1485185029Spjd nvlist_t *poolconfig = NULL; 1486168404Spjd is_force = force; 1487168404Spjd 1488168404Spjd /* 1489168404Spjd * Construct the vdev specification. If this is successful, we know 1490168404Spjd * that we have a valid specification, and that all devices can be 1491168404Spjd * opened. 1492168404Spjd */ 1493168404Spjd if ((newroot = construct_spec(argc, argv)) == NULL) 1494168404Spjd return (NULL); 1495168404Spjd 1496185029Spjd if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1497185029Spjd return (NULL); 1498185029Spjd 1499168404Spjd /* 1500168404Spjd * Validate each device to make sure that its not shared with another 1501168404Spjd * subsystem. We do this even if 'force' is set, because there are some 1502168404Spjd * uses (such as a dedicated dump device) that even '-f' cannot 1503168404Spjd * override. 1504168404Spjd */ 1505272136Sdelphij if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1506168404Spjd nvlist_free(newroot); 1507168404Spjd return (NULL); 1508168404Spjd } 1509168404Spjd 1510168404Spjd /* 1511168404Spjd * Check the replication level of the given vdevs and report any errors 1512168404Spjd * found. We include the existing pool spec, if any, as we need to 1513168404Spjd * catch changes against the existing replication level. 1514168404Spjd */ 1515168404Spjd if (check_rep && check_replication(poolconfig, newroot) != 0) { 1516168404Spjd nvlist_free(newroot); 1517168404Spjd return (NULL); 1518168404Spjd } 1519168404Spjd 1520297077Smav#ifdef illumos 1521219089Spjd /* 1522219089Spjd * Run through the vdev specification and label any whole disks found. 1523219089Spjd */ 1524219089Spjd if (!dryrun && make_disks(zhp, newroot) != 0) { 1525219089Spjd nvlist_free(newroot); 1526219089Spjd return (NULL); 1527219089Spjd } 1528219089Spjd#endif 1529219089Spjd 1530168404Spjd return (newroot); 1531168404Spjd} 1532