1 2/* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23/* 24 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28/* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34#include <sys/zfs_context.h> 35#include <sys/fm/fs/zfs.h> 36#include <sys/spa_impl.h> 37#include <sys/zio.h> 38#include <sys/zio_checksum.h> 39#include <sys/dmu.h> 40#include <sys/dmu_tx.h> 41#include <sys/zap.h> 42#include <sys/zil.h> 43#include <sys/ddt.h> 44#include <sys/vdev_impl.h> 45#include <sys/metaslab.h> 46#include <sys/metaslab_impl.h> 47#include <sys/uberblock_impl.h> 48#include <sys/txg.h> 49#include <sys/avl.h> 50#include <sys/dmu_traverse.h> 51#include <sys/dmu_objset.h> 52#include <sys/unique.h> 53#include <sys/dsl_pool.h> 54#include <sys/dsl_dataset.h> 55#include <sys/dsl_dir.h> 56#include <sys/dsl_prop.h> 57#include <sys/dsl_synctask.h> 58#include <sys/fs/zfs.h> 59#include <sys/arc.h> 60#include <sys/callb.h> 61#include <sys/systeminfo.h> 62#include <sys/spa_boot.h> 63#include <sys/zfs_ioctl.h> 64 65#ifdef _KERNEL 66#include <sys/bootprops.h> 67#include <sys/callb.h> 68#include <sys/cpupart.h> 69#include <sys/pool.h> 70#include <sys/sysdc.h> 71#include <sys/zone.h> 72#endif /* _KERNEL */ 73 74#include "zfs_prop.h" 75#include "zfs_comutil.h" 76 77typedef enum zti_modes { 78 zti_mode_fixed, /* value is # of threads (min 1) */ 79 zti_mode_online_percent, /* value is % of online CPUs */ 80 zti_mode_batch, /* cpu-intensive; value is ignored */ 81 zti_mode_null, /* don't create a taskq */ 82 zti_nmodes 83} zti_modes_t; 84 85#define ZTI_FIX(n) { zti_mode_fixed, (n) } 86#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 87#define ZTI_BATCH { zti_mode_batch, 0 } 88#define ZTI_NULL { zti_mode_null, 0 } 89 90#define ZTI_ONE ZTI_FIX(1) 91 92typedef struct zio_taskq_info { 93 enum zti_modes zti_mode; 94 uint_t zti_value; 95} zio_taskq_info_t; 96 97static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 98 "issue", "issue_high", "intr", "intr_high" 99}; 100 101/* 102 * Define the taskq threads for the following I/O types: 103 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 104 */ 105const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 106 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 107 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 108 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 109 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 110 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 111 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113}; 114 115static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 116static boolean_t spa_has_active_shared_spare(spa_t *spa); 117static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 118 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 119 char **ereport); 120 121uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 122id_t zio_taskq_psrset_bind = PS_NONE; 123boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 124uint_t zio_taskq_basedc = 80; /* base duty cycle */ 125 126boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 127 128/* 129 * This (illegal) pool name is used when temporarily importing a spa_t in order 130 * to get the vdev stats associated with the imported devices. 131 */ 132#define TRYIMPORT_NAME "$import" 133 134/* 135 * ========================================================================== 136 * SPA properties routines 137 * ========================================================================== 138 */ 139 140/* 141 * Add a (source=src, propname=propval) list to an nvlist. 142 */ 143static void 144spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 145 uint64_t intval, zprop_source_t src) 146{ 147 const char *propname = zpool_prop_to_name(prop); 148 nvlist_t *propval; 149 150 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 151 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 152 153 if (strval != NULL) 154 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 155 else 156 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 157 158 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 159 nvlist_free(propval); 160} 161 162/* 163 * Get property values from the spa configuration. 164 */ 165static void 166spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 167{ 168 uint64_t size; 169 uint64_t alloc; 170 uint64_t cap, version; 171 zprop_source_t src = ZPROP_SRC_NONE; 172 spa_config_dirent_t *dp; 173 174 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 175 176 if (spa->spa_root_vdev != NULL) { 177 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 178 size = metaslab_class_get_space(spa_normal_class(spa)); 179 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 180 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 181 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 182 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 183 size - alloc, src); 184 185 cap = (size == 0) ? 0 : (alloc * 100 / size); 186 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 187 188 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 189 ddt_get_pool_dedup_ratio(spa), src); 190 191 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 192 spa->spa_root_vdev->vdev_state, src); 193 194 version = spa_version(spa); 195 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 196 src = ZPROP_SRC_DEFAULT; 197 else 198 src = ZPROP_SRC_LOCAL; 199 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 200 } 201 202 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 203 204 if (spa->spa_root != NULL) 205 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 206 0, ZPROP_SRC_LOCAL); 207 208 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 209 if (dp->scd_path == NULL) { 210 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 211 "none", 0, ZPROP_SRC_LOCAL); 212 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 213 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 214 dp->scd_path, 0, ZPROP_SRC_LOCAL); 215 } 216 } 217} 218 219/* 220 * Get zpool property values. 221 */ 222int 223spa_prop_get(spa_t *spa, nvlist_t **nvp) 224{ 225 objset_t *mos = spa->spa_meta_objset; 226 zap_cursor_t zc; 227 zap_attribute_t za; 228 int err; 229 230 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 231 232 mutex_enter(&spa->spa_props_lock); 233 234 /* 235 * Get properties from the spa config. 236 */ 237 spa_prop_get_config(spa, nvp); 238 239 /* If no pool property object, no more prop to get. */ 240 if (mos == NULL || spa->spa_pool_props_object == 0) { 241 mutex_exit(&spa->spa_props_lock); 242 return (0); 243 } 244 245 /* 246 * Get properties from the MOS pool property object. 247 */ 248 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 249 (err = zap_cursor_retrieve(&zc, &za)) == 0; 250 zap_cursor_advance(&zc)) { 251 uint64_t intval = 0; 252 char *strval = NULL; 253 zprop_source_t src = ZPROP_SRC_DEFAULT; 254 zpool_prop_t prop; 255 256 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 257 continue; 258 259 switch (za.za_integer_length) { 260 case 8: 261 /* integer property */ 262 if (za.za_first_integer != 263 zpool_prop_default_numeric(prop)) 264 src = ZPROP_SRC_LOCAL; 265 266 if (prop == ZPOOL_PROP_BOOTFS) { 267 dsl_pool_t *dp; 268 dsl_dataset_t *ds = NULL; 269 270 dp = spa_get_dsl(spa); 271 rw_enter(&dp->dp_config_rwlock, RW_READER); 272 if (err = dsl_dataset_hold_obj(dp, 273 za.za_first_integer, FTAG, &ds)) { 274 rw_exit(&dp->dp_config_rwlock); 275 break; 276 } 277 278 strval = kmem_alloc( 279 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 280 KM_SLEEP); 281 dsl_dataset_name(ds, strval); 282 dsl_dataset_rele(ds, FTAG); 283 rw_exit(&dp->dp_config_rwlock); 284 } else { 285 strval = NULL; 286 intval = za.za_first_integer; 287 } 288 289 spa_prop_add_list(*nvp, prop, strval, intval, src); 290 291 if (strval != NULL) 292 kmem_free(strval, 293 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 294 295 break; 296 297 case 1: 298 /* string property */ 299 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 300 err = zap_lookup(mos, spa->spa_pool_props_object, 301 za.za_name, 1, za.za_num_integers, strval); 302 if (err) { 303 kmem_free(strval, za.za_num_integers); 304 break; 305 } 306 spa_prop_add_list(*nvp, prop, strval, 0, src); 307 kmem_free(strval, za.za_num_integers); 308 break; 309 310 default: 311 break; 312 } 313 } 314 zap_cursor_fini(&zc); 315 mutex_exit(&spa->spa_props_lock); 316out: 317 if (err && err != ENOENT) { 318 nvlist_free(*nvp); 319 *nvp = NULL; 320 return (err); 321 } 322 323 return (0); 324} 325 326/* 327 * Validate the given pool properties nvlist and modify the list 328 * for the property values to be set. 329 */ 330static int 331spa_prop_validate(spa_t *spa, nvlist_t *props) 332{ 333 nvpair_t *elem; 334 int error = 0, reset_bootfs = 0; 335 uint64_t objnum; 336 337 elem = NULL; 338 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 339 zpool_prop_t prop; 340 char *propname, *strval; 341 uint64_t intval; 342 objset_t *os; 343 char *slash; 344 345 propname = nvpair_name(elem); 346 347 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 348 return (EINVAL); 349 350 switch (prop) { 351 case ZPOOL_PROP_VERSION: 352 error = nvpair_value_uint64(elem, &intval); 353 if (!error && 354 (intval < spa_version(spa) || intval > SPA_VERSION)) 355 error = EINVAL; 356 break; 357 358 case ZPOOL_PROP_DELEGATION: 359 case ZPOOL_PROP_AUTOREPLACE: 360 case ZPOOL_PROP_LISTSNAPS: 361 case ZPOOL_PROP_AUTOEXPAND: 362 error = nvpair_value_uint64(elem, &intval); 363 if (!error && intval > 1) 364 error = EINVAL; 365 break; 366 367 case ZPOOL_PROP_BOOTFS: 368 /* 369 * If the pool version is less than SPA_VERSION_BOOTFS, 370 * or the pool is still being created (version == 0), 371 * the bootfs property cannot be set. 372 */ 373 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 374 error = ENOTSUP; 375 break; 376 } 377 378 /* 379 * Make sure the vdev config is bootable 380 */ 381 if (!vdev_is_bootable(spa->spa_root_vdev)) { 382 error = ENOTSUP; 383 break; 384 } 385 386 reset_bootfs = 1; 387 388 error = nvpair_value_string(elem, &strval); 389 390 if (!error) { 391 uint64_t compress; 392 393 if (strval == NULL || strval[0] == '\0') { 394 objnum = zpool_prop_default_numeric( 395 ZPOOL_PROP_BOOTFS); 396 break; 397 } 398 399 if (error = dmu_objset_hold(strval, FTAG, &os)) 400 break; 401 402 /* Must be ZPL and not gzip compressed. */ 403 404 if (dmu_objset_type(os) != DMU_OST_ZFS) { 405 error = ENOTSUP; 406 } else if ((error = dsl_prop_get_integer(strval, 407 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 408 &compress, NULL)) == 0 && 409 !BOOTFS_COMPRESS_VALID(compress)) { 410 error = ENOTSUP; 411 } else { 412 objnum = dmu_objset_id(os); 413 } 414 dmu_objset_rele(os, FTAG); 415 } 416 break; 417 418 case ZPOOL_PROP_FAILUREMODE: 419 error = nvpair_value_uint64(elem, &intval); 420 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 421 intval > ZIO_FAILURE_MODE_PANIC)) 422 error = EINVAL; 423 424 /* 425 * This is a special case which only occurs when 426 * the pool has completely failed. This allows 427 * the user to change the in-core failmode property 428 * without syncing it out to disk (I/Os might 429 * currently be blocked). We do this by returning 430 * EIO to the caller (spa_prop_set) to trick it 431 * into thinking we encountered a property validation 432 * error. 433 */ 434 if (!error && spa_suspended(spa)) { 435 spa->spa_failmode = intval; 436 error = EIO; 437 } 438 break; 439 440 case ZPOOL_PROP_CACHEFILE: 441 if ((error = nvpair_value_string(elem, &strval)) != 0) 442 break; 443 444 if (strval[0] == '\0') 445 break; 446 447 if (strcmp(strval, "none") == 0) 448 break; 449 450 if (strval[0] != '/') { 451 error = EINVAL; 452 break; 453 } 454 455 slash = strrchr(strval, '/'); 456 ASSERT(slash != NULL); 457 458 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 459 strcmp(slash, "/..") == 0) 460 error = EINVAL; 461 break; 462 463 case ZPOOL_PROP_DEDUPDITTO: 464 if (spa_version(spa) < SPA_VERSION_DEDUP) 465 error = ENOTSUP; 466 else 467 error = nvpair_value_uint64(elem, &intval); 468 if (error == 0 && 469 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 470 error = EINVAL; 471 break; 472 } 473 474 if (error) 475 break; 476 } 477 478 if (!error && reset_bootfs) { 479 error = nvlist_remove(props, 480 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 481 482 if (!error) { 483 error = nvlist_add_uint64(props, 484 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 485 } 486 } 487 488 return (error); 489} 490 491void 492spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 493{ 494 char *cachefile; 495 spa_config_dirent_t *dp; 496 497 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 498 &cachefile) != 0) 499 return; 500 501 dp = kmem_alloc(sizeof (spa_config_dirent_t), 502 KM_SLEEP); 503 504 if (cachefile[0] == '\0') 505 dp->scd_path = spa_strdup(spa_config_path); 506 else if (strcmp(cachefile, "none") == 0) 507 dp->scd_path = NULL; 508 else 509 dp->scd_path = spa_strdup(cachefile); 510 511 list_insert_head(&spa->spa_config_list, dp); 512 if (need_sync) 513 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 514} 515 516int 517spa_prop_set(spa_t *spa, nvlist_t *nvp) 518{ 519 int error; 520 nvpair_t *elem; 521 boolean_t need_sync = B_FALSE; 522 zpool_prop_t prop; 523 524 if ((error = spa_prop_validate(spa, nvp)) != 0) 525 return (error); 526 527 elem = NULL; 528 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 529 if ((prop = zpool_name_to_prop( 530 nvpair_name(elem))) == ZPROP_INVAL) 531 return (EINVAL); 532 533 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 534 continue; 535 536 need_sync = B_TRUE; 537 break; 538 } 539 540 if (need_sync) 541 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 542 spa, nvp, 3)); 543 else 544 return (0); 545} 546 547/* 548 * If the bootfs property value is dsobj, clear it. 549 */ 550void 551spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 552{ 553 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 554 VERIFY(zap_remove(spa->spa_meta_objset, 555 spa->spa_pool_props_object, 556 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 557 spa->spa_bootfs = 0; 558 } 559} 560 561/* 562 * ========================================================================== 563 * SPA state manipulation (open/create/destroy/import/export) 564 * ========================================================================== 565 */ 566 567static int 568spa_error_entry_compare(const void *a, const void *b) 569{ 570 spa_error_entry_t *sa = (spa_error_entry_t *)a; 571 spa_error_entry_t *sb = (spa_error_entry_t *)b; 572 int ret; 573 574 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 575 sizeof (zbookmark_t)); 576 577 if (ret < 0) 578 return (-1); 579 else if (ret > 0) 580 return (1); 581 else 582 return (0); 583} 584 585/* 586 * Utility function which retrieves copies of the current logs and 587 * re-initializes them in the process. 588 */ 589void 590spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 591{ 592 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 593 594 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 595 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 596 597 avl_create(&spa->spa_errlist_scrub, 598 spa_error_entry_compare, sizeof (spa_error_entry_t), 599 offsetof(spa_error_entry_t, se_avl)); 600 avl_create(&spa->spa_errlist_last, 601 spa_error_entry_compare, sizeof (spa_error_entry_t), 602 offsetof(spa_error_entry_t, se_avl)); 603} 604 605static taskq_t * 606spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 607 uint_t value) 608{ 609 uint_t flags = TASKQ_PREPOPULATE; 610 boolean_t batch = B_FALSE; 611 612 switch (mode) { 613 case zti_mode_null: 614 return (NULL); /* no taskq needed */ 615 616 case zti_mode_fixed: 617 ASSERT3U(value, >=, 1); 618 value = MAX(value, 1); 619 break; 620 621 case zti_mode_batch: 622 batch = B_TRUE; 623 flags |= TASKQ_THREADS_CPU_PCT; 624 value = zio_taskq_batch_pct; 625 break; 626 627 case zti_mode_online_percent: 628 flags |= TASKQ_THREADS_CPU_PCT; 629 break; 630 631 default: 632 panic("unrecognized mode for %s taskq (%u:%u) in " 633 "spa_activate()", 634 name, mode, value); 635 break; 636 } 637 638 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 639 if (batch) 640 flags |= TASKQ_DC_BATCH; 641 642 return (taskq_create_sysdc(name, value, 50, INT_MAX, 643 spa->spa_proc, zio_taskq_basedc, flags)); 644 } 645 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 646 spa->spa_proc, flags)); 647} 648 649static void 650spa_create_zio_taskqs(spa_t *spa) 651{ 652 for (int t = 0; t < ZIO_TYPES; t++) { 653 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 654 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 655 enum zti_modes mode = ztip->zti_mode; 656 uint_t value = ztip->zti_value; 657 char name[32]; 658 659 (void) snprintf(name, sizeof (name), 660 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 661 662 spa->spa_zio_taskq[t][q] = 663 spa_taskq_create(spa, name, mode, value); 664 } 665 } 666} 667 668#ifdef _KERNEL 669static void 670spa_thread(void *arg) 671{ 672 callb_cpr_t cprinfo; 673 674 spa_t *spa = arg; 675 676 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 677 spa->spa_name); 678 679 ASSERT(curproc != &p0); 680#ifdef PORT_SOLARIS 681 user_t *pu = PTOU(curproc); 682 683 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 684 "zpool-%s", spa->spa_name); 685 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 686 687 /* bind this thread to the requested psrset */ 688 if (zio_taskq_psrset_bind != PS_NONE) { 689 pool_lock(); 690 mutex_enter(&cpu_lock); 691 mutex_enter(&pidlock); 692 mutex_enter(&curproc->p_lock); 693 694 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 695 0, NULL, NULL) == 0) { 696 curthread->t_bind_pset = zio_taskq_psrset_bind; 697 } else { 698 cmn_err(CE_WARN, 699 "Couldn't bind process for zfs pool \"%s\" to " 700 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 701 } 702 703 mutex_exit(&curproc->p_lock); 704 mutex_exit(&pidlock); 705 mutex_exit(&cpu_lock); 706 pool_unlock(); 707 } 708 709 if (zio_taskq_sysdc) { 710 sysdc_thread_enter(curthread, 100, 0); 711 } 712#endif /* PORT_SOLARIS */ 713 spa->spa_proc = curproc; 714 715 spa_create_zio_taskqs(spa); 716 717 mutex_enter(&spa->spa_proc_lock); 718 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 719 720 spa->spa_proc_state = SPA_PROC_ACTIVE; 721 cv_broadcast(&spa->spa_proc_cv); 722 723 CALLB_CPR_SAFE_BEGIN(&cprinfo); 724 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 725 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 726 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 727 728 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 729 spa->spa_proc_state = SPA_PROC_GONE; 730 spa->spa_proc = &p0; 731 cv_broadcast(&spa->spa_proc_cv); 732 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 733 734/* mutex_enter(curproc->p_lock); 735 lwp_exit(curproc); */ 736} 737#endif 738 739/* 740 * Activate an uninitialized pool. 741 */ 742static void 743spa_activate(spa_t *spa, int mode) 744{ 745 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 746 747 spa->spa_state = POOL_STATE_ACTIVE; 748 spa->spa_mode = mode; 749 750 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 751 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 752 753 /* Try to create a covering process */ 754 mutex_enter(&spa->spa_proc_lock); 755 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 756 ASSERT(spa->spa_proc == &p0); 757 spa->spa_did = 0; 758#if 0 759 /* Only create a process if we're going to be around a while. */ 760 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 761 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 762 NULL, 0) == 0) { 763 spa->spa_proc_state = SPA_PROC_CREATED; 764 while (spa->spa_proc_state == SPA_PROC_CREATED) { 765 cv_wait(&spa->spa_proc_cv, 766 &spa->spa_proc_lock); 767 } 768 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 769 ASSERT(spa->spa_proc != &p0); 770 ASSERT(spa->spa_did != 0); 771 } else { 772#ifdef _KERNEL 773 cmn_err(CE_WARN, 774 "Couldn't create process for zfs pool \"%s\"\n", 775 spa->spa_name); 776#endif 777 } 778 } 779#endif 780 mutex_exit(&spa->spa_proc_lock); 781 782 /* If we didn't create a process, we need to create our taskqs. */ 783 if (spa->spa_proc == &p0) { 784 spa_create_zio_taskqs(spa); 785 } 786 787 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 788 offsetof(vdev_t, vdev_config_dirty_node)); 789 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 790 offsetof(vdev_t, vdev_state_dirty_node)); 791 792 txg_list_create(&spa->spa_vdev_txg_list, 793 offsetof(struct vdev, vdev_txg_node)); 794 795 avl_create(&spa->spa_errlist_scrub, 796 spa_error_entry_compare, sizeof (spa_error_entry_t), 797 offsetof(spa_error_entry_t, se_avl)); 798 avl_create(&spa->spa_errlist_last, 799 spa_error_entry_compare, sizeof (spa_error_entry_t), 800 offsetof(spa_error_entry_t, se_avl)); 801} 802 803/* 804 * Opposite of spa_activate(). 805 */ 806static void 807spa_deactivate(spa_t *spa) 808{ 809 ASSERT(spa->spa_sync_on == B_FALSE); 810 ASSERT(spa->spa_dsl_pool == NULL); 811 ASSERT(spa->spa_root_vdev == NULL); 812 ASSERT(spa->spa_async_zio_root == NULL); 813 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 814 815 txg_list_destroy(&spa->spa_vdev_txg_list); 816 817 list_destroy(&spa->spa_config_dirty_list); 818 list_destroy(&spa->spa_state_dirty_list); 819 820 for (int t = 0; t < ZIO_TYPES; t++) { 821 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 822 if (spa->spa_zio_taskq[t][q] != NULL) 823 taskq_destroy(spa->spa_zio_taskq[t][q]); 824 spa->spa_zio_taskq[t][q] = NULL; 825 } 826 } 827 828 metaslab_class_destroy(spa->spa_normal_class); 829 spa->spa_normal_class = NULL; 830 831 metaslab_class_destroy(spa->spa_log_class); 832 spa->spa_log_class = NULL; 833 834 /* 835 * If this was part of an import or the open otherwise failed, we may 836 * still have errors left in the queues. Empty them just in case. 837 */ 838 spa_errlog_drain(spa); 839 840 avl_destroy(&spa->spa_errlist_scrub); 841 avl_destroy(&spa->spa_errlist_last); 842 843 spa->spa_state = POOL_STATE_UNINITIALIZED; 844 845 mutex_enter(&spa->spa_proc_lock); 846 if (spa->spa_proc_state != SPA_PROC_NONE) { 847 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 848 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 849 cv_broadcast(&spa->spa_proc_cv); 850 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 851 ASSERT(spa->spa_proc != &p0); 852 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 853 } 854 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 855 spa->spa_proc_state = SPA_PROC_NONE; 856 } 857 ASSERT(spa->spa_proc == &p0); 858 mutex_exit(&spa->spa_proc_lock); 859 860 /* 861 * We want to make sure spa_thread() has actually exited the ZFS 862 * module, so that the module can't be unloaded out from underneath 863 * it. 864 */ 865 if (spa->spa_did != 0) { 866 thread_join(spa->spa_did); 867 spa->spa_did = 0; 868 } 869} 870 871/* 872 * Verify a pool configuration, and construct the vdev tree appropriately. This 873 * will create all the necessary vdevs in the appropriate layout, with each vdev 874 * in the CLOSED state. This will prep the pool before open/creation/import. 875 * All vdev validation is done by the vdev_alloc() routine. 876 */ 877static int 878spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 879 uint_t id, int atype) 880{ 881 nvlist_t **child; 882 uint_t children; 883 int error; 884 885 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 886 return (error); 887 888 if ((*vdp)->vdev_ops->vdev_op_leaf) 889 return (0); 890 891 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 892 &child, &children); 893 894 if (error == ENOENT) 895 return (0); 896 897 if (error) { 898 vdev_free(*vdp); 899 *vdp = NULL; 900 return (EINVAL); 901 } 902 903 for (int c = 0; c < children; c++) { 904 vdev_t *vd; 905 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 906 atype)) != 0) { 907 vdev_free(*vdp); 908 *vdp = NULL; 909 return (error); 910 } 911 } 912 913 ASSERT(*vdp != NULL); 914 915 return (0); 916} 917 918/* 919 * Opposite of spa_load(). 920 */ 921static void 922spa_unload(spa_t *spa) 923{ 924 int i; 925 926 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 927 928 /* 929 * Stop async tasks. 930 */ 931 spa_async_suspend(spa); 932 933 /* 934 * Stop syncing. 935 */ 936 if (spa->spa_sync_on) { 937 txg_sync_stop(spa->spa_dsl_pool); 938 spa->spa_sync_on = B_FALSE; 939 } 940 941 /* 942 * Wait for any outstanding async I/O to complete. 943 */ 944 if (spa->spa_async_zio_root != NULL) { 945 (void) zio_wait(spa->spa_async_zio_root); 946 spa->spa_async_zio_root = NULL; 947 } 948 949 /* 950 * Close the dsl pool. 951 */ 952 if (spa->spa_dsl_pool) { 953 dsl_pool_close(spa->spa_dsl_pool); 954 spa->spa_dsl_pool = NULL; 955 spa->spa_meta_objset = NULL; 956 } 957 958 ddt_unload(spa); 959 960 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 961 962 /* 963 * Drop and purge level 2 cache 964 */ 965 spa_l2cache_drop(spa); 966 967 /* 968 * Close all vdevs. 969 */ 970 if (spa->spa_root_vdev) 971 vdev_free(spa->spa_root_vdev); 972 ASSERT(spa->spa_root_vdev == NULL); 973 974 for (i = 0; i < spa->spa_spares.sav_count; i++) 975 vdev_free(spa->spa_spares.sav_vdevs[i]); 976 if (spa->spa_spares.sav_vdevs) { 977 kmem_free(spa->spa_spares.sav_vdevs, 978 spa->spa_spares.sav_count * sizeof (void *)); 979 spa->spa_spares.sav_vdevs = NULL; 980 } 981 if (spa->spa_spares.sav_config) { 982 nvlist_free(spa->spa_spares.sav_config); 983 spa->spa_spares.sav_config = NULL; 984 } 985 spa->spa_spares.sav_count = 0; 986 987 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 988 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 989 if (spa->spa_l2cache.sav_vdevs) { 990 kmem_free(spa->spa_l2cache.sav_vdevs, 991 spa->spa_l2cache.sav_count * sizeof (void *)); 992 spa->spa_l2cache.sav_vdevs = NULL; 993 } 994 if (spa->spa_l2cache.sav_config) { 995 nvlist_free(spa->spa_l2cache.sav_config); 996 spa->spa_l2cache.sav_config = NULL; 997 } 998 spa->spa_l2cache.sav_count = 0; 999 1000 spa->spa_async_suspended = 0; 1001 1002 spa_config_exit(spa, SCL_ALL, FTAG); 1003} 1004 1005/* 1006 * Load (or re-load) the current list of vdevs describing the active spares for 1007 * this pool. When this is called, we have some form of basic information in 1008 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1009 * then re-generate a more complete list including status information. 1010 */ 1011static void 1012spa_load_spares(spa_t *spa) 1013{ 1014 nvlist_t **spares; 1015 uint_t nspares; 1016 int i; 1017 vdev_t *vd, *tvd; 1018 1019 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1020 1021 /* 1022 * First, close and free any existing spare vdevs. 1023 */ 1024 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1025 vd = spa->spa_spares.sav_vdevs[i]; 1026 1027 /* Undo the call to spa_activate() below */ 1028 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1029 B_FALSE)) != NULL && tvd->vdev_isspare) 1030 spa_spare_remove(tvd); 1031 vdev_close(vd); 1032 vdev_free(vd); 1033 } 1034 1035 if (spa->spa_spares.sav_vdevs) 1036 kmem_free(spa->spa_spares.sav_vdevs, 1037 spa->spa_spares.sav_count * sizeof (void *)); 1038 1039 if (spa->spa_spares.sav_config == NULL) 1040 nspares = 0; 1041 else 1042 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1043 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1044 1045 spa->spa_spares.sav_count = (int)nspares; 1046 spa->spa_spares.sav_vdevs = NULL; 1047 1048 if (nspares == 0) 1049 return; 1050 1051 /* 1052 * Construct the array of vdevs, opening them to get status in the 1053 * process. For each spare, there is potentially two different vdev_t 1054 * structures associated with it: one in the list of spares (used only 1055 * for basic validation purposes) and one in the active vdev 1056 * configuration (if it's spared in). During this phase we open and 1057 * validate each vdev on the spare list. If the vdev also exists in the 1058 * active configuration, then we also mark this vdev as an active spare. 1059 */ 1060 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1061 KM_SLEEP); 1062 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1063 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1064 VDEV_ALLOC_SPARE) == 0); 1065 ASSERT(vd != NULL); 1066 1067 spa->spa_spares.sav_vdevs[i] = vd; 1068 1069 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1070 B_FALSE)) != NULL) { 1071 if (!tvd->vdev_isspare) 1072 spa_spare_add(tvd); 1073 1074 /* 1075 * We only mark the spare active if we were successfully 1076 * able to load the vdev. Otherwise, importing a pool 1077 * with a bad active spare would result in strange 1078 * behavior, because multiple pool would think the spare 1079 * is actively in use. 1080 * 1081 * There is a vulnerability here to an equally bizarre 1082 * circumstance, where a dead active spare is later 1083 * brought back to life (onlined or otherwise). Given 1084 * the rarity of this scenario, and the extra complexity 1085 * it adds, we ignore the possibility. 1086 */ 1087 if (!vdev_is_dead(tvd)) 1088 spa_spare_activate(tvd); 1089 } 1090 1091 vd->vdev_top = vd; 1092 vd->vdev_aux = &spa->spa_spares; 1093 1094 if (vdev_open(vd) != 0) 1095 continue; 1096 1097 if (vdev_validate_aux(vd) == 0) 1098 spa_spare_add(vd); 1099 } 1100 1101 /* 1102 * Recompute the stashed list of spares, with status information 1103 * this time. 1104 */ 1105 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1106 DATA_TYPE_NVLIST_ARRAY) == 0); 1107 1108 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1109 KM_SLEEP); 1110 for (i = 0; i < spa->spa_spares.sav_count; i++) 1111 spares[i] = vdev_config_generate(spa, 1112 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 1113 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1114 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1115 for (i = 0; i < spa->spa_spares.sav_count; i++) 1116 nvlist_free(spares[i]); 1117 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1118} 1119 1120/* 1121 * Load (or re-load) the current list of vdevs describing the active l2cache for 1122 * this pool. When this is called, we have some form of basic information in 1123 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1124 * then re-generate a more complete list including status information. 1125 * Devices which are already active have their details maintained, and are 1126 * not re-opened. 1127 */ 1128static void 1129spa_load_l2cache(spa_t *spa) 1130{ 1131 nvlist_t **l2cache; 1132 uint_t nl2cache; 1133 int i, j, oldnvdevs; 1134 uint64_t guid; 1135 vdev_t *vd, **oldvdevs, **newvdevs; 1136 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1137 1138 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1139 1140 if (sav->sav_config != NULL) { 1141 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1142 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1143 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1144 } else { 1145 nl2cache = 0; 1146 } 1147 1148 oldvdevs = sav->sav_vdevs; 1149 oldnvdevs = sav->sav_count; 1150 sav->sav_vdevs = NULL; 1151 sav->sav_count = 0; 1152 1153 /* 1154 * Process new nvlist of vdevs. 1155 */ 1156 for (i = 0; i < nl2cache; i++) { 1157 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1158 &guid) == 0); 1159 1160 newvdevs[i] = NULL; 1161 for (j = 0; j < oldnvdevs; j++) { 1162 vd = oldvdevs[j]; 1163 if (vd != NULL && guid == vd->vdev_guid) { 1164 /* 1165 * Retain previous vdev for add/remove ops. 1166 */ 1167 newvdevs[i] = vd; 1168 oldvdevs[j] = NULL; 1169 break; 1170 } 1171 } 1172 1173 if (newvdevs[i] == NULL) { 1174 /* 1175 * Create new vdev 1176 */ 1177 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1178 VDEV_ALLOC_L2CACHE) == 0); 1179 ASSERT(vd != NULL); 1180 newvdevs[i] = vd; 1181 1182 /* 1183 * Commit this vdev as an l2cache device, 1184 * even if it fails to open. 1185 */ 1186 spa_l2cache_add(vd); 1187 1188 vd->vdev_top = vd; 1189 vd->vdev_aux = sav; 1190 1191 spa_l2cache_activate(vd); 1192 1193 if (vdev_open(vd) != 0) 1194 continue; 1195 1196 (void) vdev_validate_aux(vd); 1197 1198 if (!vdev_is_dead(vd)) 1199 l2arc_add_vdev(spa, vd); 1200 } 1201 } 1202 1203 /* 1204 * Purge vdevs that were dropped 1205 */ 1206 for (i = 0; i < oldnvdevs; i++) { 1207 uint64_t pool; 1208 1209 vd = oldvdevs[i]; 1210 if (vd != NULL) { 1211 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1212 pool != 0ULL && l2arc_vdev_present(vd)) 1213 l2arc_remove_vdev(vd); 1214 (void) vdev_close(vd); 1215 spa_l2cache_remove(vd); 1216 } 1217 } 1218 1219 if (oldvdevs) 1220 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1221 1222 if (sav->sav_config == NULL) 1223 goto out; 1224 1225 sav->sav_vdevs = newvdevs; 1226 sav->sav_count = (int)nl2cache; 1227 1228 /* 1229 * Recompute the stashed list of l2cache devices, with status 1230 * information this time. 1231 */ 1232 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1233 DATA_TYPE_NVLIST_ARRAY) == 0); 1234 1235 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1236 for (i = 0; i < sav->sav_count; i++) 1237 l2cache[i] = vdev_config_generate(spa, 1238 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1239 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1240 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1241out: 1242 for (i = 0; i < sav->sav_count; i++) 1243 nvlist_free(l2cache[i]); 1244 if (sav->sav_count) 1245 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1246} 1247 1248static int 1249load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1250{ 1251 dmu_buf_t *db; 1252 char *packed = NULL; 1253 size_t nvsize = 0; 1254 int error; 1255 *value = NULL; 1256 1257 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1258 nvsize = *(uint64_t *)db->db_data; 1259 dmu_buf_rele(db, FTAG); 1260 1261 packed = kmem_alloc(nvsize, KM_SLEEP); 1262 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1263 DMU_READ_PREFETCH); 1264 if (error == 0) 1265 error = nvlist_unpack(packed, nvsize, value, 0); 1266 kmem_free(packed, nvsize); 1267 1268 return (error); 1269} 1270 1271/* 1272 * Checks to see if the given vdev could not be opened, in which case we post a 1273 * sysevent to notify the autoreplace code that the device has been removed. 1274 */ 1275static void 1276spa_check_removed(vdev_t *vd) 1277{ 1278 for (int c = 0; c < vd->vdev_children; c++) 1279 spa_check_removed(vd->vdev_child[c]); 1280 1281 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1282 zfs_post_autoreplace(vd->vdev_spa, vd); 1283 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1284 } 1285} 1286 1287/* 1288 * Load the slog device state from the config object since it's possible 1289 * that the label does not contain the most up-to-date information. 1290 */ 1291void 1292spa_load_log_state(spa_t *spa, nvlist_t *nv) 1293{ 1294 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1295 1296 /* 1297 * Load the original root vdev tree from the passed config. 1298 */ 1299 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1300 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1301 1302 for (int c = 0; c < rvd->vdev_children; c++) { 1303 vdev_t *cvd = rvd->vdev_child[c]; 1304 if (cvd->vdev_islog) 1305 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1306 } 1307 vdev_free(ovd); 1308 spa_config_exit(spa, SCL_ALL, FTAG); 1309} 1310 1311/* 1312 * Check for missing log devices 1313 */ 1314int 1315spa_check_logs(spa_t *spa) 1316{ 1317 switch (spa->spa_log_state) { 1318 case SPA_LOG_MISSING: 1319 /* need to recheck in case slog has been restored */ 1320 case SPA_LOG_UNKNOWN: 1321 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1322 DS_FIND_CHILDREN)) { 1323 spa_set_log_state(spa, SPA_LOG_MISSING); 1324 return (1); 1325 } 1326 break; 1327 } 1328 return (0); 1329} 1330 1331static boolean_t 1332spa_passivate_log(spa_t *spa) 1333{ 1334 vdev_t *rvd = spa->spa_root_vdev; 1335 boolean_t slog_found = B_FALSE; 1336 1337 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1338 1339 if (!spa_has_slogs(spa)) 1340 return (B_FALSE); 1341 1342 for (int c = 0; c < rvd->vdev_children; c++) { 1343 vdev_t *tvd = rvd->vdev_child[c]; 1344 metaslab_group_t *mg = tvd->vdev_mg; 1345 1346 if (tvd->vdev_islog) { 1347 metaslab_group_passivate(mg); 1348 slog_found = B_TRUE; 1349 } 1350 } 1351 1352 return (slog_found); 1353} 1354 1355static void 1356spa_activate_log(spa_t *spa) 1357{ 1358 vdev_t *rvd = spa->spa_root_vdev; 1359 1360 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1361 1362 for (int c = 0; c < rvd->vdev_children; c++) { 1363 vdev_t *tvd = rvd->vdev_child[c]; 1364 metaslab_group_t *mg = tvd->vdev_mg; 1365 1366 if (tvd->vdev_islog) 1367 metaslab_group_activate(mg); 1368 } 1369} 1370 1371int 1372spa_offline_log(spa_t *spa) 1373{ 1374 int error = 0; 1375 1376 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1377 NULL, DS_FIND_CHILDREN)) == 0) { 1378 1379 /* 1380 * We successfully offlined the log device, sync out the 1381 * current txg so that the "stubby" block can be removed 1382 * by zil_sync(). 1383 */ 1384 txg_wait_synced(spa->spa_dsl_pool, 0); 1385 } 1386 return (error); 1387} 1388 1389static void 1390spa_aux_check_removed(spa_aux_vdev_t *sav) 1391{ 1392 for (int i = 0; i < sav->sav_count; i++) 1393 spa_check_removed(sav->sav_vdevs[i]); 1394} 1395 1396void 1397spa_claim_notify(zio_t *zio) 1398{ 1399 spa_t *spa = zio->io_spa; 1400 1401 if (zio->io_error) 1402 return; 1403 1404 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1405 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1406 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1407 mutex_exit(&spa->spa_props_lock); 1408} 1409 1410typedef struct spa_load_error { 1411 uint64_t sle_meta_count; 1412 uint64_t sle_data_count; 1413} spa_load_error_t; 1414 1415static void 1416spa_load_verify_done(zio_t *zio) 1417{ 1418 blkptr_t *bp = zio->io_bp; 1419 spa_load_error_t *sle = zio->io_private; 1420 dmu_object_type_t type = BP_GET_TYPE(bp); 1421 int error = zio->io_error; 1422 1423 if (error) { 1424 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1425 type != DMU_OT_INTENT_LOG) 1426 atomic_add_64(&sle->sle_meta_count, 1); 1427 else 1428 atomic_add_64(&sle->sle_data_count, 1); 1429 } 1430 zio_data_buf_free(zio->io_data, zio->io_size); 1431} 1432 1433/*ARGSUSED*/ 1434static int 1435spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1436 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1437{ 1438 if (bp != NULL) { 1439 zio_t *rio = arg; 1440 size_t size = BP_GET_PSIZE(bp); 1441 void *data = zio_data_buf_alloc(size); 1442 1443 zio_nowait(zio_read(rio, spa, bp, data, size, 1444 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1445 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1446 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1447 } 1448 return (0); 1449} 1450 1451static int 1452spa_load_verify(spa_t *spa) 1453{ 1454 zio_t *rio; 1455 spa_load_error_t sle = { 0 }; 1456 zpool_rewind_policy_t policy; 1457 boolean_t verify_ok = B_FALSE; 1458 int error; 1459 1460 zpool_get_rewind_policy(spa->spa_config, &policy); 1461 1462 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1463 return (0); 1464 1465 rio = zio_root(spa, NULL, &sle, 1466 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1467 1468 error = traverse_pool(spa, spa->spa_verify_min_txg, 1469 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1470 1471 (void) zio_wait(rio); 1472 1473 spa->spa_load_meta_errors = sle.sle_meta_count; 1474 spa->spa_load_data_errors = sle.sle_data_count; 1475 1476 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1477 sle.sle_data_count <= policy.zrp_maxdata) { 1478 verify_ok = B_TRUE; 1479 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1480 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1481 } else { 1482 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1483 } 1484 1485 if (error) { 1486 if (error != ENXIO && error != EIO) 1487 error = EIO; 1488 return (error); 1489 } 1490 1491 return (verify_ok ? 0 : EIO); 1492} 1493 1494/* 1495 * Find a value in the pool props object. 1496 */ 1497static void 1498spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1499{ 1500 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1501 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1502} 1503 1504/* 1505 * Find a value in the pool directory object. 1506 */ 1507static int 1508spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1509{ 1510 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1511 name, sizeof (uint64_t), 1, val)); 1512} 1513 1514static int 1515spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1516{ 1517 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1518 return (err); 1519} 1520 1521/* 1522 * Fix up config after a partly-completed split. This is done with the 1523 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1524 * pool have that entry in their config, but only the splitting one contains 1525 * a list of all the guids of the vdevs that are being split off. 1526 * 1527 * This function determines what to do with that list: either rejoin 1528 * all the disks to the pool, or complete the splitting process. To attempt 1529 * the rejoin, each disk that is offlined is marked online again, and 1530 * we do a reopen() call. If the vdev label for every disk that was 1531 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1532 * then we call vdev_split() on each disk, and complete the split. 1533 * 1534 * Otherwise we leave the config alone, with all the vdevs in place in 1535 * the original pool. 1536 */ 1537static void 1538spa_try_repair(spa_t *spa, nvlist_t *config) 1539{ 1540 uint_t extracted; 1541 uint64_t *glist; 1542 uint_t i, gcount; 1543 nvlist_t *nvl; 1544 vdev_t **vd; 1545 boolean_t attempt_reopen; 1546 1547 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1548 return; 1549 1550 /* check that the config is complete */ 1551 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1552 &glist, &gcount) != 0) 1553 return; 1554 1555 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1556 1557 /* attempt to online all the vdevs & validate */ 1558 attempt_reopen = B_TRUE; 1559 for (i = 0; i < gcount; i++) { 1560 if (glist[i] == 0) /* vdev is hole */ 1561 continue; 1562 1563 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1564 if (vd[i] == NULL) { 1565 /* 1566 * Don't bother attempting to reopen the disks; 1567 * just do the split. 1568 */ 1569 attempt_reopen = B_FALSE; 1570 } else { 1571 /* attempt to re-online it */ 1572 vd[i]->vdev_offline = B_FALSE; 1573 } 1574 } 1575 1576 if (attempt_reopen) { 1577 vdev_reopen(spa->spa_root_vdev); 1578 1579 /* check each device to see what state it's in */ 1580 for (extracted = 0, i = 0; i < gcount; i++) { 1581 if (vd[i] != NULL && 1582 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1583 break; 1584 ++extracted; 1585 } 1586 } 1587 1588 /* 1589 * If every disk has been moved to the new pool, or if we never 1590 * even attempted to look at them, then we split them off for 1591 * good. 1592 */ 1593 if (!attempt_reopen || gcount == extracted) { 1594 for (i = 0; i < gcount; i++) 1595 if (vd[i] != NULL) 1596 vdev_split(vd[i]); 1597 vdev_reopen(spa->spa_root_vdev); 1598 } 1599 1600 kmem_free(vd, gcount * sizeof (vdev_t *)); 1601} 1602 1603static int 1604spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1605 boolean_t mosconfig) 1606{ 1607 nvlist_t *config = spa->spa_config; 1608 char *ereport = FM_EREPORT_ZFS_POOL; 1609 int error; 1610 uint64_t pool_guid; 1611 nvlist_t *nvl; 1612 1613 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1614 return (EINVAL); 1615 1616 /* 1617 * Versioning wasn't explicitly added to the label until later, so if 1618 * it's not present treat it as the initial version. 1619 */ 1620 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1621 &spa->spa_ubsync.ub_version) != 0) 1622 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1623 1624 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1625 &spa->spa_config_txg); 1626 1627 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1628 spa_guid_exists(pool_guid, 0)) { 1629 error = EEXIST; 1630 } else { 1631 spa->spa_load_guid = pool_guid; 1632 1633 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1634 &nvl) == 0) { 1635 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1636 KM_SLEEP) == 0); 1637 } 1638 1639 error = spa_load_impl(spa, pool_guid, config, state, type, 1640 mosconfig, &ereport); 1641 } 1642 1643 spa->spa_minref = refcount_count(&spa->spa_refcount); 1644 if (error && error != EBADF) 1645 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1646 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1647 spa->spa_ena = 0; 1648 1649 return (error); 1650} 1651 1652/* 1653 * Load an existing storage pool, using the pool's builtin spa_config as a 1654 * source of configuration information. 1655 */ 1656static int 1657spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1658 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1659 char **ereport) 1660{ 1661 int error = 0; 1662 nvlist_t *nvconfig, *nvroot = NULL; 1663 vdev_t *rvd; 1664 uberblock_t *ub = &spa->spa_uberblock; 1665 uint64_t config_cache_txg = spa->spa_config_txg; 1666 int orig_mode = spa->spa_mode; 1667 int parse; 1668 1669 /* 1670 * If this is an untrusted config, access the pool in read-only mode. 1671 * This prevents things like resilvering recently removed devices. 1672 */ 1673 if (!mosconfig) 1674 spa->spa_mode = FREAD; 1675 1676 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1677 1678 spa->spa_load_state = state; 1679 1680 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1681 return (EINVAL); 1682 1683 parse = (type == SPA_IMPORT_EXISTING ? 1684 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1685 1686 /* 1687 * Create "The Godfather" zio to hold all async IOs 1688 */ 1689 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1690 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1691 1692 /* 1693 * Parse the configuration into a vdev tree. We explicitly set the 1694 * value that will be returned by spa_version() since parsing the 1695 * configuration requires knowing the version number. 1696 */ 1697 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1698 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1699 spa_config_exit(spa, SCL_ALL, FTAG); 1700 1701 if (error != 0) 1702 return (error); 1703 1704 ASSERT(spa->spa_root_vdev == rvd); 1705 1706 if (type != SPA_IMPORT_ASSEMBLE) { 1707 ASSERT(spa_guid(spa) == pool_guid); 1708 } 1709 1710 /* 1711 * Try to open all vdevs, loading each label in the process. 1712 */ 1713 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1714 error = vdev_open(rvd); 1715 spa_config_exit(spa, SCL_ALL, FTAG); 1716 if (error != 0) 1717 return (error); 1718 1719 /* 1720 * We need to validate the vdev labels against the configuration that 1721 * we have in hand, which is dependent on the setting of mosconfig. If 1722 * mosconfig is true then we're validating the vdev labels based on 1723 * that config. Otherwise, we're validating against the cached config 1724 * (zpool.cache) that was read when we loaded the zfs module, and then 1725 * later we will recursively call spa_load() and validate against 1726 * the vdev config. 1727 * 1728 * If we're assembling a new pool that's been split off from an 1729 * existing pool, the labels haven't yet been updated so we skip 1730 * validation for now. 1731 */ 1732 if (type != SPA_IMPORT_ASSEMBLE) { 1733 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1734 error = vdev_validate(rvd); 1735 spa_config_exit(spa, SCL_ALL, FTAG); 1736 1737 if (error != 0) 1738 return (error); 1739 1740 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1741 return (ENXIO); 1742 } 1743 1744 /* 1745 * Find the best uberblock. 1746 */ 1747 vdev_uberblock_load(NULL, rvd, ub); 1748 1749 /* 1750 * If we weren't able to find a single valid uberblock, return failure. 1751 */ 1752 if (ub->ub_txg == 0) 1753 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1754 1755 /* 1756 * If the pool is newer than the code, we can't open it. 1757 */ 1758 if (ub->ub_version > SPA_VERSION) 1759 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1760 1761 /* 1762 * If the vdev guid sum doesn't match the uberblock, we have an 1763 * incomplete configuration. 1764 */ 1765 if (mosconfig && type != SPA_IMPORT_ASSEMBLE && 1766 rvd->vdev_guid_sum != ub->ub_guid_sum) 1767 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1768 1769 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1770 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1771 spa_try_repair(spa, config); 1772 spa_config_exit(spa, SCL_ALL, FTAG); 1773 nvlist_free(spa->spa_config_splitting); 1774 spa->spa_config_splitting = NULL; 1775 } 1776 1777 /* 1778 * Initialize internal SPA structures. 1779 */ 1780 spa->spa_state = POOL_STATE_ACTIVE; 1781 spa->spa_ubsync = spa->spa_uberblock; 1782 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1783 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 1784 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1785 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1786 spa->spa_claim_max_txg = spa->spa_first_txg; 1787 1788 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1789 if (error) 1790 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1791 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1792 1793 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 1794 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1795 1796 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 1797 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1798 1799 if (!mosconfig) { 1800 uint64_t hostid; 1801 nvlist_t *policy = NULL; 1802 1803 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1804 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1805 char *hostname; 1806 unsigned long myhostid = 0; 1807 1808 VERIFY(nvlist_lookup_string(nvconfig, 1809 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1810 1811#ifdef _KERNEL 1812 myhostid = zone_get_hostid(NULL); 1813#else /* _KERNEL */ 1814 /* 1815 * We're emulating the system's hostid in userland, so 1816 * we can't use zone_get_hostid(). 1817 */ 1818 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1819#endif /* _KERNEL */ 1820 if (hostid != 0 && myhostid != 0 && 1821 hostid != myhostid) { 1822 cmn_err(CE_WARN, "pool '%s' could not be " 1823 "loaded as it was last accessed by " 1824 "another system (host: %s hostid: 0x%lx). " 1825 "See: http://www.sun.com/msg/ZFS-8000-EY", 1826 spa_name(spa), hostname, 1827 (unsigned long)hostid); 1828 return (EBADF); 1829 } 1830 } 1831 if (nvlist_lookup_nvlist(spa->spa_config, 1832 ZPOOL_REWIND_POLICY, &policy) == 0) 1833 VERIFY(nvlist_add_nvlist(nvconfig, 1834 ZPOOL_REWIND_POLICY, policy) == 0); 1835 1836 spa_config_set(spa, nvconfig); 1837 spa_unload(spa); 1838 spa_deactivate(spa); 1839 spa_activate(spa, orig_mode); 1840 1841 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 1842 } 1843 1844 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, 1845 &spa->spa_deferred_bplist_obj) != 0) 1846 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1847 1848 /* 1849 * Load the bit that tells us to use the new accounting function 1850 * (raid-z deflation). If we have an older pool, this will not 1851 * be present. 1852 */ 1853 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 1854 if (error != 0 && error != ENOENT) 1855 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1856 1857 /* 1858 * Load the persistent error log. If we have an older pool, this will 1859 * not be present. 1860 */ 1861 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 1862 if (error != 0 && error != ENOENT) 1863 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1864 1865 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 1866 &spa->spa_errlog_scrub); 1867 if (error != 0 && error != ENOENT) 1868 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1869 1870 /* 1871 * Load the history object. If we have an older pool, this 1872 * will not be present. 1873 */ 1874 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 1875 if (error != 0 && error != ENOENT) 1876 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1877 1878 /* 1879 * If we're assembling the pool from the split-off vdevs of 1880 * an existing pool, we don't want to attach the spares & cache 1881 * devices. 1882 */ 1883 1884 /* 1885 * Load any hot spares for this pool. 1886 */ 1887 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 1888 if (error != 0 && error != ENOENT) 1889 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1890 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 1891 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1892 if (load_nvlist(spa, spa->spa_spares.sav_object, 1893 &spa->spa_spares.sav_config) != 0) 1894 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1895 1896 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1897 spa_load_spares(spa); 1898 spa_config_exit(spa, SCL_ALL, FTAG); 1899 } else if (error == 0) { 1900 spa->spa_spares.sav_sync = B_TRUE; 1901 } 1902 1903 /* 1904 * Load any level 2 ARC devices for this pool. 1905 */ 1906 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 1907 &spa->spa_l2cache.sav_object); 1908 if (error != 0 && error != ENOENT) 1909 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1910 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 1911 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1912 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1913 &spa->spa_l2cache.sav_config) != 0) 1914 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1915 1916 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1917 spa_load_l2cache(spa); 1918 spa_config_exit(spa, SCL_ALL, FTAG); 1919 } else if (error == 0) { 1920 spa->spa_l2cache.sav_sync = B_TRUE; 1921 } 1922 1923 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1924 1925 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 1926 if (error && error != ENOENT) 1927 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1928 1929 if (error == 0) { 1930 uint64_t autoreplace; 1931 1932 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 1933 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 1934 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 1935 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 1936 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 1937 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 1938 &spa->spa_dedup_ditto); 1939 1940 spa->spa_autoreplace = (autoreplace != 0); 1941 } 1942 1943 /* 1944 * If the 'autoreplace' property is set, then post a resource notifying 1945 * the ZFS DE that it should not issue any faults for unopenable 1946 * devices. We also iterate over the vdevs, and post a sysevent for any 1947 * unopenable vdevs so that the normal autoreplace handler can take 1948 * over. 1949 */ 1950 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1951 spa_check_removed(spa->spa_root_vdev); 1952 /* 1953 * For the import case, this is done in spa_import(), because 1954 * at this point we're using the spare definitions from 1955 * the MOS config, not necessarily from the userland config. 1956 */ 1957 if (state != SPA_LOAD_IMPORT) { 1958 spa_aux_check_removed(&spa->spa_spares); 1959 spa_aux_check_removed(&spa->spa_l2cache); 1960 } 1961 } 1962 1963 /* 1964 * Load the vdev state for all toplevel vdevs. 1965 */ 1966 vdev_load(rvd); 1967 1968 /* 1969 * Propagate the leaf DTLs we just loaded all the way up the tree. 1970 */ 1971 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1972 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1973 spa_config_exit(spa, SCL_ALL, FTAG); 1974 1975 /* 1976 * Check the state of the root vdev. If it can't be opened, it 1977 * indicates one or more toplevel vdevs are faulted. 1978 */ 1979 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1980 return (ENXIO); 1981 1982 /* 1983 * Load the DDTs (dedup tables). 1984 */ 1985 error = ddt_load(spa); 1986 if (error != 0) 1987 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1988 1989 spa_update_dspace(spa); 1990 1991 if (state != SPA_LOAD_TRYIMPORT) { 1992 error = spa_load_verify(spa); 1993 if (error) 1994 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 1995 error)); 1996 } 1997 1998 /* 1999 * Load the intent log state and check log integrity. If we're 2000 * assembling a pool from a split, the log is not transferred over. 2001 */ 2002 if (type != SPA_IMPORT_ASSEMBLE) { 2003 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 2004 &nvroot) == 0); 2005 spa_load_log_state(spa, nvroot); 2006 nvlist_free(nvconfig); 2007 2008 if (spa_check_logs(spa)) { 2009 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2010 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2011 } 2012 } 2013 2014 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2015 spa->spa_load_max_txg == UINT64_MAX)) { 2016 dmu_tx_t *tx; 2017 int need_update = B_FALSE; 2018 2019 ASSERT(state != SPA_LOAD_TRYIMPORT); 2020 2021 /* 2022 * Claim log blocks that haven't been committed yet. 2023 * This must all happen in a single txg. 2024 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2025 * invoked from zil_claim_log_block()'s i/o done callback. 2026 * Price of rollback is that we abandon the log. 2027 */ 2028 spa->spa_claiming = B_TRUE; 2029 2030 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2031 spa_first_txg(spa)); 2032 (void) dmu_objset_find(spa_name(spa), 2033 zil_claim, tx, DS_FIND_CHILDREN); 2034 dmu_tx_commit(tx); 2035 2036 spa->spa_claiming = B_FALSE; 2037 2038 spa_set_log_state(spa, SPA_LOG_GOOD); 2039 spa->spa_sync_on = B_TRUE; 2040 txg_sync_start(spa->spa_dsl_pool); 2041 2042 /* 2043 * Wait for all claims to sync. We sync up to the highest 2044 * claimed log block birth time so that claimed log blocks 2045 * don't appear to be from the future. spa_claim_max_txg 2046 * will have been set for us by either zil_check_log_chain() 2047 * (invoked from spa_check_logs()) or zil_claim() above. 2048 */ 2049 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2050 2051 /* 2052 * If the config cache is stale, or we have uninitialized 2053 * metaslabs (see spa_vdev_add()), then update the config. 2054 * 2055 * If spa_load_verbatim is true, trust the current 2056 * in-core spa_config and update the disk labels. 2057 */ 2058 if (config_cache_txg != spa->spa_config_txg || 2059 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 2060 state == SPA_LOAD_RECOVER) 2061 need_update = B_TRUE; 2062 2063 for (int c = 0; c < rvd->vdev_children; c++) 2064 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2065 need_update = B_TRUE; 2066 2067 /* 2068 * Update the config cache asychronously in case we're the 2069 * root pool, in which case the config cache isn't writable yet. 2070 */ 2071 if (need_update) 2072 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2073 2074 /* 2075 * Check all DTLs to see if anything needs resilvering. 2076 */ 2077 if (vdev_resilver_needed(rvd, NULL, NULL)) 2078 spa_async_request(spa, SPA_ASYNC_RESILVER); 2079 2080 /* 2081 * Delete any inconsistent datasets. 2082 */ 2083 (void) dmu_objset_find(spa_name(spa), 2084 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2085 2086 /* 2087 * Clean up any stale temporary dataset userrefs. 2088 */ 2089 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2090 } 2091 2092 return (0); 2093} 2094 2095static int 2096spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2097{ 2098 spa_unload(spa); 2099 spa_deactivate(spa); 2100 2101 spa->spa_load_max_txg--; 2102 2103 spa_activate(spa, spa_mode_global); 2104 spa_async_suspend(spa); 2105 2106 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2107} 2108 2109static int 2110spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2111 uint64_t max_request, int rewind_flags) 2112{ 2113 nvlist_t *config = NULL; 2114 int load_error, rewind_error; 2115 uint64_t safe_rewind_txg; 2116 uint64_t min_txg; 2117 2118 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2119 spa->spa_load_max_txg = spa->spa_load_txg; 2120 spa_set_log_state(spa, SPA_LOG_CLEAR); 2121 } else { 2122 spa->spa_load_max_txg = max_request; 2123 } 2124 2125 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2126 mosconfig); 2127 if (load_error == 0) 2128 return (0); 2129 2130 if (spa->spa_root_vdev != NULL) 2131 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2132 2133 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2134 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2135 2136 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2137 nvlist_free(config); 2138 return (load_error); 2139 } 2140 2141 /* Price of rolling back is discarding txgs, including log */ 2142 if (state == SPA_LOAD_RECOVER) 2143 spa_set_log_state(spa, SPA_LOG_CLEAR); 2144 2145 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2146 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2147 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2148 TXG_INITIAL : safe_rewind_txg; 2149 2150 /* 2151 * Continue as long as we're finding errors, we're still within 2152 * the acceptable rewind range, and we're still finding uberblocks 2153 */ 2154 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2155 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2156 if (spa->spa_load_max_txg < safe_rewind_txg) 2157 spa->spa_extreme_rewind = B_TRUE; 2158 rewind_error = spa_load_retry(spa, state, mosconfig); 2159 } 2160 2161 if (config) 2162 spa_rewind_data_to_nvlist(spa, config); 2163 2164 spa->spa_extreme_rewind = B_FALSE; 2165 spa->spa_load_max_txg = UINT64_MAX; 2166 2167 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2168 spa_config_set(spa, config); 2169 2170 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2171} 2172 2173/* 2174 * Pool Open/Import 2175 * 2176 * The import case is identical to an open except that the configuration is sent 2177 * down from userland, instead of grabbed from the configuration cache. For the 2178 * case of an open, the pool configuration will exist in the 2179 * POOL_STATE_UNINITIALIZED state. 2180 * 2181 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2182 * the same time open the pool, without having to keep around the spa_t in some 2183 * ambiguous state. 2184 */ 2185static int 2186spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2187 nvlist_t **config) 2188{ 2189 spa_t *spa; 2190 zpool_rewind_policy_t policy; 2191 spa_load_state_t state = SPA_LOAD_OPEN; 2192 int error; 2193 int locked = B_FALSE; 2194 2195 *spapp = NULL; 2196 2197 /* 2198 * As disgusting as this is, we need to support recursive calls to this 2199 * function because dsl_dir_open() is called during spa_load(), and ends 2200 * up calling spa_open() again. The real fix is to figure out how to 2201 * avoid dsl_dir_open() calling this in the first place. 2202 */ 2203 if (mutex_owner(&spa_namespace_lock) != curthread) { 2204 mutex_enter(&spa_namespace_lock); 2205 locked = B_TRUE; 2206 } 2207 2208 if ((spa = spa_lookup(pool)) == NULL) { 2209 if (locked) 2210 mutex_exit(&spa_namespace_lock); 2211 return (ENOENT); 2212 } 2213 2214 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); 2215 if (policy.zrp_request & ZPOOL_DO_REWIND) 2216 state = SPA_LOAD_RECOVER; 2217 2218 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2219 2220 spa_activate(spa, spa_mode_global); 2221 2222 if (spa->spa_last_open_failed && (policy.zrp_request & 2223 (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) { 2224 if (config != NULL && spa->spa_config) 2225 VERIFY(nvlist_dup(spa->spa_config, 2226 config, KM_SLEEP) == 0); 2227 spa_deactivate(spa); 2228 if (locked) 2229 mutex_exit(&spa_namespace_lock); 2230 return (spa->spa_last_open_failed); 2231 } 2232 2233 if (state != SPA_LOAD_RECOVER) 2234 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2235 2236 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2237 policy.zrp_request); 2238 2239 if (error == EBADF) { 2240 /* 2241 * If vdev_validate() returns failure (indicated by 2242 * EBADF), it indicates that one of the vdevs indicates 2243 * that the pool has been exported or destroyed. If 2244 * this is the case, the config cache is out of sync and 2245 * we should remove the pool from the namespace. 2246 */ 2247 spa_unload(spa); 2248 spa_deactivate(spa); 2249 spa_config_sync(spa, B_TRUE, B_TRUE); 2250 spa_remove(spa); 2251 if (locked) 2252 mutex_exit(&spa_namespace_lock); 2253 return (ENOENT); 2254 } 2255 2256 if (error) { 2257 /* 2258 * We can't open the pool, but we still have useful 2259 * information: the state of each vdev after the 2260 * attempted vdev_open(). Return this to the user. 2261 */ 2262 if (config != NULL && spa->spa_config) 2263 VERIFY(nvlist_dup(spa->spa_config, config, 2264 KM_SLEEP) == 0); 2265 spa_unload(spa); 2266 spa_deactivate(spa); 2267 spa->spa_last_open_failed = error; 2268 if (locked) 2269 mutex_exit(&spa_namespace_lock); 2270 *spapp = NULL; 2271 return (error); 2272 } 2273 2274 } 2275 2276 spa_open_ref(spa, tag); 2277 2278 2279 if (config != NULL) 2280 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2281 2282 if (locked) { 2283 spa->spa_last_open_failed = 0; 2284 spa->spa_last_ubsync_txg = 0; 2285 spa->spa_load_txg = 0; 2286 mutex_exit(&spa_namespace_lock); 2287 } 2288 2289 *spapp = spa; 2290 2291 return (0); 2292} 2293 2294int 2295spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2296 nvlist_t **config) 2297{ 2298 return (spa_open_common(name, spapp, tag, policy, config)); 2299} 2300 2301int 2302spa_open(const char *name, spa_t **spapp, void *tag) 2303{ 2304 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2305} 2306 2307/* 2308 * Lookup the given spa_t, incrementing the inject count in the process, 2309 * preventing it from being exported or destroyed. 2310 */ 2311spa_t * 2312spa_inject_addref(char *name) 2313{ 2314 spa_t *spa; 2315 2316 mutex_enter(&spa_namespace_lock); 2317 if ((spa = spa_lookup(name)) == NULL) { 2318 mutex_exit(&spa_namespace_lock); 2319 return (NULL); 2320 } 2321 spa->spa_inject_ref++; 2322 mutex_exit(&spa_namespace_lock); 2323 2324 return (spa); 2325} 2326 2327void 2328spa_inject_delref(spa_t *spa) 2329{ 2330 mutex_enter(&spa_namespace_lock); 2331 spa->spa_inject_ref--; 2332 mutex_exit(&spa_namespace_lock); 2333} 2334 2335/* 2336 * Add spares device information to the nvlist. 2337 */ 2338static void 2339spa_add_spares(spa_t *spa, nvlist_t *config) 2340{ 2341 nvlist_t **spares; 2342 uint_t i, nspares; 2343 nvlist_t *nvroot; 2344 uint64_t guid; 2345 vdev_stat_t *vs; 2346 uint_t vsc; 2347 uint64_t pool; 2348 2349 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2350 2351 if (spa->spa_spares.sav_count == 0) 2352 return; 2353 2354 VERIFY(nvlist_lookup_nvlist(config, 2355 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2356 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2357 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2358 if (nspares != 0) { 2359 VERIFY(nvlist_add_nvlist_array(nvroot, 2360 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2361 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2362 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2363 2364 /* 2365 * Go through and find any spares which have since been 2366 * repurposed as an active spare. If this is the case, update 2367 * their status appropriately. 2368 */ 2369 for (i = 0; i < nspares; i++) { 2370 VERIFY(nvlist_lookup_uint64(spares[i], 2371 ZPOOL_CONFIG_GUID, &guid) == 0); 2372 if (spa_spare_exists(guid, &pool, NULL) && 2373 pool != 0ULL) { 2374 VERIFY(nvlist_lookup_uint64_array( 2375 spares[i], ZPOOL_CONFIG_STATS, 2376 (uint64_t **)&vs, &vsc) == 0); 2377 vs->vs_state = VDEV_STATE_CANT_OPEN; 2378 vs->vs_aux = VDEV_AUX_SPARED; 2379 } 2380 } 2381 } 2382} 2383 2384/* 2385 * Add l2cache device information to the nvlist, including vdev stats. 2386 */ 2387static void 2388spa_add_l2cache(spa_t *spa, nvlist_t *config) 2389{ 2390 nvlist_t **l2cache; 2391 uint_t i, j, nl2cache; 2392 nvlist_t *nvroot; 2393 uint64_t guid; 2394 vdev_t *vd; 2395 vdev_stat_t *vs; 2396 uint_t vsc; 2397 2398 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2399 2400 if (spa->spa_l2cache.sav_count == 0) 2401 return; 2402 2403 VERIFY(nvlist_lookup_nvlist(config, 2404 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2405 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2406 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2407 if (nl2cache != 0) { 2408 VERIFY(nvlist_add_nvlist_array(nvroot, 2409 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2410 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2411 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2412 2413 /* 2414 * Update level 2 cache device stats. 2415 */ 2416 2417 for (i = 0; i < nl2cache; i++) { 2418 VERIFY(nvlist_lookup_uint64(l2cache[i], 2419 ZPOOL_CONFIG_GUID, &guid) == 0); 2420 2421 vd = NULL; 2422 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2423 if (guid == 2424 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2425 vd = spa->spa_l2cache.sav_vdevs[j]; 2426 break; 2427 } 2428 } 2429 ASSERT(vd != NULL); 2430 2431 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2432 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 2433 vdev_get_stats(vd, vs); 2434 } 2435 } 2436} 2437 2438int 2439spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2440{ 2441 int error; 2442 spa_t *spa; 2443 2444 *config = NULL; 2445 error = spa_open_common(name, &spa, FTAG, NULL, config); 2446 2447 if (spa != NULL) { 2448 /* 2449 * This still leaves a window of inconsistency where the spares 2450 * or l2cache devices could change and the config would be 2451 * self-inconsistent. 2452 */ 2453 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2454 2455 if (*config != NULL) { 2456 VERIFY(nvlist_add_uint64(*config, 2457 ZPOOL_CONFIG_ERRCOUNT, 2458 spa_get_errlog_size(spa)) == 0); 2459 2460 if (spa_suspended(spa)) 2461 VERIFY(nvlist_add_uint64(*config, 2462 ZPOOL_CONFIG_SUSPENDED, 2463 spa->spa_failmode) == 0); 2464 2465 spa_add_spares(spa, *config); 2466 spa_add_l2cache(spa, *config); 2467 } 2468 } 2469 2470 /* 2471 * We want to get the alternate root even for faulted pools, so we cheat 2472 * and call spa_lookup() directly. 2473 */ 2474 if (altroot) { 2475 if (spa == NULL) { 2476 mutex_enter(&spa_namespace_lock); 2477 spa = spa_lookup(name); 2478 if (spa) 2479 spa_altroot(spa, altroot, buflen); 2480 else 2481 altroot[0] = '\0'; 2482 spa = NULL; 2483 mutex_exit(&spa_namespace_lock); 2484 } else { 2485 spa_altroot(spa, altroot, buflen); 2486 } 2487 } 2488 2489 if (spa != NULL) { 2490 spa_config_exit(spa, SCL_CONFIG, FTAG); 2491 spa_close(spa, FTAG); 2492 } 2493 2494 return (error); 2495} 2496 2497/* 2498 * Validate that the auxiliary device array is well formed. We must have an 2499 * array of nvlists, each which describes a valid leaf vdev. If this is an 2500 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2501 * specified, as long as they are well-formed. 2502 */ 2503static int 2504spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2505 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2506 vdev_labeltype_t label) 2507{ 2508 nvlist_t **dev; 2509 uint_t i, ndev; 2510 vdev_t *vd; 2511 int error; 2512 2513 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2514 2515 /* 2516 * It's acceptable to have no devs specified. 2517 */ 2518 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2519 return (0); 2520 2521 if (ndev == 0) 2522 return (EINVAL); 2523 2524 /* 2525 * Make sure the pool is formatted with a version that supports this 2526 * device type. 2527 */ 2528 if (spa_version(spa) < version) 2529 return (ENOTSUP); 2530 2531 /* 2532 * Set the pending device list so we correctly handle device in-use 2533 * checking. 2534 */ 2535 sav->sav_pending = dev; 2536 sav->sav_npending = ndev; 2537 2538 for (i = 0; i < ndev; i++) { 2539 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2540 mode)) != 0) 2541 goto out; 2542 2543 if (!vd->vdev_ops->vdev_op_leaf) { 2544 vdev_free(vd); 2545 error = EINVAL; 2546 goto out; 2547 } 2548 2549 /* 2550 * The L2ARC currently only supports disk devices in 2551 * kernel context. For user-level testing, we allow it. 2552 */ 2553#ifdef _KERNEL 2554 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2555 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2556 error = ENOTBLK; 2557 goto out; 2558 } 2559#endif 2560 vd->vdev_top = vd; 2561 2562 if ((error = vdev_open(vd)) == 0 && 2563 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2564 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2565 vd->vdev_guid) == 0); 2566 } 2567 2568 vdev_free(vd); 2569 2570 if (error && 2571 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2572 goto out; 2573 else 2574 error = 0; 2575 } 2576 2577out: 2578 sav->sav_pending = NULL; 2579 sav->sav_npending = 0; 2580 return (error); 2581} 2582 2583static int 2584spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2585{ 2586 int error; 2587 2588 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2589 2590 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2591 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2592 VDEV_LABEL_SPARE)) != 0) { 2593 return (error); 2594 } 2595 2596 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2597 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2598 VDEV_LABEL_L2CACHE)); 2599} 2600 2601static void 2602spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2603 const char *config) 2604{ 2605 int i; 2606 2607 if (sav->sav_config != NULL) { 2608 nvlist_t **olddevs; 2609 uint_t oldndevs; 2610 nvlist_t **newdevs; 2611 2612 /* 2613 * Generate new dev list by concatentating with the 2614 * current dev list. 2615 */ 2616 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2617 &olddevs, &oldndevs) == 0); 2618 2619 newdevs = kmem_alloc(sizeof (void *) * 2620 (ndevs + oldndevs), KM_SLEEP); 2621 for (i = 0; i < oldndevs; i++) 2622 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2623 KM_SLEEP) == 0); 2624 for (i = 0; i < ndevs; i++) 2625 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2626 KM_SLEEP) == 0); 2627 2628 VERIFY(nvlist_remove(sav->sav_config, config, 2629 DATA_TYPE_NVLIST_ARRAY) == 0); 2630 2631 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2632 config, newdevs, ndevs + oldndevs) == 0); 2633 for (i = 0; i < oldndevs + ndevs; i++) 2634 nvlist_free(newdevs[i]); 2635 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2636 } else { 2637 /* 2638 * Generate a new dev list. 2639 */ 2640 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2641 KM_SLEEP) == 0); 2642 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2643 devs, ndevs) == 0); 2644 } 2645} 2646 2647/* 2648 * Stop and drop level 2 ARC devices 2649 */ 2650void 2651spa_l2cache_drop(spa_t *spa) 2652{ 2653 vdev_t *vd; 2654 int i; 2655 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2656 2657 for (i = 0; i < sav->sav_count; i++) { 2658 uint64_t pool; 2659 2660 vd = sav->sav_vdevs[i]; 2661 ASSERT(vd != NULL); 2662 2663 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2664 pool != 0ULL && l2arc_vdev_present(vd)) 2665 l2arc_remove_vdev(vd); 2666 if (vd->vdev_isl2cache) 2667 spa_l2cache_remove(vd); 2668 vdev_clear_stats(vd); 2669 (void) vdev_close(vd); 2670 } 2671} 2672 2673/* 2674 * Pool Creation 2675 */ 2676int 2677spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2678 const char *history_str, nvlist_t *zplprops) 2679{ 2680 spa_t *spa; 2681 char *altroot = NULL; 2682 vdev_t *rvd; 2683 dsl_pool_t *dp; 2684 dmu_tx_t *tx; 2685 int error = 0; 2686 uint64_t txg = TXG_INITIAL; 2687 nvlist_t **spares, **l2cache; 2688 uint_t nspares, nl2cache; 2689 uint64_t version; 2690 2691 /* 2692 * If this pool already exists, return failure. 2693 */ 2694 mutex_enter(&spa_namespace_lock); 2695 if (spa_lookup(pool) != NULL) { 2696 mutex_exit(&spa_namespace_lock); 2697 return (EEXIST); 2698 } 2699 2700 /* 2701 * Allocate a new spa_t structure. 2702 */ 2703 (void) nvlist_lookup_string(props, 2704 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2705 spa = spa_add(pool, NULL, altroot); 2706 spa_activate(spa, spa_mode_global); 2707 2708 if (props && (error = spa_prop_validate(spa, props))) { 2709 spa_deactivate(spa); 2710 spa_remove(spa); 2711 mutex_exit(&spa_namespace_lock); 2712 return (error); 2713 } 2714 2715 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2716 &version) != 0) 2717 version = SPA_VERSION; 2718 ASSERT(version <= SPA_VERSION); 2719 2720 spa->spa_first_txg = txg; 2721 spa->spa_uberblock.ub_txg = txg - 1; 2722 spa->spa_uberblock.ub_version = version; 2723 spa->spa_ubsync = spa->spa_uberblock; 2724 2725 /* 2726 * Create "The Godfather" zio to hold all async IOs 2727 */ 2728 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2729 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2730 2731 /* 2732 * Create the root vdev. 2733 */ 2734 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2735 2736 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2737 2738 ASSERT(error != 0 || rvd != NULL); 2739 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2740 2741 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2742 error = EINVAL; 2743 2744 if (error == 0 && 2745 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2746 (error = spa_validate_aux(spa, nvroot, txg, 2747 VDEV_ALLOC_ADD)) == 0) { 2748 for (int c = 0; c < rvd->vdev_children; c++) { 2749 vdev_metaslab_set_size(rvd->vdev_child[c]); 2750 vdev_expand(rvd->vdev_child[c], txg); 2751 } 2752 } 2753 2754 spa_config_exit(spa, SCL_ALL, FTAG); 2755 2756 if (error != 0) { 2757 spa_unload(spa); 2758 spa_deactivate(spa); 2759 spa_remove(spa); 2760 mutex_exit(&spa_namespace_lock); 2761 return (error); 2762 } 2763 2764 /* 2765 * Get the list of spares, if specified. 2766 */ 2767 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2768 &spares, &nspares) == 0) { 2769 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2770 KM_SLEEP) == 0); 2771 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2772 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2773 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2774 spa_load_spares(spa); 2775 spa_config_exit(spa, SCL_ALL, FTAG); 2776 spa->spa_spares.sav_sync = B_TRUE; 2777 } 2778 2779 /* 2780 * Get the list of level 2 cache devices, if specified. 2781 */ 2782 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2783 &l2cache, &nl2cache) == 0) { 2784 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2785 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2786 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2787 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2788 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2789 spa_load_l2cache(spa); 2790 spa_config_exit(spa, SCL_ALL, FTAG); 2791 spa->spa_l2cache.sav_sync = B_TRUE; 2792 } 2793 2794 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2795 spa->spa_meta_objset = dp->dp_meta_objset; 2796 2797 /* 2798 * Create DDTs (dedup tables). 2799 */ 2800 ddt_create(spa); 2801 2802 spa_update_dspace(spa); 2803 2804 tx = dmu_tx_create_assigned(dp, txg); 2805 2806 /* 2807 * Create the pool config object. 2808 */ 2809 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2810 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2811 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2812 2813 if (zap_add(spa->spa_meta_objset, 2814 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2815 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2816 cmn_err(CE_PANIC, "failed to add pool config"); 2817 } 2818 2819 /* Newly created pools with the right version are always deflated. */ 2820 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2821 spa->spa_deflate = TRUE; 2822 if (zap_add(spa->spa_meta_objset, 2823 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2824 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2825 cmn_err(CE_PANIC, "failed to add deflate"); 2826 } 2827 } 2828 2829 /* 2830 * Create the deferred-free bplist object. Turn off compression 2831 * because sync-to-convergence takes longer if the blocksize 2832 * keeps changing. 2833 */ 2834 spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 2835 1 << 14, tx); 2836 dmu_object_set_compress(spa->spa_meta_objset, 2837 spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); 2838 2839 if (zap_add(spa->spa_meta_objset, 2840 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2841 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { 2842 cmn_err(CE_PANIC, "failed to add bplist"); 2843 } 2844 2845 /* 2846 * Create the pool's history object. 2847 */ 2848 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2849 spa_history_create_obj(spa, tx); 2850 2851 /* 2852 * Set pool properties. 2853 */ 2854 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2855 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2856 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2857 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2858 2859 if (props != NULL) { 2860 spa_configfile_set(spa, props, B_FALSE); 2861 spa_sync_props(spa, props, CRED(), tx); 2862 } 2863 2864 dmu_tx_commit(tx); 2865 2866 spa->spa_sync_on = B_TRUE; 2867 txg_sync_start(spa->spa_dsl_pool); 2868 2869 /* 2870 * We explicitly wait for the first transaction to complete so that our 2871 * bean counters are appropriately updated. 2872 */ 2873 txg_wait_synced(spa->spa_dsl_pool, txg); 2874 2875 spa_config_sync(spa, B_FALSE, B_TRUE); 2876 2877 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2878 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2879 spa_history_log_version(spa, LOG_POOL_CREATE); 2880 2881 spa->spa_minref = refcount_count(&spa->spa_refcount); 2882 2883 mutex_exit(&spa_namespace_lock); 2884 2885 return (0); 2886} 2887 2888#ifdef _KERNEL 2889/* 2890 * Get the root pool information from the root disk, then import the root pool 2891 * during the system boot up time. 2892 */ 2893extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2894 2895static nvlist_t * 2896spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2897{ 2898 nvlist_t *config; 2899 nvlist_t *nvtop, *nvroot; 2900 uint64_t pgid; 2901 2902 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2903 return (NULL); 2904 2905 /* 2906 * Add this top-level vdev to the child array. 2907 */ 2908 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2909 &nvtop) == 0); 2910 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2911 &pgid) == 0); 2912 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2913 2914 /* 2915 * Put this pool's top-level vdevs into a root vdev. 2916 */ 2917 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2918 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2919 VDEV_TYPE_ROOT) == 0); 2920 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2921 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2922 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2923 &nvtop, 1) == 0); 2924 2925 /* 2926 * Replace the existing vdev_tree with the new root vdev in 2927 * this pool's configuration (remove the old, add the new). 2928 */ 2929 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2930 nvlist_free(nvroot); 2931 return (config); 2932} 2933 2934/* 2935 * Walk the vdev tree and see if we can find a device with "better" 2936 * configuration. A configuration is "better" if the label on that 2937 * device has a more recent txg. 2938 */ 2939static void 2940spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2941{ 2942 for (int c = 0; c < vd->vdev_children; c++) 2943 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2944 2945 if (vd->vdev_ops->vdev_op_leaf) { 2946 nvlist_t *label; 2947 uint64_t label_txg; 2948 2949 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2950 &label) != 0) 2951 return; 2952 2953 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2954 &label_txg) == 0); 2955 2956 /* 2957 * Do we have a better boot device? 2958 */ 2959 if (label_txg > *txg) { 2960 *txg = label_txg; 2961 *avd = vd; 2962 } 2963 nvlist_free(label); 2964 } 2965} 2966 2967/* 2968 * Import a root pool. 2969 * 2970 * For x86. devpath_list will consist of devid and/or physpath name of 2971 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2972 * The GRUB "findroot" command will return the vdev we should boot. 2973 * 2974 * For Sparc, devpath_list consists the physpath name of the booting device 2975 * no matter the rootpool is a single device pool or a mirrored pool. 2976 * e.g. 2977 * "/pci@1f,0/ide@d/disk@0,0:a" 2978 */ 2979int 2980spa_import_rootpool(char *devpath, char *devid) 2981{ 2982 spa_t *spa; 2983 vdev_t *rvd, *bvd, *avd = NULL; 2984 nvlist_t *config, *nvtop; 2985 uint64_t guid, txg; 2986 char *pname; 2987 int error; 2988 2989 /* 2990 * Read the label from the boot device and generate a configuration. 2991 */ 2992 config = spa_generate_rootconf(devpath, devid, &guid); 2993#if defined(_OBP) && defined(_KERNEL) 2994 if (config == NULL) { 2995 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2996 /* iscsi boot */ 2997 get_iscsi_bootpath_phy(devpath); 2998 config = spa_generate_rootconf(devpath, devid, &guid); 2999 } 3000 } 3001#endif 3002 if (config == NULL) { 3003 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3004 devpath); 3005 return (EIO); 3006 } 3007 3008 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3009 &pname) == 0); 3010 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3011 3012 mutex_enter(&spa_namespace_lock); 3013 if ((spa = spa_lookup(pname)) != NULL) { 3014 /* 3015 * Remove the existing root pool from the namespace so that we 3016 * can replace it with the correct config we just read in. 3017 */ 3018 spa_remove(spa); 3019 } 3020 3021 spa = spa_add(pname, config, NULL); 3022 spa->spa_is_root = B_TRUE; 3023 spa->spa_load_verbatim = B_TRUE; 3024 3025 /* 3026 * Build up a vdev tree based on the boot device's label config. 3027 */ 3028 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3029 &nvtop) == 0); 3030 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3031 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3032 VDEV_ALLOC_ROOTPOOL); 3033 spa_config_exit(spa, SCL_ALL, FTAG); 3034 if (error) { 3035 mutex_exit(&spa_namespace_lock); 3036 nvlist_free(config); 3037 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3038 pname); 3039 return (error); 3040 } 3041 3042 /* 3043 * Get the boot vdev. 3044 */ 3045 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3046 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3047 (u_longlong_t)guid); 3048 error = ENOENT; 3049 goto out; 3050 } 3051 3052 /* 3053 * Determine if there is a better boot device. 3054 */ 3055 avd = bvd; 3056 spa_alt_rootvdev(rvd, &avd, &txg); 3057 if (avd != bvd) { 3058 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3059 "try booting from '%s'", avd->vdev_path); 3060 error = EINVAL; 3061 goto out; 3062 } 3063 3064 /* 3065 * If the boot device is part of a spare vdev then ensure that 3066 * we're booting off the active spare. 3067 */ 3068 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3069 !bvd->vdev_isspare) { 3070 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3071 "try booting from '%s'", 3072 bvd->vdev_parent->vdev_child[1]->vdev_path); 3073 error = EINVAL; 3074 goto out; 3075 } 3076 3077 error = 0; 3078 spa_history_log_version(spa, LOG_POOL_IMPORT); 3079out: 3080 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3081 vdev_free(rvd); 3082 spa_config_exit(spa, SCL_ALL, FTAG); 3083 mutex_exit(&spa_namespace_lock); 3084 3085 nvlist_free(config); 3086 return (error); 3087} 3088 3089#endif 3090 3091/* 3092 * Take a pool and insert it into the namespace as if it had been loaded at 3093 * boot. 3094 */ 3095int 3096spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 3097{ 3098 spa_t *spa; 3099 char *altroot = NULL; 3100 3101 mutex_enter(&spa_namespace_lock); 3102 if (spa_lookup(pool) != NULL) { 3103 mutex_exit(&spa_namespace_lock); 3104 return (EEXIST); 3105 } 3106 3107 (void) nvlist_lookup_string(props, 3108 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3109 spa = spa_add(pool, config, altroot); 3110 3111 spa->spa_load_verbatim = B_TRUE; 3112 3113 if (props != NULL) 3114 spa_configfile_set(spa, props, B_FALSE); 3115 3116 spa_config_sync(spa, B_FALSE, B_TRUE); 3117 3118 mutex_exit(&spa_namespace_lock); 3119 spa_history_log_version(spa, LOG_POOL_IMPORT); 3120 3121 return (0); 3122} 3123 3124/* 3125 * Import a non-root pool into the system. 3126 */ 3127int 3128spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 3129{ 3130 spa_t *spa; 3131 char *altroot = NULL; 3132 spa_load_state_t state = SPA_LOAD_IMPORT; 3133 zpool_rewind_policy_t policy; 3134 int error; 3135 nvlist_t *nvroot; 3136 nvlist_t **spares, **l2cache; 3137 uint_t nspares, nl2cache; 3138 3139 /* 3140 * If a pool with this name exists, return failure. 3141 */ 3142 mutex_enter(&spa_namespace_lock); 3143 if (spa_lookup(pool) != NULL) { 3144 mutex_exit(&spa_namespace_lock); 3145 return (EEXIST); 3146 } 3147 3148 zpool_get_rewind_policy(config, &policy); 3149 if (policy.zrp_request & ZPOOL_DO_REWIND) 3150 state = SPA_LOAD_RECOVER; 3151 3152 /* 3153 * Create and initialize the spa structure. 3154 */ 3155 (void) nvlist_lookup_string(props, 3156 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3157 spa = spa_add(pool, config, altroot); 3158 spa_activate(spa, spa_mode_global); 3159 3160 /* 3161 * Don't start async tasks until we know everything is healthy. 3162 */ 3163 spa_async_suspend(spa); 3164 3165 /* 3166 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3167 * because the user-supplied config is actually the one to trust when 3168 * doing an import. 3169 */ 3170 if (state != SPA_LOAD_RECOVER) 3171 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3172 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3173 policy.zrp_request); 3174 3175 /* 3176 * Propagate anything learned about failing or best txgs 3177 * back to caller 3178 */ 3179 spa_rewind_data_to_nvlist(spa, config); 3180 3181 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3182 /* 3183 * Toss any existing sparelist, as it doesn't have any validity 3184 * anymore, and conflicts with spa_has_spare(). 3185 */ 3186 if (spa->spa_spares.sav_config) { 3187 nvlist_free(spa->spa_spares.sav_config); 3188 spa->spa_spares.sav_config = NULL; 3189 spa_load_spares(spa); 3190 } 3191 if (spa->spa_l2cache.sav_config) { 3192 nvlist_free(spa->spa_l2cache.sav_config); 3193 spa->spa_l2cache.sav_config = NULL; 3194 spa_load_l2cache(spa); 3195 } 3196 3197 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3198 &nvroot) == 0); 3199 if (error == 0) 3200 error = spa_validate_aux(spa, nvroot, -1ULL, 3201 VDEV_ALLOC_SPARE); 3202 if (error == 0) 3203 error = spa_validate_aux(spa, nvroot, -1ULL, 3204 VDEV_ALLOC_L2CACHE); 3205 spa_config_exit(spa, SCL_ALL, FTAG); 3206 3207 if (props != NULL) 3208 spa_configfile_set(spa, props, B_FALSE); 3209 3210 if (error != 0 || (props && spa_writeable(spa) && 3211 (error = spa_prop_set(spa, props)))) { 3212 spa_unload(spa); 3213 spa_deactivate(spa); 3214 spa_remove(spa); 3215 mutex_exit(&spa_namespace_lock); 3216 return (error); 3217 } 3218 3219 spa_async_resume(spa); 3220 3221 /* 3222 * Override any spares and level 2 cache devices as specified by 3223 * the user, as these may have correct device names/devids, etc. 3224 */ 3225 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3226 &spares, &nspares) == 0) { 3227 if (spa->spa_spares.sav_config) 3228 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3229 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3230 else 3231 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3232 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3233 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3234 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3235 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3236 spa_load_spares(spa); 3237 spa_config_exit(spa, SCL_ALL, FTAG); 3238 spa->spa_spares.sav_sync = B_TRUE; 3239 } 3240 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3241 &l2cache, &nl2cache) == 0) { 3242 if (spa->spa_l2cache.sav_config) 3243 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3244 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3245 else 3246 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3247 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3248 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3249 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3250 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3251 spa_load_l2cache(spa); 3252 spa_config_exit(spa, SCL_ALL, FTAG); 3253 spa->spa_l2cache.sav_sync = B_TRUE; 3254 } 3255 3256 /* 3257 * Check for any removed devices. 3258 */ 3259 if (spa->spa_autoreplace) { 3260 spa_aux_check_removed(&spa->spa_spares); 3261 spa_aux_check_removed(&spa->spa_l2cache); 3262 } 3263 3264 if (spa_writeable(spa)) { 3265 /* 3266 * Update the config cache to include the newly-imported pool. 3267 */ 3268 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3269 } 3270 3271 /* 3272 * It's possible that the pool was expanded while it was exported. 3273 * We kick off an async task to handle this for us. 3274 */ 3275 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3276 3277 mutex_exit(&spa_namespace_lock); 3278 spa_history_log_version(spa, LOG_POOL_IMPORT); 3279 3280 return (0); 3281} 3282 3283nvlist_t * 3284spa_tryimport(nvlist_t *tryconfig) 3285{ 3286 nvlist_t *config = NULL; 3287 char *poolname; 3288 spa_t *spa; 3289 uint64_t state; 3290 int error; 3291 3292 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3293 return (NULL); 3294 3295 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3296 return (NULL); 3297 3298 /* 3299 * Create and initialize the spa structure. 3300 */ 3301 mutex_enter(&spa_namespace_lock); 3302 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3303 spa_activate(spa, FREAD); 3304 3305 /* 3306 * Pass off the heavy lifting to spa_load(). 3307 * Pass TRUE for mosconfig because the user-supplied config 3308 * is actually the one to trust when doing an import. 3309 */ 3310 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3311 3312 /* 3313 * If 'tryconfig' was at least parsable, return the current config. 3314 */ 3315 if (spa->spa_root_vdev != NULL) { 3316 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3317 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3318 poolname) == 0); 3319 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3320 state) == 0); 3321 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3322 spa->spa_uberblock.ub_timestamp) == 0); 3323 3324 /* 3325 * If the bootfs property exists on this pool then we 3326 * copy it out so that external consumers can tell which 3327 * pools are bootable. 3328 */ 3329 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3330 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3331 3332 /* 3333 * We have to play games with the name since the 3334 * pool was opened as TRYIMPORT_NAME. 3335 */ 3336 if (dsl_dsobj_to_dsname(spa_name(spa), 3337 spa->spa_bootfs, tmpname) == 0) { 3338 char *cp; 3339 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3340 3341 cp = strchr(tmpname, '/'); 3342 if (cp == NULL) { 3343 (void) strlcpy(dsname, tmpname, 3344 MAXPATHLEN); 3345 } else { 3346 (void) snprintf(dsname, MAXPATHLEN, 3347 "%s/%s", poolname, ++cp); 3348 } 3349 VERIFY(nvlist_add_string(config, 3350 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3351 kmem_free(dsname, MAXPATHLEN); 3352 } 3353 kmem_free(tmpname, MAXPATHLEN); 3354 } 3355 3356 /* 3357 * Add the list of hot spares and level 2 cache devices. 3358 */ 3359 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3360 spa_add_spares(spa, config); 3361 spa_add_l2cache(spa, config); 3362 spa_config_exit(spa, SCL_CONFIG, FTAG); 3363 } 3364 3365 spa_unload(spa); 3366 spa_deactivate(spa); 3367 spa_remove(spa); 3368 mutex_exit(&spa_namespace_lock); 3369 3370 return (config); 3371} 3372 3373/* 3374 * Pool export/destroy 3375 * 3376 * The act of destroying or exporting a pool is very simple. We make sure there 3377 * is no more pending I/O and any references to the pool are gone. Then, we 3378 * update the pool state and sync all the labels to disk, removing the 3379 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3380 * we don't sync the labels or remove the configuration cache. 3381 */ 3382static int 3383spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3384 boolean_t force, boolean_t hardforce) 3385{ 3386 spa_t *spa; 3387 3388 if (oldconfig) 3389 *oldconfig = NULL; 3390 3391 if (!(spa_mode_global & FWRITE)) 3392 return (EROFS); 3393 3394 mutex_enter(&spa_namespace_lock); 3395 if ((spa = spa_lookup(pool)) == NULL) { 3396 mutex_exit(&spa_namespace_lock); 3397 return (ENOENT); 3398 } 3399 3400 /* 3401 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3402 * reacquire the namespace lock, and see if we can export. 3403 */ 3404 spa_open_ref(spa, FTAG); 3405 mutex_exit(&spa_namespace_lock); 3406 spa_async_suspend(spa); 3407 mutex_enter(&spa_namespace_lock); 3408 spa_close(spa, FTAG); 3409 3410 /* 3411 * The pool will be in core if it's openable, 3412 * in which case we can modify its state. 3413 */ 3414 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3415 /* 3416 * Objsets may be open only because they're dirty, so we 3417 * have to force it to sync before checking spa_refcnt. 3418 */ 3419 txg_wait_synced(spa->spa_dsl_pool, 0); 3420 3421 /* 3422 * A pool cannot be exported or destroyed if there are active 3423 * references. If we are resetting a pool, allow references by 3424 * fault injection handlers. 3425 */ 3426 if (!spa_refcount_zero(spa) || 3427 (spa->spa_inject_ref != 0 && 3428 new_state != POOL_STATE_UNINITIALIZED)) { 3429 spa_async_resume(spa); 3430 mutex_exit(&spa_namespace_lock); 3431 return (EBUSY); 3432 } 3433 3434 /* 3435 * A pool cannot be exported if it has an active shared spare. 3436 * This is to prevent other pools stealing the active spare 3437 * from an exported pool. At user's own will, such pool can 3438 * be forcedly exported. 3439 */ 3440 if (!force && new_state == POOL_STATE_EXPORTED && 3441 spa_has_active_shared_spare(spa)) { 3442 spa_async_resume(spa); 3443 mutex_exit(&spa_namespace_lock); 3444 return (EXDEV); 3445 } 3446 3447 /* 3448 * We want this to be reflected on every label, 3449 * so mark them all dirty. spa_unload() will do the 3450 * final sync that pushes these changes out. 3451 */ 3452 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3453 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3454 spa->spa_state = new_state; 3455 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 3456 vdev_config_dirty(spa->spa_root_vdev); 3457 spa_config_exit(spa, SCL_ALL, FTAG); 3458 } 3459 } 3460 3461 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3462 3463 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3464 spa_unload(spa); 3465 spa_deactivate(spa); 3466 } 3467 3468 if (oldconfig && spa->spa_config) 3469 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3470 3471 if (new_state != POOL_STATE_UNINITIALIZED) { 3472 if (!hardforce) 3473 spa_config_sync(spa, B_TRUE, B_TRUE); 3474 spa_remove(spa); 3475 } 3476 mutex_exit(&spa_namespace_lock); 3477 3478 return (0); 3479} 3480 3481/* 3482 * Destroy a storage pool. 3483 */ 3484int 3485spa_destroy(char *pool) 3486{ 3487 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3488 B_FALSE, B_FALSE)); 3489} 3490 3491/* 3492 * Export a storage pool. 3493 */ 3494int 3495spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3496 boolean_t hardforce) 3497{ 3498 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3499 force, hardforce)); 3500} 3501 3502/* 3503 * Similar to spa_export(), this unloads the spa_t without actually removing it 3504 * from the namespace in any way. 3505 */ 3506int 3507spa_reset(char *pool) 3508{ 3509 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3510 B_FALSE, B_FALSE)); 3511} 3512 3513/* 3514 * ========================================================================== 3515 * Device manipulation 3516 * ========================================================================== 3517 */ 3518 3519/* 3520 * Add a device to a storage pool. 3521 */ 3522int 3523spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3524{ 3525 uint64_t txg, id; 3526 int error; 3527 vdev_t *rvd = spa->spa_root_vdev; 3528 vdev_t *vd, *tvd; 3529 nvlist_t **spares, **l2cache; 3530 uint_t nspares, nl2cache; 3531 3532 txg = spa_vdev_enter(spa); 3533 3534 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3535 VDEV_ALLOC_ADD)) != 0) 3536 return (spa_vdev_exit(spa, NULL, txg, error)); 3537 3538 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3539 3540 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3541 &nspares) != 0) 3542 nspares = 0; 3543 3544 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3545 &nl2cache) != 0) 3546 nl2cache = 0; 3547 3548 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3549 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3550 3551 if (vd->vdev_children != 0 && 3552 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3553 return (spa_vdev_exit(spa, vd, txg, error)); 3554 3555 /* 3556 * We must validate the spares and l2cache devices after checking the 3557 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3558 */ 3559 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3560 return (spa_vdev_exit(spa, vd, txg, error)); 3561 3562 /* 3563 * Transfer each new top-level vdev from vd to rvd. 3564 */ 3565 for (int c = 0; c < vd->vdev_children; c++) { 3566 3567 /* 3568 * Set the vdev id to the first hole, if one exists. 3569 */ 3570 for (id = 0; id < rvd->vdev_children; id++) { 3571 if (rvd->vdev_child[id]->vdev_ishole) { 3572 vdev_free(rvd->vdev_child[id]); 3573 break; 3574 } 3575 } 3576 tvd = vd->vdev_child[c]; 3577 vdev_remove_child(vd, tvd); 3578 tvd->vdev_id = id; 3579 vdev_add_child(rvd, tvd); 3580 vdev_config_dirty(tvd); 3581 } 3582 3583 if (nspares != 0) { 3584 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3585 ZPOOL_CONFIG_SPARES); 3586 spa_load_spares(spa); 3587 spa->spa_spares.sav_sync = B_TRUE; 3588 } 3589 3590 if (nl2cache != 0) { 3591 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3592 ZPOOL_CONFIG_L2CACHE); 3593 spa_load_l2cache(spa); 3594 spa->spa_l2cache.sav_sync = B_TRUE; 3595 } 3596 3597 /* 3598 * We have to be careful when adding new vdevs to an existing pool. 3599 * If other threads start allocating from these vdevs before we 3600 * sync the config cache, and we lose power, then upon reboot we may 3601 * fail to open the pool because there are DVAs that the config cache 3602 * can't translate. Therefore, we first add the vdevs without 3603 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3604 * and then let spa_config_update() initialize the new metaslabs. 3605 * 3606 * spa_load() checks for added-but-not-initialized vdevs, so that 3607 * if we lose power at any point in this sequence, the remaining 3608 * steps will be completed the next time we load the pool. 3609 */ 3610 (void) spa_vdev_exit(spa, vd, txg, 0); 3611 3612 mutex_enter(&spa_namespace_lock); 3613 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3614 mutex_exit(&spa_namespace_lock); 3615 3616 return (0); 3617} 3618 3619/* 3620 * Attach a device to a mirror. The arguments are the path to any device 3621 * in the mirror, and the nvroot for the new device. If the path specifies 3622 * a device that is not mirrored, we automatically insert the mirror vdev. 3623 * 3624 * If 'replacing' is specified, the new device is intended to replace the 3625 * existing device; in this case the two devices are made into their own 3626 * mirror using the 'replacing' vdev, which is functionally identical to 3627 * the mirror vdev (it actually reuses all the same ops) but has a few 3628 * extra rules: you can't attach to it after it's been created, and upon 3629 * completion of resilvering, the first disk (the one being replaced) 3630 * is automatically detached. 3631 */ 3632int 3633spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3634{ 3635 uint64_t txg, open_txg; 3636 vdev_t *rvd = spa->spa_root_vdev; 3637 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3638 vdev_ops_t *pvops; 3639 char *oldvdpath, *newvdpath; 3640 int newvd_isspare; 3641 int error; 3642 3643 txg = spa_vdev_enter(spa); 3644 3645 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3646 3647 if (oldvd == NULL) 3648 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3649 3650 if (!oldvd->vdev_ops->vdev_op_leaf) 3651 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3652 3653 pvd = oldvd->vdev_parent; 3654 3655 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3656 VDEV_ALLOC_ADD)) != 0) 3657 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3658 3659 if (newrootvd->vdev_children != 1) 3660 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3661 3662 newvd = newrootvd->vdev_child[0]; 3663 3664 if (!newvd->vdev_ops->vdev_op_leaf) 3665 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3666 3667 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3668 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3669 3670 /* 3671 * Spares can't replace logs 3672 */ 3673 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3674 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3675 3676 if (!replacing) { 3677 /* 3678 * For attach, the only allowable parent is a mirror or the root 3679 * vdev. 3680 */ 3681 if (pvd->vdev_ops != &vdev_mirror_ops && 3682 pvd->vdev_ops != &vdev_root_ops) 3683 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3684 3685 pvops = &vdev_mirror_ops; 3686 } else { 3687 /* 3688 * Active hot spares can only be replaced by inactive hot 3689 * spares. 3690 */ 3691 if (pvd->vdev_ops == &vdev_spare_ops && 3692 pvd->vdev_child[1] == oldvd && 3693 !spa_has_spare(spa, newvd->vdev_guid)) 3694 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3695 3696 /* 3697 * If the source is a hot spare, and the parent isn't already a 3698 * spare, then we want to create a new hot spare. Otherwise, we 3699 * want to create a replacing vdev. The user is not allowed to 3700 * attach to a spared vdev child unless the 'isspare' state is 3701 * the same (spare replaces spare, non-spare replaces 3702 * non-spare). 3703 */ 3704 if (pvd->vdev_ops == &vdev_replacing_ops) 3705 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3706 else if (pvd->vdev_ops == &vdev_spare_ops && 3707 newvd->vdev_isspare != oldvd->vdev_isspare) 3708 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3709 else if (pvd->vdev_ops != &vdev_spare_ops && 3710 newvd->vdev_isspare) 3711 pvops = &vdev_spare_ops; 3712 else 3713 pvops = &vdev_replacing_ops; 3714 } 3715 3716 /* 3717 * Make sure the new device is big enough. 3718 */ 3719 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3720 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3721 3722 /* 3723 * The new device cannot have a higher alignment requirement 3724 * than the top-level vdev. 3725 */ 3726 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3727 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3728 3729 /* 3730 * If this is an in-place replacement, update oldvd's path and devid 3731 * to make it distinguishable from newvd, and unopenable from now on. 3732 */ 3733 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3734 spa_strfree(oldvd->vdev_path); 3735 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3736 KM_SLEEP); 3737 (void) sprintf(oldvd->vdev_path, "%s/%s", 3738 newvd->vdev_path, "old"); 3739 if (oldvd->vdev_devid != NULL) { 3740 spa_strfree(oldvd->vdev_devid); 3741 oldvd->vdev_devid = NULL; 3742 } 3743 } 3744 3745 /* 3746 * If the parent is not a mirror, or if we're replacing, insert the new 3747 * mirror/replacing/spare vdev above oldvd. 3748 */ 3749 if (pvd->vdev_ops != pvops) 3750 pvd = vdev_add_parent(oldvd, pvops); 3751 3752 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3753 ASSERT(pvd->vdev_ops == pvops); 3754 ASSERT(oldvd->vdev_parent == pvd); 3755 3756 /* 3757 * Extract the new device from its root and add it to pvd. 3758 */ 3759 vdev_remove_child(newrootvd, newvd); 3760 newvd->vdev_id = pvd->vdev_children; 3761 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3762 vdev_add_child(pvd, newvd); 3763 3764 tvd = newvd->vdev_top; 3765 ASSERT(pvd->vdev_top == tvd); 3766 ASSERT(tvd->vdev_parent == rvd); 3767 3768 vdev_config_dirty(tvd); 3769 3770 /* 3771 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3772 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3773 */ 3774 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3775 3776 vdev_dtl_dirty(newvd, DTL_MISSING, 3777 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3778 3779 if (newvd->vdev_isspare) { 3780 spa_spare_activate(newvd); 3781 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3782 } 3783 3784 oldvdpath = spa_strdup(oldvd->vdev_path); 3785 newvdpath = spa_strdup(newvd->vdev_path); 3786 newvd_isspare = newvd->vdev_isspare; 3787 3788 /* 3789 * Mark newvd's DTL dirty in this txg. 3790 */ 3791 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3792 3793 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3794 3795 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3796 CRED(), "%s vdev=%s %s vdev=%s", 3797 replacing && newvd_isspare ? "spare in" : 3798 replacing ? "replace" : "attach", newvdpath, 3799 replacing ? "for" : "to", oldvdpath); 3800 3801 spa_strfree(oldvdpath); 3802 spa_strfree(newvdpath); 3803 3804 /* 3805 * Kick off a resilver to update newvd. 3806 */ 3807 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3808 3809 return (0); 3810} 3811 3812/* 3813 * Detach a device from a mirror or replacing vdev. 3814 * If 'replace_done' is specified, only detach if the parent 3815 * is a replacing vdev. 3816 */ 3817int 3818spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3819{ 3820 uint64_t txg; 3821 int error; 3822 vdev_t *rvd = spa->spa_root_vdev; 3823 vdev_t *vd, *pvd, *cvd, *tvd; 3824 boolean_t unspare = B_FALSE; 3825 uint64_t unspare_guid; 3826 size_t len; 3827 char *vdpath; 3828 3829 txg = spa_vdev_enter(spa); 3830 3831 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3832 3833 if (vd == NULL) 3834 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3835 3836 if (!vd->vdev_ops->vdev_op_leaf) 3837 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3838 3839 pvd = vd->vdev_parent; 3840 3841 /* 3842 * If the parent/child relationship is not as expected, don't do it. 3843 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3844 * vdev that's replacing B with C. The user's intent in replacing 3845 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3846 * the replace by detaching C, the expected behavior is to end up 3847 * M(A,B). But suppose that right after deciding to detach C, 3848 * the replacement of B completes. We would have M(A,C), and then 3849 * ask to detach C, which would leave us with just A -- not what 3850 * the user wanted. To prevent this, we make sure that the 3851 * parent/child relationship hasn't changed -- in this example, 3852 * that C's parent is still the replacing vdev R. 3853 */ 3854 if (pvd->vdev_guid != pguid && pguid != 0) 3855 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3856 3857 /* 3858 * If replace_done is specified, only remove this device if it's 3859 * the first child of a replacing vdev. For the 'spare' vdev, either 3860 * disk can be removed. 3861 */ 3862 if (replace_done) { 3863 if (pvd->vdev_ops == &vdev_replacing_ops) { 3864 if (vd->vdev_id != 0) 3865 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3866 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3867 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3868 } 3869 } 3870 3871 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3872 spa_version(spa) >= SPA_VERSION_SPARES); 3873 3874 /* 3875 * Only mirror, replacing, and spare vdevs support detach. 3876 */ 3877 if (pvd->vdev_ops != &vdev_replacing_ops && 3878 pvd->vdev_ops != &vdev_mirror_ops && 3879 pvd->vdev_ops != &vdev_spare_ops) 3880 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3881 3882 /* 3883 * If this device has the only valid copy of some data, 3884 * we cannot safely detach it. 3885 */ 3886 if (vdev_dtl_required(vd)) 3887 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3888 3889 ASSERT(pvd->vdev_children >= 2); 3890 3891 /* 3892 * If we are detaching the second disk from a replacing vdev, then 3893 * check to see if we changed the original vdev's path to have "/old" 3894 * at the end in spa_vdev_attach(). If so, undo that change now. 3895 */ 3896 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3897 pvd->vdev_child[0]->vdev_path != NULL && 3898 pvd->vdev_child[1]->vdev_path != NULL) { 3899 ASSERT(pvd->vdev_child[1] == vd); 3900 cvd = pvd->vdev_child[0]; 3901 len = strlen(vd->vdev_path); 3902 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3903 strcmp(cvd->vdev_path + len, "/old") == 0) { 3904 spa_strfree(cvd->vdev_path); 3905 cvd->vdev_path = spa_strdup(vd->vdev_path); 3906 } 3907 } 3908 3909 /* 3910 * If we are detaching the original disk from a spare, then it implies 3911 * that the spare should become a real disk, and be removed from the 3912 * active spare list for the pool. 3913 */ 3914 if (pvd->vdev_ops == &vdev_spare_ops && 3915 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3916 unspare = B_TRUE; 3917 3918 /* 3919 * Erase the disk labels so the disk can be used for other things. 3920 * This must be done after all other error cases are handled, 3921 * but before we disembowel vd (so we can still do I/O to it). 3922 * But if we can't do it, don't treat the error as fatal -- 3923 * it may be that the unwritability of the disk is the reason 3924 * it's being detached! 3925 */ 3926 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3927 3928 /* 3929 * Remove vd from its parent and compact the parent's children. 3930 */ 3931 vdev_remove_child(pvd, vd); 3932 vdev_compact_children(pvd); 3933 3934 /* 3935 * Remember one of the remaining children so we can get tvd below. 3936 */ 3937 cvd = pvd->vdev_child[0]; 3938 3939 /* 3940 * If we need to remove the remaining child from the list of hot spares, 3941 * do it now, marking the vdev as no longer a spare in the process. 3942 * We must do this before vdev_remove_parent(), because that can 3943 * change the GUID if it creates a new toplevel GUID. For a similar 3944 * reason, we must remove the spare now, in the same txg as the detach; 3945 * otherwise someone could attach a new sibling, change the GUID, and 3946 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3947 */ 3948 if (unspare) { 3949 ASSERT(cvd->vdev_isspare); 3950 spa_spare_remove(cvd); 3951 unspare_guid = cvd->vdev_guid; 3952 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3953 } 3954 3955 /* 3956 * If the parent mirror/replacing vdev only has one child, 3957 * the parent is no longer needed. Remove it from the tree. 3958 */ 3959 if (pvd->vdev_children == 1) 3960 vdev_remove_parent(cvd); 3961 3962 /* 3963 * We don't set tvd until now because the parent we just removed 3964 * may have been the previous top-level vdev. 3965 */ 3966 tvd = cvd->vdev_top; 3967 ASSERT(tvd->vdev_parent == rvd); 3968 3969 /* 3970 * Reevaluate the parent vdev state. 3971 */ 3972 vdev_propagate_state(cvd); 3973 3974 /* 3975 * If the 'autoexpand' property is set on the pool then automatically 3976 * try to expand the size of the pool. For example if the device we 3977 * just detached was smaller than the others, it may be possible to 3978 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3979 * first so that we can obtain the updated sizes of the leaf vdevs. 3980 */ 3981 if (spa->spa_autoexpand) { 3982 vdev_reopen(tvd); 3983 vdev_expand(tvd, txg); 3984 } 3985 3986 vdev_config_dirty(tvd); 3987 3988 /* 3989 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3990 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3991 * But first make sure we're not on any *other* txg's DTL list, to 3992 * prevent vd from being accessed after it's freed. 3993 */ 3994 vdpath = spa_strdup(vd->vdev_path); 3995 for (int t = 0; t < TXG_SIZE; t++) 3996 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3997 vd->vdev_detached = B_TRUE; 3998 vdev_dirty(tvd, VDD_DTL, vd, txg); 3999 4000 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4001 4002 error = spa_vdev_exit(spa, vd, txg, 0); 4003 4004 spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), 4005 "vdev=%s", vdpath); 4006 spa_strfree(vdpath); 4007 4008 /* 4009 * If this was the removal of the original device in a hot spare vdev, 4010 * then we want to go through and remove the device from the hot spare 4011 * list of every other pool. 4012 */ 4013 if (unspare) { 4014 spa_t *myspa = spa; 4015 spa = NULL; 4016 mutex_enter(&spa_namespace_lock); 4017 while ((spa = spa_next(spa)) != NULL) { 4018 if (spa->spa_state != POOL_STATE_ACTIVE) 4019 continue; 4020 if (spa == myspa) 4021 continue; 4022 spa_open_ref(spa, FTAG); 4023 mutex_exit(&spa_namespace_lock); 4024 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4025 mutex_enter(&spa_namespace_lock); 4026 spa_close(spa, FTAG); 4027 } 4028 mutex_exit(&spa_namespace_lock); 4029 } 4030 4031 return (error); 4032} 4033 4034/* 4035 * Split a set of devices from their mirrors, and create a new pool from them. 4036 */ 4037int 4038spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4039 nvlist_t *props, boolean_t exp) 4040{ 4041 int error = 0; 4042 uint64_t txg, *glist; 4043 spa_t *newspa; 4044 uint_t c, children, lastlog; 4045 nvlist_t **child, *nvl, *tmp; 4046 dmu_tx_t *tx; 4047 char *altroot = NULL; 4048 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4049 boolean_t activate_slog; 4050 4051 if (!spa_writeable(spa)) 4052 return (EROFS); 4053 4054 txg = spa_vdev_enter(spa); 4055 4056 /* clear the log and flush everything up to now */ 4057 activate_slog = spa_passivate_log(spa); 4058 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4059 error = spa_offline_log(spa); 4060 txg = spa_vdev_config_enter(spa); 4061 4062 if (activate_slog) 4063 spa_activate_log(spa); 4064 4065 if (error != 0) 4066 return (spa_vdev_exit(spa, NULL, txg, error)); 4067 4068 /* check new spa name before going any further */ 4069 if (spa_lookup(newname) != NULL) 4070 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4071 4072 /* 4073 * scan through all the children to ensure they're all mirrors 4074 */ 4075 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4076 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4077 &children) != 0) 4078 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4079 4080 /* first, check to ensure we've got the right child count */ 4081 rvd = spa->spa_root_vdev; 4082 lastlog = 0; 4083 for (c = 0; c < rvd->vdev_children; c++) { 4084 vdev_t *vd = rvd->vdev_child[c]; 4085 4086 /* don't count the holes & logs as children */ 4087 if (vd->vdev_islog || vd->vdev_ishole) { 4088 if (lastlog == 0) 4089 lastlog = c; 4090 continue; 4091 } 4092 4093 lastlog = 0; 4094 } 4095 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4096 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4097 4098 /* next, ensure no spare or cache devices are part of the split */ 4099 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4100 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4101 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4102 4103 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4104 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4105 4106 /* then, loop over each vdev and validate it */ 4107 for (c = 0; c < children; c++) { 4108 uint64_t is_hole = 0; 4109 4110 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4111 &is_hole); 4112 4113 if (is_hole != 0) { 4114 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4115 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4116 continue; 4117 } else { 4118 error = EINVAL; 4119 break; 4120 } 4121 } 4122 4123 /* which disk is going to be split? */ 4124 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4125 &glist[c]) != 0) { 4126 error = EINVAL; 4127 break; 4128 } 4129 4130 /* look it up in the spa */ 4131 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4132 if (vml[c] == NULL) { 4133 error = ENODEV; 4134 break; 4135 } 4136 4137 /* make sure there's nothing stopping the split */ 4138 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4139 vml[c]->vdev_islog || 4140 vml[c]->vdev_ishole || 4141 vml[c]->vdev_isspare || 4142 vml[c]->vdev_isl2cache || 4143 !vdev_writeable(vml[c]) || 4144 vml[c]->vdev_children != 0 || 4145 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4146 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4147 error = EINVAL; 4148 break; 4149 } 4150 4151 if (vdev_dtl_required(vml[c])) { 4152 error = EBUSY; 4153 break; 4154 } 4155 4156 /* we need certain info from the top level */ 4157 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4158 vml[c]->vdev_top->vdev_ms_array) == 0); 4159 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4160 vml[c]->vdev_top->vdev_ms_shift) == 0); 4161 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4162 vml[c]->vdev_top->vdev_asize) == 0); 4163 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4164 vml[c]->vdev_top->vdev_ashift) == 0); 4165 } 4166 4167 if (error != 0) { 4168 kmem_free(vml, children * sizeof (vdev_t *)); 4169 kmem_free(glist, children * sizeof (uint64_t)); 4170 return (spa_vdev_exit(spa, NULL, txg, error)); 4171 } 4172 4173 /* stop writers from using the disks */ 4174 for (c = 0; c < children; c++) { 4175 if (vml[c] != NULL) 4176 vml[c]->vdev_offline = B_TRUE; 4177 } 4178 vdev_reopen(spa->spa_root_vdev); 4179 4180 /* 4181 * Temporarily record the splitting vdevs in the spa config. This 4182 * will disappear once the config is regenerated. 4183 */ 4184 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4185 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4186 glist, children) == 0); 4187 kmem_free(glist, children * sizeof (uint64_t)); 4188 4189 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4190 nvl) == 0); 4191 spa->spa_config_splitting = nvl; 4192 vdev_config_dirty(spa->spa_root_vdev); 4193 4194 /* configure and create the new pool */ 4195 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4196 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4197 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4198 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4199 spa_version(spa)) == 0); 4200 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4201 spa->spa_config_txg) == 0); 4202 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4203 spa_generate_guid(NULL)) == 0); 4204 (void) nvlist_lookup_string(props, 4205 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4206 4207 /* add the new pool to the namespace */ 4208 newspa = spa_add(newname, config, altroot); 4209 newspa->spa_config_txg = spa->spa_config_txg; 4210 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4211 4212 /* release the spa config lock, retaining the namespace lock */ 4213 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4214 4215 if (zio_injection_enabled) 4216 zio_handle_panic_injection(spa, FTAG, 1); 4217 4218 spa_activate(newspa, spa_mode_global); 4219 spa_async_suspend(newspa); 4220 4221 /* create the new pool from the disks of the original pool */ 4222 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4223 if (error) 4224 goto out; 4225 4226 /* if that worked, generate a real config for the new pool */ 4227 if (newspa->spa_root_vdev != NULL) { 4228 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4229 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4230 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4231 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4232 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4233 B_TRUE)); 4234 } 4235 4236 /* set the props */ 4237 if (props != NULL) { 4238 spa_configfile_set(newspa, props, B_FALSE); 4239 error = spa_prop_set(newspa, props); 4240 if (error) 4241 goto out; 4242 } 4243 4244 /* flush everything */ 4245 txg = spa_vdev_config_enter(newspa); 4246 vdev_config_dirty(newspa->spa_root_vdev); 4247 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4248 4249 if (zio_injection_enabled) 4250 zio_handle_panic_injection(spa, FTAG, 2); 4251 4252 spa_async_resume(newspa); 4253 4254 /* finally, update the original pool's config */ 4255 txg = spa_vdev_config_enter(spa); 4256 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4257 error = dmu_tx_assign(tx, TXG_WAIT); 4258 if (error != 0) 4259 dmu_tx_abort(tx); 4260 for (c = 0; c < children; c++) { 4261 if (vml[c] != NULL) { 4262 vdev_split(vml[c]); 4263 if (error == 0) 4264 spa_history_internal_log(LOG_POOL_VDEV_DETACH, 4265 spa, tx, CRED(), "vdev=%s", 4266 vml[c]->vdev_path); 4267 vdev_free(vml[c]); 4268 } 4269 } 4270 vdev_config_dirty(spa->spa_root_vdev); 4271 spa->spa_config_splitting = NULL; 4272 nvlist_free(nvl); 4273 if (error == 0) 4274 dmu_tx_commit(tx); 4275 (void) spa_vdev_exit(spa, NULL, txg, 0); 4276 4277 if (zio_injection_enabled) 4278 zio_handle_panic_injection(spa, FTAG, 3); 4279 4280 /* split is complete; log a history record */ 4281 spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), 4282 "split new pool %s from pool %s", newname, spa_name(spa)); 4283 4284 kmem_free(vml, children * sizeof (vdev_t *)); 4285 4286 /* if we're not going to mount the filesystems in userland, export */ 4287 if (exp) 4288 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4289 B_FALSE, B_FALSE); 4290 4291 return (error); 4292 4293out: 4294 spa_unload(newspa); 4295 spa_deactivate(newspa); 4296 spa_remove(newspa); 4297 4298 txg = spa_vdev_config_enter(spa); 4299 nvlist_free(spa->spa_config_splitting); 4300 spa->spa_config_splitting = NULL; 4301 (void) spa_vdev_exit(spa, NULL, txg, error); 4302 4303 kmem_free(vml, children * sizeof (vdev_t *)); 4304 return (error); 4305} 4306 4307static nvlist_t * 4308spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4309{ 4310 for (int i = 0; i < count; i++) { 4311 uint64_t guid; 4312 4313 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4314 &guid) == 0); 4315 4316 if (guid == target_guid) 4317 return (nvpp[i]); 4318 } 4319 4320 return (NULL); 4321} 4322 4323static void 4324spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4325 nvlist_t *dev_to_remove) 4326{ 4327 nvlist_t **newdev = NULL; 4328 4329 if (count > 1) 4330 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4331 4332 for (int i = 0, j = 0; i < count; i++) { 4333 if (dev[i] == dev_to_remove) 4334 continue; 4335 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4336 } 4337 4338 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4339 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4340 4341 for (int i = 0; i < count - 1; i++) 4342 nvlist_free(newdev[i]); 4343 4344 if (count > 1) 4345 kmem_free(newdev, (count - 1) * sizeof (void *)); 4346} 4347 4348/* 4349 * Removing a device from the vdev namespace requires several steps 4350 * and can take a significant amount of time. As a result we use 4351 * the spa_vdev_config_[enter/exit] functions which allow us to 4352 * grab and release the spa_config_lock while still holding the namespace 4353 * lock. During each step the configuration is synced out. 4354 */ 4355 4356/* 4357 * Evacuate the device. 4358 */ 4359int 4360spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4361{ 4362 int error = 0; 4363 uint64_t txg; 4364 4365 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4366 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4367 ASSERT(vd == vd->vdev_top); 4368 4369 /* 4370 * Evacuate the device. We don't hold the config lock as writer 4371 * since we need to do I/O but we do keep the 4372 * spa_namespace_lock held. Once this completes the device 4373 * should no longer have any blocks allocated on it. 4374 */ 4375 if (vd->vdev_islog) { 4376 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 4377 NULL, DS_FIND_CHILDREN); 4378 } else { 4379 error = ENOTSUP; /* until we have bp rewrite */ 4380 } 4381 4382 txg_wait_synced(spa_get_dsl(spa), 0); 4383 4384 if (error) 4385 return (error); 4386 4387 /* 4388 * The evacuation succeeded. Remove any remaining MOS metadata 4389 * associated with this vdev, and wait for these changes to sync. 4390 */ 4391 txg = spa_vdev_config_enter(spa); 4392 vd->vdev_removing = B_TRUE; 4393 vdev_dirty(vd, 0, NULL, txg); 4394 vdev_config_dirty(vd); 4395 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4396 4397 return (0); 4398} 4399 4400/* 4401 * Complete the removal by cleaning up the namespace. 4402 */ 4403void 4404spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4405{ 4406 vdev_t *rvd = spa->spa_root_vdev; 4407 uint64_t id = vd->vdev_id; 4408 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4409 4410 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4411 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4412 ASSERT(vd == vd->vdev_top); 4413 4414 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4415 4416 if (list_link_active(&vd->vdev_state_dirty_node)) 4417 vdev_state_clean(vd); 4418 if (list_link_active(&vd->vdev_config_dirty_node)) 4419 vdev_config_clean(vd); 4420 4421 vdev_free(vd); 4422 4423 if (last_vdev) { 4424 vdev_compact_children(rvd); 4425 } else { 4426 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4427 vdev_add_child(rvd, vd); 4428 } 4429 vdev_config_dirty(rvd); 4430 4431 /* 4432 * Reassess the health of our root vdev. 4433 */ 4434 vdev_reopen(rvd); 4435} 4436 4437/* 4438 * Remove a device from the pool. Currently, this supports removing only hot 4439 * spares, slogs, and level 2 ARC devices. 4440 */ 4441int 4442spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4443{ 4444 vdev_t *vd; 4445 metaslab_group_t *mg; 4446 nvlist_t **spares, **l2cache, *nv; 4447 uint64_t txg = 0; 4448 uint_t nspares, nl2cache; 4449 int error = 0; 4450 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4451 4452 if (!locked) 4453 txg = spa_vdev_enter(spa); 4454 4455 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4456 4457 if (spa->spa_spares.sav_vdevs != NULL && 4458 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4459 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4460 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4461 /* 4462 * Only remove the hot spare if it's not currently in use 4463 * in this pool. 4464 */ 4465 if (vd == NULL || unspare) { 4466 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4467 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4468 spa_load_spares(spa); 4469 spa->spa_spares.sav_sync = B_TRUE; 4470 } else { 4471 error = EBUSY; 4472 } 4473 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4474 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4475 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4476 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4477 /* 4478 * Cache devices can always be removed. 4479 */ 4480 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4481 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4482 spa_load_l2cache(spa); 4483 spa->spa_l2cache.sav_sync = B_TRUE; 4484 } else if (vd != NULL && vd->vdev_islog) { 4485 ASSERT(!locked); 4486 ASSERT(vd == vd->vdev_top); 4487 4488 /* 4489 * XXX - Once we have bp-rewrite this should 4490 * become the common case. 4491 */ 4492 4493 mg = vd->vdev_mg; 4494 4495 /* 4496 * Stop allocating from this vdev. 4497 */ 4498 metaslab_group_passivate(mg); 4499 4500 /* 4501 * Wait for the youngest allocations and frees to sync, 4502 * and then wait for the deferral of those frees to finish. 4503 */ 4504 spa_vdev_config_exit(spa, NULL, 4505 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4506 4507 /* 4508 * Attempt to evacuate the vdev. 4509 */ 4510 error = spa_vdev_remove_evacuate(spa, vd); 4511 4512 txg = spa_vdev_config_enter(spa); 4513 4514 /* 4515 * If we couldn't evacuate the vdev, unwind. 4516 */ 4517 if (error) { 4518 metaslab_group_activate(mg); 4519 return (spa_vdev_exit(spa, NULL, txg, error)); 4520 } 4521 4522 /* 4523 * Clean up the vdev namespace. 4524 */ 4525 spa_vdev_remove_from_namespace(spa, vd); 4526 4527 } else if (vd != NULL) { 4528 /* 4529 * Normal vdevs cannot be removed (yet). 4530 */ 4531 error = ENOTSUP; 4532 } else { 4533 /* 4534 * There is no vdev of any kind with the specified guid. 4535 */ 4536 error = ENOENT; 4537 } 4538 4539 if (!locked) 4540 return (spa_vdev_exit(spa, NULL, txg, error)); 4541 4542 return (error); 4543} 4544 4545/* 4546 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4547 * current spared, so we can detach it. 4548 */ 4549static vdev_t * 4550spa_vdev_resilver_done_hunt(vdev_t *vd) 4551{ 4552 vdev_t *newvd, *oldvd; 4553 4554 for (int c = 0; c < vd->vdev_children; c++) { 4555 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4556 if (oldvd != NULL) 4557 return (oldvd); 4558 } 4559 4560 /* 4561 * Check for a completed replacement. 4562 */ 4563 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 4564 oldvd = vd->vdev_child[0]; 4565 newvd = vd->vdev_child[1]; 4566 4567 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4568 !vdev_dtl_required(oldvd)) 4569 return (oldvd); 4570 } 4571 4572 /* 4573 * Check for a completed resilver with the 'unspare' flag set. 4574 */ 4575 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 4576 newvd = vd->vdev_child[0]; 4577 oldvd = vd->vdev_child[1]; 4578 4579 if (newvd->vdev_unspare && 4580 vdev_dtl_empty(newvd, DTL_MISSING) && 4581 !vdev_dtl_required(oldvd)) { 4582 newvd->vdev_unspare = 0; 4583 return (oldvd); 4584 } 4585 } 4586 4587 return (NULL); 4588} 4589 4590static void 4591spa_vdev_resilver_done(spa_t *spa) 4592{ 4593 vdev_t *vd, *pvd, *ppvd; 4594 uint64_t guid, sguid, pguid, ppguid; 4595 4596 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4597 4598 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4599 pvd = vd->vdev_parent; 4600 ppvd = pvd->vdev_parent; 4601 guid = vd->vdev_guid; 4602 pguid = pvd->vdev_guid; 4603 ppguid = ppvd->vdev_guid; 4604 sguid = 0; 4605 /* 4606 * If we have just finished replacing a hot spared device, then 4607 * we need to detach the parent's first child (the original hot 4608 * spare) as well. 4609 */ 4610 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 4611 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4612 ASSERT(ppvd->vdev_children == 2); 4613 sguid = ppvd->vdev_child[1]->vdev_guid; 4614 } 4615 spa_config_exit(spa, SCL_ALL, FTAG); 4616 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4617 return; 4618 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4619 return; 4620 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4621 } 4622 4623 spa_config_exit(spa, SCL_ALL, FTAG); 4624} 4625 4626/* 4627 * Update the stored path or FRU for this vdev. 4628 */ 4629int 4630spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4631 boolean_t ispath) 4632{ 4633 vdev_t *vd; 4634 4635 spa_vdev_state_enter(spa, SCL_ALL); 4636 4637 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4638 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4639 4640 if (!vd->vdev_ops->vdev_op_leaf) 4641 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4642 4643 if (ispath) { 4644 spa_strfree(vd->vdev_path); 4645 vd->vdev_path = spa_strdup(value); 4646 } else { 4647 if (vd->vdev_fru != NULL) 4648 spa_strfree(vd->vdev_fru); 4649 vd->vdev_fru = spa_strdup(value); 4650 } 4651 4652 return (spa_vdev_state_exit(spa, vd, 0)); 4653} 4654 4655int 4656spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4657{ 4658 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4659} 4660 4661int 4662spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4663{ 4664 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4665} 4666 4667/* 4668 * ========================================================================== 4669 * SPA Scrubbing 4670 * ========================================================================== 4671 */ 4672 4673int 4674spa_scrub(spa_t *spa, pool_scrub_type_t type) 4675{ 4676 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4677 4678 if ((uint_t)type >= POOL_SCRUB_TYPES) 4679 return (ENOTSUP); 4680 4681 /* 4682 * If a resilver was requested, but there is no DTL on a 4683 * writeable leaf device, we have nothing to do. 4684 */ 4685 if (type == POOL_SCRUB_RESILVER && 4686 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4687 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4688 return (0); 4689 } 4690 4691 if (type == POOL_SCRUB_EVERYTHING && 4692 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 4693 spa->spa_dsl_pool->dp_scrub_isresilver) 4694 return (EBUSY); 4695 4696 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 4697 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 4698 } else if (type == POOL_SCRUB_NONE) { 4699 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 4700 } else { 4701 return (EINVAL); 4702 } 4703} 4704 4705/* 4706 * ========================================================================== 4707 * SPA async task processing 4708 * ========================================================================== 4709 */ 4710 4711static void 4712spa_async_remove(spa_t *spa, vdev_t *vd) 4713{ 4714 if (vd->vdev_remove_wanted) { 4715 vd->vdev_remove_wanted = 0; 4716 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4717 4718 /* 4719 * We want to clear the stats, but we don't want to do a full 4720 * vdev_clear() as that will cause us to throw away 4721 * degraded/faulted state as well as attempt to reopen the 4722 * device, all of which is a waste. 4723 */ 4724 vd->vdev_stat.vs_read_errors = 0; 4725 vd->vdev_stat.vs_write_errors = 0; 4726 vd->vdev_stat.vs_checksum_errors = 0; 4727 4728 vdev_state_dirty(vd->vdev_top); 4729 } 4730 4731 for (int c = 0; c < vd->vdev_children; c++) 4732 spa_async_remove(spa, vd->vdev_child[c]); 4733} 4734 4735static void 4736spa_async_probe(spa_t *spa, vdev_t *vd) 4737{ 4738 if (vd->vdev_probe_wanted) { 4739 vd->vdev_probe_wanted = 0; 4740 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4741 } 4742 4743 for (int c = 0; c < vd->vdev_children; c++) 4744 spa_async_probe(spa, vd->vdev_child[c]); 4745} 4746 4747static void 4748spa_async_autoexpand(spa_t *spa, vdev_t *vd) 4749{ 4750 sysevent_id_t eid; 4751 nvlist_t *attr; 4752 char *physpath; 4753 4754 if (!spa->spa_autoexpand) 4755 return; 4756 4757 for (int c = 0; c < vd->vdev_children; c++) { 4758 vdev_t *cvd = vd->vdev_child[c]; 4759 spa_async_autoexpand(spa, cvd); 4760 } 4761 4762 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 4763 return; 4764 4765 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 4766 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 4767 4768 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4769 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 4770 4771 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 4772 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 4773 4774 nvlist_free(attr); 4775 kmem_free(physpath, MAXPATHLEN); 4776} 4777 4778static void 4779spa_async_thread(void *arg) 4780{ 4781 int tasks; 4782 spa_t *spa = arg; 4783 4784 ASSERT(spa->spa_sync_on); 4785 4786 mutex_enter(&spa->spa_async_lock); 4787 tasks = spa->spa_async_tasks; 4788 spa->spa_async_tasks = 0; 4789 mutex_exit(&spa->spa_async_lock); 4790 4791 /* 4792 * See if the config needs to be updated. 4793 */ 4794 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 4795 uint64_t old_space, new_space; 4796 4797 mutex_enter(&spa_namespace_lock); 4798 old_space = metaslab_class_get_space(spa_normal_class(spa)); 4799 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4800 new_space = metaslab_class_get_space(spa_normal_class(spa)); 4801 mutex_exit(&spa_namespace_lock); 4802 4803 /* 4804 * If the pool grew as a result of the config update, 4805 * then log an internal history event. 4806 */ 4807 if (new_space != old_space) { 4808 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 4809 spa, NULL, CRED(), 4810 "pool '%s' size: %llu(+%llu)", 4811 spa_name(spa), new_space, new_space - old_space); 4812 } 4813 } 4814 4815 /* 4816 * See if any devices need to be marked REMOVED. 4817 */ 4818 if (tasks & SPA_ASYNC_REMOVE) { 4819 spa_vdev_state_enter(spa, SCL_NONE); 4820 spa_async_remove(spa, spa->spa_root_vdev); 4821 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 4822 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 4823 for (int i = 0; i < spa->spa_spares.sav_count; i++) 4824 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 4825 (void) spa_vdev_state_exit(spa, NULL, 0); 4826 } 4827 4828 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 4829 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4830 spa_async_autoexpand(spa, spa->spa_root_vdev); 4831 spa_config_exit(spa, SCL_CONFIG, FTAG); 4832 } 4833 4834 /* 4835 * See if any devices need to be probed. 4836 */ 4837 if (tasks & SPA_ASYNC_PROBE) { 4838 spa_vdev_state_enter(spa, SCL_NONE); 4839 spa_async_probe(spa, spa->spa_root_vdev); 4840 (void) spa_vdev_state_exit(spa, NULL, 0); 4841 } 4842 4843 /* 4844 * If any devices are done replacing, detach them. 4845 */ 4846 if (tasks & SPA_ASYNC_RESILVER_DONE) 4847 spa_vdev_resilver_done(spa); 4848 4849 /* 4850 * Kick off a resilver. 4851 */ 4852 if (tasks & SPA_ASYNC_RESILVER) 4853 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4854 4855 /* 4856 * Let the world know that we're done. 4857 */ 4858 mutex_enter(&spa->spa_async_lock); 4859 spa->spa_async_thread = NULL; 4860 cv_broadcast(&spa->spa_async_cv); 4861 mutex_exit(&spa->spa_async_lock); 4862 thread_exit(); 4863} 4864 4865void 4866spa_async_suspend(spa_t *spa) 4867{ 4868 mutex_enter(&spa->spa_async_lock); 4869 spa->spa_async_suspended++; 4870 while (spa->spa_async_thread != NULL) 4871 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4872 mutex_exit(&spa->spa_async_lock); 4873} 4874 4875void 4876spa_async_resume(spa_t *spa) 4877{ 4878 mutex_enter(&spa->spa_async_lock); 4879 ASSERT(spa->spa_async_suspended != 0); 4880 spa->spa_async_suspended--; 4881 mutex_exit(&spa->spa_async_lock); 4882} 4883 4884static void 4885spa_async_dispatch(spa_t *spa) 4886{ 4887 mutex_enter(&spa->spa_async_lock); 4888 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4889 spa->spa_async_thread == NULL && 4890 rootdir != NULL && !vn_is_readonly(rootdir)) 4891 spa->spa_async_thread = thread_create(NULL, 0, 4892 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4893 mutex_exit(&spa->spa_async_lock); 4894} 4895 4896void 4897spa_async_request(spa_t *spa, int task) 4898{ 4899 mutex_enter(&spa->spa_async_lock); 4900 spa->spa_async_tasks |= task; 4901 mutex_exit(&spa->spa_async_lock); 4902} 4903 4904/* 4905 * ========================================================================== 4906 * SPA syncing routines 4907 * ========================================================================== 4908 */ 4909static void 4910spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) 4911{ 4912 blkptr_t blk; 4913 uint64_t itor = 0; 4914 uint8_t c = 1; 4915 4916 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4917 ASSERT(blk.blk_birth < txg); 4918 zio_free(spa, txg, &blk); 4919 } 4920 4921 bplist_vacate(bpl, tx); 4922 4923 /* 4924 * Pre-dirty the first block so we sync to convergence faster. 4925 * (Usually only the first block is needed.) 4926 */ 4927 dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); 4928} 4929 4930static void 4931spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 4932{ 4933 zio_t *zio = arg; 4934 4935 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 4936 zio->io_flags)); 4937} 4938 4939static void 4940spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4941{ 4942 char *packed = NULL; 4943 size_t bufsize; 4944 size_t nvsize = 0; 4945 dmu_buf_t *db; 4946 4947 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4948 4949 /* 4950 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4951 * information. This avoids the dbuf_will_dirty() path and 4952 * saves us a pre-read to get data we don't actually care about. 4953 */ 4954 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4955 packed = kmem_alloc(bufsize, KM_SLEEP); 4956 4957 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4958 KM_SLEEP) == 0); 4959 bzero(packed + nvsize, bufsize - nvsize); 4960 4961 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4962 4963 kmem_free(packed, bufsize); 4964 4965 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4966 dmu_buf_will_dirty(db, tx); 4967 *(uint64_t *)db->db_data = nvsize; 4968 dmu_buf_rele(db, FTAG); 4969} 4970 4971static void 4972spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4973 const char *config, const char *entry) 4974{ 4975 nvlist_t *nvroot; 4976 nvlist_t **list; 4977 int i; 4978 4979 if (!sav->sav_sync) 4980 return; 4981 4982 /* 4983 * Update the MOS nvlist describing the list of available devices. 4984 * spa_validate_aux() will have already made sure this nvlist is 4985 * valid and the vdevs are labeled appropriately. 4986 */ 4987 if (sav->sav_object == 0) { 4988 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4989 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4990 sizeof (uint64_t), tx); 4991 VERIFY(zap_update(spa->spa_meta_objset, 4992 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4993 &sav->sav_object, tx) == 0); 4994 } 4995 4996 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4997 if (sav->sav_count == 0) { 4998 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4999 } else { 5000 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5001 for (i = 0; i < sav->sav_count; i++) 5002 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5003 B_FALSE, B_FALSE, B_TRUE); 5004 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5005 sav->sav_count) == 0); 5006 for (i = 0; i < sav->sav_count; i++) 5007 nvlist_free(list[i]); 5008 kmem_free(list, sav->sav_count * sizeof (void *)); 5009 } 5010 5011 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5012 nvlist_free(nvroot); 5013 5014 sav->sav_sync = B_FALSE; 5015} 5016 5017static void 5018spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5019{ 5020 nvlist_t *config; 5021 5022 if (list_is_empty(&spa->spa_config_dirty_list)) 5023 return; 5024 5025 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5026 5027 config = spa_config_generate(spa, spa->spa_root_vdev, 5028 dmu_tx_get_txg(tx), B_FALSE); 5029 5030 spa_config_exit(spa, SCL_STATE, FTAG); 5031 5032 if (spa->spa_config_syncing) 5033 nvlist_free(spa->spa_config_syncing); 5034 spa->spa_config_syncing = config; 5035 5036 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5037} 5038 5039/* 5040 * Set zpool properties. 5041 */ 5042static void 5043spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 5044{ 5045 spa_t *spa = arg1; 5046 objset_t *mos = spa->spa_meta_objset; 5047 nvlist_t *nvp = arg2; 5048 nvpair_t *elem; 5049 uint64_t intval; 5050 char *strval; 5051 zpool_prop_t prop; 5052 const char *propname; 5053 zprop_type_t proptype; 5054 5055 mutex_enter(&spa->spa_props_lock); 5056 5057 elem = NULL; 5058 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5059 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5060 case ZPOOL_PROP_VERSION: 5061 /* 5062 * Only set version for non-zpool-creation cases 5063 * (set/import). spa_create() needs special care 5064 * for version setting. 5065 */ 5066 if (tx->tx_txg != TXG_INITIAL) { 5067 VERIFY(nvpair_value_uint64(elem, 5068 &intval) == 0); 5069 ASSERT(intval <= SPA_VERSION); 5070 ASSERT(intval >= spa_version(spa)); 5071 spa->spa_uberblock.ub_version = intval; 5072 vdev_config_dirty(spa->spa_root_vdev); 5073 } 5074 break; 5075 5076 case ZPOOL_PROP_ALTROOT: 5077 /* 5078 * 'altroot' is a non-persistent property. It should 5079 * have been set temporarily at creation or import time. 5080 */ 5081 ASSERT(spa->spa_root != NULL); 5082 break; 5083 5084 case ZPOOL_PROP_CACHEFILE: 5085 /* 5086 * 'cachefile' is also a non-persisitent property. 5087 */ 5088 break; 5089 default: 5090 /* 5091 * Set pool property values in the poolprops mos object. 5092 */ 5093 if (spa->spa_pool_props_object == 0) { 5094 VERIFY((spa->spa_pool_props_object = 5095 zap_create(mos, DMU_OT_POOL_PROPS, 5096 DMU_OT_NONE, 0, tx)) > 0); 5097 5098 VERIFY(zap_update(mos, 5099 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5100 8, 1, &spa->spa_pool_props_object, tx) 5101 == 0); 5102 } 5103 5104 /* normalize the property name */ 5105 propname = zpool_prop_to_name(prop); 5106 proptype = zpool_prop_get_type(prop); 5107 5108 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5109 ASSERT(proptype == PROP_TYPE_STRING); 5110 VERIFY(nvpair_value_string(elem, &strval) == 0); 5111 VERIFY(zap_update(mos, 5112 spa->spa_pool_props_object, propname, 5113 1, strlen(strval) + 1, strval, tx) == 0); 5114 5115 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5116 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5117 5118 if (proptype == PROP_TYPE_INDEX) { 5119 const char *unused; 5120 VERIFY(zpool_prop_index_to_string( 5121 prop, intval, &unused) == 0); 5122 } 5123 VERIFY(zap_update(mos, 5124 spa->spa_pool_props_object, propname, 5125 8, 1, &intval, tx) == 0); 5126 } else { 5127 ASSERT(0); /* not allowed */ 5128 } 5129 5130 switch (prop) { 5131 case ZPOOL_PROP_DELEGATION: 5132 spa->spa_delegation = intval; 5133 break; 5134 case ZPOOL_PROP_BOOTFS: 5135 spa->spa_bootfs = intval; 5136 break; 5137 case ZPOOL_PROP_FAILUREMODE: 5138 spa->spa_failmode = intval; 5139 break; 5140 case ZPOOL_PROP_AUTOEXPAND: 5141 spa->spa_autoexpand = intval; 5142 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 5143 break; 5144 case ZPOOL_PROP_DEDUPDITTO: 5145 spa->spa_dedup_ditto = intval; 5146 break; 5147 default: 5148 break; 5149 } 5150 } 5151 5152 /* log internal history if this is not a zpool create */ 5153 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5154 tx->tx_txg != TXG_INITIAL) { 5155 spa_history_internal_log(LOG_POOL_PROPSET, 5156 spa, tx, cr, "%s %lld %s", 5157 nvpair_name(elem), intval, spa_name(spa)); 5158 } 5159 } 5160 5161 mutex_exit(&spa->spa_props_lock); 5162} 5163 5164/* 5165 * Sync the specified transaction group. New blocks may be dirtied as 5166 * part of the process, so we iterate until it converges. 5167 */ 5168void 5169spa_sync(spa_t *spa, uint64_t txg) 5170{ 5171 dsl_pool_t *dp = spa->spa_dsl_pool; 5172 objset_t *mos = spa->spa_meta_objset; 5173 bplist_t *defer_bpl = &spa->spa_deferred_bplist; 5174 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5175 vdev_t *rvd = spa->spa_root_vdev; 5176 vdev_t *vd; 5177 dmu_tx_t *tx; 5178 int error; 5179 5180 /* 5181 * Lock out configuration changes. 5182 */ 5183 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5184 5185 spa->spa_syncing_txg = txg; 5186 spa->spa_sync_pass = 0; 5187 5188 /* 5189 * If there are any pending vdev state changes, convert them 5190 * into config changes that go out with this transaction group. 5191 */ 5192 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5193 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5194 /* 5195 * We need the write lock here because, for aux vdevs, 5196 * calling vdev_config_dirty() modifies sav_config. 5197 * This is ugly and will become unnecessary when we 5198 * eliminate the aux vdev wart by integrating all vdevs 5199 * into the root vdev tree. 5200 */ 5201 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5202 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5203 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5204 vdev_state_clean(vd); 5205 vdev_config_dirty(vd); 5206 } 5207 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5208 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5209 } 5210 spa_config_exit(spa, SCL_STATE, FTAG); 5211 5212 VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); 5213 5214 tx = dmu_tx_create_assigned(dp, txg); 5215 5216 /* 5217 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5218 * set spa_deflate if we have no raid-z vdevs. 5219 */ 5220 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5221 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5222 int i; 5223 5224 for (i = 0; i < rvd->vdev_children; i++) { 5225 vd = rvd->vdev_child[i]; 5226 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5227 break; 5228 } 5229 if (i == rvd->vdev_children) { 5230 spa->spa_deflate = TRUE; 5231 VERIFY(0 == zap_add(spa->spa_meta_objset, 5232 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5233 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5234 } 5235 } 5236 5237 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5238 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5239 dsl_pool_create_origin(dp, tx); 5240 5241 /* Keeping the origin open increases spa_minref */ 5242 spa->spa_minref += 3; 5243 } 5244 5245 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5246 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5247 dsl_pool_upgrade_clones(dp, tx); 5248 } 5249 5250 /* 5251 * If anything has changed in this txg, push the deferred frees 5252 * from the previous txg. If not, leave them alone so that we 5253 * don't generate work on an otherwise idle system. 5254 */ 5255 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5256 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5257 !txg_list_empty(&dp->dp_sync_tasks, txg)) 5258 spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); 5259 5260 /* 5261 * Iterate to convergence. 5262 */ 5263 do { 5264 int pass = ++spa->spa_sync_pass; 5265 5266 spa_sync_config_object(spa, tx); 5267 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5268 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5269 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5270 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5271 spa_errlog_sync(spa, txg); 5272 dsl_pool_sync(dp, txg); 5273 5274 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5275 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5276 bplist_sync(free_bpl, spa_sync_free, zio, tx); 5277 VERIFY(zio_wait(zio) == 0); 5278 } else { 5279 bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); 5280 } 5281 5282 ddt_sync(spa, txg); 5283 5284 mutex_enter(&spa->spa_scrub_lock); 5285 while (spa->spa_scrub_inflight > 0) 5286 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 5287 mutex_exit(&spa->spa_scrub_lock); 5288 5289 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5290 vdev_sync(vd, txg); 5291 5292 } while (dmu_objset_is_dirty(mos, txg)); 5293 5294 ASSERT(free_bpl->bpl_queue == NULL); 5295 5296 bplist_close(defer_bpl); 5297 5298 /* 5299 * Rewrite the vdev configuration (which includes the uberblock) 5300 * to commit the transaction group. 5301 * 5302 * If there are no dirty vdevs, we sync the uberblock to a few 5303 * random top-level vdevs that are known to be visible in the 5304 * config cache (see spa_vdev_add() for a complete description). 5305 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5306 */ 5307 for (;;) { 5308 /* 5309 * We hold SCL_STATE to prevent vdev open/close/etc. 5310 * while we're attempting to write the vdev labels. 5311 */ 5312 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5313 5314 if (list_is_empty(&spa->spa_config_dirty_list)) { 5315 vdev_t *svd[SPA_DVAS_PER_BP]; 5316 int svdcount = 0; 5317 int children = rvd->vdev_children; 5318 int c0 = spa_get_random(children); 5319 5320 for (int c = 0; c < children; c++) { 5321 vd = rvd->vdev_child[(c0 + c) % children]; 5322 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5323 continue; 5324 svd[svdcount++] = vd; 5325 if (svdcount == SPA_DVAS_PER_BP) 5326 break; 5327 } 5328 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5329 if (error != 0) 5330 error = vdev_config_sync(svd, svdcount, txg, 5331 B_TRUE); 5332 } else { 5333 error = vdev_config_sync(rvd->vdev_child, 5334 rvd->vdev_children, txg, B_FALSE); 5335 if (error != 0) 5336 error = vdev_config_sync(rvd->vdev_child, 5337 rvd->vdev_children, txg, B_TRUE); 5338 } 5339 5340 spa_config_exit(spa, SCL_STATE, FTAG); 5341 5342 if (error == 0) 5343 break; 5344 zio_suspend(spa, NULL); 5345 zio_resume_wait(spa); 5346 } 5347 dmu_tx_commit(tx); 5348 5349 /* 5350 * Clear the dirty config list. 5351 */ 5352 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5353 vdev_config_clean(vd); 5354 5355 /* 5356 * Now that the new config has synced transactionally, 5357 * let it become visible to the config cache. 5358 */ 5359 if (spa->spa_config_syncing != NULL) { 5360 spa_config_set(spa, spa->spa_config_syncing); 5361 spa->spa_config_txg = txg; 5362 spa->spa_config_syncing = NULL; 5363 } 5364 5365 spa->spa_ubsync = spa->spa_uberblock; 5366 5367 dsl_pool_sync_done(dp, txg); 5368 5369 /* 5370 * Update usable space statistics. 5371 */ 5372 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5373 vdev_sync_done(vd, txg); 5374 5375 spa_update_dspace(spa); 5376 5377 /* 5378 * It had better be the case that we didn't dirty anything 5379 * since vdev_config_sync(). 5380 */ 5381 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5382 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5383 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5384 ASSERT(defer_bpl->bpl_queue == NULL); 5385 ASSERT(free_bpl->bpl_queue == NULL); 5386 5387 spa->spa_sync_pass = 0; 5388 5389 spa_config_exit(spa, SCL_CONFIG, FTAG); 5390 5391 spa_handle_ignored_writes(spa); 5392 5393 /* 5394 * If any async tasks have been requested, kick them off. 5395 */ 5396 spa_async_dispatch(spa); 5397} 5398 5399/* 5400 * Sync all pools. We don't want to hold the namespace lock across these 5401 * operations, so we take a reference on the spa_t and drop the lock during the 5402 * sync. 5403 */ 5404void 5405spa_sync_allpools(void) 5406{ 5407 spa_t *spa = NULL; 5408 mutex_enter(&spa_namespace_lock); 5409 while ((spa = spa_next(spa)) != NULL) { 5410 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 5411 continue; 5412 spa_open_ref(spa, FTAG); 5413 mutex_exit(&spa_namespace_lock); 5414 txg_wait_synced(spa_get_dsl(spa), 0); 5415 mutex_enter(&spa_namespace_lock); 5416 spa_close(spa, FTAG); 5417 } 5418 mutex_exit(&spa_namespace_lock); 5419} 5420 5421/* 5422 * ========================================================================== 5423 * Miscellaneous routines 5424 * ========================================================================== 5425 */ 5426 5427/* 5428 * Remove all pools in the system. 5429 */ 5430void 5431spa_evict_all(void) 5432{ 5433 spa_t *spa; 5434 5435 /* 5436 * Remove all cached state. All pools should be closed now, 5437 * so every spa in the AVL tree should be unreferenced. 5438 */ 5439 mutex_enter(&spa_namespace_lock); 5440 while ((spa = spa_next(NULL)) != NULL) { 5441 /* 5442 * Stop async tasks. The async thread may need to detach 5443 * a device that's been replaced, which requires grabbing 5444 * spa_namespace_lock, so we must drop it here. 5445 */ 5446 spa_open_ref(spa, FTAG); 5447 mutex_exit(&spa_namespace_lock); 5448 spa_async_suspend(spa); 5449 mutex_enter(&spa_namespace_lock); 5450 spa_close(spa, FTAG); 5451 5452 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5453 spa_unload(spa); 5454 spa_deactivate(spa); 5455 } 5456 spa_remove(spa); 5457 } 5458 mutex_exit(&spa_namespace_lock); 5459} 5460 5461vdev_t * 5462spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5463{ 5464 vdev_t *vd; 5465 int i; 5466 5467 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5468 return (vd); 5469 5470 if (aux) { 5471 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5472 vd = spa->spa_l2cache.sav_vdevs[i]; 5473 if (vd->vdev_guid == guid) 5474 return (vd); 5475 } 5476 5477 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5478 vd = spa->spa_spares.sav_vdevs[i]; 5479 if (vd->vdev_guid == guid) 5480 return (vd); 5481 } 5482 } 5483 5484 return (NULL); 5485} 5486 5487void 5488spa_upgrade(spa_t *spa, uint64_t version) 5489{ 5490 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5491 5492 /* 5493 * This should only be called for a non-faulted pool, and since a 5494 * future version would result in an unopenable pool, this shouldn't be 5495 * possible. 5496 */ 5497 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5498 ASSERT(version >= spa->spa_uberblock.ub_version); 5499 5500 spa->spa_uberblock.ub_version = version; 5501 vdev_config_dirty(spa->spa_root_vdev); 5502 5503 spa_config_exit(spa, SCL_ALL, FTAG); 5504 5505 txg_wait_synced(spa_get_dsl(spa), 0); 5506} 5507 5508boolean_t 5509spa_has_spare(spa_t *spa, uint64_t guid) 5510{ 5511 int i; 5512 uint64_t spareguid; 5513 spa_aux_vdev_t *sav = &spa->spa_spares; 5514 5515 for (i = 0; i < sav->sav_count; i++) 5516 if (sav->sav_vdevs[i]->vdev_guid == guid) 5517 return (B_TRUE); 5518 5519 for (i = 0; i < sav->sav_npending; i++) { 5520 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5521 &spareguid) == 0 && spareguid == guid) 5522 return (B_TRUE); 5523 } 5524 5525 return (B_FALSE); 5526} 5527 5528/* 5529 * Check if a pool has an active shared spare device. 5530 * Note: reference count of an active spare is 2, as a spare and as a replace 5531 */ 5532static boolean_t 5533spa_has_active_shared_spare(spa_t *spa) 5534{ 5535 int i, refcnt; 5536 uint64_t pool; 5537 spa_aux_vdev_t *sav = &spa->spa_spares; 5538 5539 for (i = 0; i < sav->sav_count; i++) { 5540 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5541 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5542 refcnt > 2) 5543 return (B_TRUE); 5544 } 5545 5546 return (B_FALSE); 5547} 5548 5549/* 5550 * Post a sysevent corresponding to the given event. The 'name' must be one of 5551 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5552 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5553 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5554 * or zdb as real changes. 5555 */ 5556void 5557spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5558{ 5559#ifndef __NetBSD__ 5560#ifdef _KERNEL 5561 sysevent_t *ev; 5562 sysevent_attr_list_t *attr = NULL; 5563 sysevent_value_t value; 5564 sysevent_id_t eid; 5565 5566 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5567 SE_SLEEP); 5568 5569 value.value_type = SE_DATA_TYPE_STRING; 5570 value.value.sv_string = spa_name(spa); 5571 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5572 goto done; 5573 5574 value.value_type = SE_DATA_TYPE_UINT64; 5575 value.value.sv_uint64 = spa_guid(spa); 5576 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5577 goto done; 5578 5579 if (vd) { 5580 value.value_type = SE_DATA_TYPE_UINT64; 5581 value.value.sv_uint64 = vd->vdev_guid; 5582 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5583 SE_SLEEP) != 0) 5584 goto done; 5585 5586 if (vd->vdev_path) { 5587 value.value_type = SE_DATA_TYPE_STRING; 5588 value.value.sv_string = vd->vdev_path; 5589 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5590 &value, SE_SLEEP) != 0) 5591 goto done; 5592 } 5593 } 5594 5595 if (sysevent_attach_attributes(ev, attr) != 0) 5596 goto done; 5597 attr = NULL; 5598 5599 (void) log_sysevent(ev, SE_SLEEP, &eid); 5600 5601done: 5602 if (attr) 5603 sysevent_free_attr(attr); 5604 sysevent_free(ev); 5605#endif 5606#endif /* __NetBSD__ */ 5607} 5608