1/*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD$"); 29 30/* 31 * Stand-alone ZFS file reader. 32 */ 33 34#include <sys/endian.h> 35#include <sys/stat.h> 36#include <sys/stdint.h> 37#include <sys/list.h> 38#include <machine/_inttypes.h> 39 40#include "zfsimpl.h" 41#include "zfssubr.c" 42 43 44struct zfsmount { 45 const spa_t *spa; 46 objset_phys_t objset; 47 uint64_t rootobj; 48}; 49static struct zfsmount zfsmount __unused; 50 51/* 52 * The indirect_child_t represents the vdev that we will read from, when we 53 * need to read all copies of the data (e.g. for scrub or reconstruction). 54 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 55 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 56 * ic_vdev is a child of the mirror. 57 */ 58typedef struct indirect_child { 59 void *ic_data; 60 vdev_t *ic_vdev; 61} indirect_child_t; 62 63/* 64 * The indirect_split_t represents one mapped segment of an i/o to the 65 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 66 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 67 * For split blocks, there will be several of these. 68 */ 69typedef struct indirect_split { 70 list_node_t is_node; /* link on iv_splits */ 71 72 /* 73 * is_split_offset is the offset into the i/o. 74 * This is the sum of the previous splits' is_size's. 75 */ 76 uint64_t is_split_offset; 77 78 vdev_t *is_vdev; /* top-level vdev */ 79 uint64_t is_target_offset; /* offset on is_vdev */ 80 uint64_t is_size; 81 int is_children; /* number of entries in is_child[] */ 82 83 /* 84 * is_good_child is the child that we are currently using to 85 * attempt reconstruction. 86 */ 87 int is_good_child; 88 89 indirect_child_t is_child[1]; /* variable-length */ 90} indirect_split_t; 91 92/* 93 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 94 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 95 */ 96typedef struct indirect_vsd { 97 boolean_t iv_split_block; 98 boolean_t iv_reconstruct; 99 100 list_t iv_splits; /* list of indirect_split_t's */ 101} indirect_vsd_t; 102 103/* 104 * List of all vdevs, chained through v_alllink. 105 */ 106static vdev_list_t zfs_vdevs; 107 108 /* 109 * List of ZFS features supported for read 110 */ 111static const char *features_for_read[] = { 112 "org.illumos:lz4_compress", 113 "com.delphix:hole_birth", 114 "com.delphix:extensible_dataset", 115 "com.delphix:embedded_data", 116 "org.open-zfs:large_blocks", 117 "org.illumos:sha512", 118 "org.illumos:skein", 119 "org.zfsonlinux:large_dnode", 120 "com.joyent:multi_vdev_crash_dump", 121 "com.delphix:spacemap_histogram", 122 "com.delphix:zpool_checkpoint", 123 "com.delphix:spacemap_v2", 124 "com.datto:encryption", 125 "org.zfsonlinux:allocation_classes", 126 "com.datto:resilver_defer", 127 "com.delphix:device_removal", 128 "com.delphix:obsolete_counts", 129 "com.intel:allocation_classes", 130 "org.freebsd:zstd_compress", 131 "com.datto:encryption", 132 NULL 133}; 134 135/* 136 * List of all pools, chained through spa_link. 137 */ 138static spa_list_t zfs_pools; 139 140static const dnode_phys_t *dnode_cache_obj; 141static uint64_t dnode_cache_bn; 142static char *dnode_cache_buf; 143static char *zap_scratch; 144static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr; 145 146#define TEMP_SIZE (1024 * 1024) 147 148static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 149static int zfs_get_root(const spa_t *spa, uint64_t *objid); 150static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 151static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, 152 const char *name, uint64_t integer_size, uint64_t num_integers, 153 void *value); 154static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t, 155 dnode_phys_t *); 156static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *, 157 size_t); 158static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t, 159 size_t); 160static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t); 161vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *, 162 uint64_t); 163vdev_indirect_mapping_entry_phys_t * 164 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t, 165 uint64_t, uint64_t *); 166 167static void 168zfs_init(void) 169{ 170 STAILQ_INIT(&zfs_vdevs); 171 STAILQ_INIT(&zfs_pools); 172 173 zfs_temp_buf = malloc(TEMP_SIZE); 174 zfs_temp_end = zfs_temp_buf + TEMP_SIZE; 175 zfs_temp_ptr = zfs_temp_buf; 176 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 177 zap_scratch = malloc(SPA_MAXBLOCKSIZE); 178 179 zfs_init_crc(); 180} 181 182static void * 183zfs_alloc(size_t size) 184{ 185 char *ptr; 186 187 if (zfs_temp_ptr + size > zfs_temp_end) { 188 panic("ZFS: out of temporary buffer space"); 189 } 190 ptr = zfs_temp_ptr; 191 zfs_temp_ptr += size; 192 193 return (ptr); 194} 195 196static void 197zfs_free(void *ptr, size_t size) 198{ 199 200 zfs_temp_ptr -= size; 201 if (zfs_temp_ptr != ptr) { 202 panic("ZFS: zfs_alloc()/zfs_free() mismatch"); 203 } 204} 205 206static int 207xdr_int(const unsigned char **xdr, int *ip) 208{ 209 *ip = be32dec(*xdr); 210 (*xdr) += 4; 211 return (0); 212} 213 214static int 215xdr_u_int(const unsigned char **xdr, u_int *ip) 216{ 217 *ip = be32dec(*xdr); 218 (*xdr) += 4; 219 return (0); 220} 221 222static int 223xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) 224{ 225 u_int hi, lo; 226 227 xdr_u_int(xdr, &hi); 228 xdr_u_int(xdr, &lo); 229 *lp = (((uint64_t) hi) << 32) | lo; 230 return (0); 231} 232 233static int 234nvlist_find(const unsigned char *nvlist, const char *name, int type, 235 int *elementsp, void *valuep) 236{ 237 const unsigned char *p, *pair; 238 int junk; 239 int encoded_size, decoded_size; 240 241 p = nvlist; 242 xdr_int(&p, &junk); 243 xdr_int(&p, &junk); 244 245 pair = p; 246 xdr_int(&p, &encoded_size); 247 xdr_int(&p, &decoded_size); 248 while (encoded_size && decoded_size) { 249 int namelen, pairtype, elements; 250 const char *pairname; 251 252 xdr_int(&p, &namelen); 253 pairname = (const char *)p; 254 p += roundup(namelen, 4); 255 xdr_int(&p, &pairtype); 256 257 if (!memcmp(name, pairname, namelen) && type == pairtype) { 258 xdr_int(&p, &elements); 259 if (elementsp) 260 *elementsp = elements; 261 if (type == DATA_TYPE_UINT64) { 262 xdr_uint64_t(&p, (uint64_t *) valuep); 263 return (0); 264 } else if (type == DATA_TYPE_STRING) { 265 int len; 266 xdr_int(&p, &len); 267 (*(const char **)valuep) = (const char *)p; 268 return (0); 269 } else if (type == DATA_TYPE_NVLIST || 270 type == DATA_TYPE_NVLIST_ARRAY) { 271 (*(const unsigned char **)valuep) = 272 (const unsigned char *)p; 273 return (0); 274 } else { 275 return (EIO); 276 } 277 } else { 278 /* 279 * Not the pair we are looking for, skip to the next one. 280 */ 281 p = pair + encoded_size; 282 } 283 284 pair = p; 285 xdr_int(&p, &encoded_size); 286 xdr_int(&p, &decoded_size); 287 } 288 289 return (EIO); 290} 291 292static int 293nvlist_check_features_for_read(const unsigned char *nvlist) 294{ 295 const unsigned char *p, *pair; 296 int junk; 297 int encoded_size, decoded_size; 298 int rc; 299 300 rc = 0; 301 302 p = nvlist; 303 xdr_int(&p, &junk); 304 xdr_int(&p, &junk); 305 306 pair = p; 307 xdr_int(&p, &encoded_size); 308 xdr_int(&p, &decoded_size); 309 while (encoded_size && decoded_size) { 310 int namelen, pairtype; 311 const char *pairname; 312 int i, found; 313 314 found = 0; 315 316 xdr_int(&p, &namelen); 317 pairname = (const char *)p; 318 p += roundup(namelen, 4); 319 xdr_int(&p, &pairtype); 320 321 for (i = 0; features_for_read[i] != NULL; i++) { 322 if (!memcmp(pairname, features_for_read[i], namelen)) { 323 found = 1; 324 break; 325 } 326 } 327 328 if (!found) { 329 printf("ZFS: unsupported feature: %s\n", pairname); 330 rc = EIO; 331 } 332 333 p = pair + encoded_size; 334 335 pair = p; 336 xdr_int(&p, &encoded_size); 337 xdr_int(&p, &decoded_size); 338 } 339 340 return (rc); 341} 342 343/* 344 * Return the next nvlist in an nvlist array. 345 */ 346static const unsigned char * 347nvlist_next(const unsigned char *nvlist) 348{ 349 const unsigned char *p, *pair; 350 int junk; 351 int encoded_size, decoded_size; 352 353 p = nvlist; 354 xdr_int(&p, &junk); 355 xdr_int(&p, &junk); 356 357 pair = p; 358 xdr_int(&p, &encoded_size); 359 xdr_int(&p, &decoded_size); 360 while (encoded_size && decoded_size) { 361 p = pair + encoded_size; 362 363 pair = p; 364 xdr_int(&p, &encoded_size); 365 xdr_int(&p, &decoded_size); 366 } 367 368 return p; 369} 370 371#ifdef TEST 372 373static const unsigned char * 374nvlist_print(const unsigned char *nvlist, unsigned int indent) 375{ 376 static const char* typenames[] = { 377 "DATA_TYPE_UNKNOWN", 378 "DATA_TYPE_BOOLEAN", 379 "DATA_TYPE_BYTE", 380 "DATA_TYPE_INT16", 381 "DATA_TYPE_UINT16", 382 "DATA_TYPE_INT32", 383 "DATA_TYPE_UINT32", 384 "DATA_TYPE_INT64", 385 "DATA_TYPE_UINT64", 386 "DATA_TYPE_STRING", 387 "DATA_TYPE_BYTE_ARRAY", 388 "DATA_TYPE_INT16_ARRAY", 389 "DATA_TYPE_UINT16_ARRAY", 390 "DATA_TYPE_INT32_ARRAY", 391 "DATA_TYPE_UINT32_ARRAY", 392 "DATA_TYPE_INT64_ARRAY", 393 "DATA_TYPE_UINT64_ARRAY", 394 "DATA_TYPE_STRING_ARRAY", 395 "DATA_TYPE_HRTIME", 396 "DATA_TYPE_NVLIST", 397 "DATA_TYPE_NVLIST_ARRAY", 398 "DATA_TYPE_BOOLEAN_VALUE", 399 "DATA_TYPE_INT8", 400 "DATA_TYPE_UINT8", 401 "DATA_TYPE_BOOLEAN_ARRAY", 402 "DATA_TYPE_INT8_ARRAY", 403 "DATA_TYPE_UINT8_ARRAY" 404 }; 405 406 unsigned int i, j; 407 const unsigned char *p, *pair; 408 int junk; 409 int encoded_size, decoded_size; 410 411 p = nvlist; 412 xdr_int(&p, &junk); 413 xdr_int(&p, &junk); 414 415 pair = p; 416 xdr_int(&p, &encoded_size); 417 xdr_int(&p, &decoded_size); 418 while (encoded_size && decoded_size) { 419 int namelen, pairtype, elements; 420 const char *pairname; 421 422 xdr_int(&p, &namelen); 423 pairname = (const char *)p; 424 p += roundup(namelen, 4); 425 xdr_int(&p, &pairtype); 426 427 for (i = 0; i < indent; i++) 428 printf(" "); 429 printf("%s %s", typenames[pairtype], pairname); 430 431 xdr_int(&p, &elements); 432 switch (pairtype) { 433 case DATA_TYPE_UINT64: { 434 uint64_t val; 435 xdr_uint64_t(&p, &val); 436 printf(" = 0x%jx\n", (uintmax_t)val); 437 break; 438 } 439 440 case DATA_TYPE_STRING: { 441 int len; 442 xdr_int(&p, &len); 443 printf(" = \"%s\"\n", p); 444 break; 445 } 446 447 case DATA_TYPE_NVLIST: 448 printf("\n"); 449 nvlist_print(p, indent + 1); 450 break; 451 452 case DATA_TYPE_NVLIST_ARRAY: 453 for (j = 0; j < elements; j++) { 454 printf("[%d]\n", j); 455 p = nvlist_print(p, indent + 1); 456 if (j != elements - 1) { 457 for (i = 0; i < indent; i++) 458 printf(" "); 459 printf("%s %s", typenames[pairtype], pairname); 460 } 461 } 462 break; 463 464 default: 465 printf("\n"); 466 } 467 468 p = pair + encoded_size; 469 470 pair = p; 471 xdr_int(&p, &encoded_size); 472 xdr_int(&p, &decoded_size); 473 } 474 475 return p; 476} 477 478#endif 479 480static int 481vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 482 off_t offset, size_t size) 483{ 484 size_t psize; 485 int rc; 486 487 if (!vdev->v_phys_read) 488 return (EIO); 489 490 if (bp) { 491 psize = BP_GET_PSIZE(bp); 492 } else { 493 psize = size; 494 } 495 496 /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/ 497 rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); 498 if (rc == 0) { 499 if (bp != NULL) 500 rc = zio_checksum_verify(vdev->v_spa, bp, buf); 501 } 502 503 return (rc); 504} 505 506typedef struct remap_segment { 507 vdev_t *rs_vd; 508 uint64_t rs_offset; 509 uint64_t rs_asize; 510 uint64_t rs_split_offset; 511 list_node_t rs_node; 512} remap_segment_t; 513 514static remap_segment_t * 515rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 516{ 517 remap_segment_t *rs = malloc(sizeof (remap_segment_t)); 518 519 if (rs != NULL) { 520 rs->rs_vd = vd; 521 rs->rs_offset = offset; 522 rs->rs_asize = asize; 523 rs->rs_split_offset = split_offset; 524 } 525 526 return (rs); 527} 528 529vdev_indirect_mapping_t * 530vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, 531 uint64_t mapping_object) 532{ 533 vdev_indirect_mapping_t *vim; 534 vdev_indirect_mapping_phys_t *vim_phys; 535 int rc; 536 537 vim = calloc(1, sizeof (*vim)); 538 if (vim == NULL) 539 return (NULL); 540 541 vim->vim_dn = calloc(1, sizeof (*vim->vim_dn)); 542 if (vim->vim_dn == NULL) { 543 free(vim); 544 return (NULL); 545 } 546 547 rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn); 548 if (rc != 0) { 549 free(vim->vim_dn); 550 free(vim); 551 return (NULL); 552 } 553 554 vim->vim_spa = spa; 555 vim->vim_phys = malloc(sizeof (*vim->vim_phys)); 556 if (vim->vim_phys == NULL) { 557 free(vim->vim_dn); 558 free(vim); 559 return (NULL); 560 } 561 562 vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn); 563 *vim->vim_phys = *vim_phys; 564 565 vim->vim_objset = os; 566 vim->vim_object = mapping_object; 567 vim->vim_entries = NULL; 568 569 vim->vim_havecounts = 570 (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); 571 572 return (vim); 573} 574 575/* 576 * Compare an offset with an indirect mapping entry; there are three 577 * possible scenarios: 578 * 579 * 1. The offset is "less than" the mapping entry; meaning the 580 * offset is less than the source offset of the mapping entry. In 581 * this case, there is no overlap between the offset and the 582 * mapping entry and -1 will be returned. 583 * 584 * 2. The offset is "greater than" the mapping entry; meaning the 585 * offset is greater than the mapping entry's source offset plus 586 * the entry's size. In this case, there is no overlap between 587 * the offset and the mapping entry and 1 will be returned. 588 * 589 * NOTE: If the offset is actually equal to the entry's offset 590 * plus size, this is considered to be "greater" than the entry, 591 * and this case applies (i.e. 1 will be returned). Thus, the 592 * entry's "range" can be considered to be inclusive at its 593 * start, but exclusive at its end: e.g. [src, src + size). 594 * 595 * 3. The last case to consider is if the offset actually falls 596 * within the mapping entry's range. If this is the case, the 597 * offset is considered to be "equal to" the mapping entry and 598 * 0 will be returned. 599 * 600 * NOTE: If the offset is equal to the entry's source offset, 601 * this case applies and 0 will be returned. If the offset is 602 * equal to the entry's source plus its size, this case does 603 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 604 * returned. 605 */ 606static int 607dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 608{ 609 const uint64_t *key = v_key; 610 const vdev_indirect_mapping_entry_phys_t *array_elem = 611 v_array_elem; 612 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 613 614 if (*key < src_offset) { 615 return (-1); 616 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 617 return (0); 618 } else { 619 return (1); 620 } 621} 622 623/* 624 * Return array entry. 625 */ 626static vdev_indirect_mapping_entry_phys_t * 627vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index) 628{ 629 uint64_t size; 630 off_t offset = 0; 631 int rc; 632 633 if (vim->vim_phys->vimp_num_entries == 0) 634 return (NULL); 635 636 if (vim->vim_entries == NULL) { 637 uint64_t bsize; 638 639 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 640 size = vim->vim_phys->vimp_num_entries * 641 sizeof (*vim->vim_entries); 642 if (size > bsize) { 643 size = bsize / sizeof (*vim->vim_entries); 644 size *= sizeof (*vim->vim_entries); 645 } 646 vim->vim_entries = malloc(size); 647 if (vim->vim_entries == NULL) 648 return (NULL); 649 vim->vim_num_entries = size / sizeof (*vim->vim_entries); 650 offset = index * sizeof (*vim->vim_entries); 651 } 652 653 /* We have data in vim_entries */ 654 if (offset == 0) { 655 if (index >= vim->vim_entry_offset && 656 index <= vim->vim_entry_offset + vim->vim_num_entries) { 657 index -= vim->vim_entry_offset; 658 return (&vim->vim_entries[index]); 659 } 660 offset = index * sizeof (*vim->vim_entries); 661 } 662 663 vim->vim_entry_offset = index; 664 size = vim->vim_num_entries * sizeof (*vim->vim_entries); 665 rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries, 666 size); 667 if (rc != 0) { 668 /* Read error, invalidate vim_entries. */ 669 free(vim->vim_entries); 670 vim->vim_entries = NULL; 671 return (NULL); 672 } 673 index -= vim->vim_entry_offset; 674 return (&vim->vim_entries[index]); 675} 676 677/* 678 * Returns the mapping entry for the given offset. 679 * 680 * It's possible that the given offset will not be in the mapping table 681 * (i.e. no mapping entries contain this offset), in which case, the 682 * return value value depends on the "next_if_missing" parameter. 683 * 684 * If the offset is not found in the table and "next_if_missing" is 685 * B_FALSE, then NULL will always be returned. The behavior is intended 686 * to allow consumers to get the entry corresponding to the offset 687 * parameter, iff the offset overlaps with an entry in the table. 688 * 689 * If the offset is not found in the table and "next_if_missing" is 690 * B_TRUE, then the entry nearest to the given offset will be returned, 691 * such that the entry's source offset is greater than the offset 692 * passed in (i.e. the "next" mapping entry in the table is returned, if 693 * the offset is missing from the table). If there are no entries whose 694 * source offset is greater than the passed in offset, NULL is returned. 695 */ 696static vdev_indirect_mapping_entry_phys_t * 697vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 698 uint64_t offset) 699{ 700 ASSERT(vim->vim_phys->vimp_num_entries > 0); 701 702 vdev_indirect_mapping_entry_phys_t *entry; 703 704 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 705 uint64_t base = 0; 706 707 /* 708 * We don't define these inside of the while loop because we use 709 * their value in the case that offset isn't in the mapping. 710 */ 711 uint64_t mid; 712 int result; 713 714 while (last >= base) { 715 mid = base + ((last - base) >> 1); 716 717 entry = vdev_indirect_mapping_entry(vim, mid); 718 if (entry == NULL) 719 break; 720 result = dva_mapping_overlap_compare(&offset, entry); 721 722 if (result == 0) { 723 break; 724 } else if (result < 0) { 725 last = mid - 1; 726 } else { 727 base = mid + 1; 728 } 729 } 730 return (entry); 731} 732 733/* 734 * Given an indirect vdev and an extent on that vdev, it duplicates the 735 * physical entries of the indirect mapping that correspond to the extent 736 * to a new array and returns a pointer to it. In addition, copied_entries 737 * is populated with the number of mapping entries that were duplicated. 738 * 739 * Finally, since we are doing an allocation, it is up to the caller to 740 * free the array allocated in this function. 741 */ 742vdev_indirect_mapping_entry_phys_t * 743vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 744 uint64_t asize, uint64_t *copied_entries) 745{ 746 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 747 vdev_indirect_mapping_t *vim = vd->v_mapping; 748 uint64_t entries = 0; 749 750 vdev_indirect_mapping_entry_phys_t *first_mapping = 751 vdev_indirect_mapping_entry_for_offset(vim, offset); 752 ASSERT3P(first_mapping, !=, NULL); 753 754 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 755 while (asize > 0) { 756 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 757 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 758 uint64_t inner_size = MIN(asize, size - inner_offset); 759 760 offset += inner_size; 761 asize -= inner_size; 762 entries++; 763 m++; 764 } 765 766 size_t copy_length = entries * sizeof (*first_mapping); 767 duplicate_mappings = malloc(copy_length); 768 if (duplicate_mappings != NULL) 769 bcopy(first_mapping, duplicate_mappings, copy_length); 770 else 771 entries = 0; 772 773 *copied_entries = entries; 774 775 return (duplicate_mappings); 776} 777 778static vdev_t * 779vdev_lookup_top(spa_t *spa, uint64_t vdev) 780{ 781 vdev_t *rvd; 782 vdev_list_t *vlist; 783 784 vlist = &spa->spa_root_vdev->v_children; 785 STAILQ_FOREACH(rvd, vlist, v_childlink) 786 if (rvd->v_id == vdev) 787 break; 788 789 return (rvd); 790} 791 792/* 793 * This is a callback for vdev_indirect_remap() which allocates an 794 * indirect_split_t for each split segment and adds it to iv_splits. 795 */ 796static void 797vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 798 uint64_t size, void *arg) 799{ 800 int n = 1; 801 zio_t *zio = arg; 802 indirect_vsd_t *iv = zio->io_vsd; 803 804 if (vd->v_read == vdev_indirect_read) 805 return; 806 807 if (vd->v_read == vdev_mirror_read) 808 n = vd->v_nchildren; 809 810 indirect_split_t *is = 811 malloc(offsetof(indirect_split_t, is_child[n])); 812 if (is == NULL) { 813 zio->io_error = ENOMEM; 814 return; 815 } 816 bzero(is, offsetof(indirect_split_t, is_child[n])); 817 818 is->is_children = n; 819 is->is_size = size; 820 is->is_split_offset = split_offset; 821 is->is_target_offset = offset; 822 is->is_vdev = vd; 823 824 /* 825 * Note that we only consider multiple copies of the data for 826 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 827 * though they use the same ops as mirror, because there's only one 828 * "good" copy under the replacing/spare. 829 */ 830 if (vd->v_read == vdev_mirror_read) { 831 int i = 0; 832 vdev_t *kid; 833 834 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) { 835 is->is_child[i++].ic_vdev = kid; 836 } 837 } else { 838 is->is_child[0].ic_vdev = vd; 839 } 840 841 list_insert_tail(&iv->iv_splits, is); 842} 843 844static void 845vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) 846{ 847 list_t stack; 848 spa_t *spa = vd->v_spa; 849 zio_t *zio = arg; 850 remap_segment_t *rs; 851 852 list_create(&stack, sizeof (remap_segment_t), 853 offsetof(remap_segment_t, rs_node)); 854 855 rs = rs_alloc(vd, offset, asize, 0); 856 if (rs == NULL) { 857 printf("vdev_indirect_remap: out of memory.\n"); 858 zio->io_error = ENOMEM; 859 } 860 for ( ; rs != NULL; rs = list_remove_head(&stack)) { 861 vdev_t *v = rs->rs_vd; 862 uint64_t num_entries = 0; 863 /* vdev_indirect_mapping_t *vim = v->v_mapping; */ 864 vdev_indirect_mapping_entry_phys_t *mapping = 865 vdev_indirect_mapping_duplicate_adjacent_entries(v, 866 rs->rs_offset, rs->rs_asize, &num_entries); 867 868 if (num_entries == 0) 869 zio->io_error = ENOMEM; 870 871 for (uint64_t i = 0; i < num_entries; i++) { 872 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 873 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 874 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 875 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 876 uint64_t inner_offset = rs->rs_offset - 877 DVA_MAPPING_GET_SRC_OFFSET(m); 878 uint64_t inner_size = 879 MIN(rs->rs_asize, size - inner_offset); 880 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 881 882 if (dst_v->v_read == vdev_indirect_read) { 883 remap_segment_t *o; 884 885 o = rs_alloc(dst_v, dst_offset + inner_offset, 886 inner_size, rs->rs_split_offset); 887 if (o == NULL) { 888 printf("vdev_indirect_remap: " 889 "out of memory.\n"); 890 zio->io_error = ENOMEM; 891 break; 892 } 893 894 list_insert_head(&stack, o); 895 } 896 vdev_indirect_gather_splits(rs->rs_split_offset, dst_v, 897 dst_offset + inner_offset, 898 inner_size, arg); 899 900 /* 901 * vdev_indirect_gather_splits can have memory 902 * allocation error, we can not recover from it. 903 */ 904 if (zio->io_error != 0) 905 break; 906 rs->rs_offset += inner_size; 907 rs->rs_asize -= inner_size; 908 rs->rs_split_offset += inner_size; 909 } 910 911 free(mapping); 912 free(rs); 913 if (zio->io_error != 0) 914 break; 915 } 916 917 list_destroy(&stack); 918} 919 920static void 921vdev_indirect_map_free(zio_t *zio) 922{ 923 indirect_vsd_t *iv = zio->io_vsd; 924 indirect_split_t *is; 925 926 while ((is = list_head(&iv->iv_splits)) != NULL) { 927 for (int c = 0; c < is->is_children; c++) { 928 indirect_child_t *ic = &is->is_child[c]; 929 free(ic->ic_data); 930 } 931 list_remove(&iv->iv_splits, is); 932 free(is); 933 } 934 free(iv); 935} 936 937static int 938vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 939 off_t offset, size_t bytes) 940{ 941 zio_t zio; 942 spa_t *spa = vdev->v_spa; 943 indirect_vsd_t *iv; 944 indirect_split_t *first; 945 int rc = EIO; 946 947 iv = calloc(1, sizeof(*iv)); 948 if (iv == NULL) 949 return (ENOMEM); 950 951 list_create(&iv->iv_splits, 952 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 953 954 bzero(&zio, sizeof(zio)); 955 zio.io_spa = spa; 956 zio.io_bp = (blkptr_t *)bp; 957 zio.io_data = buf; 958 zio.io_size = bytes; 959 zio.io_offset = offset; 960 zio.io_vd = vdev; 961 zio.io_vsd = iv; 962 963 if (vdev->v_mapping == NULL) { 964 vdev_indirect_config_t *vic; 965 966 vic = &vdev->vdev_indirect_config; 967 vdev->v_mapping = vdev_indirect_mapping_open(spa, 968 &spa->spa_mos, vic->vic_mapping_object); 969 } 970 971 vdev_indirect_remap(vdev, offset, bytes, &zio); 972 if (zio.io_error != 0) 973 return (zio.io_error); 974 975 first = list_head(&iv->iv_splits); 976 if (first->is_size == zio.io_size) { 977 /* 978 * This is not a split block; we are pointing to the entire 979 * data, which will checksum the same as the original data. 980 * Pass the BP down so that the child i/o can verify the 981 * checksum, and try a different location if available 982 * (e.g. on a mirror). 983 * 984 * While this special case could be handled the same as the 985 * general (split block) case, doing it this way ensures 986 * that the vast majority of blocks on indirect vdevs 987 * (which are not split) are handled identically to blocks 988 * on non-indirect vdevs. This allows us to be less strict 989 * about performance in the general (but rare) case. 990 */ 991 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp, 992 zio.io_data, first->is_target_offset, bytes); 993 } else { 994 iv->iv_split_block = B_TRUE; 995 /* 996 * Read one copy of each split segment, from the 997 * top-level vdev. Since we don't know the 998 * checksum of each split individually, the child 999 * zio can't ensure that we get the right data. 1000 * E.g. if it's a mirror, it will just read from a 1001 * random (healthy) leaf vdev. We have to verify 1002 * the checksum in vdev_indirect_io_done(). 1003 */ 1004 for (indirect_split_t *is = list_head(&iv->iv_splits); 1005 is != NULL; is = list_next(&iv->iv_splits, is)) { 1006 char *ptr = zio.io_data; 1007 1008 rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp, 1009 ptr + is->is_split_offset, is->is_target_offset, 1010 is->is_size); 1011 } 1012 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data)) 1013 rc = ECKSUM; 1014 else 1015 rc = 0; 1016 } 1017 1018 vdev_indirect_map_free(&zio); 1019 if (rc == 0) 1020 rc = zio.io_error; 1021 1022 return (rc); 1023} 1024 1025static int 1026vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 1027 off_t offset, size_t bytes) 1028{ 1029 1030 return (vdev_read_phys(vdev, bp, buf, 1031 offset + VDEV_LABEL_START_SIZE, bytes)); 1032} 1033 1034 1035static int 1036vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 1037 off_t offset, size_t bytes) 1038{ 1039 vdev_t *kid; 1040 int rc; 1041 1042 rc = EIO; 1043 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1044 if (kid->v_state != VDEV_STATE_HEALTHY) 1045 continue; 1046 rc = kid->v_read(kid, bp, buf, offset, bytes); 1047 if (!rc) 1048 return (0); 1049 } 1050 1051 return (rc); 1052} 1053 1054static int 1055vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 1056 off_t offset, size_t bytes) 1057{ 1058 vdev_t *kid; 1059 1060 /* 1061 * Here we should have two kids: 1062 * First one which is the one we are replacing and we can trust 1063 * only this one to have valid data, but it might not be present. 1064 * Second one is that one we are replacing with. It is most likely 1065 * healthy, but we can't trust it has needed data, so we won't use it. 1066 */ 1067 kid = STAILQ_FIRST(&vdev->v_children); 1068 if (kid == NULL) 1069 return (EIO); 1070 if (kid->v_state != VDEV_STATE_HEALTHY) 1071 return (EIO); 1072 return (kid->v_read(kid, bp, buf, offset, bytes)); 1073} 1074 1075static vdev_t * 1076vdev_find(uint64_t guid) 1077{ 1078 vdev_t *vdev; 1079 1080 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 1081 if (vdev->v_guid == guid) 1082 return (vdev); 1083 1084 return (0); 1085} 1086 1087static vdev_t * 1088vdev_create(uint64_t guid, vdev_read_t *_read) 1089{ 1090 vdev_t *vdev; 1091 vdev_indirect_config_t *vic; 1092 1093 vdev = calloc(1, sizeof(vdev_t)); 1094 if (vdev != NULL) { 1095 STAILQ_INIT(&vdev->v_children); 1096 vdev->v_guid = guid; 1097 vdev->v_read = _read; 1098 1099 /* 1100 * root vdev has no read function. 1101 * We only point root vdev from spa. 1102 */ 1103 if (_read != NULL) { 1104 vic = &vdev->vdev_indirect_config; 1105 vic->vic_prev_indirect_vdev = UINT64_MAX; 1106 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 1107 } 1108 } 1109 1110 return (vdev); 1111} 1112 1113static void 1114vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist) 1115{ 1116 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 1117 uint64_t is_log; 1118 1119 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 1120 is_log = 0; 1121 (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, 1122 &is_offline); 1123 (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, 1124 &is_removed); 1125 (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, 1126 &is_faulted); 1127 (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 1128 NULL, &is_degraded); 1129 (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 1130 NULL, &isnt_present); 1131 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, 1132 &is_log); 1133 1134 if (is_offline != 0) 1135 vdev->v_state = VDEV_STATE_OFFLINE; 1136 else if (is_removed != 0) 1137 vdev->v_state = VDEV_STATE_REMOVED; 1138 else if (is_faulted != 0) 1139 vdev->v_state = VDEV_STATE_FAULTED; 1140 else if (is_degraded != 0) 1141 vdev->v_state = VDEV_STATE_DEGRADED; 1142 else if (isnt_present != 0) 1143 vdev->v_state = VDEV_STATE_CANT_OPEN; 1144 1145 vdev->v_islog = is_log == 1; 1146} 1147 1148static int 1149vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp) 1150{ 1151 uint64_t id, ashift, asize, nparity; 1152 const char *path; 1153 const char *type; 1154 vdev_t *vdev; 1155 1156 if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) || 1157 nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, 1158 NULL, &type)) { 1159 return (ENOENT); 1160 } 1161 1162 if (strcmp(type, VDEV_TYPE_MIRROR) 1163 && strcmp(type, VDEV_TYPE_DISK) 1164#ifdef ZFS_TEST 1165 && strcmp(type, VDEV_TYPE_FILE) 1166#endif 1167 && strcmp(type, VDEV_TYPE_RAIDZ) 1168 && strcmp(type, VDEV_TYPE_INDIRECT) 1169 && strcmp(type, VDEV_TYPE_REPLACING)) { 1170 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n"); 1171 return (EIO); 1172 } 1173 1174 if (strcmp(type, VDEV_TYPE_MIRROR) == 0) 1175 vdev = vdev_create(guid, vdev_mirror_read); 1176 else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) 1177 vdev = vdev_create(guid, vdev_raidz_read); 1178 else if (strcmp(type, VDEV_TYPE_REPLACING) == 0) 1179 vdev = vdev_create(guid, vdev_replacing_read); 1180 else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { 1181 vdev_indirect_config_t *vic; 1182 1183 vdev = vdev_create(guid, vdev_indirect_read); 1184 if (vdev != NULL) { 1185 vdev->v_state = VDEV_STATE_HEALTHY; 1186 vic = &vdev->vdev_indirect_config; 1187 1188 nvlist_find(nvlist, 1189 ZPOOL_CONFIG_INDIRECT_OBJECT, 1190 DATA_TYPE_UINT64, 1191 NULL, &vic->vic_mapping_object); 1192 nvlist_find(nvlist, 1193 ZPOOL_CONFIG_INDIRECT_BIRTHS, 1194 DATA_TYPE_UINT64, 1195 NULL, &vic->vic_births_object); 1196 nvlist_find(nvlist, 1197 ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 1198 DATA_TYPE_UINT64, 1199 NULL, &vic->vic_prev_indirect_vdev); 1200 } 1201 } else { 1202 vdev = vdev_create(guid, vdev_disk_read); 1203 } 1204 1205 if (vdev == NULL) 1206 return (ENOMEM); 1207 1208 vdev_set_initial_state(vdev, nvlist); 1209 vdev->v_id = id; 1210 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 1211 DATA_TYPE_UINT64, NULL, &ashift) == 0) 1212 vdev->v_ashift = ashift; 1213 1214 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 1215 DATA_TYPE_UINT64, NULL, &asize) == 0) { 1216 vdev->v_psize = asize + 1217 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1218 } 1219 1220 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 1221 DATA_TYPE_UINT64, NULL, &nparity) == 0) 1222 vdev->v_nparity = nparity; 1223 1224 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 1225 DATA_TYPE_STRING, NULL, &path) == 0) { 1226 if (strncmp(path, "/dev/", 5) == 0) 1227 path += 5; 1228 vdev->v_name = strdup(path); 1229 } else { 1230 char *name; 1231 1232 name = NULL; 1233 if (strcmp(type, "raidz") == 0) { 1234 if (vdev->v_nparity < 1 || 1235 vdev->v_nparity > 3) { 1236 printf("ZFS: can only boot from disk, " 1237 "mirror, raidz1, raidz2 and raidz3 " 1238 "vdevs\n"); 1239 return (EIO); 1240 } 1241 (void) asprintf(&name, "%s%d-%" PRIu64, type, 1242 vdev->v_nparity, id); 1243 } else { 1244 (void) asprintf(&name, "%s-%" PRIu64, type, id); 1245 } 1246 vdev->v_name = name; 1247 } 1248 *vdevp = vdev; 1249 return (0); 1250} 1251 1252/* 1253 * Find slot for vdev. We return either NULL to signal to use 1254 * STAILQ_INSERT_HEAD, or we return link element to be used with 1255 * STAILQ_INSERT_AFTER. 1256 */ 1257static vdev_t * 1258vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) 1259{ 1260 vdev_t *v, *previous; 1261 1262 if (STAILQ_EMPTY(&top_vdev->v_children)) 1263 return (NULL); 1264 1265 previous = NULL; 1266 STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { 1267 if (v->v_id > vdev->v_id) 1268 return (previous); 1269 1270 if (v->v_id == vdev->v_id) 1271 return (v); 1272 1273 if (v->v_id < vdev->v_id) 1274 previous = v; 1275 } 1276 return (previous); 1277} 1278 1279static size_t 1280vdev_child_count(vdev_t *vdev) 1281{ 1282 vdev_t *v; 1283 size_t count; 1284 1285 count = 0; 1286 STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { 1287 count++; 1288 } 1289 return (count); 1290} 1291 1292/* 1293 * Insert vdev into top_vdev children list. List is ordered by v_id. 1294 */ 1295static void 1296vdev_insert(vdev_t *top_vdev, vdev_t *vdev) 1297{ 1298 vdev_t *previous; 1299 size_t count; 1300 1301 /* 1302 * The top level vdev can appear in random order, depending how 1303 * the firmware is presenting the disk devices. 1304 * However, we will insert vdev to create list ordered by v_id, 1305 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER 1306 * as STAILQ does not have insert before. 1307 */ 1308 previous = vdev_find_previous(top_vdev, vdev); 1309 1310 if (previous == NULL) { 1311 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); 1312 count = vdev_child_count(top_vdev); 1313 if (top_vdev->v_nchildren < count) 1314 top_vdev->v_nchildren = count; 1315 return; 1316 } 1317 1318 if (previous->v_id == vdev->v_id) 1319 return; 1320 1321 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink); 1322 count = vdev_child_count(top_vdev); 1323 if (top_vdev->v_nchildren < count) 1324 top_vdev->v_nchildren = count; 1325} 1326 1327static int 1328vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) 1329{ 1330 vdev_t *top_vdev, *vdev; 1331 const unsigned char *kids; 1332 int rc, nkids; 1333 1334 /* Get top vdev. */ 1335 top_vdev = vdev_find(top_guid); 1336 if (top_vdev == NULL) { 1337 rc = vdev_init(top_guid, nvlist, &top_vdev); 1338 if (rc != 0) 1339 return (rc); 1340 top_vdev->v_spa = spa; 1341 top_vdev->v_top = top_vdev; 1342 vdev_insert(spa->spa_root_vdev, top_vdev); 1343 } 1344 1345 /* Add children if there are any. */ 1346 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1347 &nkids, &kids); 1348 if (rc == 0) { 1349 for (int i = 0; i < nkids; i++) { 1350 uint64_t guid; 1351 1352 rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, 1353 DATA_TYPE_UINT64, NULL, &guid); 1354 if (rc != 0) 1355 return (rc); 1356 rc = vdev_init(guid, kids, &vdev); 1357 if (rc != 0) 1358 return (rc); 1359 1360 vdev->v_spa = spa; 1361 vdev->v_top = top_vdev; 1362 vdev_insert(top_vdev, vdev); 1363 1364 kids = nvlist_next(kids); 1365 } 1366 } else { 1367 rc = 0; 1368 } 1369 1370 return (rc); 1371} 1372 1373static int 1374vdev_init_from_label(spa_t *spa, const unsigned char *nvlist) 1375{ 1376 uint64_t pool_guid, top_guid; 1377 const unsigned char *vdevs; 1378 1379 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1380 NULL, &pool_guid) || 1381 nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, 1382 NULL, &top_guid) || 1383 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1384 NULL, &vdevs)) { 1385 printf("ZFS: can't find vdev details\n"); 1386 return (ENOENT); 1387 } 1388 1389 return (vdev_from_nvlist(spa, top_guid, vdevs)); 1390} 1391 1392static void 1393vdev_set_state(vdev_t *vdev) 1394{ 1395 vdev_t *kid; 1396 int good_kids; 1397 int bad_kids; 1398 1399 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1400 vdev_set_state(kid); 1401 } 1402 1403 /* 1404 * A mirror or raidz is healthy if all its kids are healthy. A 1405 * mirror is degraded if any of its kids is healthy; a raidz 1406 * is degraded if at most nparity kids are offline. 1407 */ 1408 if (STAILQ_FIRST(&vdev->v_children)) { 1409 good_kids = 0; 1410 bad_kids = 0; 1411 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1412 if (kid->v_state == VDEV_STATE_HEALTHY) 1413 good_kids++; 1414 else 1415 bad_kids++; 1416 } 1417 if (bad_kids == 0) { 1418 vdev->v_state = VDEV_STATE_HEALTHY; 1419 } else { 1420 if (vdev->v_read == vdev_mirror_read) { 1421 if (good_kids) { 1422 vdev->v_state = VDEV_STATE_DEGRADED; 1423 } else { 1424 vdev->v_state = VDEV_STATE_OFFLINE; 1425 } 1426 } else if (vdev->v_read == vdev_raidz_read) { 1427 if (bad_kids > vdev->v_nparity) { 1428 vdev->v_state = VDEV_STATE_OFFLINE; 1429 } else { 1430 vdev->v_state = VDEV_STATE_DEGRADED; 1431 } 1432 } 1433 } 1434 } 1435} 1436 1437static int 1438vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist) 1439{ 1440 vdev_t *vdev; 1441 const unsigned char *kids; 1442 int rc, nkids; 1443 1444 /* Update top vdev. */ 1445 vdev = vdev_find(top_guid); 1446 if (vdev != NULL) 1447 vdev_set_initial_state(vdev, nvlist); 1448 1449 /* Update children if there are any. */ 1450 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1451 &nkids, &kids); 1452 if (rc == 0) { 1453 for (int i = 0; i < nkids; i++) { 1454 uint64_t guid; 1455 1456 rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, 1457 DATA_TYPE_UINT64, NULL, &guid); 1458 if (rc != 0) 1459 break; 1460 1461 vdev = vdev_find(guid); 1462 if (vdev != NULL) 1463 vdev_set_initial_state(vdev, kids); 1464 1465 kids = nvlist_next(kids); 1466 } 1467 } else { 1468 rc = 0; 1469 } 1470 1471 return (rc); 1472} 1473 1474static int 1475vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) 1476{ 1477 uint64_t pool_guid, vdev_children; 1478 const unsigned char *vdevs, *kids; 1479 int rc, nkids; 1480 1481 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1482 NULL, &pool_guid) || 1483 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, 1484 NULL, &vdev_children) || 1485 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1486 NULL, &vdevs)) { 1487 printf("ZFS: can't find vdev details\n"); 1488 return (ENOENT); 1489 } 1490 1491 /* Wrong guid?! */ 1492 if (spa->spa_guid != pool_guid) 1493 return (EIO); 1494 1495 spa->spa_root_vdev->v_nchildren = vdev_children; 1496 1497 rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1498 &nkids, &kids); 1499 1500 /* 1501 * MOS config has at least one child for root vdev. 1502 */ 1503 if (rc != 0) 1504 return (EIO); 1505 1506 for (int i = 0; i < nkids; i++) { 1507 uint64_t guid; 1508 vdev_t *vdev; 1509 1510 rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 1511 NULL, &guid); 1512 if (rc != 0) 1513 break; 1514 vdev = vdev_find(guid); 1515 /* 1516 * Top level vdev is missing, create it. 1517 */ 1518 if (vdev == NULL) 1519 rc = vdev_from_nvlist(spa, guid, kids); 1520 else 1521 rc = vdev_update_from_nvlist(guid, kids); 1522 if (rc != 0) 1523 break; 1524 kids = nvlist_next(kids); 1525 } 1526 1527 /* 1528 * Re-evaluate top-level vdev state. 1529 */ 1530 vdev_set_state(spa->spa_root_vdev); 1531 1532 return (rc); 1533} 1534 1535static spa_t * 1536spa_find_by_guid(uint64_t guid) 1537{ 1538 spa_t *spa; 1539 1540 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1541 if (spa->spa_guid == guid) 1542 return (spa); 1543 1544 return (0); 1545} 1546 1547static spa_t * 1548spa_find_by_name(const char *name) 1549{ 1550 spa_t *spa; 1551 1552 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1553 if (!strcmp(spa->spa_name, name)) 1554 return (spa); 1555 1556 return (0); 1557} 1558 1559#ifdef BOOT2 1560static spa_t * 1561spa_get_primary(void) 1562{ 1563 1564 return (STAILQ_FIRST(&zfs_pools)); 1565} 1566 1567static vdev_t * 1568spa_get_primary_vdev(const spa_t *spa) 1569{ 1570 vdev_t *vdev; 1571 vdev_t *kid; 1572 1573 if (spa == NULL) 1574 spa = spa_get_primary(); 1575 if (spa == NULL) 1576 return (NULL); 1577 vdev = spa->spa_root_vdev; 1578 if (vdev == NULL) 1579 return (NULL); 1580 for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL; 1581 kid = STAILQ_FIRST(&vdev->v_children)) 1582 vdev = kid; 1583 return (vdev); 1584} 1585#endif 1586 1587static spa_t * 1588spa_create(uint64_t guid, const char *name) 1589{ 1590 spa_t *spa; 1591 1592 if ((spa = calloc(1, sizeof(spa_t))) == NULL) 1593 return (NULL); 1594 if ((spa->spa_name = strdup(name)) == NULL) { 1595 free(spa); 1596 return (NULL); 1597 } 1598 spa->spa_guid = guid; 1599 spa->spa_root_vdev = vdev_create(guid, NULL); 1600 if (spa->spa_root_vdev == NULL) { 1601 free(spa->spa_name); 1602 free(spa); 1603 return (NULL); 1604 } 1605 spa->spa_root_vdev->v_name = strdup("root"); 1606 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 1607 1608 return (spa); 1609} 1610 1611static const char * 1612state_name(vdev_state_t state) 1613{ 1614 static const char* names[] = { 1615 "UNKNOWN", 1616 "CLOSED", 1617 "OFFLINE", 1618 "REMOVED", 1619 "CANT_OPEN", 1620 "FAULTED", 1621 "DEGRADED", 1622 "ONLINE" 1623 }; 1624 return names[state]; 1625} 1626 1627#ifdef BOOT2 1628 1629#define pager_printf printf 1630 1631#else 1632 1633static int 1634pager_printf(const char *fmt, ...) 1635{ 1636 char line[80]; 1637 va_list args; 1638 1639 va_start(args, fmt); 1640 vsnprintf(line, sizeof(line), fmt, args); 1641 va_end(args); 1642 return (pager_output(line)); 1643} 1644 1645#endif 1646 1647#define STATUS_FORMAT " %s %s\n" 1648 1649static int 1650print_state(int indent, const char *name, vdev_state_t state) 1651{ 1652 int i; 1653 char buf[512]; 1654 1655 buf[0] = 0; 1656 for (i = 0; i < indent; i++) 1657 strcat(buf, " "); 1658 strcat(buf, name); 1659 return (pager_printf(STATUS_FORMAT, buf, state_name(state))); 1660} 1661 1662static int 1663vdev_status(vdev_t *vdev, int indent) 1664{ 1665 vdev_t *kid; 1666 int ret; 1667 1668 if (vdev->v_islog) { 1669 (void)pager_output(" logs\n"); 1670 indent++; 1671 } 1672 1673 ret = print_state(indent, vdev->v_name, vdev->v_state); 1674 if (ret != 0) 1675 return (ret); 1676 1677 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1678 ret = vdev_status(kid, indent + 1); 1679 if (ret != 0) 1680 return (ret); 1681 } 1682 return (ret); 1683} 1684 1685static int 1686spa_status(spa_t *spa) 1687{ 1688 static char bootfs[ZFS_MAXNAMELEN]; 1689 uint64_t rootid; 1690 vdev_list_t *vlist; 1691 vdev_t *vdev; 1692 int good_kids, bad_kids, degraded_kids, ret; 1693 vdev_state_t state; 1694 1695 ret = pager_printf(" pool: %s\n", spa->spa_name); 1696 if (ret != 0) 1697 return (ret); 1698 1699 if (zfs_get_root(spa, &rootid) == 0 && 1700 zfs_rlookup(spa, rootid, bootfs) == 0) { 1701 if (bootfs[0] == '\0') 1702 ret = pager_printf("bootfs: %s\n", spa->spa_name); 1703 else 1704 ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, 1705 bootfs); 1706 if (ret != 0) 1707 return (ret); 1708 } 1709 ret = pager_printf("config:\n\n"); 1710 if (ret != 0) 1711 return (ret); 1712 ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); 1713 if (ret != 0) 1714 return (ret); 1715 1716 good_kids = 0; 1717 degraded_kids = 0; 1718 bad_kids = 0; 1719 vlist = &spa->spa_root_vdev->v_children; 1720 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1721 if (vdev->v_state == VDEV_STATE_HEALTHY) 1722 good_kids++; 1723 else if (vdev->v_state == VDEV_STATE_DEGRADED) 1724 degraded_kids++; 1725 else 1726 bad_kids++; 1727 } 1728 1729 state = VDEV_STATE_CLOSED; 1730 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 1731 state = VDEV_STATE_HEALTHY; 1732 else if ((good_kids + degraded_kids) > 0) 1733 state = VDEV_STATE_DEGRADED; 1734 1735 ret = print_state(0, spa->spa_name, state); 1736 if (ret != 0) 1737 return (ret); 1738 1739 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1740 ret = vdev_status(vdev, 1); 1741 if (ret != 0) 1742 return (ret); 1743 } 1744 return (ret); 1745} 1746 1747static int 1748spa_all_status(void) 1749{ 1750 spa_t *spa; 1751 int first = 1, ret = 0; 1752 1753 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 1754 if (!first) { 1755 ret = pager_printf("\n"); 1756 if (ret != 0) 1757 return (ret); 1758 } 1759 first = 0; 1760 ret = spa_status(spa); 1761 if (ret != 0) 1762 return (ret); 1763 } 1764 return (ret); 1765} 1766 1767static uint64_t 1768vdev_label_offset(uint64_t psize, int l, uint64_t offset) 1769{ 1770 uint64_t label_offset; 1771 1772 if (l < VDEV_LABELS / 2) 1773 label_offset = 0; 1774 else 1775 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); 1776 1777 return (offset + l * sizeof (vdev_label_t) + label_offset); 1778} 1779 1780static int 1781vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) 1782{ 1783 unsigned int seq1 = 0; 1784 unsigned int seq2 = 0; 1785 int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); 1786 1787 if (cmp != 0) 1788 return (cmp); 1789 1790 cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); 1791 if (cmp != 0) 1792 return (cmp); 1793 1794 if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) 1795 seq1 = MMP_SEQ(ub1); 1796 1797 if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) 1798 seq2 = MMP_SEQ(ub2); 1799 1800 return (AVL_CMP(seq1, seq2)); 1801} 1802 1803static int 1804uberblock_verify(uberblock_t *ub) 1805{ 1806 if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { 1807 byteswap_uint64_array(ub, sizeof (uberblock_t)); 1808 } 1809 1810 if (ub->ub_magic != UBERBLOCK_MAGIC || 1811 !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) 1812 return (EINVAL); 1813 1814 return (0); 1815} 1816 1817static int 1818vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, 1819 size_t size) 1820{ 1821 blkptr_t bp; 1822 off_t off; 1823 1824 off = vdev_label_offset(vd->v_psize, l, offset); 1825 1826 BP_ZERO(&bp); 1827 BP_SET_LSIZE(&bp, size); 1828 BP_SET_PSIZE(&bp, size); 1829 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1830 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1831 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 1832 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1833 1834 return (vdev_read_phys(vd, &bp, buf, off, size)); 1835} 1836 1837static unsigned char * 1838vdev_label_read_config(vdev_t *vd, uint64_t txg) 1839{ 1840 vdev_phys_t *label; 1841 uint64_t best_txg = 0; 1842 uint64_t label_txg = 0; 1843 uint64_t asize; 1844 unsigned char *nvl; 1845 size_t nvl_size; 1846 int error; 1847 1848 label = malloc(sizeof (vdev_phys_t)); 1849 if (label == NULL) 1850 return (NULL); 1851 1852 nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4; 1853 nvl = malloc(nvl_size); 1854 if (nvl == NULL) 1855 goto done; 1856 1857 for (int l = 0; l < VDEV_LABELS; l++) { 1858 const unsigned char *nvlist; 1859 1860 if (vdev_label_read(vd, l, label, 1861 offsetof(vdev_label_t, vl_vdev_phys), 1862 sizeof (vdev_phys_t))) 1863 continue; 1864 1865 if (label->vp_nvlist[0] != NV_ENCODE_XDR) 1866 continue; 1867 1868 nvlist = (const unsigned char *) label->vp_nvlist + 4; 1869 error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, 1870 DATA_TYPE_UINT64, NULL, &label_txg); 1871 if (error != 0 || label_txg == 0) { 1872 memcpy(nvl, nvlist, nvl_size); 1873 goto done; 1874 } 1875 1876 if (label_txg <= txg && label_txg > best_txg) { 1877 best_txg = label_txg; 1878 memcpy(nvl, nvlist, nvl_size); 1879 1880 /* 1881 * Use asize from pool config. We need this 1882 * because we can get bad value from BIOS. 1883 */ 1884 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 1885 DATA_TYPE_UINT64, NULL, &asize) == 0) { 1886 vd->v_psize = asize + 1887 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1888 } 1889 } 1890 } 1891 1892 if (best_txg == 0) { 1893 free(nvl); 1894 nvl = NULL; 1895 } 1896done: 1897 free(label); 1898 return (nvl); 1899} 1900 1901static void 1902vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) 1903{ 1904 uberblock_t *buf; 1905 1906 buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); 1907 if (buf == NULL) 1908 return; 1909 1910 for (int l = 0; l < VDEV_LABELS; l++) { 1911 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { 1912 if (vdev_label_read(vd, l, buf, 1913 VDEV_UBERBLOCK_OFFSET(vd, n), 1914 VDEV_UBERBLOCK_SIZE(vd))) 1915 continue; 1916 if (uberblock_verify(buf) != 0) 1917 continue; 1918 1919 if (vdev_uberblock_compare(buf, ub) > 0) 1920 *ub = *buf; 1921 } 1922 } 1923 free(buf); 1924} 1925 1926static int 1927vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) 1928{ 1929 vdev_t vtmp; 1930 spa_t *spa; 1931 vdev_t *vdev; 1932 unsigned char *nvlist; 1933 uint64_t val; 1934 uint64_t guid, vdev_children; 1935 uint64_t pool_txg, pool_guid; 1936 const char *pool_name; 1937 const unsigned char *features; 1938 int rc; 1939 1940 /* 1941 * Load the vdev label and figure out which 1942 * uberblock is most current. 1943 */ 1944 memset(&vtmp, 0, sizeof(vtmp)); 1945 vtmp.v_phys_read = _read; 1946 vtmp.v_read_priv = read_priv; 1947 vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), 1948 (uint64_t)sizeof (vdev_label_t)); 1949 1950 /* Test for minimum device size. */ 1951 if (vtmp.v_psize < SPA_MINDEVSIZE) 1952 return (EIO); 1953 1954 nvlist = vdev_label_read_config(&vtmp, UINT64_MAX); 1955 if (nvlist == NULL) 1956 return (EIO); 1957 1958 if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, 1959 NULL, &val) != 0) { 1960 free(nvlist); 1961 return (EIO); 1962 } 1963 1964 if (!SPA_VERSION_IS_SUPPORTED(val)) { 1965 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 1966 (unsigned) val, (unsigned) SPA_VERSION); 1967 free(nvlist); 1968 return (EIO); 1969 } 1970 1971 /* Check ZFS features for read */ 1972 if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, 1973 DATA_TYPE_NVLIST, NULL, &features) == 0 && 1974 nvlist_check_features_for_read(features) != 0) { 1975 free(nvlist); 1976 return (EIO); 1977 } 1978 1979 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, 1980 NULL, &val) != 0) { 1981 free(nvlist); 1982 return (EIO); 1983 } 1984 1985 if (val == POOL_STATE_DESTROYED) { 1986 /* We don't boot only from destroyed pools. */ 1987 free(nvlist); 1988 return (EIO); 1989 } 1990 1991 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, 1992 NULL, &pool_txg) != 0 || 1993 nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1994 NULL, &pool_guid) != 0 || 1995 nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, 1996 NULL, &pool_name) != 0) { 1997 /* 1998 * Cache and spare devices end up here - just ignore 1999 * them. 2000 */ 2001 free(nvlist); 2002 return (EIO); 2003 } 2004 2005 /* 2006 * Create the pool if this is the first time we've seen it. 2007 */ 2008 spa = spa_find_by_guid(pool_guid); 2009 if (spa == NULL) { 2010 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, 2011 DATA_TYPE_UINT64, NULL, &vdev_children); 2012 spa = spa_create(pool_guid, pool_name); 2013 if (spa == NULL) { 2014 free(nvlist); 2015 return (ENOMEM); 2016 } 2017 spa->spa_root_vdev->v_nchildren = vdev_children; 2018 } 2019 if (pool_txg > spa->spa_txg) 2020 spa->spa_txg = pool_txg; 2021 2022 /* 2023 * Get the vdev tree and create our in-core copy of it. 2024 * If we already have a vdev with this guid, this must 2025 * be some kind of alias (overlapping slices, dangerously dedicated 2026 * disks etc). 2027 */ 2028 if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 2029 NULL, &guid) != 0) { 2030 free(nvlist); 2031 return (EIO); 2032 } 2033 vdev = vdev_find(guid); 2034 /* Has this vdev already been inited? */ 2035 if (vdev && vdev->v_phys_read) { 2036 free(nvlist); 2037 return (EIO); 2038 } 2039 2040 rc = vdev_init_from_label(spa, nvlist); 2041 free(nvlist); 2042 if (rc != 0) 2043 return (rc); 2044 2045 /* 2046 * We should already have created an incomplete vdev for this 2047 * vdev. Find it and initialise it with our read proc. 2048 */ 2049 vdev = vdev_find(guid); 2050 if (vdev != NULL) { 2051 vdev->v_phys_read = _read; 2052 vdev->v_read_priv = read_priv; 2053 vdev->v_psize = vtmp.v_psize; 2054 /* 2055 * If no other state is set, mark vdev healthy. 2056 */ 2057 if (vdev->v_state == VDEV_STATE_UNKNOWN) 2058 vdev->v_state = VDEV_STATE_HEALTHY; 2059 } else { 2060 printf("ZFS: inconsistent nvlist contents\n"); 2061 return (EIO); 2062 } 2063 2064 if (vdev->v_islog) 2065 spa->spa_with_log = vdev->v_islog; 2066 2067 /* 2068 * Re-evaluate top-level vdev state. 2069 */ 2070 vdev_set_state(vdev->v_top); 2071 2072 /* 2073 * Ok, we are happy with the pool so far. Lets find 2074 * the best uberblock and then we can actually access 2075 * the contents of the pool. 2076 */ 2077 vdev_uberblock_load(vdev, &spa->spa_uberblock); 2078 2079 if (spap != NULL) 2080 *spap = spa; 2081 return (0); 2082} 2083 2084static int 2085ilog2(int n) 2086{ 2087 int v; 2088 2089 for (v = 0; v < 32; v++) 2090 if (n == (1 << v)) 2091 return v; 2092 return -1; 2093} 2094 2095static int 2096zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 2097{ 2098 blkptr_t gbh_bp; 2099 zio_gbh_phys_t zio_gb; 2100 char *pbuf; 2101 int i; 2102 2103 /* Artificial BP for gang block header. */ 2104 gbh_bp = *bp; 2105 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2106 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2107 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 2108 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 2109 for (i = 0; i < SPA_DVAS_PER_BP; i++) 2110 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 2111 2112 /* Read gang header block using the artificial BP. */ 2113 if (zio_read(spa, &gbh_bp, &zio_gb)) 2114 return (EIO); 2115 2116 pbuf = buf; 2117 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 2118 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 2119 2120 if (BP_IS_HOLE(gbp)) 2121 continue; 2122 if (zio_read(spa, gbp, pbuf)) 2123 return (EIO); 2124 pbuf += BP_GET_PSIZE(gbp); 2125 } 2126 2127 if (zio_checksum_verify(spa, bp, buf)) 2128 return (EIO); 2129 return (0); 2130} 2131 2132static int 2133zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 2134{ 2135 int cpfunc = BP_GET_COMPRESS(bp); 2136 uint64_t align, size; 2137 void *pbuf; 2138 int i, error; 2139 2140 /* 2141 * Process data embedded in block pointer 2142 */ 2143 if (BP_IS_EMBEDDED(bp)) { 2144 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 2145 2146 size = BPE_GET_PSIZE(bp); 2147 ASSERT(size <= BPE_PAYLOAD_SIZE); 2148 2149 if (cpfunc != ZIO_COMPRESS_OFF) 2150 pbuf = zfs_alloc(size); 2151 else 2152 pbuf = buf; 2153 2154 decode_embedded_bp_compressed(bp, pbuf); 2155 error = 0; 2156 2157 if (cpfunc != ZIO_COMPRESS_OFF) { 2158 error = zio_decompress_data(cpfunc, pbuf, 2159 size, buf, BP_GET_LSIZE(bp)); 2160 zfs_free(pbuf, size); 2161 } 2162 if (error != 0) 2163 printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n", 2164 error); 2165 return (error); 2166 } 2167 2168 error = EIO; 2169 2170 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 2171 const dva_t *dva = &bp->blk_dva[i]; 2172 vdev_t *vdev; 2173 vdev_list_t *vlist; 2174 uint64_t vdevid; 2175 off_t offset; 2176 2177 if (!dva->dva_word[0] && !dva->dva_word[1]) 2178 continue; 2179 2180 vdevid = DVA_GET_VDEV(dva); 2181 offset = DVA_GET_OFFSET(dva); 2182 vlist = &spa->spa_root_vdev->v_children; 2183 STAILQ_FOREACH(vdev, vlist, v_childlink) { 2184 if (vdev->v_id == vdevid) 2185 break; 2186 } 2187 if (!vdev || !vdev->v_read) 2188 continue; 2189 2190 size = BP_GET_PSIZE(bp); 2191 if (vdev->v_read == vdev_raidz_read) { 2192 align = 1ULL << vdev->v_ashift; 2193 if (P2PHASE(size, align) != 0) 2194 size = P2ROUNDUP(size, align); 2195 } 2196 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 2197 pbuf = zfs_alloc(size); 2198 else 2199 pbuf = buf; 2200 2201 if (DVA_GET_GANG(dva)) 2202 error = zio_read_gang(spa, bp, pbuf); 2203 else 2204 error = vdev->v_read(vdev, bp, pbuf, offset, size); 2205 if (error == 0) { 2206 if (cpfunc != ZIO_COMPRESS_OFF) 2207 error = zio_decompress_data(cpfunc, pbuf, 2208 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 2209 else if (size != BP_GET_PSIZE(bp)) 2210 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 2211 } 2212 if (buf != pbuf) 2213 zfs_free(pbuf, size); 2214 if (error == 0) 2215 break; 2216 } 2217 if (error != 0) 2218 printf("ZFS: i/o error - all block copies unavailable\n"); 2219 return (error); 2220} 2221 2222static int 2223dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen) 2224{ 2225 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 2226 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2227 int nlevels = dnode->dn_nlevels; 2228 int i, rc; 2229 2230 if (bsize > SPA_MAXBLOCKSIZE) { 2231 printf("ZFS: I/O error - blocks larger than %llu are not " 2232 "supported\n", SPA_MAXBLOCKSIZE); 2233 return (EIO); 2234 } 2235 2236 /* 2237 * Note: bsize may not be a power of two here so we need to do an 2238 * actual divide rather than a bitshift. 2239 */ 2240 while (buflen > 0) { 2241 uint64_t bn = offset / bsize; 2242 int boff = offset % bsize; 2243 int ibn; 2244 const blkptr_t *indbp; 2245 blkptr_t bp; 2246 2247 if (bn > dnode->dn_maxblkid) 2248 return (EIO); 2249 2250 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 2251 goto cached; 2252 2253 indbp = dnode->dn_blkptr; 2254 for (i = 0; i < nlevels; i++) { 2255 /* 2256 * Copy the bp from the indirect array so that 2257 * we can re-use the scratch buffer for multi-level 2258 * objects. 2259 */ 2260 ibn = bn >> ((nlevels - i - 1) * ibshift); 2261 ibn &= ((1 << ibshift) - 1); 2262 bp = indbp[ibn]; 2263 if (BP_IS_HOLE(&bp)) { 2264 memset(dnode_cache_buf, 0, bsize); 2265 break; 2266 } 2267 rc = zio_read(spa, &bp, dnode_cache_buf); 2268 if (rc) 2269 return (rc); 2270 indbp = (const blkptr_t *) dnode_cache_buf; 2271 } 2272 dnode_cache_obj = dnode; 2273 dnode_cache_bn = bn; 2274 cached: 2275 2276 /* 2277 * The buffer contains our data block. Copy what we 2278 * need from it and loop. 2279 */ 2280 i = bsize - boff; 2281 if (i > buflen) i = buflen; 2282 memcpy(buf, &dnode_cache_buf[boff], i); 2283 buf = ((char *)buf) + i; 2284 offset += i; 2285 buflen -= i; 2286 } 2287 2288 return (0); 2289} 2290 2291/* 2292 * Lookup a value in a microzap directory. Assumes that the zap 2293 * scratch buffer contains the directory contents. 2294 */ 2295static int 2296mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value) 2297{ 2298 const mzap_phys_t *mz; 2299 const mzap_ent_phys_t *mze; 2300 size_t size; 2301 int chunks, i; 2302 2303 /* 2304 * Microzap objects use exactly one block. Read the whole 2305 * thing. 2306 */ 2307 size = dnode->dn_datablkszsec * 512; 2308 2309 mz = (const mzap_phys_t *) zap_scratch; 2310 chunks = size / MZAP_ENT_LEN - 1; 2311 2312 for (i = 0; i < chunks; i++) { 2313 mze = &mz->mz_chunk[i]; 2314 if (!strcmp(mze->mze_name, name)) { 2315 *value = mze->mze_value; 2316 return (0); 2317 } 2318 } 2319 2320 return (ENOENT); 2321} 2322 2323/* 2324 * Compare a name with a zap leaf entry. Return non-zero if the name 2325 * matches. 2326 */ 2327static int 2328fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name) 2329{ 2330 size_t namelen; 2331 const zap_leaf_chunk_t *nc; 2332 const char *p; 2333 2334 namelen = zc->l_entry.le_name_numints; 2335 2336 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2337 p = name; 2338 while (namelen > 0) { 2339 size_t len; 2340 len = namelen; 2341 if (len > ZAP_LEAF_ARRAY_BYTES) 2342 len = ZAP_LEAF_ARRAY_BYTES; 2343 if (memcmp(p, nc->l_array.la_array, len)) 2344 return (0); 2345 p += len; 2346 namelen -= len; 2347 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2348 } 2349 2350 return 1; 2351} 2352 2353/* 2354 * Extract a uint64_t value from a zap leaf entry. 2355 */ 2356static uint64_t 2357fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 2358{ 2359 const zap_leaf_chunk_t *vc; 2360 int i; 2361 uint64_t value; 2362 const uint8_t *p; 2363 2364 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 2365 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 2366 value = (value << 8) | p[i]; 2367 } 2368 2369 return value; 2370} 2371 2372static void 2373stv(int len, void *addr, uint64_t value) 2374{ 2375 switch (len) { 2376 case 1: 2377 *(uint8_t *)addr = value; 2378 return; 2379 case 2: 2380 *(uint16_t *)addr = value; 2381 return; 2382 case 4: 2383 *(uint32_t *)addr = value; 2384 return; 2385 case 8: 2386 *(uint64_t *)addr = value; 2387 return; 2388 } 2389} 2390 2391/* 2392 * Extract a array from a zap leaf entry. 2393 */ 2394static void 2395fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2396 uint64_t integer_size, uint64_t num_integers, void *buf) 2397{ 2398 uint64_t array_int_len = zc->l_entry.le_value_intlen; 2399 uint64_t value = 0; 2400 uint64_t *u64 = buf; 2401 char *p = buf; 2402 int len = MIN(zc->l_entry.le_value_numints, num_integers); 2403 int chunk = zc->l_entry.le_value_chunk; 2404 int byten = 0; 2405 2406 if (integer_size == 8 && len == 1) { 2407 *u64 = fzap_leaf_value(zl, zc); 2408 return; 2409 } 2410 2411 while (len > 0) { 2412 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; 2413 int i; 2414 2415 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); 2416 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { 2417 value = (value << 8) | la->la_array[i]; 2418 byten++; 2419 if (byten == array_int_len) { 2420 stv(integer_size, p, value); 2421 byten = 0; 2422 len--; 2423 if (len == 0) 2424 return; 2425 p += integer_size; 2426 } 2427 } 2428 chunk = la->la_next; 2429 } 2430} 2431 2432static int 2433fzap_check_size(uint64_t integer_size, uint64_t num_integers) 2434{ 2435 2436 switch (integer_size) { 2437 case 1: 2438 case 2: 2439 case 4: 2440 case 8: 2441 break; 2442 default: 2443 return (EINVAL); 2444 } 2445 2446 if (integer_size * num_integers > ZAP_MAXVALUELEN) 2447 return (E2BIG); 2448 2449 return (0); 2450} 2451 2452/* 2453 * Lookup a value in a fatzap directory. Assumes that the zap scratch 2454 * buffer contains the directory header. 2455 */ 2456static int 2457fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2458 uint64_t integer_size, uint64_t num_integers, void *value) 2459{ 2460 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2461 zap_phys_t zh = *(zap_phys_t *) zap_scratch; 2462 fat_zap_t z; 2463 uint64_t *ptrtbl; 2464 uint64_t hash; 2465 int rc; 2466 2467 if (zh.zap_magic != ZAP_MAGIC) 2468 return (EIO); 2469 2470 if ((rc = fzap_check_size(integer_size, num_integers)) != 0) 2471 return (rc); 2472 2473 z.zap_block_shift = ilog2(bsize); 2474 z.zap_phys = (zap_phys_t *) zap_scratch; 2475 2476 /* 2477 * Figure out where the pointer table is and read it in if necessary. 2478 */ 2479 if (zh.zap_ptrtbl.zt_blk) { 2480 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize, 2481 zap_scratch, bsize); 2482 if (rc) 2483 return (rc); 2484 ptrtbl = (uint64_t *) zap_scratch; 2485 } else { 2486 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0); 2487 } 2488 2489 hash = zap_hash(zh.zap_salt, name); 2490 2491 zap_leaf_t zl; 2492 zl.l_bs = z.zap_block_shift; 2493 2494 off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs; 2495 zap_leaf_chunk_t *zc; 2496 2497 rc = dnode_read(spa, dnode, off, zap_scratch, bsize); 2498 if (rc) 2499 return (rc); 2500 2501 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 2502 2503 /* 2504 * Make sure this chunk matches our hash. 2505 */ 2506 if (zl.l_phys->l_hdr.lh_prefix_len > 0 2507 && zl.l_phys->l_hdr.lh_prefix 2508 != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len)) 2509 return (ENOENT); 2510 2511 /* 2512 * Hash within the chunk to find our entry. 2513 */ 2514 int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len); 2515 int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1); 2516 h = zl.l_phys->l_hash[h]; 2517 if (h == 0xffff) 2518 return (ENOENT); 2519 zc = &ZAP_LEAF_CHUNK(&zl, h); 2520 while (zc->l_entry.le_hash != hash) { 2521 if (zc->l_entry.le_next == 0xffff) 2522 return (ENOENT); 2523 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next); 2524 } 2525 if (fzap_name_equal(&zl, zc, name)) { 2526 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 2527 integer_size * num_integers) 2528 return (E2BIG); 2529 fzap_leaf_array(&zl, zc, integer_size, num_integers, value); 2530 return (0); 2531 } 2532 2533 return (ENOENT); 2534} 2535 2536/* 2537 * Lookup a name in a zap object and return its value as a uint64_t. 2538 */ 2539static int 2540zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2541 uint64_t integer_size, uint64_t num_integers, void *value) 2542{ 2543 int rc; 2544 uint64_t zap_type; 2545 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2546 2547 rc = dnode_read(spa, dnode, 0, zap_scratch, size); 2548 if (rc) 2549 return (rc); 2550 2551 zap_type = *(uint64_t *) zap_scratch; 2552 if (zap_type == ZBT_MICRO) 2553 return mzap_lookup(dnode, name, value); 2554 else if (zap_type == ZBT_HEADER) { 2555 return fzap_lookup(spa, dnode, name, integer_size, 2556 num_integers, value); 2557 } 2558 printf("ZFS: invalid zap_type=%d\n", (int)zap_type); 2559 return (EIO); 2560} 2561 2562/* 2563 * List a microzap directory. Assumes that the zap scratch buffer contains 2564 * the directory contents. 2565 */ 2566static int 2567mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t)) 2568{ 2569 const mzap_phys_t *mz; 2570 const mzap_ent_phys_t *mze; 2571 size_t size; 2572 int chunks, i, rc; 2573 2574 /* 2575 * Microzap objects use exactly one block. Read the whole 2576 * thing. 2577 */ 2578 size = dnode->dn_datablkszsec * 512; 2579 mz = (const mzap_phys_t *) zap_scratch; 2580 chunks = size / MZAP_ENT_LEN - 1; 2581 2582 for (i = 0; i < chunks; i++) { 2583 mze = &mz->mz_chunk[i]; 2584 if (mze->mze_name[0]) { 2585 rc = callback(mze->mze_name, mze->mze_value); 2586 if (rc != 0) 2587 return (rc); 2588 } 2589 } 2590 2591 return (0); 2592} 2593 2594/* 2595 * List a fatzap directory. Assumes that the zap scratch buffer contains 2596 * the directory header. 2597 */ 2598static int 2599fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t)) 2600{ 2601 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2602 zap_phys_t zh = *(zap_phys_t *) zap_scratch; 2603 fat_zap_t z; 2604 int i, j, rc; 2605 2606 if (zh.zap_magic != ZAP_MAGIC) 2607 return (EIO); 2608 2609 z.zap_block_shift = ilog2(bsize); 2610 z.zap_phys = (zap_phys_t *) zap_scratch; 2611 2612 /* 2613 * This assumes that the leaf blocks start at block 1. The 2614 * documentation isn't exactly clear on this. 2615 */ 2616 zap_leaf_t zl; 2617 zl.l_bs = z.zap_block_shift; 2618 for (i = 0; i < zh.zap_num_leafs; i++) { 2619 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2620 char name[256], *p; 2621 uint64_t value; 2622 2623 if (dnode_read(spa, dnode, off, zap_scratch, bsize)) 2624 return (EIO); 2625 2626 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 2627 2628 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2629 zap_leaf_chunk_t *zc, *nc; 2630 int namelen; 2631 2632 zc = &ZAP_LEAF_CHUNK(&zl, j); 2633 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2634 continue; 2635 namelen = zc->l_entry.le_name_numints; 2636 if (namelen > sizeof(name)) 2637 namelen = sizeof(name); 2638 2639 /* 2640 * Paste the name back together. 2641 */ 2642 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 2643 p = name; 2644 while (namelen > 0) { 2645 int len; 2646 len = namelen; 2647 if (len > ZAP_LEAF_ARRAY_BYTES) 2648 len = ZAP_LEAF_ARRAY_BYTES; 2649 memcpy(p, nc->l_array.la_array, len); 2650 p += len; 2651 namelen -= len; 2652 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 2653 } 2654 2655 /* 2656 * Assume the first eight bytes of the value are 2657 * a uint64_t. 2658 */ 2659 value = fzap_leaf_value(&zl, zc); 2660 2661 //printf("%s 0x%jx\n", name, (uintmax_t)value); 2662 rc = callback((const char *)name, value); 2663 if (rc != 0) 2664 return (rc); 2665 } 2666 } 2667 2668 return (0); 2669} 2670 2671static int zfs_printf(const char *name, uint64_t value __unused) 2672{ 2673 2674 printf("%s\n", name); 2675 2676 return (0); 2677} 2678 2679/* 2680 * List a zap directory. 2681 */ 2682static int 2683zap_list(const spa_t *spa, const dnode_phys_t *dnode) 2684{ 2685 uint64_t zap_type; 2686 size_t size = dnode->dn_datablkszsec * 512; 2687 2688 if (dnode_read(spa, dnode, 0, zap_scratch, size)) 2689 return (EIO); 2690 2691 zap_type = *(uint64_t *) zap_scratch; 2692 if (zap_type == ZBT_MICRO) 2693 return mzap_list(dnode, zfs_printf); 2694 else 2695 return fzap_list(spa, dnode, zfs_printf); 2696} 2697 2698static int 2699objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode) 2700{ 2701 off_t offset; 2702 2703 offset = objnum * sizeof(dnode_phys_t); 2704 return dnode_read(spa, &os->os_meta_dnode, offset, 2705 dnode, sizeof(dnode_phys_t)); 2706} 2707 2708static int 2709mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 2710{ 2711 const mzap_phys_t *mz; 2712 const mzap_ent_phys_t *mze; 2713 size_t size; 2714 int chunks, i; 2715 2716 /* 2717 * Microzap objects use exactly one block. Read the whole 2718 * thing. 2719 */ 2720 size = dnode->dn_datablkszsec * 512; 2721 2722 mz = (const mzap_phys_t *) zap_scratch; 2723 chunks = size / MZAP_ENT_LEN - 1; 2724 2725 for (i = 0; i < chunks; i++) { 2726 mze = &mz->mz_chunk[i]; 2727 if (value == mze->mze_value) { 2728 strcpy(name, mze->mze_name); 2729 return (0); 2730 } 2731 } 2732 2733 return (ENOENT); 2734} 2735 2736static void 2737fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 2738{ 2739 size_t namelen; 2740 const zap_leaf_chunk_t *nc; 2741 char *p; 2742 2743 namelen = zc->l_entry.le_name_numints; 2744 2745 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2746 p = name; 2747 while (namelen > 0) { 2748 size_t len; 2749 len = namelen; 2750 if (len > ZAP_LEAF_ARRAY_BYTES) 2751 len = ZAP_LEAF_ARRAY_BYTES; 2752 memcpy(p, nc->l_array.la_array, len); 2753 p += len; 2754 namelen -= len; 2755 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2756 } 2757 2758 *p = '\0'; 2759} 2760 2761static int 2762fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 2763{ 2764 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2765 zap_phys_t zh = *(zap_phys_t *)zap_scratch; 2766 fat_zap_t z; 2767 int i, j; 2768 2769 if (zh.zap_magic != ZAP_MAGIC) 2770 return (EIO); 2771 2772 z.zap_block_shift = ilog2(bsize); 2773 z.zap_phys = (zap_phys_t *) zap_scratch; 2774 2775 /* 2776 * This assumes that the leaf blocks start at block 1. The 2777 * documentation isn't exactly clear on this. 2778 */ 2779 zap_leaf_t zl; 2780 zl.l_bs = z.zap_block_shift; 2781 for (i = 0; i < zh.zap_num_leafs; i++) { 2782 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2783 2784 if (dnode_read(spa, dnode, off, zap_scratch, bsize)) 2785 return (EIO); 2786 2787 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 2788 2789 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2790 zap_leaf_chunk_t *zc; 2791 2792 zc = &ZAP_LEAF_CHUNK(&zl, j); 2793 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2794 continue; 2795 if (zc->l_entry.le_value_intlen != 8 || 2796 zc->l_entry.le_value_numints != 1) 2797 continue; 2798 2799 if (fzap_leaf_value(&zl, zc) == value) { 2800 fzap_name_copy(&zl, zc, name); 2801 return (0); 2802 } 2803 } 2804 } 2805 2806 return (ENOENT); 2807} 2808 2809static int 2810zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 2811{ 2812 int rc; 2813 uint64_t zap_type; 2814 size_t size = dnode->dn_datablkszsec * 512; 2815 2816 rc = dnode_read(spa, dnode, 0, zap_scratch, size); 2817 if (rc) 2818 return (rc); 2819 2820 zap_type = *(uint64_t *) zap_scratch; 2821 if (zap_type == ZBT_MICRO) 2822 return mzap_rlookup(spa, dnode, name, value); 2823 else 2824 return fzap_rlookup(spa, dnode, name, value); 2825} 2826 2827static int 2828zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 2829{ 2830 char name[256]; 2831 char component[256]; 2832 uint64_t dir_obj, parent_obj, child_dir_zapobj; 2833 dnode_phys_t child_dir_zap, dataset, dir, parent; 2834 dsl_dir_phys_t *dd; 2835 dsl_dataset_phys_t *ds; 2836 char *p; 2837 int len; 2838 2839 p = &name[sizeof(name) - 1]; 2840 *p = '\0'; 2841 2842 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 2843 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 2844 return (EIO); 2845 } 2846 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 2847 dir_obj = ds->ds_dir_obj; 2848 2849 for (;;) { 2850 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0) 2851 return (EIO); 2852 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 2853 2854 /* Actual loop condition. */ 2855 parent_obj = dd->dd_parent_obj; 2856 if (parent_obj == 0) 2857 break; 2858 2859 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0) 2860 return (EIO); 2861 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 2862 child_dir_zapobj = dd->dd_child_dir_zapobj; 2863 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) 2864 return (EIO); 2865 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 2866 return (EIO); 2867 2868 len = strlen(component); 2869 p -= len; 2870 memcpy(p, component, len); 2871 --p; 2872 *p = '/'; 2873 2874 /* Actual loop iteration. */ 2875 dir_obj = parent_obj; 2876 } 2877 2878 if (*p != '\0') 2879 ++p; 2880 strcpy(result, p); 2881 2882 return (0); 2883} 2884 2885static int 2886zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 2887{ 2888 char element[256]; 2889 uint64_t dir_obj, child_dir_zapobj; 2890 dnode_phys_t child_dir_zap, dir; 2891 dsl_dir_phys_t *dd; 2892 const char *p, *q; 2893 2894 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) 2895 return (EIO); 2896 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 2897 1, &dir_obj)) 2898 return (EIO); 2899 2900 p = name; 2901 for (;;) { 2902 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) 2903 return (EIO); 2904 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 2905 2906 while (*p == '/') 2907 p++; 2908 /* Actual loop condition #1. */ 2909 if (*p == '\0') 2910 break; 2911 2912 q = strchr(p, '/'); 2913 if (q) { 2914 memcpy(element, p, q - p); 2915 element[q - p] = '\0'; 2916 p = q + 1; 2917 } else { 2918 strcpy(element, p); 2919 p += strlen(p); 2920 } 2921 2922 child_dir_zapobj = dd->dd_child_dir_zapobj; 2923 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) 2924 return (EIO); 2925 2926 /* Actual loop condition #2. */ 2927 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 2928 1, &dir_obj) != 0) 2929 return (ENOENT); 2930 } 2931 2932 *objnum = dd->dd_head_dataset_obj; 2933 return (0); 2934} 2935 2936#ifndef BOOT2 2937static int 2938zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 2939{ 2940 uint64_t dir_obj, child_dir_zapobj; 2941 dnode_phys_t child_dir_zap, dir, dataset; 2942 dsl_dataset_phys_t *ds; 2943 dsl_dir_phys_t *dd; 2944 2945 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 2946 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 2947 return (EIO); 2948 } 2949 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; 2950 dir_obj = ds->ds_dir_obj; 2951 2952 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) { 2953 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 2954 return (EIO); 2955 } 2956 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 2957 2958 child_dir_zapobj = dd->dd_child_dir_zapobj; 2959 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) { 2960 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 2961 return (EIO); 2962 } 2963 2964 return (zap_list(spa, &child_dir_zap) != 0); 2965} 2966 2967int 2968zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t)) 2969{ 2970 uint64_t dir_obj, child_dir_zapobj, zap_type; 2971 dnode_phys_t child_dir_zap, dir, dataset; 2972 dsl_dataset_phys_t *ds; 2973 dsl_dir_phys_t *dd; 2974 int err; 2975 2976 err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset); 2977 if (err != 0) { 2978 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 2979 return (err); 2980 } 2981 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; 2982 dir_obj = ds->ds_dir_obj; 2983 2984 err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir); 2985 if (err != 0) { 2986 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 2987 return (err); 2988 } 2989 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 2990 2991 child_dir_zapobj = dd->dd_child_dir_zapobj; 2992 err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap); 2993 if (err != 0) { 2994 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 2995 return (err); 2996 } 2997 2998 err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512); 2999 if (err != 0) 3000 return (err); 3001 3002 zap_type = *(uint64_t *) zap_scratch; 3003 if (zap_type == ZBT_MICRO) 3004 return mzap_list(&child_dir_zap, callback); 3005 else 3006 return fzap_list(spa, &child_dir_zap, callback); 3007} 3008#endif 3009 3010/* 3011 * Find the object set given the object number of its dataset object 3012 * and return its details in *objset 3013 */ 3014static int 3015zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 3016{ 3017 dnode_phys_t dataset; 3018 dsl_dataset_phys_t *ds; 3019 3020 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 3021 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3022 return (EIO); 3023 } 3024 3025 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; 3026 if (zio_read(spa, &ds->ds_bp, objset)) { 3027 printf("ZFS: can't read object set for dataset %ju\n", 3028 (uintmax_t)objnum); 3029 return (EIO); 3030 } 3031 3032 return (0); 3033} 3034 3035/* 3036 * Find the object set pointed to by the BOOTFS property or the root 3037 * dataset if there is none and return its details in *objset 3038 */ 3039static int 3040zfs_get_root(const spa_t *spa, uint64_t *objid) 3041{ 3042 dnode_phys_t dir, propdir; 3043 uint64_t props, bootfs, root; 3044 3045 *objid = 0; 3046 3047 /* 3048 * Start with the MOS directory object. 3049 */ 3050 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3051 printf("ZFS: can't read MOS object directory\n"); 3052 return (EIO); 3053 } 3054 3055 /* 3056 * Lookup the pool_props and see if we can find a bootfs. 3057 */ 3058 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, 3059 sizeof(props), 1, &props) == 0 && 3060 objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 && 3061 zap_lookup(spa, &propdir, "bootfs", 3062 sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { 3063 *objid = bootfs; 3064 return (0); 3065 } 3066 /* 3067 * Lookup the root dataset directory 3068 */ 3069 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root) 3070 || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) { 3071 printf("ZFS: can't find root dsl_dir\n"); 3072 return (EIO); 3073 } 3074 3075 /* 3076 * Use the information from the dataset directory's bonus buffer 3077 * to find the dataset object and from that the object set itself. 3078 */ 3079 dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus; 3080 *objid = dd->dd_head_dataset_obj; 3081 return (0); 3082} 3083 3084static int 3085zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 3086{ 3087 3088 mount->spa = spa; 3089 3090 /* 3091 * Find the root object set if not explicitly provided 3092 */ 3093 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 3094 printf("ZFS: can't find root filesystem\n"); 3095 return (EIO); 3096 } 3097 3098 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 3099 printf("ZFS: can't open root filesystem\n"); 3100 return (EIO); 3101 } 3102 3103 mount->rootobj = rootobj; 3104 3105 return (0); 3106} 3107 3108/* 3109 * callback function for feature name checks. 3110 */ 3111static int 3112check_feature(const char *name, uint64_t value) 3113{ 3114 int i; 3115 3116 if (value == 0) 3117 return (0); 3118 if (name[0] == '\0') 3119 return (0); 3120 3121 for (i = 0; features_for_read[i] != NULL; i++) { 3122 if (strcmp(name, features_for_read[i]) == 0) 3123 return (0); 3124 } 3125 printf("ZFS: unsupported feature: %s\n", name); 3126 return (EIO); 3127} 3128 3129/* 3130 * Checks whether the MOS features that are active are supported. 3131 */ 3132static int 3133check_mos_features(const spa_t *spa) 3134{ 3135 dnode_phys_t dir; 3136 uint64_t objnum, zap_type; 3137 size_t size; 3138 int rc; 3139 3140 if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, 3141 &dir)) != 0) 3142 return (rc); 3143 if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, 3144 sizeof (objnum), 1, &objnum)) != 0) { 3145 /* 3146 * It is older pool without features. As we have already 3147 * tested the label, just return without raising the error. 3148 */ 3149 return (0); 3150 } 3151 3152 if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0) 3153 return (rc); 3154 3155 if (dir.dn_type != DMU_OTN_ZAP_METADATA) 3156 return (EIO); 3157 3158 size = dir.dn_datablkszsec * 512; 3159 if (dnode_read(spa, &dir, 0, zap_scratch, size)) 3160 return (EIO); 3161 3162 zap_type = *(uint64_t *) zap_scratch; 3163 if (zap_type == ZBT_MICRO) 3164 rc = mzap_list(&dir, check_feature); 3165 else 3166 rc = fzap_list(spa, &dir, check_feature); 3167 3168 return (rc); 3169} 3170 3171static int 3172load_nvlist(spa_t *spa, uint64_t obj, unsigned char **value) 3173{ 3174 dnode_phys_t dir; 3175 size_t size; 3176 int rc; 3177 unsigned char *nv; 3178 3179 *value = NULL; 3180 if ((rc = objset_get_dnode(spa, &spa->spa_mos, obj, &dir)) != 0) 3181 return (rc); 3182 if (dir.dn_type != DMU_OT_PACKED_NVLIST && 3183 dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) { 3184 return (EIO); 3185 } 3186 3187 if (dir.dn_bonuslen != sizeof (uint64_t)) 3188 return (EIO); 3189 3190 size = *(uint64_t *)DN_BONUS(&dir); 3191 nv = malloc(size); 3192 if (nv == NULL) 3193 return (ENOMEM); 3194 3195 rc = dnode_read(spa, &dir, 0, nv, size); 3196 if (rc != 0) { 3197 free(nv); 3198 nv = NULL; 3199 return (rc); 3200 } 3201 *value = nv; 3202 return (rc); 3203} 3204 3205static int 3206zfs_spa_init(spa_t *spa) 3207{ 3208 dnode_phys_t dir; 3209 uint64_t config_object; 3210 unsigned char *nvlist; 3211 int rc; 3212 3213 if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { 3214 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 3215 return (EIO); 3216 } 3217 if (spa->spa_mos.os_type != DMU_OST_META) { 3218 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 3219 return (EIO); 3220 } 3221 3222 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, 3223 &dir)) { 3224 printf("ZFS: failed to read pool %s directory object\n", 3225 spa->spa_name); 3226 return (EIO); 3227 } 3228 /* this is allowed to fail, older pools do not have salt */ 3229 rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, 3230 sizeof (spa->spa_cksum_salt.zcs_bytes), 3231 spa->spa_cksum_salt.zcs_bytes); 3232 3233 rc = check_mos_features(spa); 3234 if (rc != 0) { 3235 printf("ZFS: pool %s is not supported\n", spa->spa_name); 3236 return (rc); 3237 } 3238 3239 rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG, 3240 sizeof (config_object), 1, &config_object); 3241 if (rc != 0) { 3242 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG); 3243 return (EIO); 3244 } 3245 rc = load_nvlist(spa, config_object, &nvlist); 3246 if (rc != 0) 3247 return (rc); 3248 3249 /* Update vdevs from MOS config. */ 3250 rc = vdev_init_from_nvlist(spa, nvlist + 4); 3251 free(nvlist); 3252 return (rc); 3253} 3254 3255static int 3256zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 3257{ 3258 3259 if (dn->dn_bonustype != DMU_OT_SA) { 3260 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 3261 3262 sb->st_mode = zp->zp_mode; 3263 sb->st_uid = zp->zp_uid; 3264 sb->st_gid = zp->zp_gid; 3265 sb->st_size = zp->zp_size; 3266 } else { 3267 sa_hdr_phys_t *sahdrp; 3268 int hdrsize; 3269 size_t size = 0; 3270 void *buf = NULL; 3271 3272 if (dn->dn_bonuslen != 0) 3273 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3274 else { 3275 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 3276 blkptr_t *bp = DN_SPILL_BLKPTR(dn); 3277 int error; 3278 3279 size = BP_GET_LSIZE(bp); 3280 buf = zfs_alloc(size); 3281 error = zio_read(spa, bp, buf); 3282 if (error != 0) { 3283 zfs_free(buf, size); 3284 return (error); 3285 } 3286 sahdrp = buf; 3287 } else { 3288 return (EIO); 3289 } 3290 } 3291 hdrsize = SA_HDR_SIZE(sahdrp); 3292 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 3293 SA_MODE_OFFSET); 3294 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 3295 SA_UID_OFFSET); 3296 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 3297 SA_GID_OFFSET); 3298 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 3299 SA_SIZE_OFFSET); 3300 if (buf != NULL) 3301 zfs_free(buf, size); 3302 } 3303 3304 return (0); 3305} 3306 3307static int 3308zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) 3309{ 3310 int rc = 0; 3311 3312 if (dn->dn_bonustype == DMU_OT_SA) { 3313 sa_hdr_phys_t *sahdrp = NULL; 3314 size_t size = 0; 3315 void *buf = NULL; 3316 int hdrsize; 3317 char *p; 3318 3319 if (dn->dn_bonuslen != 0) 3320 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3321 else { 3322 blkptr_t *bp; 3323 3324 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) 3325 return (EIO); 3326 bp = DN_SPILL_BLKPTR(dn); 3327 3328 size = BP_GET_LSIZE(bp); 3329 buf = zfs_alloc(size); 3330 rc = zio_read(spa, bp, buf); 3331 if (rc != 0) { 3332 zfs_free(buf, size); 3333 return (rc); 3334 } 3335 sahdrp = buf; 3336 } 3337 hdrsize = SA_HDR_SIZE(sahdrp); 3338 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); 3339 memcpy(path, p, psize); 3340 if (buf != NULL) 3341 zfs_free(buf, size); 3342 return (0); 3343 } 3344 /* 3345 * Second test is purely to silence bogus compiler 3346 * warning about accessing past the end of dn_bonus. 3347 */ 3348 if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && 3349 sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { 3350 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); 3351 } else { 3352 rc = dnode_read(spa, dn, 0, path, psize); 3353 } 3354 return (rc); 3355} 3356 3357struct obj_list { 3358 uint64_t objnum; 3359 STAILQ_ENTRY(obj_list) entry; 3360}; 3361 3362/* 3363 * Lookup a file and return its dnode. 3364 */ 3365static int 3366zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 3367{ 3368 int rc; 3369 uint64_t objnum; 3370 const spa_t *spa; 3371 dnode_phys_t dn; 3372 const char *p, *q; 3373 char element[256]; 3374 char path[1024]; 3375 int symlinks_followed = 0; 3376 struct stat sb; 3377 struct obj_list *entry, *tentry; 3378 STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); 3379 3380 spa = mount->spa; 3381 if (mount->objset.os_type != DMU_OST_ZFS) { 3382 printf("ZFS: unexpected object set type %ju\n", 3383 (uintmax_t)mount->objset.os_type); 3384 return (EIO); 3385 } 3386 3387 if ((entry = malloc(sizeof(struct obj_list))) == NULL) 3388 return (ENOMEM); 3389 3390 /* 3391 * Get the root directory dnode. 3392 */ 3393 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 3394 if (rc) { 3395 free(entry); 3396 return (rc); 3397 } 3398 3399 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum); 3400 if (rc) { 3401 free(entry); 3402 return (rc); 3403 } 3404 entry->objnum = objnum; 3405 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3406 3407 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3408 if (rc != 0) 3409 goto done; 3410 3411 p = upath; 3412 while (p && *p) { 3413 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3414 if (rc != 0) 3415 goto done; 3416 3417 while (*p == '/') 3418 p++; 3419 if (*p == '\0') 3420 break; 3421 q = p; 3422 while (*q != '\0' && *q != '/') 3423 q++; 3424 3425 /* skip dot */ 3426 if (p + 1 == q && p[0] == '.') { 3427 p++; 3428 continue; 3429 } 3430 /* double dot */ 3431 if (p + 2 == q && p[0] == '.' && p[1] == '.') { 3432 p += 2; 3433 if (STAILQ_FIRST(&on_cache) == 3434 STAILQ_LAST(&on_cache, obj_list, entry)) { 3435 rc = ENOENT; 3436 goto done; 3437 } 3438 entry = STAILQ_FIRST(&on_cache); 3439 STAILQ_REMOVE_HEAD(&on_cache, entry); 3440 free(entry); 3441 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3442 continue; 3443 } 3444 if (q - p + 1 > sizeof(element)) { 3445 rc = ENAMETOOLONG; 3446 goto done; 3447 } 3448 memcpy(element, p, q - p); 3449 element[q - p] = 0; 3450 p = q; 3451 3452 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) 3453 goto done; 3454 if (!S_ISDIR(sb.st_mode)) { 3455 rc = ENOTDIR; 3456 goto done; 3457 } 3458 3459 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); 3460 if (rc) 3461 goto done; 3462 objnum = ZFS_DIRENT_OBJ(objnum); 3463 3464 if ((entry = malloc(sizeof(struct obj_list))) == NULL) { 3465 rc = ENOMEM; 3466 goto done; 3467 } 3468 entry->objnum = objnum; 3469 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3470 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3471 if (rc) 3472 goto done; 3473 3474 /* 3475 * Check for symlink. 3476 */ 3477 rc = zfs_dnode_stat(spa, &dn, &sb); 3478 if (rc) 3479 goto done; 3480 if (S_ISLNK(sb.st_mode)) { 3481 if (symlinks_followed > 10) { 3482 rc = EMLINK; 3483 goto done; 3484 } 3485 symlinks_followed++; 3486 3487 /* 3488 * Read the link value and copy the tail of our 3489 * current path onto the end. 3490 */ 3491 if (sb.st_size + strlen(p) + 1 > sizeof(path)) { 3492 rc = ENAMETOOLONG; 3493 goto done; 3494 } 3495 strcpy(&path[sb.st_size], p); 3496 3497 rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); 3498 if (rc != 0) 3499 goto done; 3500 3501 /* 3502 * Restart with the new path, starting either at 3503 * the root or at the parent depending whether or 3504 * not the link is relative. 3505 */ 3506 p = path; 3507 if (*p == '/') { 3508 while (STAILQ_FIRST(&on_cache) != 3509 STAILQ_LAST(&on_cache, obj_list, entry)) { 3510 entry = STAILQ_FIRST(&on_cache); 3511 STAILQ_REMOVE_HEAD(&on_cache, entry); 3512 free(entry); 3513 } 3514 } else { 3515 entry = STAILQ_FIRST(&on_cache); 3516 STAILQ_REMOVE_HEAD(&on_cache, entry); 3517 free(entry); 3518 } 3519 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3520 } 3521 } 3522 3523 *dnode = dn; 3524done: 3525 STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) 3526 free(entry); 3527 return (rc); 3528} 3529