1/*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30/*
31 *	Stand-alone ZFS file reader.
32 */
33
34#include <sys/endian.h>
35#include <sys/stat.h>
36#include <sys/stdint.h>
37#include <sys/list.h>
38#include <machine/_inttypes.h>
39
40#include "zfsimpl.h"
41#include "zfssubr.c"
42
43
44struct zfsmount {
45	const spa_t	*spa;
46	objset_phys_t	objset;
47	uint64_t	rootobj;
48};
49static struct zfsmount zfsmount __unused;
50
51/*
52 * The indirect_child_t represents the vdev that we will read from, when we
53 * need to read all copies of the data (e.g. for scrub or reconstruction).
54 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
55 * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
56 * ic_vdev is a child of the mirror.
57 */
58typedef struct indirect_child {
59	void *ic_data;
60	vdev_t *ic_vdev;
61} indirect_child_t;
62
63/*
64 * The indirect_split_t represents one mapped segment of an i/o to the
65 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
66 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
67 * For split blocks, there will be several of these.
68 */
69typedef struct indirect_split {
70	list_node_t is_node; /* link on iv_splits */
71
72	/*
73	 * is_split_offset is the offset into the i/o.
74	 * This is the sum of the previous splits' is_size's.
75	 */
76	uint64_t is_split_offset;
77
78	vdev_t *is_vdev; /* top-level vdev */
79	uint64_t is_target_offset; /* offset on is_vdev */
80	uint64_t is_size;
81	int is_children; /* number of entries in is_child[] */
82
83	/*
84	 * is_good_child is the child that we are currently using to
85	 * attempt reconstruction.
86	 */
87	int is_good_child;
88
89	indirect_child_t is_child[1]; /* variable-length */
90} indirect_split_t;
91
92/*
93 * The indirect_vsd_t is associated with each i/o to the indirect vdev.
94 * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
95 */
96typedef struct indirect_vsd {
97	boolean_t iv_split_block;
98	boolean_t iv_reconstruct;
99
100	list_t iv_splits; /* list of indirect_split_t's */
101} indirect_vsd_t;
102
103/*
104 * List of all vdevs, chained through v_alllink.
105 */
106static vdev_list_t zfs_vdevs;
107
108 /*
109 * List of ZFS features supported for read
110 */
111static const char *features_for_read[] = {
112	"org.illumos:lz4_compress",
113	"com.delphix:hole_birth",
114	"com.delphix:extensible_dataset",
115	"com.delphix:embedded_data",
116	"org.open-zfs:large_blocks",
117	"org.illumos:sha512",
118	"org.illumos:skein",
119	"org.zfsonlinux:large_dnode",
120	"com.joyent:multi_vdev_crash_dump",
121	"com.delphix:spacemap_histogram",
122	"com.delphix:zpool_checkpoint",
123	"com.delphix:spacemap_v2",
124	"com.datto:encryption",
125	"org.zfsonlinux:allocation_classes",
126	"com.datto:resilver_defer",
127	"com.delphix:device_removal",
128	"com.delphix:obsolete_counts",
129	"com.intel:allocation_classes",
130	"org.freebsd:zstd_compress",
131	"com.datto:encryption",
132	NULL
133};
134
135/*
136 * List of all pools, chained through spa_link.
137 */
138static spa_list_t zfs_pools;
139
140static const dnode_phys_t *dnode_cache_obj;
141static uint64_t dnode_cache_bn;
142static char *dnode_cache_buf;
143static char *zap_scratch;
144static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
145
146#define TEMP_SIZE	(1024 * 1024)
147
148static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
149static int zfs_get_root(const spa_t *spa, uint64_t *objid);
150static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
151static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
152    const char *name, uint64_t integer_size, uint64_t num_integers,
153    void *value);
154static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
155    dnode_phys_t *);
156static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
157    size_t);
158static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
159    size_t);
160static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t);
161vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *,
162    uint64_t);
163vdev_indirect_mapping_entry_phys_t *
164    vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t,
165    uint64_t, uint64_t *);
166
167static void
168zfs_init(void)
169{
170	STAILQ_INIT(&zfs_vdevs);
171	STAILQ_INIT(&zfs_pools);
172
173	zfs_temp_buf = malloc(TEMP_SIZE);
174	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
175	zfs_temp_ptr = zfs_temp_buf;
176	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
177	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
178
179	zfs_init_crc();
180}
181
182static void *
183zfs_alloc(size_t size)
184{
185	char *ptr;
186
187	if (zfs_temp_ptr + size > zfs_temp_end) {
188		panic("ZFS: out of temporary buffer space");
189	}
190	ptr = zfs_temp_ptr;
191	zfs_temp_ptr += size;
192
193	return (ptr);
194}
195
196static void
197zfs_free(void *ptr, size_t size)
198{
199
200	zfs_temp_ptr -= size;
201	if (zfs_temp_ptr != ptr) {
202		panic("ZFS: zfs_alloc()/zfs_free() mismatch");
203	}
204}
205
206static int
207xdr_int(const unsigned char **xdr, int *ip)
208{
209	*ip = be32dec(*xdr);
210	(*xdr) += 4;
211	return (0);
212}
213
214static int
215xdr_u_int(const unsigned char **xdr, u_int *ip)
216{
217	*ip = be32dec(*xdr);
218	(*xdr) += 4;
219	return (0);
220}
221
222static int
223xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
224{
225	u_int hi, lo;
226
227	xdr_u_int(xdr, &hi);
228	xdr_u_int(xdr, &lo);
229	*lp = (((uint64_t) hi) << 32) | lo;
230	return (0);
231}
232
233static int
234nvlist_find(const unsigned char *nvlist, const char *name, int type,
235	    int *elementsp, void *valuep)
236{
237	const unsigned char *p, *pair;
238	int junk;
239	int encoded_size, decoded_size;
240
241	p = nvlist;
242	xdr_int(&p, &junk);
243	xdr_int(&p, &junk);
244
245	pair = p;
246	xdr_int(&p, &encoded_size);
247	xdr_int(&p, &decoded_size);
248	while (encoded_size && decoded_size) {
249		int namelen, pairtype, elements;
250		const char *pairname;
251
252		xdr_int(&p, &namelen);
253		pairname = (const char *)p;
254		p += roundup(namelen, 4);
255		xdr_int(&p, &pairtype);
256
257		if (!memcmp(name, pairname, namelen) && type == pairtype) {
258			xdr_int(&p, &elements);
259			if (elementsp)
260				*elementsp = elements;
261			if (type == DATA_TYPE_UINT64) {
262				xdr_uint64_t(&p, (uint64_t *) valuep);
263				return (0);
264			} else if (type == DATA_TYPE_STRING) {
265				int len;
266				xdr_int(&p, &len);
267				(*(const char **)valuep) = (const char *)p;
268				return (0);
269			} else if (type == DATA_TYPE_NVLIST ||
270			    type == DATA_TYPE_NVLIST_ARRAY) {
271				(*(const unsigned char **)valuep) =
272				    (const unsigned char *)p;
273				return (0);
274			} else {
275				return (EIO);
276			}
277		} else {
278			/*
279			 * Not the pair we are looking for, skip to the next one.
280			 */
281			p = pair + encoded_size;
282		}
283
284		pair = p;
285		xdr_int(&p, &encoded_size);
286		xdr_int(&p, &decoded_size);
287	}
288
289	return (EIO);
290}
291
292static int
293nvlist_check_features_for_read(const unsigned char *nvlist)
294{
295	const unsigned char *p, *pair;
296	int junk;
297	int encoded_size, decoded_size;
298	int rc;
299
300	rc = 0;
301
302	p = nvlist;
303	xdr_int(&p, &junk);
304	xdr_int(&p, &junk);
305
306	pair = p;
307	xdr_int(&p, &encoded_size);
308	xdr_int(&p, &decoded_size);
309	while (encoded_size && decoded_size) {
310		int namelen, pairtype;
311		const char *pairname;
312		int i, found;
313
314		found = 0;
315
316		xdr_int(&p, &namelen);
317		pairname = (const char *)p;
318		p += roundup(namelen, 4);
319		xdr_int(&p, &pairtype);
320
321		for (i = 0; features_for_read[i] != NULL; i++) {
322			if (!memcmp(pairname, features_for_read[i], namelen)) {
323				found = 1;
324				break;
325			}
326		}
327
328		if (!found) {
329			printf("ZFS: unsupported feature: %s\n", pairname);
330			rc = EIO;
331		}
332
333		p = pair + encoded_size;
334
335		pair = p;
336		xdr_int(&p, &encoded_size);
337		xdr_int(&p, &decoded_size);
338	}
339
340	return (rc);
341}
342
343/*
344 * Return the next nvlist in an nvlist array.
345 */
346static const unsigned char *
347nvlist_next(const unsigned char *nvlist)
348{
349	const unsigned char *p, *pair;
350	int junk;
351	int encoded_size, decoded_size;
352
353	p = nvlist;
354	xdr_int(&p, &junk);
355	xdr_int(&p, &junk);
356
357	pair = p;
358	xdr_int(&p, &encoded_size);
359	xdr_int(&p, &decoded_size);
360	while (encoded_size && decoded_size) {
361		p = pair + encoded_size;
362
363		pair = p;
364		xdr_int(&p, &encoded_size);
365		xdr_int(&p, &decoded_size);
366	}
367
368	return p;
369}
370
371#ifdef TEST
372
373static const unsigned char *
374nvlist_print(const unsigned char *nvlist, unsigned int indent)
375{
376	static const char* typenames[] = {
377		"DATA_TYPE_UNKNOWN",
378		"DATA_TYPE_BOOLEAN",
379		"DATA_TYPE_BYTE",
380		"DATA_TYPE_INT16",
381		"DATA_TYPE_UINT16",
382		"DATA_TYPE_INT32",
383		"DATA_TYPE_UINT32",
384		"DATA_TYPE_INT64",
385		"DATA_TYPE_UINT64",
386		"DATA_TYPE_STRING",
387		"DATA_TYPE_BYTE_ARRAY",
388		"DATA_TYPE_INT16_ARRAY",
389		"DATA_TYPE_UINT16_ARRAY",
390		"DATA_TYPE_INT32_ARRAY",
391		"DATA_TYPE_UINT32_ARRAY",
392		"DATA_TYPE_INT64_ARRAY",
393		"DATA_TYPE_UINT64_ARRAY",
394		"DATA_TYPE_STRING_ARRAY",
395		"DATA_TYPE_HRTIME",
396		"DATA_TYPE_NVLIST",
397		"DATA_TYPE_NVLIST_ARRAY",
398		"DATA_TYPE_BOOLEAN_VALUE",
399		"DATA_TYPE_INT8",
400		"DATA_TYPE_UINT8",
401		"DATA_TYPE_BOOLEAN_ARRAY",
402		"DATA_TYPE_INT8_ARRAY",
403		"DATA_TYPE_UINT8_ARRAY"
404	};
405
406	unsigned int i, j;
407	const unsigned char *p, *pair;
408	int junk;
409	int encoded_size, decoded_size;
410
411	p = nvlist;
412	xdr_int(&p, &junk);
413	xdr_int(&p, &junk);
414
415	pair = p;
416	xdr_int(&p, &encoded_size);
417	xdr_int(&p, &decoded_size);
418	while (encoded_size && decoded_size) {
419		int namelen, pairtype, elements;
420		const char *pairname;
421
422		xdr_int(&p, &namelen);
423		pairname = (const char *)p;
424		p += roundup(namelen, 4);
425		xdr_int(&p, &pairtype);
426
427		for (i = 0; i < indent; i++)
428			printf(" ");
429		printf("%s %s", typenames[pairtype], pairname);
430
431		xdr_int(&p, &elements);
432		switch (pairtype) {
433		case DATA_TYPE_UINT64: {
434			uint64_t val;
435			xdr_uint64_t(&p, &val);
436			printf(" = 0x%jx\n", (uintmax_t)val);
437			break;
438		}
439
440		case DATA_TYPE_STRING: {
441			int len;
442			xdr_int(&p, &len);
443			printf(" = \"%s\"\n", p);
444			break;
445		}
446
447		case DATA_TYPE_NVLIST:
448			printf("\n");
449			nvlist_print(p, indent + 1);
450			break;
451
452		case DATA_TYPE_NVLIST_ARRAY:
453			for (j = 0; j < elements; j++) {
454				printf("[%d]\n", j);
455				p = nvlist_print(p, indent + 1);
456				if (j != elements - 1) {
457					for (i = 0; i < indent; i++)
458						printf(" ");
459					printf("%s %s", typenames[pairtype], pairname);
460				}
461			}
462			break;
463
464		default:
465			printf("\n");
466		}
467
468		p = pair + encoded_size;
469
470		pair = p;
471		xdr_int(&p, &encoded_size);
472		xdr_int(&p, &decoded_size);
473	}
474
475	return p;
476}
477
478#endif
479
480static int
481vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
482    off_t offset, size_t size)
483{
484	size_t psize;
485	int rc;
486
487	if (!vdev->v_phys_read)
488		return (EIO);
489
490	if (bp) {
491		psize = BP_GET_PSIZE(bp);
492	} else {
493		psize = size;
494	}
495
496	/*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
497	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
498	if (rc == 0) {
499		if (bp != NULL)
500			rc = zio_checksum_verify(vdev->v_spa, bp, buf);
501	}
502
503	return (rc);
504}
505
506typedef struct remap_segment {
507	vdev_t *rs_vd;
508	uint64_t rs_offset;
509	uint64_t rs_asize;
510	uint64_t rs_split_offset;
511	list_node_t rs_node;
512} remap_segment_t;
513
514static remap_segment_t *
515rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
516{
517	remap_segment_t *rs = malloc(sizeof (remap_segment_t));
518
519	if (rs != NULL) {
520		rs->rs_vd = vd;
521		rs->rs_offset = offset;
522		rs->rs_asize = asize;
523		rs->rs_split_offset = split_offset;
524	}
525
526	return (rs);
527}
528
529vdev_indirect_mapping_t *
530vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
531    uint64_t mapping_object)
532{
533	vdev_indirect_mapping_t *vim;
534	vdev_indirect_mapping_phys_t *vim_phys;
535	int rc;
536
537	vim = calloc(1, sizeof (*vim));
538	if (vim == NULL)
539		return (NULL);
540
541	vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
542	if (vim->vim_dn == NULL) {
543		free(vim);
544		return (NULL);
545	}
546
547	rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
548	if (rc != 0) {
549		free(vim->vim_dn);
550		free(vim);
551		return (NULL);
552	}
553
554	vim->vim_spa = spa;
555	vim->vim_phys = malloc(sizeof (*vim->vim_phys));
556	if (vim->vim_phys == NULL) {
557		free(vim->vim_dn);
558		free(vim);
559		return (NULL);
560	}
561
562	vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
563	*vim->vim_phys = *vim_phys;
564
565	vim->vim_objset = os;
566	vim->vim_object = mapping_object;
567	vim->vim_entries = NULL;
568
569	vim->vim_havecounts =
570	    (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
571
572	return (vim);
573}
574
575/*
576 * Compare an offset with an indirect mapping entry; there are three
577 * possible scenarios:
578 *
579 *     1. The offset is "less than" the mapping entry; meaning the
580 *        offset is less than the source offset of the mapping entry. In
581 *        this case, there is no overlap between the offset and the
582 *        mapping entry and -1 will be returned.
583 *
584 *     2. The offset is "greater than" the mapping entry; meaning the
585 *        offset is greater than the mapping entry's source offset plus
586 *        the entry's size. In this case, there is no overlap between
587 *        the offset and the mapping entry and 1 will be returned.
588 *
589 *        NOTE: If the offset is actually equal to the entry's offset
590 *        plus size, this is considered to be "greater" than the entry,
591 *        and this case applies (i.e. 1 will be returned). Thus, the
592 *        entry's "range" can be considered to be inclusive at its
593 *        start, but exclusive at its end: e.g. [src, src + size).
594 *
595 *     3. The last case to consider is if the offset actually falls
596 *        within the mapping entry's range. If this is the case, the
597 *        offset is considered to be "equal to" the mapping entry and
598 *        0 will be returned.
599 *
600 *        NOTE: If the offset is equal to the entry's source offset,
601 *        this case applies and 0 will be returned. If the offset is
602 *        equal to the entry's source plus its size, this case does
603 *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
604 *        returned.
605 */
606static int
607dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
608{
609	const uint64_t *key = v_key;
610	const vdev_indirect_mapping_entry_phys_t *array_elem =
611	    v_array_elem;
612	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
613
614	if (*key < src_offset) {
615		return (-1);
616	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
617		return (0);
618	} else {
619		return (1);
620	}
621}
622
623/*
624 * Return array entry.
625 */
626static vdev_indirect_mapping_entry_phys_t *
627vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
628{
629	uint64_t size;
630	off_t offset = 0;
631	int rc;
632
633	if (vim->vim_phys->vimp_num_entries == 0)
634		return (NULL);
635
636	if (vim->vim_entries == NULL) {
637		uint64_t bsize;
638
639		bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
640		size = vim->vim_phys->vimp_num_entries *
641		    sizeof (*vim->vim_entries);
642		if (size > bsize) {
643			size = bsize / sizeof (*vim->vim_entries);
644			size *= sizeof (*vim->vim_entries);
645		}
646		vim->vim_entries = malloc(size);
647		if (vim->vim_entries == NULL)
648			return (NULL);
649		vim->vim_num_entries = size / sizeof (*vim->vim_entries);
650		offset = index * sizeof (*vim->vim_entries);
651	}
652
653	/* We have data in vim_entries */
654	if (offset == 0) {
655		if (index >= vim->vim_entry_offset &&
656		    index <= vim->vim_entry_offset + vim->vim_num_entries) {
657			index -= vim->vim_entry_offset;
658			return (&vim->vim_entries[index]);
659		}
660		offset = index * sizeof (*vim->vim_entries);
661	}
662
663	vim->vim_entry_offset = index;
664	size = vim->vim_num_entries * sizeof (*vim->vim_entries);
665	rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
666	    size);
667	if (rc != 0) {
668		/* Read error, invalidate vim_entries. */
669		free(vim->vim_entries);
670		vim->vim_entries = NULL;
671		return (NULL);
672	}
673	index -= vim->vim_entry_offset;
674	return (&vim->vim_entries[index]);
675}
676
677/*
678 * Returns the mapping entry for the given offset.
679 *
680 * It's possible that the given offset will not be in the mapping table
681 * (i.e. no mapping entries contain this offset), in which case, the
682 * return value value depends on the "next_if_missing" parameter.
683 *
684 * If the offset is not found in the table and "next_if_missing" is
685 * B_FALSE, then NULL will always be returned. The behavior is intended
686 * to allow consumers to get the entry corresponding to the offset
687 * parameter, iff the offset overlaps with an entry in the table.
688 *
689 * If the offset is not found in the table and "next_if_missing" is
690 * B_TRUE, then the entry nearest to the given offset will be returned,
691 * such that the entry's source offset is greater than the offset
692 * passed in (i.e. the "next" mapping entry in the table is returned, if
693 * the offset is missing from the table). If there are no entries whose
694 * source offset is greater than the passed in offset, NULL is returned.
695 */
696static vdev_indirect_mapping_entry_phys_t *
697vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
698    uint64_t offset)
699{
700	ASSERT(vim->vim_phys->vimp_num_entries > 0);
701
702	vdev_indirect_mapping_entry_phys_t *entry;
703
704	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
705	uint64_t base = 0;
706
707	/*
708	 * We don't define these inside of the while loop because we use
709	 * their value in the case that offset isn't in the mapping.
710	 */
711	uint64_t mid;
712	int result;
713
714	while (last >= base) {
715		mid = base + ((last - base) >> 1);
716
717		entry = vdev_indirect_mapping_entry(vim, mid);
718		if (entry == NULL)
719			break;
720		result = dva_mapping_overlap_compare(&offset, entry);
721
722		if (result == 0) {
723			break;
724		} else if (result < 0) {
725			last = mid - 1;
726		} else {
727			base = mid + 1;
728		}
729	}
730	return (entry);
731}
732
733/*
734 * Given an indirect vdev and an extent on that vdev, it duplicates the
735 * physical entries of the indirect mapping that correspond to the extent
736 * to a new array and returns a pointer to it. In addition, copied_entries
737 * is populated with the number of mapping entries that were duplicated.
738 *
739 * Finally, since we are doing an allocation, it is up to the caller to
740 * free the array allocated in this function.
741 */
742vdev_indirect_mapping_entry_phys_t *
743vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
744    uint64_t asize, uint64_t *copied_entries)
745{
746	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
747	vdev_indirect_mapping_t *vim = vd->v_mapping;
748	uint64_t entries = 0;
749
750	vdev_indirect_mapping_entry_phys_t *first_mapping =
751	    vdev_indirect_mapping_entry_for_offset(vim, offset);
752	ASSERT3P(first_mapping, !=, NULL);
753
754	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
755	while (asize > 0) {
756		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
757		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
758		uint64_t inner_size = MIN(asize, size - inner_offset);
759
760		offset += inner_size;
761		asize -= inner_size;
762		entries++;
763		m++;
764	}
765
766	size_t copy_length = entries * sizeof (*first_mapping);
767	duplicate_mappings = malloc(copy_length);
768	if (duplicate_mappings != NULL)
769		bcopy(first_mapping, duplicate_mappings, copy_length);
770	else
771		entries = 0;
772
773	*copied_entries = entries;
774
775	return (duplicate_mappings);
776}
777
778static vdev_t *
779vdev_lookup_top(spa_t *spa, uint64_t vdev)
780{
781	vdev_t *rvd;
782	vdev_list_t *vlist;
783
784	vlist = &spa->spa_root_vdev->v_children;
785	STAILQ_FOREACH(rvd, vlist, v_childlink)
786		if (rvd->v_id == vdev)
787			break;
788
789	return (rvd);
790}
791
792/*
793 * This is a callback for vdev_indirect_remap() which allocates an
794 * indirect_split_t for each split segment and adds it to iv_splits.
795 */
796static void
797vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
798    uint64_t size, void *arg)
799{
800	int n = 1;
801	zio_t *zio = arg;
802	indirect_vsd_t *iv = zio->io_vsd;
803
804	if (vd->v_read == vdev_indirect_read)
805		return;
806
807	if (vd->v_read == vdev_mirror_read)
808		n = vd->v_nchildren;
809
810	indirect_split_t *is =
811	    malloc(offsetof(indirect_split_t, is_child[n]));
812	if (is == NULL) {
813		zio->io_error = ENOMEM;
814		return;
815	}
816	bzero(is, offsetof(indirect_split_t, is_child[n]));
817
818	is->is_children = n;
819	is->is_size = size;
820	is->is_split_offset = split_offset;
821	is->is_target_offset = offset;
822	is->is_vdev = vd;
823
824	/*
825	 * Note that we only consider multiple copies of the data for
826	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
827	 * though they use the same ops as mirror, because there's only one
828	 * "good" copy under the replacing/spare.
829	 */
830	if (vd->v_read == vdev_mirror_read) {
831		int i = 0;
832		vdev_t *kid;
833
834		STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
835			is->is_child[i++].ic_vdev = kid;
836		}
837	} else {
838		is->is_child[0].ic_vdev = vd;
839	}
840
841	list_insert_tail(&iv->iv_splits, is);
842}
843
844static void
845vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
846{
847	list_t stack;
848	spa_t *spa = vd->v_spa;
849	zio_t *zio = arg;
850	remap_segment_t *rs;
851
852	list_create(&stack, sizeof (remap_segment_t),
853	    offsetof(remap_segment_t, rs_node));
854
855	rs = rs_alloc(vd, offset, asize, 0);
856	if (rs == NULL) {
857		printf("vdev_indirect_remap: out of memory.\n");
858		zio->io_error = ENOMEM;
859	}
860	for ( ; rs != NULL; rs = list_remove_head(&stack)) {
861		vdev_t *v = rs->rs_vd;
862		uint64_t num_entries = 0;
863		/* vdev_indirect_mapping_t *vim = v->v_mapping; */
864		vdev_indirect_mapping_entry_phys_t *mapping =
865		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
866		    rs->rs_offset, rs->rs_asize, &num_entries);
867
868		if (num_entries == 0)
869			zio->io_error = ENOMEM;
870
871		for (uint64_t i = 0; i < num_entries; i++) {
872			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
873			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
874			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
875			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
876			uint64_t inner_offset = rs->rs_offset -
877			    DVA_MAPPING_GET_SRC_OFFSET(m);
878			uint64_t inner_size =
879			    MIN(rs->rs_asize, size - inner_offset);
880			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
881
882			if (dst_v->v_read == vdev_indirect_read) {
883				remap_segment_t *o;
884
885				o = rs_alloc(dst_v, dst_offset + inner_offset,
886				    inner_size, rs->rs_split_offset);
887				if (o == NULL) {
888					printf("vdev_indirect_remap: "
889					    "out of memory.\n");
890					zio->io_error = ENOMEM;
891					break;
892				}
893
894				list_insert_head(&stack, o);
895			}
896			vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
897			    dst_offset + inner_offset,
898			    inner_size, arg);
899
900			/*
901			 * vdev_indirect_gather_splits can have memory
902			 * allocation error, we can not recover from it.
903			 */
904			if (zio->io_error != 0)
905				break;
906			rs->rs_offset += inner_size;
907			rs->rs_asize -= inner_size;
908			rs->rs_split_offset += inner_size;
909		}
910
911		free(mapping);
912		free(rs);
913		if (zio->io_error != 0)
914			break;
915	}
916
917	list_destroy(&stack);
918}
919
920static void
921vdev_indirect_map_free(zio_t *zio)
922{
923	indirect_vsd_t *iv = zio->io_vsd;
924	indirect_split_t *is;
925
926	while ((is = list_head(&iv->iv_splits)) != NULL) {
927		for (int c = 0; c < is->is_children; c++) {
928			indirect_child_t *ic = &is->is_child[c];
929			free(ic->ic_data);
930		}
931		list_remove(&iv->iv_splits, is);
932		free(is);
933	}
934	free(iv);
935}
936
937static int
938vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
939    off_t offset, size_t bytes)
940{
941	zio_t zio;
942	spa_t *spa = vdev->v_spa;
943	indirect_vsd_t *iv;
944	indirect_split_t *first;
945	int rc = EIO;
946
947	iv = calloc(1, sizeof(*iv));
948	if (iv == NULL)
949		return (ENOMEM);
950
951	list_create(&iv->iv_splits,
952	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
953
954	bzero(&zio, sizeof(zio));
955	zio.io_spa = spa;
956	zio.io_bp = (blkptr_t *)bp;
957	zio.io_data = buf;
958	zio.io_size = bytes;
959	zio.io_offset = offset;
960	zio.io_vd = vdev;
961	zio.io_vsd = iv;
962
963	if (vdev->v_mapping == NULL) {
964		vdev_indirect_config_t *vic;
965
966		vic = &vdev->vdev_indirect_config;
967		vdev->v_mapping = vdev_indirect_mapping_open(spa,
968		    &spa->spa_mos, vic->vic_mapping_object);
969	}
970
971	vdev_indirect_remap(vdev, offset, bytes, &zio);
972	if (zio.io_error != 0)
973		return (zio.io_error);
974
975	first = list_head(&iv->iv_splits);
976	if (first->is_size == zio.io_size) {
977		/*
978		 * This is not a split block; we are pointing to the entire
979		 * data, which will checksum the same as the original data.
980		 * Pass the BP down so that the child i/o can verify the
981		 * checksum, and try a different location if available
982		 * (e.g. on a mirror).
983		 *
984		 * While this special case could be handled the same as the
985		 * general (split block) case, doing it this way ensures
986		 * that the vast majority of blocks on indirect vdevs
987		 * (which are not split) are handled identically to blocks
988		 * on non-indirect vdevs.  This allows us to be less strict
989		 * about performance in the general (but rare) case.
990		 */
991		rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
992		    zio.io_data, first->is_target_offset, bytes);
993	} else {
994		iv->iv_split_block = B_TRUE;
995		/*
996		 * Read one copy of each split segment, from the
997		 * top-level vdev.  Since we don't know the
998		 * checksum of each split individually, the child
999		 * zio can't ensure that we get the right data.
1000		 * E.g. if it's a mirror, it will just read from a
1001		 * random (healthy) leaf vdev.  We have to verify
1002		 * the checksum in vdev_indirect_io_done().
1003		 */
1004		for (indirect_split_t *is = list_head(&iv->iv_splits);
1005		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1006			char *ptr = zio.io_data;
1007
1008			rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
1009			    ptr + is->is_split_offset, is->is_target_offset,
1010			    is->is_size);
1011		}
1012		if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
1013			rc = ECKSUM;
1014		else
1015			rc = 0;
1016	}
1017
1018	vdev_indirect_map_free(&zio);
1019	if (rc == 0)
1020		rc = zio.io_error;
1021
1022	return (rc);
1023}
1024
1025static int
1026vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1027    off_t offset, size_t bytes)
1028{
1029
1030	return (vdev_read_phys(vdev, bp, buf,
1031		offset + VDEV_LABEL_START_SIZE, bytes));
1032}
1033
1034
1035static int
1036vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1037    off_t offset, size_t bytes)
1038{
1039	vdev_t *kid;
1040	int rc;
1041
1042	rc = EIO;
1043	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1044		if (kid->v_state != VDEV_STATE_HEALTHY)
1045			continue;
1046		rc = kid->v_read(kid, bp, buf, offset, bytes);
1047		if (!rc)
1048			return (0);
1049	}
1050
1051	return (rc);
1052}
1053
1054static int
1055vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1056    off_t offset, size_t bytes)
1057{
1058	vdev_t *kid;
1059
1060	/*
1061	 * Here we should have two kids:
1062	 * First one which is the one we are replacing and we can trust
1063	 * only this one to have valid data, but it might not be present.
1064	 * Second one is that one we are replacing with. It is most likely
1065	 * healthy, but we can't trust it has needed data, so we won't use it.
1066	 */
1067	kid = STAILQ_FIRST(&vdev->v_children);
1068	if (kid == NULL)
1069		return (EIO);
1070	if (kid->v_state != VDEV_STATE_HEALTHY)
1071		return (EIO);
1072	return (kid->v_read(kid, bp, buf, offset, bytes));
1073}
1074
1075static vdev_t *
1076vdev_find(uint64_t guid)
1077{
1078	vdev_t *vdev;
1079
1080	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
1081		if (vdev->v_guid == guid)
1082			return (vdev);
1083
1084	return (0);
1085}
1086
1087static vdev_t *
1088vdev_create(uint64_t guid, vdev_read_t *_read)
1089{
1090	vdev_t *vdev;
1091	vdev_indirect_config_t *vic;
1092
1093	vdev = calloc(1, sizeof(vdev_t));
1094	if (vdev != NULL) {
1095		STAILQ_INIT(&vdev->v_children);
1096		vdev->v_guid = guid;
1097		vdev->v_read = _read;
1098
1099		/*
1100		 * root vdev has no read function.
1101		 * We only point root vdev from spa.
1102		 */
1103		if (_read != NULL) {
1104			vic = &vdev->vdev_indirect_config;
1105			vic->vic_prev_indirect_vdev = UINT64_MAX;
1106			STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
1107		}
1108	}
1109
1110	return (vdev);
1111}
1112
1113static void
1114vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
1115{
1116	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
1117	uint64_t is_log;
1118
1119	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
1120	is_log = 0;
1121	(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
1122	    &is_offline);
1123	(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
1124	    &is_removed);
1125	(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
1126	    &is_faulted);
1127	(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
1128	    NULL, &is_degraded);
1129	(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
1130	    NULL, &isnt_present);
1131	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
1132	    &is_log);
1133
1134	if (is_offline != 0)
1135		vdev->v_state = VDEV_STATE_OFFLINE;
1136	else if (is_removed != 0)
1137		vdev->v_state = VDEV_STATE_REMOVED;
1138	else if (is_faulted != 0)
1139		vdev->v_state = VDEV_STATE_FAULTED;
1140	else if (is_degraded != 0)
1141		vdev->v_state = VDEV_STATE_DEGRADED;
1142	else if (isnt_present != 0)
1143		vdev->v_state = VDEV_STATE_CANT_OPEN;
1144
1145	vdev->v_islog = is_log == 1;
1146}
1147
1148static int
1149vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
1150{
1151	uint64_t id, ashift, asize, nparity;
1152	const char *path;
1153	const char *type;
1154	vdev_t *vdev;
1155
1156	if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
1157	    nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1158	    NULL, &type)) {
1159		return (ENOENT);
1160	}
1161
1162	if (strcmp(type, VDEV_TYPE_MIRROR)
1163	    && strcmp(type, VDEV_TYPE_DISK)
1164#ifdef ZFS_TEST
1165	    && strcmp(type, VDEV_TYPE_FILE)
1166#endif
1167	    && strcmp(type, VDEV_TYPE_RAIDZ)
1168	    && strcmp(type, VDEV_TYPE_INDIRECT)
1169	    && strcmp(type, VDEV_TYPE_REPLACING)) {
1170		printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
1171		return (EIO);
1172	}
1173
1174	if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
1175		vdev = vdev_create(guid, vdev_mirror_read);
1176	else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
1177		vdev = vdev_create(guid, vdev_raidz_read);
1178	else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
1179		vdev = vdev_create(guid, vdev_replacing_read);
1180	else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
1181		vdev_indirect_config_t *vic;
1182
1183		vdev = vdev_create(guid, vdev_indirect_read);
1184		if (vdev != NULL) {
1185			vdev->v_state = VDEV_STATE_HEALTHY;
1186			vic = &vdev->vdev_indirect_config;
1187
1188			nvlist_find(nvlist,
1189			    ZPOOL_CONFIG_INDIRECT_OBJECT,
1190			    DATA_TYPE_UINT64,
1191			    NULL, &vic->vic_mapping_object);
1192			nvlist_find(nvlist,
1193			    ZPOOL_CONFIG_INDIRECT_BIRTHS,
1194			    DATA_TYPE_UINT64,
1195			    NULL, &vic->vic_births_object);
1196			nvlist_find(nvlist,
1197			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
1198			    DATA_TYPE_UINT64,
1199			    NULL, &vic->vic_prev_indirect_vdev);
1200		}
1201	} else {
1202		vdev = vdev_create(guid, vdev_disk_read);
1203	}
1204
1205	if (vdev == NULL)
1206		return (ENOMEM);
1207
1208	vdev_set_initial_state(vdev, nvlist);
1209	vdev->v_id = id;
1210	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
1211	    DATA_TYPE_UINT64, NULL, &ashift) == 0)
1212		vdev->v_ashift = ashift;
1213
1214	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1215	    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1216		vdev->v_psize = asize +
1217		    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1218	}
1219
1220	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1221	    DATA_TYPE_UINT64, NULL, &nparity) == 0)
1222		vdev->v_nparity = nparity;
1223
1224	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1225	    DATA_TYPE_STRING, NULL, &path) == 0) {
1226		if (strncmp(path, "/dev/", 5) == 0)
1227			path += 5;
1228		vdev->v_name = strdup(path);
1229	} else {
1230		char *name;
1231
1232		name = NULL;
1233		if (strcmp(type, "raidz") == 0) {
1234			if (vdev->v_nparity < 1 ||
1235			    vdev->v_nparity > 3) {
1236				printf("ZFS: can only boot from disk, "
1237				    "mirror, raidz1, raidz2 and raidz3 "
1238				    "vdevs\n");
1239				return (EIO);
1240			}
1241			(void) asprintf(&name, "%s%d-%" PRIu64, type,
1242			    vdev->v_nparity, id);
1243		} else {
1244			(void) asprintf(&name, "%s-%" PRIu64, type, id);
1245		}
1246		vdev->v_name = name;
1247	}
1248	*vdevp = vdev;
1249	return (0);
1250}
1251
1252/*
1253 * Find slot for vdev. We return either NULL to signal to use
1254 * STAILQ_INSERT_HEAD, or we return link element to be used with
1255 * STAILQ_INSERT_AFTER.
1256 */
1257static vdev_t *
1258vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1259{
1260	vdev_t *v, *previous;
1261
1262	if (STAILQ_EMPTY(&top_vdev->v_children))
1263		return (NULL);
1264
1265	previous = NULL;
1266	STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1267		if (v->v_id > vdev->v_id)
1268			return (previous);
1269
1270		if (v->v_id == vdev->v_id)
1271			return (v);
1272
1273		if (v->v_id < vdev->v_id)
1274			previous = v;
1275	}
1276	return (previous);
1277}
1278
1279static size_t
1280vdev_child_count(vdev_t *vdev)
1281{
1282	vdev_t *v;
1283	size_t count;
1284
1285	count = 0;
1286	STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1287		count++;
1288	}
1289	return (count);
1290}
1291
1292/*
1293 * Insert vdev into top_vdev children list. List is ordered by v_id.
1294 */
1295static void
1296vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1297{
1298	vdev_t *previous;
1299	size_t count;
1300
1301	/*
1302	 * The top level vdev can appear in random order, depending how
1303	 * the firmware is presenting the disk devices.
1304	 * However, we will insert vdev to create list ordered by v_id,
1305	 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1306	 * as STAILQ does not have insert before.
1307	 */
1308	previous = vdev_find_previous(top_vdev, vdev);
1309
1310	if (previous == NULL) {
1311		STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1312		count = vdev_child_count(top_vdev);
1313		if (top_vdev->v_nchildren < count)
1314			top_vdev->v_nchildren = count;
1315		return;
1316	}
1317
1318	if (previous->v_id == vdev->v_id)
1319		return;
1320
1321	STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink);
1322	count = vdev_child_count(top_vdev);
1323	if (top_vdev->v_nchildren < count)
1324		top_vdev->v_nchildren = count;
1325}
1326
1327static int
1328vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
1329{
1330	vdev_t *top_vdev, *vdev;
1331	const unsigned char *kids;
1332	int rc, nkids;
1333
1334	/* Get top vdev. */
1335	top_vdev = vdev_find(top_guid);
1336	if (top_vdev == NULL) {
1337		rc = vdev_init(top_guid, nvlist, &top_vdev);
1338		if (rc != 0)
1339			return (rc);
1340		top_vdev->v_spa = spa;
1341		top_vdev->v_top = top_vdev;
1342		vdev_insert(spa->spa_root_vdev, top_vdev);
1343	}
1344
1345	/* Add children if there are any. */
1346	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1347	    &nkids, &kids);
1348	if (rc == 0) {
1349		for (int i = 0; i < nkids; i++) {
1350			uint64_t guid;
1351
1352			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1353			    DATA_TYPE_UINT64, NULL, &guid);
1354			if (rc != 0)
1355				return (rc);
1356			rc = vdev_init(guid, kids, &vdev);
1357			if (rc != 0)
1358				return (rc);
1359
1360			vdev->v_spa = spa;
1361			vdev->v_top = top_vdev;
1362			vdev_insert(top_vdev, vdev);
1363
1364			kids = nvlist_next(kids);
1365		}
1366	} else {
1367		rc = 0;
1368	}
1369
1370	return (rc);
1371}
1372
1373static int
1374vdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
1375{
1376	uint64_t pool_guid, top_guid;
1377	const unsigned char *vdevs;
1378
1379	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1380	    NULL, &pool_guid) ||
1381	    nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1382	    NULL, &top_guid) ||
1383	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1384	    NULL, &vdevs)) {
1385		printf("ZFS: can't find vdev details\n");
1386		return (ENOENT);
1387	}
1388
1389	return (vdev_from_nvlist(spa, top_guid, vdevs));
1390}
1391
1392static void
1393vdev_set_state(vdev_t *vdev)
1394{
1395	vdev_t *kid;
1396	int good_kids;
1397	int bad_kids;
1398
1399	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1400		vdev_set_state(kid);
1401	}
1402
1403	/*
1404	 * A mirror or raidz is healthy if all its kids are healthy. A
1405	 * mirror is degraded if any of its kids is healthy; a raidz
1406	 * is degraded if at most nparity kids are offline.
1407	 */
1408	if (STAILQ_FIRST(&vdev->v_children)) {
1409		good_kids = 0;
1410		bad_kids = 0;
1411		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1412			if (kid->v_state == VDEV_STATE_HEALTHY)
1413				good_kids++;
1414			else
1415				bad_kids++;
1416		}
1417		if (bad_kids == 0) {
1418			vdev->v_state = VDEV_STATE_HEALTHY;
1419		} else {
1420			if (vdev->v_read == vdev_mirror_read) {
1421				if (good_kids) {
1422					vdev->v_state = VDEV_STATE_DEGRADED;
1423				} else {
1424					vdev->v_state = VDEV_STATE_OFFLINE;
1425				}
1426			} else if (vdev->v_read == vdev_raidz_read) {
1427				if (bad_kids > vdev->v_nparity) {
1428					vdev->v_state = VDEV_STATE_OFFLINE;
1429				} else {
1430					vdev->v_state = VDEV_STATE_DEGRADED;
1431				}
1432			}
1433		}
1434	}
1435}
1436
1437static int
1438vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
1439{
1440	vdev_t *vdev;
1441	const unsigned char *kids;
1442	int rc, nkids;
1443
1444	/* Update top vdev. */
1445	vdev = vdev_find(top_guid);
1446	if (vdev != NULL)
1447		vdev_set_initial_state(vdev, nvlist);
1448
1449	/* Update children if there are any. */
1450	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1451	    &nkids, &kids);
1452	if (rc == 0) {
1453		for (int i = 0; i < nkids; i++) {
1454			uint64_t guid;
1455
1456			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1457			    DATA_TYPE_UINT64, NULL, &guid);
1458			if (rc != 0)
1459				break;
1460
1461			vdev = vdev_find(guid);
1462			if (vdev != NULL)
1463				vdev_set_initial_state(vdev, kids);
1464
1465			kids = nvlist_next(kids);
1466		}
1467	} else {
1468		rc = 0;
1469	}
1470
1471	return (rc);
1472}
1473
1474static int
1475vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist)
1476{
1477	uint64_t pool_guid, vdev_children;
1478	const unsigned char *vdevs, *kids;
1479	int rc, nkids;
1480
1481	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1482	    NULL, &pool_guid) ||
1483	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
1484	    NULL, &vdev_children) ||
1485	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1486	    NULL, &vdevs)) {
1487		printf("ZFS: can't find vdev details\n");
1488		return (ENOENT);
1489	}
1490
1491	/* Wrong guid?! */
1492	if (spa->spa_guid != pool_guid)
1493		return (EIO);
1494
1495	spa->spa_root_vdev->v_nchildren = vdev_children;
1496
1497	rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1498	    &nkids, &kids);
1499
1500	/*
1501	 * MOS config has at least one child for root vdev.
1502	 */
1503	if (rc != 0)
1504		return (EIO);
1505
1506	for (int i = 0; i < nkids; i++) {
1507		uint64_t guid;
1508		vdev_t *vdev;
1509
1510		rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1511		    NULL, &guid);
1512		if (rc != 0)
1513			break;
1514		vdev = vdev_find(guid);
1515		/*
1516		 * Top level vdev is missing, create it.
1517		 */
1518		if (vdev == NULL)
1519			rc = vdev_from_nvlist(spa, guid, kids);
1520		else
1521			rc = vdev_update_from_nvlist(guid, kids);
1522		if (rc != 0)
1523			break;
1524		kids = nvlist_next(kids);
1525	}
1526
1527	/*
1528	 * Re-evaluate top-level vdev state.
1529	 */
1530	vdev_set_state(spa->spa_root_vdev);
1531
1532	return (rc);
1533}
1534
1535static spa_t *
1536spa_find_by_guid(uint64_t guid)
1537{
1538	spa_t *spa;
1539
1540	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1541		if (spa->spa_guid == guid)
1542			return (spa);
1543
1544	return (0);
1545}
1546
1547static spa_t *
1548spa_find_by_name(const char *name)
1549{
1550	spa_t *spa;
1551
1552	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1553		if (!strcmp(spa->spa_name, name))
1554			return (spa);
1555
1556	return (0);
1557}
1558
1559#ifdef BOOT2
1560static spa_t *
1561spa_get_primary(void)
1562{
1563
1564	return (STAILQ_FIRST(&zfs_pools));
1565}
1566
1567static vdev_t *
1568spa_get_primary_vdev(const spa_t *spa)
1569{
1570	vdev_t *vdev;
1571	vdev_t *kid;
1572
1573	if (spa == NULL)
1574		spa = spa_get_primary();
1575	if (spa == NULL)
1576		return (NULL);
1577	vdev = spa->spa_root_vdev;
1578	if (vdev == NULL)
1579		return (NULL);
1580	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
1581	     kid = STAILQ_FIRST(&vdev->v_children))
1582		vdev = kid;
1583	return (vdev);
1584}
1585#endif
1586
1587static spa_t *
1588spa_create(uint64_t guid, const char *name)
1589{
1590	spa_t *spa;
1591
1592	if ((spa = calloc(1, sizeof(spa_t))) == NULL)
1593		return (NULL);
1594	if ((spa->spa_name = strdup(name)) == NULL) {
1595		free(spa);
1596		return (NULL);
1597	}
1598	spa->spa_guid = guid;
1599	spa->spa_root_vdev = vdev_create(guid, NULL);
1600	if (spa->spa_root_vdev == NULL) {
1601		free(spa->spa_name);
1602		free(spa);
1603		return (NULL);
1604	}
1605	spa->spa_root_vdev->v_name = strdup("root");
1606	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
1607
1608	return (spa);
1609}
1610
1611static const char *
1612state_name(vdev_state_t state)
1613{
1614	static const char* names[] = {
1615		"UNKNOWN",
1616		"CLOSED",
1617		"OFFLINE",
1618		"REMOVED",
1619		"CANT_OPEN",
1620		"FAULTED",
1621		"DEGRADED",
1622		"ONLINE"
1623	};
1624	return names[state];
1625}
1626
1627#ifdef BOOT2
1628
1629#define pager_printf printf
1630
1631#else
1632
1633static int
1634pager_printf(const char *fmt, ...)
1635{
1636	char line[80];
1637	va_list args;
1638
1639	va_start(args, fmt);
1640	vsnprintf(line, sizeof(line), fmt, args);
1641	va_end(args);
1642	return (pager_output(line));
1643}
1644
1645#endif
1646
1647#define STATUS_FORMAT	"        %s %s\n"
1648
1649static int
1650print_state(int indent, const char *name, vdev_state_t state)
1651{
1652	int i;
1653	char buf[512];
1654
1655	buf[0] = 0;
1656	for (i = 0; i < indent; i++)
1657		strcat(buf, "  ");
1658	strcat(buf, name);
1659	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
1660}
1661
1662static int
1663vdev_status(vdev_t *vdev, int indent)
1664{
1665	vdev_t *kid;
1666	int ret;
1667
1668	if (vdev->v_islog) {
1669		(void)pager_output("        logs\n");
1670		indent++;
1671	}
1672
1673	ret = print_state(indent, vdev->v_name, vdev->v_state);
1674	if (ret != 0)
1675		return (ret);
1676
1677	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1678		ret = vdev_status(kid, indent + 1);
1679		if (ret != 0)
1680			return (ret);
1681	}
1682	return (ret);
1683}
1684
1685static int
1686spa_status(spa_t *spa)
1687{
1688	static char bootfs[ZFS_MAXNAMELEN];
1689	uint64_t rootid;
1690	vdev_list_t *vlist;
1691	vdev_t *vdev;
1692	int good_kids, bad_kids, degraded_kids, ret;
1693	vdev_state_t state;
1694
1695	ret = pager_printf("  pool: %s\n", spa->spa_name);
1696	if (ret != 0)
1697		return (ret);
1698
1699	if (zfs_get_root(spa, &rootid) == 0 &&
1700	    zfs_rlookup(spa, rootid, bootfs) == 0) {
1701		if (bootfs[0] == '\0')
1702			ret = pager_printf("bootfs: %s\n", spa->spa_name);
1703		else
1704			ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
1705			    bootfs);
1706		if (ret != 0)
1707			return (ret);
1708	}
1709	ret = pager_printf("config:\n\n");
1710	if (ret != 0)
1711		return (ret);
1712	ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
1713	if (ret != 0)
1714		return (ret);
1715
1716	good_kids = 0;
1717	degraded_kids = 0;
1718	bad_kids = 0;
1719	vlist = &spa->spa_root_vdev->v_children;
1720	STAILQ_FOREACH(vdev, vlist, v_childlink) {
1721		if (vdev->v_state == VDEV_STATE_HEALTHY)
1722			good_kids++;
1723		else if (vdev->v_state == VDEV_STATE_DEGRADED)
1724			degraded_kids++;
1725		else
1726			bad_kids++;
1727	}
1728
1729	state = VDEV_STATE_CLOSED;
1730	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
1731		state = VDEV_STATE_HEALTHY;
1732	else if ((good_kids + degraded_kids) > 0)
1733		state = VDEV_STATE_DEGRADED;
1734
1735	ret = print_state(0, spa->spa_name, state);
1736	if (ret != 0)
1737		return (ret);
1738
1739	STAILQ_FOREACH(vdev, vlist, v_childlink) {
1740		ret = vdev_status(vdev, 1);
1741		if (ret != 0)
1742			return (ret);
1743	}
1744	return (ret);
1745}
1746
1747static int
1748spa_all_status(void)
1749{
1750	spa_t *spa;
1751	int first = 1, ret = 0;
1752
1753	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1754		if (!first) {
1755			ret = pager_printf("\n");
1756			if (ret != 0)
1757				return (ret);
1758		}
1759		first = 0;
1760		ret = spa_status(spa);
1761		if (ret != 0)
1762			return (ret);
1763	}
1764	return (ret);
1765}
1766
1767static uint64_t
1768vdev_label_offset(uint64_t psize, int l, uint64_t offset)
1769{
1770	uint64_t label_offset;
1771
1772	if (l < VDEV_LABELS / 2)
1773		label_offset = 0;
1774	else
1775		label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
1776
1777	return (offset + l * sizeof (vdev_label_t) + label_offset);
1778}
1779
1780static int
1781vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
1782{
1783	unsigned int seq1 = 0;
1784	unsigned int seq2 = 0;
1785	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
1786
1787	if (cmp != 0)
1788		return (cmp);
1789
1790	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
1791	if (cmp != 0)
1792		return (cmp);
1793
1794	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
1795		seq1 = MMP_SEQ(ub1);
1796
1797	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
1798		seq2 = MMP_SEQ(ub2);
1799
1800	return (AVL_CMP(seq1, seq2));
1801}
1802
1803static int
1804uberblock_verify(uberblock_t *ub)
1805{
1806	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
1807		byteswap_uint64_array(ub, sizeof (uberblock_t));
1808	}
1809
1810	if (ub->ub_magic != UBERBLOCK_MAGIC ||
1811	    !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
1812		return (EINVAL);
1813
1814	return (0);
1815}
1816
1817static int
1818vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
1819    size_t size)
1820{
1821	blkptr_t bp;
1822	off_t off;
1823
1824	off = vdev_label_offset(vd->v_psize, l, offset);
1825
1826	BP_ZERO(&bp);
1827	BP_SET_LSIZE(&bp, size);
1828	BP_SET_PSIZE(&bp, size);
1829	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1830	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1831	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
1832	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1833
1834	return (vdev_read_phys(vd, &bp, buf, off, size));
1835}
1836
1837static unsigned char *
1838vdev_label_read_config(vdev_t *vd, uint64_t txg)
1839{
1840	vdev_phys_t *label;
1841	uint64_t best_txg = 0;
1842	uint64_t label_txg = 0;
1843	uint64_t asize;
1844	unsigned char *nvl;
1845	size_t nvl_size;
1846	int error;
1847
1848	label = malloc(sizeof (vdev_phys_t));
1849	if (label == NULL)
1850		return (NULL);
1851
1852	nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
1853	nvl = malloc(nvl_size);
1854	if (nvl == NULL)
1855		goto done;
1856
1857	for (int l = 0; l < VDEV_LABELS; l++) {
1858		const unsigned char *nvlist;
1859
1860		if (vdev_label_read(vd, l, label,
1861		    offsetof(vdev_label_t, vl_vdev_phys),
1862		    sizeof (vdev_phys_t)))
1863			continue;
1864
1865		if (label->vp_nvlist[0] != NV_ENCODE_XDR)
1866			continue;
1867
1868		nvlist = (const unsigned char *) label->vp_nvlist + 4;
1869		error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
1870		    DATA_TYPE_UINT64, NULL, &label_txg);
1871		if (error != 0 || label_txg == 0) {
1872			memcpy(nvl, nvlist, nvl_size);
1873			goto done;
1874		}
1875
1876		if (label_txg <= txg && label_txg > best_txg) {
1877			best_txg = label_txg;
1878			memcpy(nvl, nvlist, nvl_size);
1879
1880			/*
1881			 * Use asize from pool config. We need this
1882			 * because we can get bad value from BIOS.
1883			 */
1884			if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1885			    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1886				vd->v_psize = asize +
1887				    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1888			}
1889		}
1890	}
1891
1892	if (best_txg == 0) {
1893		free(nvl);
1894		nvl = NULL;
1895	}
1896done:
1897	free(label);
1898	return (nvl);
1899}
1900
1901static void
1902vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
1903{
1904	uberblock_t *buf;
1905
1906	buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
1907	if (buf == NULL)
1908		return;
1909
1910	for (int l = 0; l < VDEV_LABELS; l++) {
1911		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
1912			if (vdev_label_read(vd, l, buf,
1913			    VDEV_UBERBLOCK_OFFSET(vd, n),
1914			    VDEV_UBERBLOCK_SIZE(vd)))
1915				continue;
1916			if (uberblock_verify(buf) != 0)
1917				continue;
1918
1919			if (vdev_uberblock_compare(buf, ub) > 0)
1920				*ub = *buf;
1921		}
1922	}
1923	free(buf);
1924}
1925
1926static int
1927vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
1928{
1929	vdev_t vtmp;
1930	spa_t *spa;
1931	vdev_t *vdev;
1932	unsigned char *nvlist;
1933	uint64_t val;
1934	uint64_t guid, vdev_children;
1935	uint64_t pool_txg, pool_guid;
1936	const char *pool_name;
1937	const unsigned char *features;
1938	int rc;
1939
1940	/*
1941	 * Load the vdev label and figure out which
1942	 * uberblock is most current.
1943	 */
1944	memset(&vtmp, 0, sizeof(vtmp));
1945	vtmp.v_phys_read = _read;
1946	vtmp.v_read_priv = read_priv;
1947	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
1948	    (uint64_t)sizeof (vdev_label_t));
1949
1950	/* Test for minimum device size. */
1951	if (vtmp.v_psize < SPA_MINDEVSIZE)
1952		return (EIO);
1953
1954	nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
1955	if (nvlist == NULL)
1956		return (EIO);
1957
1958	if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1959	    NULL, &val) != 0) {
1960		free(nvlist);
1961		return (EIO);
1962	}
1963
1964	if (!SPA_VERSION_IS_SUPPORTED(val)) {
1965		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1966		    (unsigned) val, (unsigned) SPA_VERSION);
1967		free(nvlist);
1968		return (EIO);
1969	}
1970
1971	/* Check ZFS features for read */
1972	if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1973	    DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1974	    nvlist_check_features_for_read(features) != 0) {
1975		free(nvlist);
1976		return (EIO);
1977	}
1978
1979	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1980	    NULL, &val) != 0) {
1981		free(nvlist);
1982		return (EIO);
1983	}
1984
1985	if (val == POOL_STATE_DESTROYED) {
1986		/* We don't boot only from destroyed pools. */
1987		free(nvlist);
1988		return (EIO);
1989	}
1990
1991	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1992	    NULL, &pool_txg) != 0 ||
1993	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1994	    NULL, &pool_guid) != 0 ||
1995	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1996	    NULL, &pool_name) != 0) {
1997		/*
1998		 * Cache and spare devices end up here - just ignore
1999		 * them.
2000		 */
2001		free(nvlist);
2002		return (EIO);
2003	}
2004
2005	/*
2006	 * Create the pool if this is the first time we've seen it.
2007	 */
2008	spa = spa_find_by_guid(pool_guid);
2009	if (spa == NULL) {
2010		nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN,
2011		    DATA_TYPE_UINT64, NULL, &vdev_children);
2012		spa = spa_create(pool_guid, pool_name);
2013		if (spa == NULL) {
2014			free(nvlist);
2015			return (ENOMEM);
2016		}
2017		spa->spa_root_vdev->v_nchildren = vdev_children;
2018	}
2019	if (pool_txg > spa->spa_txg)
2020		spa->spa_txg = pool_txg;
2021
2022	/*
2023	 * Get the vdev tree and create our in-core copy of it.
2024	 * If we already have a vdev with this guid, this must
2025	 * be some kind of alias (overlapping slices, dangerously dedicated
2026	 * disks etc).
2027	 */
2028	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
2029	    NULL, &guid) != 0) {
2030		free(nvlist);
2031		return (EIO);
2032	}
2033	vdev = vdev_find(guid);
2034	/* Has this vdev already been inited? */
2035	if (vdev && vdev->v_phys_read) {
2036		free(nvlist);
2037		return (EIO);
2038	}
2039
2040	rc = vdev_init_from_label(spa, nvlist);
2041	free(nvlist);
2042	if (rc != 0)
2043		return (rc);
2044
2045	/*
2046	 * We should already have created an incomplete vdev for this
2047	 * vdev. Find it and initialise it with our read proc.
2048	 */
2049	vdev = vdev_find(guid);
2050	if (vdev != NULL) {
2051		vdev->v_phys_read = _read;
2052		vdev->v_read_priv = read_priv;
2053		vdev->v_psize = vtmp.v_psize;
2054		/*
2055		 * If no other state is set, mark vdev healthy.
2056		 */
2057		if (vdev->v_state == VDEV_STATE_UNKNOWN)
2058			vdev->v_state = VDEV_STATE_HEALTHY;
2059	} else {
2060		printf("ZFS: inconsistent nvlist contents\n");
2061		return (EIO);
2062	}
2063
2064	if (vdev->v_islog)
2065		spa->spa_with_log = vdev->v_islog;
2066
2067	/*
2068	 * Re-evaluate top-level vdev state.
2069	 */
2070	vdev_set_state(vdev->v_top);
2071
2072	/*
2073	 * Ok, we are happy with the pool so far. Lets find
2074	 * the best uberblock and then we can actually access
2075	 * the contents of the pool.
2076	 */
2077	vdev_uberblock_load(vdev, &spa->spa_uberblock);
2078
2079	if (spap != NULL)
2080		*spap = spa;
2081	return (0);
2082}
2083
2084static int
2085ilog2(int n)
2086{
2087	int v;
2088
2089	for (v = 0; v < 32; v++)
2090		if (n == (1 << v))
2091			return v;
2092	return -1;
2093}
2094
2095static int
2096zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
2097{
2098	blkptr_t gbh_bp;
2099	zio_gbh_phys_t zio_gb;
2100	char *pbuf;
2101	int i;
2102
2103	/* Artificial BP for gang block header. */
2104	gbh_bp = *bp;
2105	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2106	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2107	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
2108	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
2109	for (i = 0; i < SPA_DVAS_PER_BP; i++)
2110		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
2111
2112	/* Read gang header block using the artificial BP. */
2113	if (zio_read(spa, &gbh_bp, &zio_gb))
2114		return (EIO);
2115
2116	pbuf = buf;
2117	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
2118		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
2119
2120		if (BP_IS_HOLE(gbp))
2121			continue;
2122		if (zio_read(spa, gbp, pbuf))
2123			return (EIO);
2124		pbuf += BP_GET_PSIZE(gbp);
2125	}
2126
2127	if (zio_checksum_verify(spa, bp, buf))
2128		return (EIO);
2129	return (0);
2130}
2131
2132static int
2133zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
2134{
2135	int cpfunc = BP_GET_COMPRESS(bp);
2136	uint64_t align, size;
2137	void *pbuf;
2138	int i, error;
2139
2140	/*
2141	 * Process data embedded in block pointer
2142	 */
2143	if (BP_IS_EMBEDDED(bp)) {
2144		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2145
2146		size = BPE_GET_PSIZE(bp);
2147		ASSERT(size <= BPE_PAYLOAD_SIZE);
2148
2149		if (cpfunc != ZIO_COMPRESS_OFF)
2150			pbuf = zfs_alloc(size);
2151		else
2152			pbuf = buf;
2153
2154		decode_embedded_bp_compressed(bp, pbuf);
2155		error = 0;
2156
2157		if (cpfunc != ZIO_COMPRESS_OFF) {
2158			error = zio_decompress_data(cpfunc, pbuf,
2159			    size, buf, BP_GET_LSIZE(bp));
2160			zfs_free(pbuf, size);
2161		}
2162		if (error != 0)
2163			printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
2164			    error);
2165		return (error);
2166	}
2167
2168	error = EIO;
2169
2170	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
2171		const dva_t *dva = &bp->blk_dva[i];
2172		vdev_t *vdev;
2173		vdev_list_t *vlist;
2174		uint64_t vdevid;
2175		off_t offset;
2176
2177		if (!dva->dva_word[0] && !dva->dva_word[1])
2178			continue;
2179
2180		vdevid = DVA_GET_VDEV(dva);
2181		offset = DVA_GET_OFFSET(dva);
2182		vlist = &spa->spa_root_vdev->v_children;
2183		STAILQ_FOREACH(vdev, vlist, v_childlink) {
2184			if (vdev->v_id == vdevid)
2185				break;
2186		}
2187		if (!vdev || !vdev->v_read)
2188			continue;
2189
2190		size = BP_GET_PSIZE(bp);
2191		if (vdev->v_read == vdev_raidz_read) {
2192			align = 1ULL << vdev->v_ashift;
2193			if (P2PHASE(size, align) != 0)
2194				size = P2ROUNDUP(size, align);
2195		}
2196		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
2197			pbuf = zfs_alloc(size);
2198		else
2199			pbuf = buf;
2200
2201		if (DVA_GET_GANG(dva))
2202			error = zio_read_gang(spa, bp, pbuf);
2203		else
2204			error = vdev->v_read(vdev, bp, pbuf, offset, size);
2205		if (error == 0) {
2206			if (cpfunc != ZIO_COMPRESS_OFF)
2207				error = zio_decompress_data(cpfunc, pbuf,
2208				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
2209			else if (size != BP_GET_PSIZE(bp))
2210				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
2211		}
2212		if (buf != pbuf)
2213			zfs_free(pbuf, size);
2214		if (error == 0)
2215			break;
2216	}
2217	if (error != 0)
2218		printf("ZFS: i/o error - all block copies unavailable\n");
2219	return (error);
2220}
2221
2222static int
2223dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
2224{
2225	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
2226	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2227	int nlevels = dnode->dn_nlevels;
2228	int i, rc;
2229
2230	if (bsize > SPA_MAXBLOCKSIZE) {
2231		printf("ZFS: I/O error - blocks larger than %llu are not "
2232		    "supported\n", SPA_MAXBLOCKSIZE);
2233		return (EIO);
2234	}
2235
2236	/*
2237	 * Note: bsize may not be a power of two here so we need to do an
2238	 * actual divide rather than a bitshift.
2239	 */
2240	while (buflen > 0) {
2241		uint64_t bn = offset / bsize;
2242		int boff = offset % bsize;
2243		int ibn;
2244		const blkptr_t *indbp;
2245		blkptr_t bp;
2246
2247		if (bn > dnode->dn_maxblkid)
2248			return (EIO);
2249
2250		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
2251			goto cached;
2252
2253		indbp = dnode->dn_blkptr;
2254		for (i = 0; i < nlevels; i++) {
2255			/*
2256			 * Copy the bp from the indirect array so that
2257			 * we can re-use the scratch buffer for multi-level
2258			 * objects.
2259			 */
2260			ibn = bn >> ((nlevels - i - 1) * ibshift);
2261			ibn &= ((1 << ibshift) - 1);
2262			bp = indbp[ibn];
2263			if (BP_IS_HOLE(&bp)) {
2264				memset(dnode_cache_buf, 0, bsize);
2265				break;
2266			}
2267			rc = zio_read(spa, &bp, dnode_cache_buf);
2268			if (rc)
2269				return (rc);
2270			indbp = (const blkptr_t *) dnode_cache_buf;
2271		}
2272		dnode_cache_obj = dnode;
2273		dnode_cache_bn = bn;
2274	cached:
2275
2276		/*
2277		 * The buffer contains our data block. Copy what we
2278		 * need from it and loop.
2279		 */
2280		i = bsize - boff;
2281		if (i > buflen) i = buflen;
2282		memcpy(buf, &dnode_cache_buf[boff], i);
2283		buf = ((char *)buf) + i;
2284		offset += i;
2285		buflen -= i;
2286	}
2287
2288	return (0);
2289}
2290
2291/*
2292 * Lookup a value in a microzap directory. Assumes that the zap
2293 * scratch buffer contains the directory contents.
2294 */
2295static int
2296mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
2297{
2298	const mzap_phys_t *mz;
2299	const mzap_ent_phys_t *mze;
2300	size_t size;
2301	int chunks, i;
2302
2303	/*
2304	 * Microzap objects use exactly one block. Read the whole
2305	 * thing.
2306	 */
2307	size = dnode->dn_datablkszsec * 512;
2308
2309	mz = (const mzap_phys_t *) zap_scratch;
2310	chunks = size / MZAP_ENT_LEN - 1;
2311
2312	for (i = 0; i < chunks; i++) {
2313		mze = &mz->mz_chunk[i];
2314		if (!strcmp(mze->mze_name, name)) {
2315			*value = mze->mze_value;
2316			return (0);
2317		}
2318	}
2319
2320	return (ENOENT);
2321}
2322
2323/*
2324 * Compare a name with a zap leaf entry. Return non-zero if the name
2325 * matches.
2326 */
2327static int
2328fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
2329{
2330	size_t namelen;
2331	const zap_leaf_chunk_t *nc;
2332	const char *p;
2333
2334	namelen = zc->l_entry.le_name_numints;
2335
2336	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2337	p = name;
2338	while (namelen > 0) {
2339		size_t len;
2340		len = namelen;
2341		if (len > ZAP_LEAF_ARRAY_BYTES)
2342			len = ZAP_LEAF_ARRAY_BYTES;
2343		if (memcmp(p, nc->l_array.la_array, len))
2344			return (0);
2345		p += len;
2346		namelen -= len;
2347		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2348	}
2349
2350	return 1;
2351}
2352
2353/*
2354 * Extract a uint64_t value from a zap leaf entry.
2355 */
2356static uint64_t
2357fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
2358{
2359	const zap_leaf_chunk_t *vc;
2360	int i;
2361	uint64_t value;
2362	const uint8_t *p;
2363
2364	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
2365	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
2366		value = (value << 8) | p[i];
2367	}
2368
2369	return value;
2370}
2371
2372static void
2373stv(int len, void *addr, uint64_t value)
2374{
2375	switch (len) {
2376	case 1:
2377		*(uint8_t *)addr = value;
2378		return;
2379	case 2:
2380		*(uint16_t *)addr = value;
2381		return;
2382	case 4:
2383		*(uint32_t *)addr = value;
2384		return;
2385	case 8:
2386		*(uint64_t *)addr = value;
2387		return;
2388	}
2389}
2390
2391/*
2392 * Extract a array from a zap leaf entry.
2393 */
2394static void
2395fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2396    uint64_t integer_size, uint64_t num_integers, void *buf)
2397{
2398	uint64_t array_int_len = zc->l_entry.le_value_intlen;
2399	uint64_t value = 0;
2400	uint64_t *u64 = buf;
2401	char *p = buf;
2402	int len = MIN(zc->l_entry.le_value_numints, num_integers);
2403	int chunk = zc->l_entry.le_value_chunk;
2404	int byten = 0;
2405
2406	if (integer_size == 8 && len == 1) {
2407		*u64 = fzap_leaf_value(zl, zc);
2408		return;
2409	}
2410
2411	while (len > 0) {
2412		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
2413		int i;
2414
2415		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
2416		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
2417			value = (value << 8) | la->la_array[i];
2418			byten++;
2419			if (byten == array_int_len) {
2420				stv(integer_size, p, value);
2421				byten = 0;
2422				len--;
2423				if (len == 0)
2424					return;
2425				p += integer_size;
2426			}
2427		}
2428		chunk = la->la_next;
2429	}
2430}
2431
2432static int
2433fzap_check_size(uint64_t integer_size, uint64_t num_integers)
2434{
2435
2436	switch (integer_size) {
2437	case 1:
2438	case 2:
2439	case 4:
2440	case 8:
2441		break;
2442	default:
2443		return (EINVAL);
2444	}
2445
2446	if (integer_size * num_integers > ZAP_MAXVALUELEN)
2447		return (E2BIG);
2448
2449	return (0);
2450}
2451
2452/*
2453 * Lookup a value in a fatzap directory. Assumes that the zap scratch
2454 * buffer contains the directory header.
2455 */
2456static int
2457fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2458    uint64_t integer_size, uint64_t num_integers, void *value)
2459{
2460	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2461	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
2462	fat_zap_t z;
2463	uint64_t *ptrtbl;
2464	uint64_t hash;
2465	int rc;
2466
2467	if (zh.zap_magic != ZAP_MAGIC)
2468		return (EIO);
2469
2470	if ((rc = fzap_check_size(integer_size, num_integers)) != 0)
2471		return (rc);
2472
2473	z.zap_block_shift = ilog2(bsize);
2474	z.zap_phys = (zap_phys_t *) zap_scratch;
2475
2476	/*
2477	 * Figure out where the pointer table is and read it in if necessary.
2478	 */
2479	if (zh.zap_ptrtbl.zt_blk) {
2480		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
2481			       zap_scratch, bsize);
2482		if (rc)
2483			return (rc);
2484		ptrtbl = (uint64_t *) zap_scratch;
2485	} else {
2486		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
2487	}
2488
2489	hash = zap_hash(zh.zap_salt, name);
2490
2491	zap_leaf_t zl;
2492	zl.l_bs = z.zap_block_shift;
2493
2494	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
2495	zap_leaf_chunk_t *zc;
2496
2497	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
2498	if (rc)
2499		return (rc);
2500
2501	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
2502
2503	/*
2504	 * Make sure this chunk matches our hash.
2505	 */
2506	if (zl.l_phys->l_hdr.lh_prefix_len > 0
2507	    && zl.l_phys->l_hdr.lh_prefix
2508	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
2509		return (ENOENT);
2510
2511	/*
2512	 * Hash within the chunk to find our entry.
2513	 */
2514	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
2515	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
2516	h = zl.l_phys->l_hash[h];
2517	if (h == 0xffff)
2518		return (ENOENT);
2519	zc = &ZAP_LEAF_CHUNK(&zl, h);
2520	while (zc->l_entry.le_hash != hash) {
2521		if (zc->l_entry.le_next == 0xffff)
2522			return (ENOENT);
2523		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
2524	}
2525	if (fzap_name_equal(&zl, zc, name)) {
2526		if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
2527		    integer_size * num_integers)
2528			return (E2BIG);
2529		fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
2530		return (0);
2531	}
2532
2533	return (ENOENT);
2534}
2535
2536/*
2537 * Lookup a name in a zap object and return its value as a uint64_t.
2538 */
2539static int
2540zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2541    uint64_t integer_size, uint64_t num_integers, void *value)
2542{
2543	int rc;
2544	uint64_t zap_type;
2545	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2546
2547	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
2548	if (rc)
2549		return (rc);
2550
2551	zap_type = *(uint64_t *) zap_scratch;
2552	if (zap_type == ZBT_MICRO)
2553		return mzap_lookup(dnode, name, value);
2554	else if (zap_type == ZBT_HEADER) {
2555		return fzap_lookup(spa, dnode, name, integer_size,
2556		    num_integers, value);
2557	}
2558	printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
2559	return (EIO);
2560}
2561
2562/*
2563 * List a microzap directory. Assumes that the zap scratch buffer contains
2564 * the directory contents.
2565 */
2566static int
2567mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
2568{
2569	const mzap_phys_t *mz;
2570	const mzap_ent_phys_t *mze;
2571	size_t size;
2572	int chunks, i, rc;
2573
2574	/*
2575	 * Microzap objects use exactly one block. Read the whole
2576	 * thing.
2577	 */
2578	size = dnode->dn_datablkszsec * 512;
2579	mz = (const mzap_phys_t *) zap_scratch;
2580	chunks = size / MZAP_ENT_LEN - 1;
2581
2582	for (i = 0; i < chunks; i++) {
2583		mze = &mz->mz_chunk[i];
2584		if (mze->mze_name[0]) {
2585			rc = callback(mze->mze_name, mze->mze_value);
2586			if (rc != 0)
2587				return (rc);
2588		}
2589	}
2590
2591	return (0);
2592}
2593
2594/*
2595 * List a fatzap directory. Assumes that the zap scratch buffer contains
2596 * the directory header.
2597 */
2598static int
2599fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
2600{
2601	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2602	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
2603	fat_zap_t z;
2604	int i, j, rc;
2605
2606	if (zh.zap_magic != ZAP_MAGIC)
2607		return (EIO);
2608
2609	z.zap_block_shift = ilog2(bsize);
2610	z.zap_phys = (zap_phys_t *) zap_scratch;
2611
2612	/*
2613	 * This assumes that the leaf blocks start at block 1. The
2614	 * documentation isn't exactly clear on this.
2615	 */
2616	zap_leaf_t zl;
2617	zl.l_bs = z.zap_block_shift;
2618	for (i = 0; i < zh.zap_num_leafs; i++) {
2619		off_t off = ((off_t)(i + 1)) << zl.l_bs;
2620		char name[256], *p;
2621		uint64_t value;
2622
2623		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
2624			return (EIO);
2625
2626		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
2627
2628		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2629			zap_leaf_chunk_t *zc, *nc;
2630			int namelen;
2631
2632			zc = &ZAP_LEAF_CHUNK(&zl, j);
2633			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2634				continue;
2635			namelen = zc->l_entry.le_name_numints;
2636			if (namelen > sizeof(name))
2637				namelen = sizeof(name);
2638
2639			/*
2640			 * Paste the name back together.
2641			 */
2642			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
2643			p = name;
2644			while (namelen > 0) {
2645				int len;
2646				len = namelen;
2647				if (len > ZAP_LEAF_ARRAY_BYTES)
2648					len = ZAP_LEAF_ARRAY_BYTES;
2649				memcpy(p, nc->l_array.la_array, len);
2650				p += len;
2651				namelen -= len;
2652				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
2653			}
2654
2655			/*
2656			 * Assume the first eight bytes of the value are
2657			 * a uint64_t.
2658			 */
2659			value = fzap_leaf_value(&zl, zc);
2660
2661			//printf("%s 0x%jx\n", name, (uintmax_t)value);
2662			rc = callback((const char *)name, value);
2663			if (rc != 0)
2664				return (rc);
2665		}
2666	}
2667
2668	return (0);
2669}
2670
2671static int zfs_printf(const char *name, uint64_t value __unused)
2672{
2673
2674	printf("%s\n", name);
2675
2676	return (0);
2677}
2678
2679/*
2680 * List a zap directory.
2681 */
2682static int
2683zap_list(const spa_t *spa, const dnode_phys_t *dnode)
2684{
2685	uint64_t zap_type;
2686	size_t size = dnode->dn_datablkszsec * 512;
2687
2688	if (dnode_read(spa, dnode, 0, zap_scratch, size))
2689		return (EIO);
2690
2691	zap_type = *(uint64_t *) zap_scratch;
2692	if (zap_type == ZBT_MICRO)
2693		return mzap_list(dnode, zfs_printf);
2694	else
2695		return fzap_list(spa, dnode, zfs_printf);
2696}
2697
2698static int
2699objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
2700{
2701	off_t offset;
2702
2703	offset = objnum * sizeof(dnode_phys_t);
2704	return dnode_read(spa, &os->os_meta_dnode, offset,
2705		dnode, sizeof(dnode_phys_t));
2706}
2707
2708static int
2709mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
2710{
2711	const mzap_phys_t *mz;
2712	const mzap_ent_phys_t *mze;
2713	size_t size;
2714	int chunks, i;
2715
2716	/*
2717	 * Microzap objects use exactly one block. Read the whole
2718	 * thing.
2719	 */
2720	size = dnode->dn_datablkszsec * 512;
2721
2722	mz = (const mzap_phys_t *) zap_scratch;
2723	chunks = size / MZAP_ENT_LEN - 1;
2724
2725	for (i = 0; i < chunks; i++) {
2726		mze = &mz->mz_chunk[i];
2727		if (value == mze->mze_value) {
2728			strcpy(name, mze->mze_name);
2729			return (0);
2730		}
2731	}
2732
2733	return (ENOENT);
2734}
2735
2736static void
2737fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
2738{
2739	size_t namelen;
2740	const zap_leaf_chunk_t *nc;
2741	char *p;
2742
2743	namelen = zc->l_entry.le_name_numints;
2744
2745	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2746	p = name;
2747	while (namelen > 0) {
2748		size_t len;
2749		len = namelen;
2750		if (len > ZAP_LEAF_ARRAY_BYTES)
2751			len = ZAP_LEAF_ARRAY_BYTES;
2752		memcpy(p, nc->l_array.la_array, len);
2753		p += len;
2754		namelen -= len;
2755		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2756	}
2757
2758	*p = '\0';
2759}
2760
2761static int
2762fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
2763{
2764	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2765	zap_phys_t zh = *(zap_phys_t *)zap_scratch;
2766	fat_zap_t z;
2767	int i, j;
2768
2769	if (zh.zap_magic != ZAP_MAGIC)
2770		return (EIO);
2771
2772	z.zap_block_shift = ilog2(bsize);
2773	z.zap_phys = (zap_phys_t *) zap_scratch;
2774
2775	/*
2776	 * This assumes that the leaf blocks start at block 1. The
2777	 * documentation isn't exactly clear on this.
2778	 */
2779	zap_leaf_t zl;
2780	zl.l_bs = z.zap_block_shift;
2781	for (i = 0; i < zh.zap_num_leafs; i++) {
2782		off_t off = ((off_t)(i + 1)) << zl.l_bs;
2783
2784		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
2785			return (EIO);
2786
2787		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
2788
2789		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2790			zap_leaf_chunk_t *zc;
2791
2792			zc = &ZAP_LEAF_CHUNK(&zl, j);
2793			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2794				continue;
2795			if (zc->l_entry.le_value_intlen != 8 ||
2796			    zc->l_entry.le_value_numints != 1)
2797				continue;
2798
2799			if (fzap_leaf_value(&zl, zc) == value) {
2800				fzap_name_copy(&zl, zc, name);
2801				return (0);
2802			}
2803		}
2804	}
2805
2806	return (ENOENT);
2807}
2808
2809static int
2810zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
2811{
2812	int rc;
2813	uint64_t zap_type;
2814	size_t size = dnode->dn_datablkszsec * 512;
2815
2816	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
2817	if (rc)
2818		return (rc);
2819
2820	zap_type = *(uint64_t *) zap_scratch;
2821	if (zap_type == ZBT_MICRO)
2822		return mzap_rlookup(spa, dnode, name, value);
2823	else
2824		return fzap_rlookup(spa, dnode, name, value);
2825}
2826
2827static int
2828zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
2829{
2830	char name[256];
2831	char component[256];
2832	uint64_t dir_obj, parent_obj, child_dir_zapobj;
2833	dnode_phys_t child_dir_zap, dataset, dir, parent;
2834	dsl_dir_phys_t *dd;
2835	dsl_dataset_phys_t *ds;
2836	char *p;
2837	int len;
2838
2839	p = &name[sizeof(name) - 1];
2840	*p = '\0';
2841
2842	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2843		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2844		return (EIO);
2845	}
2846	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2847	dir_obj = ds->ds_dir_obj;
2848
2849	for (;;) {
2850		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
2851			return (EIO);
2852		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2853
2854		/* Actual loop condition. */
2855		parent_obj  = dd->dd_parent_obj;
2856		if (parent_obj == 0)
2857			break;
2858
2859		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
2860			return (EIO);
2861		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
2862		child_dir_zapobj = dd->dd_child_dir_zapobj;
2863		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
2864			return (EIO);
2865		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
2866			return (EIO);
2867
2868		len = strlen(component);
2869		p -= len;
2870		memcpy(p, component, len);
2871		--p;
2872		*p = '/';
2873
2874		/* Actual loop iteration. */
2875		dir_obj = parent_obj;
2876	}
2877
2878	if (*p != '\0')
2879		++p;
2880	strcpy(result, p);
2881
2882	return (0);
2883}
2884
2885static int
2886zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
2887{
2888	char element[256];
2889	uint64_t dir_obj, child_dir_zapobj;
2890	dnode_phys_t child_dir_zap, dir;
2891	dsl_dir_phys_t *dd;
2892	const char *p, *q;
2893
2894	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
2895		return (EIO);
2896	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
2897	    1, &dir_obj))
2898		return (EIO);
2899
2900	p = name;
2901	for (;;) {
2902		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
2903			return (EIO);
2904		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2905
2906		while (*p == '/')
2907			p++;
2908		/* Actual loop condition #1. */
2909		if (*p == '\0')
2910			break;
2911
2912		q = strchr(p, '/');
2913		if (q) {
2914			memcpy(element, p, q - p);
2915			element[q - p] = '\0';
2916			p = q + 1;
2917		} else {
2918			strcpy(element, p);
2919			p += strlen(p);
2920		}
2921
2922		child_dir_zapobj = dd->dd_child_dir_zapobj;
2923		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
2924			return (EIO);
2925
2926		/* Actual loop condition #2. */
2927		if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
2928		    1, &dir_obj) != 0)
2929			return (ENOENT);
2930	}
2931
2932	*objnum = dd->dd_head_dataset_obj;
2933	return (0);
2934}
2935
2936#ifndef BOOT2
2937static int
2938zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
2939{
2940	uint64_t dir_obj, child_dir_zapobj;
2941	dnode_phys_t child_dir_zap, dir, dataset;
2942	dsl_dataset_phys_t *ds;
2943	dsl_dir_phys_t *dd;
2944
2945	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2946		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2947		return (EIO);
2948	}
2949	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2950	dir_obj = ds->ds_dir_obj;
2951
2952	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2953		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2954		return (EIO);
2955	}
2956	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2957
2958	child_dir_zapobj = dd->dd_child_dir_zapobj;
2959	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
2960		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2961		return (EIO);
2962	}
2963
2964	return (zap_list(spa, &child_dir_zap) != 0);
2965}
2966
2967int
2968zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
2969{
2970	uint64_t dir_obj, child_dir_zapobj, zap_type;
2971	dnode_phys_t child_dir_zap, dir, dataset;
2972	dsl_dataset_phys_t *ds;
2973	dsl_dir_phys_t *dd;
2974	int err;
2975
2976	err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2977	if (err != 0) {
2978		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2979		return (err);
2980	}
2981	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2982	dir_obj = ds->ds_dir_obj;
2983
2984	err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2985	if (err != 0) {
2986		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2987		return (err);
2988	}
2989	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2990
2991	child_dir_zapobj = dd->dd_child_dir_zapobj;
2992	err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2993	if (err != 0) {
2994		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2995		return (err);
2996	}
2997
2998	err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2999	if (err != 0)
3000		return (err);
3001
3002	zap_type = *(uint64_t *) zap_scratch;
3003	if (zap_type == ZBT_MICRO)
3004		return mzap_list(&child_dir_zap, callback);
3005	else
3006		return fzap_list(spa, &child_dir_zap, callback);
3007}
3008#endif
3009
3010/*
3011 * Find the object set given the object number of its dataset object
3012 * and return its details in *objset
3013 */
3014static int
3015zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
3016{
3017	dnode_phys_t dataset;
3018	dsl_dataset_phys_t *ds;
3019
3020	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
3021		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3022		return (EIO);
3023	}
3024
3025	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
3026	if (zio_read(spa, &ds->ds_bp, objset)) {
3027		printf("ZFS: can't read object set for dataset %ju\n",
3028		    (uintmax_t)objnum);
3029		return (EIO);
3030	}
3031
3032	return (0);
3033}
3034
3035/*
3036 * Find the object set pointed to by the BOOTFS property or the root
3037 * dataset if there is none and return its details in *objset
3038 */
3039static int
3040zfs_get_root(const spa_t *spa, uint64_t *objid)
3041{
3042	dnode_phys_t dir, propdir;
3043	uint64_t props, bootfs, root;
3044
3045	*objid = 0;
3046
3047	/*
3048	 * Start with the MOS directory object.
3049	 */
3050	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3051		printf("ZFS: can't read MOS object directory\n");
3052		return (EIO);
3053	}
3054
3055	/*
3056	 * Lookup the pool_props and see if we can find a bootfs.
3057	 */
3058	if (zap_lookup(spa, &dir, DMU_POOL_PROPS,
3059	    sizeof(props), 1, &props) == 0 &&
3060	    objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 &&
3061	    zap_lookup(spa, &propdir, "bootfs",
3062	    sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) {
3063		*objid = bootfs;
3064		return (0);
3065	}
3066	/*
3067	 * Lookup the root dataset directory
3068	 */
3069	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
3070	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
3071		printf("ZFS: can't find root dsl_dir\n");
3072		return (EIO);
3073	}
3074
3075	/*
3076	 * Use the information from the dataset directory's bonus buffer
3077	 * to find the dataset object and from that the object set itself.
3078	 */
3079	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
3080	*objid = dd->dd_head_dataset_obj;
3081	return (0);
3082}
3083
3084static int
3085zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
3086{
3087
3088	mount->spa = spa;
3089
3090	/*
3091	 * Find the root object set if not explicitly provided
3092	 */
3093	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
3094		printf("ZFS: can't find root filesystem\n");
3095		return (EIO);
3096	}
3097
3098	if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
3099		printf("ZFS: can't open root filesystem\n");
3100		return (EIO);
3101	}
3102
3103	mount->rootobj = rootobj;
3104
3105	return (0);
3106}
3107
3108/*
3109 * callback function for feature name checks.
3110 */
3111static int
3112check_feature(const char *name, uint64_t value)
3113{
3114	int i;
3115
3116	if (value == 0)
3117		return (0);
3118	if (name[0] == '\0')
3119		return (0);
3120
3121	for (i = 0; features_for_read[i] != NULL; i++) {
3122		if (strcmp(name, features_for_read[i]) == 0)
3123			return (0);
3124	}
3125	printf("ZFS: unsupported feature: %s\n", name);
3126	return (EIO);
3127}
3128
3129/*
3130 * Checks whether the MOS features that are active are supported.
3131 */
3132static int
3133check_mos_features(const spa_t *spa)
3134{
3135	dnode_phys_t dir;
3136	uint64_t objnum, zap_type;
3137	size_t size;
3138	int rc;
3139
3140	if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
3141	    &dir)) != 0)
3142		return (rc);
3143	if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
3144	    sizeof (objnum), 1, &objnum)) != 0) {
3145		/*
3146		 * It is older pool without features. As we have already
3147		 * tested the label, just return without raising the error.
3148		 */
3149		return (0);
3150	}
3151
3152	if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
3153		return (rc);
3154
3155	if (dir.dn_type != DMU_OTN_ZAP_METADATA)
3156		return (EIO);
3157
3158	size = dir.dn_datablkszsec * 512;
3159	if (dnode_read(spa, &dir, 0, zap_scratch, size))
3160		return (EIO);
3161
3162	zap_type = *(uint64_t *) zap_scratch;
3163	if (zap_type == ZBT_MICRO)
3164		rc = mzap_list(&dir, check_feature);
3165	else
3166		rc = fzap_list(spa, &dir, check_feature);
3167
3168	return (rc);
3169}
3170
3171static int
3172load_nvlist(spa_t *spa, uint64_t obj, unsigned char **value)
3173{
3174	dnode_phys_t dir;
3175	size_t size;
3176	int rc;
3177	unsigned char *nv;
3178
3179	*value = NULL;
3180	if ((rc = objset_get_dnode(spa, &spa->spa_mos, obj, &dir)) != 0)
3181		return (rc);
3182	if (dir.dn_type != DMU_OT_PACKED_NVLIST &&
3183	    dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) {
3184		return (EIO);
3185	}
3186
3187	if (dir.dn_bonuslen != sizeof (uint64_t))
3188		return (EIO);
3189
3190	size = *(uint64_t *)DN_BONUS(&dir);
3191	nv = malloc(size);
3192	if (nv == NULL)
3193		return (ENOMEM);
3194
3195	rc = dnode_read(spa, &dir, 0, nv, size);
3196	if (rc != 0) {
3197		free(nv);
3198		nv = NULL;
3199		return (rc);
3200	}
3201	*value = nv;
3202	return (rc);
3203}
3204
3205static int
3206zfs_spa_init(spa_t *spa)
3207{
3208	dnode_phys_t dir;
3209	uint64_t config_object;
3210	unsigned char *nvlist;
3211	int rc;
3212
3213	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
3214		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
3215		return (EIO);
3216	}
3217	if (spa->spa_mos.os_type != DMU_OST_META) {
3218		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
3219		return (EIO);
3220	}
3221
3222	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
3223	    &dir)) {
3224		printf("ZFS: failed to read pool %s directory object\n",
3225		    spa->spa_name);
3226		return (EIO);
3227	}
3228	/* this is allowed to fail, older pools do not have salt */
3229	rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
3230	    sizeof (spa->spa_cksum_salt.zcs_bytes),
3231	    spa->spa_cksum_salt.zcs_bytes);
3232
3233	rc = check_mos_features(spa);
3234	if (rc != 0) {
3235		printf("ZFS: pool %s is not supported\n", spa->spa_name);
3236		return (rc);
3237	}
3238
3239	rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG,
3240	    sizeof (config_object), 1, &config_object);
3241	if (rc != 0) {
3242		printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG);
3243		return (EIO);
3244	}
3245	rc = load_nvlist(spa, config_object, &nvlist);
3246	if (rc != 0)
3247		return (rc);
3248
3249	/* Update vdevs from MOS config. */
3250	rc = vdev_init_from_nvlist(spa, nvlist + 4);
3251	free(nvlist);
3252	return (rc);
3253}
3254
3255static int
3256zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
3257{
3258
3259	if (dn->dn_bonustype != DMU_OT_SA) {
3260		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
3261
3262		sb->st_mode = zp->zp_mode;
3263		sb->st_uid = zp->zp_uid;
3264		sb->st_gid = zp->zp_gid;
3265		sb->st_size = zp->zp_size;
3266	} else {
3267		sa_hdr_phys_t *sahdrp;
3268		int hdrsize;
3269		size_t size = 0;
3270		void *buf = NULL;
3271
3272		if (dn->dn_bonuslen != 0)
3273			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3274		else {
3275			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
3276				blkptr_t *bp = DN_SPILL_BLKPTR(dn);
3277				int error;
3278
3279				size = BP_GET_LSIZE(bp);
3280				buf = zfs_alloc(size);
3281				error = zio_read(spa, bp, buf);
3282				if (error != 0) {
3283					zfs_free(buf, size);
3284					return (error);
3285				}
3286				sahdrp = buf;
3287			} else {
3288				return (EIO);
3289			}
3290		}
3291		hdrsize = SA_HDR_SIZE(sahdrp);
3292		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
3293		    SA_MODE_OFFSET);
3294		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
3295		    SA_UID_OFFSET);
3296		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
3297		    SA_GID_OFFSET);
3298		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
3299		    SA_SIZE_OFFSET);
3300		if (buf != NULL)
3301			zfs_free(buf, size);
3302	}
3303
3304	return (0);
3305}
3306
3307static int
3308zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
3309{
3310	int rc = 0;
3311
3312	if (dn->dn_bonustype == DMU_OT_SA) {
3313		sa_hdr_phys_t *sahdrp = NULL;
3314		size_t size = 0;
3315		void *buf = NULL;
3316		int hdrsize;
3317		char *p;
3318
3319		if (dn->dn_bonuslen != 0)
3320			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3321		else {
3322			blkptr_t *bp;
3323
3324			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
3325				return (EIO);
3326			bp = DN_SPILL_BLKPTR(dn);
3327
3328			size = BP_GET_LSIZE(bp);
3329			buf = zfs_alloc(size);
3330			rc = zio_read(spa, bp, buf);
3331			if (rc != 0) {
3332				zfs_free(buf, size);
3333				return (rc);
3334			}
3335			sahdrp = buf;
3336		}
3337		hdrsize = SA_HDR_SIZE(sahdrp);
3338		p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
3339		memcpy(path, p, psize);
3340		if (buf != NULL)
3341			zfs_free(buf, size);
3342		return (0);
3343	}
3344	/*
3345	 * Second test is purely to silence bogus compiler
3346	 * warning about accessing past the end of dn_bonus.
3347	 */
3348	if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
3349	    sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
3350		memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
3351	} else {
3352		rc = dnode_read(spa, dn, 0, path, psize);
3353	}
3354	return (rc);
3355}
3356
3357struct obj_list {
3358	uint64_t		objnum;
3359	STAILQ_ENTRY(obj_list)	entry;
3360};
3361
3362/*
3363 * Lookup a file and return its dnode.
3364 */
3365static int
3366zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
3367{
3368	int rc;
3369	uint64_t objnum;
3370	const spa_t *spa;
3371	dnode_phys_t dn;
3372	const char *p, *q;
3373	char element[256];
3374	char path[1024];
3375	int symlinks_followed = 0;
3376	struct stat sb;
3377	struct obj_list *entry, *tentry;
3378	STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
3379
3380	spa = mount->spa;
3381	if (mount->objset.os_type != DMU_OST_ZFS) {
3382		printf("ZFS: unexpected object set type %ju\n",
3383		    (uintmax_t)mount->objset.os_type);
3384		return (EIO);
3385	}
3386
3387	if ((entry = malloc(sizeof(struct obj_list))) == NULL)
3388		return (ENOMEM);
3389
3390	/*
3391	 * Get the root directory dnode.
3392	 */
3393	rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
3394	if (rc) {
3395		free(entry);
3396		return (rc);
3397	}
3398
3399	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
3400	if (rc) {
3401		free(entry);
3402		return (rc);
3403	}
3404	entry->objnum = objnum;
3405	STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3406
3407	rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3408	if (rc != 0)
3409		goto done;
3410
3411	p = upath;
3412	while (p && *p) {
3413		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3414		if (rc != 0)
3415			goto done;
3416
3417		while (*p == '/')
3418			p++;
3419		if (*p == '\0')
3420			break;
3421		q = p;
3422		while (*q != '\0' && *q != '/')
3423			q++;
3424
3425		/* skip dot */
3426		if (p + 1 == q && p[0] == '.') {
3427			p++;
3428			continue;
3429		}
3430		/* double dot */
3431		if (p + 2 == q && p[0] == '.' && p[1] == '.') {
3432			p += 2;
3433			if (STAILQ_FIRST(&on_cache) ==
3434			    STAILQ_LAST(&on_cache, obj_list, entry)) {
3435				rc = ENOENT;
3436				goto done;
3437			}
3438			entry = STAILQ_FIRST(&on_cache);
3439			STAILQ_REMOVE_HEAD(&on_cache, entry);
3440			free(entry);
3441			objnum = (STAILQ_FIRST(&on_cache))->objnum;
3442			continue;
3443		}
3444		if (q - p + 1 > sizeof(element)) {
3445			rc = ENAMETOOLONG;
3446			goto done;
3447		}
3448		memcpy(element, p, q - p);
3449		element[q - p] = 0;
3450		p = q;
3451
3452		if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
3453			goto done;
3454		if (!S_ISDIR(sb.st_mode)) {
3455			rc = ENOTDIR;
3456			goto done;
3457		}
3458
3459		rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
3460		if (rc)
3461			goto done;
3462		objnum = ZFS_DIRENT_OBJ(objnum);
3463
3464		if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
3465			rc = ENOMEM;
3466			goto done;
3467		}
3468		entry->objnum = objnum;
3469		STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3470		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
3471		if (rc)
3472			goto done;
3473
3474		/*
3475		 * Check for symlink.
3476		 */
3477		rc = zfs_dnode_stat(spa, &dn, &sb);
3478		if (rc)
3479			goto done;
3480		if (S_ISLNK(sb.st_mode)) {
3481			if (symlinks_followed > 10) {
3482				rc = EMLINK;
3483				goto done;
3484			}
3485			symlinks_followed++;
3486
3487			/*
3488			 * Read the link value and copy the tail of our
3489			 * current path onto the end.
3490			 */
3491			if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
3492				rc = ENAMETOOLONG;
3493				goto done;
3494			}
3495			strcpy(&path[sb.st_size], p);
3496
3497			rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
3498			if (rc != 0)
3499				goto done;
3500
3501			/*
3502			 * Restart with the new path, starting either at
3503			 * the root or at the parent depending whether or
3504			 * not the link is relative.
3505			 */
3506			p = path;
3507			if (*p == '/') {
3508				while (STAILQ_FIRST(&on_cache) !=
3509				    STAILQ_LAST(&on_cache, obj_list, entry)) {
3510					entry = STAILQ_FIRST(&on_cache);
3511					STAILQ_REMOVE_HEAD(&on_cache, entry);
3512					free(entry);
3513				}
3514			} else {
3515				entry = STAILQ_FIRST(&on_cache);
3516				STAILQ_REMOVE_HEAD(&on_cache, entry);
3517				free(entry);
3518			}
3519			objnum = (STAILQ_FIRST(&on_cache))->objnum;
3520		}
3521	}
3522
3523	*dnode = dn;
3524done:
3525	STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
3526		free(entry);
3527	return (rc);
3528}
3529