1/*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/11/stand/libsa/zfs/zfsimpl.c 346477 2019-04-21 03:43:27Z kevans $");
29
30/*
31 *	Stand-alone ZFS file reader.
32 */
33
34#include <sys/stat.h>
35#include <sys/stdint.h>
36
37#include "zfsimpl.h"
38#include "zfssubr.c"
39
40
41struct zfsmount {
42	const spa_t	*spa;
43	objset_phys_t	objset;
44	uint64_t	rootobj;
45};
46static struct zfsmount zfsmount __unused;
47
48/*
49 * List of all vdevs, chained through v_alllink.
50 */
51static vdev_list_t zfs_vdevs;
52
53 /*
54 * List of ZFS features supported for read
55 */
56static const char *features_for_read[] = {
57	"org.illumos:lz4_compress",
58	"com.delphix:hole_birth",
59	"com.delphix:extensible_dataset",
60	"com.delphix:embedded_data",
61	"org.open-zfs:large_blocks",
62	"org.illumos:sha512",
63	"org.illumos:skein",
64	"org.zfsonlinux:large_dnode",
65	"com.joyent:multi_vdev_crash_dump",
66	NULL
67};
68
69/*
70 * List of all pools, chained through spa_link.
71 */
72static spa_list_t zfs_pools;
73
74static const dnode_phys_t *dnode_cache_obj;
75static uint64_t dnode_cache_bn;
76static char *dnode_cache_buf;
77static char *zap_scratch;
78static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
79
80#define TEMP_SIZE	(1024 * 1024)
81
82static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
83static int zfs_get_root(const spa_t *spa, uint64_t *objid);
84static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
85static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
86    const char *name, uint64_t integer_size, uint64_t num_integers,
87    void *value);
88
89static void
90zfs_init(void)
91{
92	STAILQ_INIT(&zfs_vdevs);
93	STAILQ_INIT(&zfs_pools);
94
95	zfs_temp_buf = malloc(TEMP_SIZE);
96	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
97	zfs_temp_ptr = zfs_temp_buf;
98	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
99	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
100
101	zfs_init_crc();
102}
103
104static void *
105zfs_alloc(size_t size)
106{
107	char *ptr;
108
109	if (zfs_temp_ptr + size > zfs_temp_end) {
110		printf("ZFS: out of temporary buffer space\n");
111		for (;;) ;
112	}
113	ptr = zfs_temp_ptr;
114	zfs_temp_ptr += size;
115
116	return (ptr);
117}
118
119static void
120zfs_free(void *ptr, size_t size)
121{
122
123	zfs_temp_ptr -= size;
124	if (zfs_temp_ptr != ptr) {
125		printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
126		for (;;) ;
127	}
128}
129
130static int
131xdr_int(const unsigned char **xdr, int *ip)
132{
133	*ip = ((*xdr)[0] << 24)
134		| ((*xdr)[1] << 16)
135		| ((*xdr)[2] << 8)
136		| ((*xdr)[3] << 0);
137	(*xdr) += 4;
138	return (0);
139}
140
141static int
142xdr_u_int(const unsigned char **xdr, u_int *ip)
143{
144	*ip = ((*xdr)[0] << 24)
145		| ((*xdr)[1] << 16)
146		| ((*xdr)[2] << 8)
147		| ((*xdr)[3] << 0);
148	(*xdr) += 4;
149	return (0);
150}
151
152static int
153xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
154{
155	u_int hi, lo;
156
157	xdr_u_int(xdr, &hi);
158	xdr_u_int(xdr, &lo);
159	*lp = (((uint64_t) hi) << 32) | lo;
160	return (0);
161}
162
163static int
164nvlist_find(const unsigned char *nvlist, const char *name, int type,
165	    int* elementsp, void *valuep)
166{
167	const unsigned char *p, *pair;
168	int junk;
169	int encoded_size, decoded_size;
170
171	p = nvlist;
172	xdr_int(&p, &junk);
173	xdr_int(&p, &junk);
174
175	pair = p;
176	xdr_int(&p, &encoded_size);
177	xdr_int(&p, &decoded_size);
178	while (encoded_size && decoded_size) {
179		int namelen, pairtype, elements;
180		const char *pairname;
181
182		xdr_int(&p, &namelen);
183		pairname = (const char*) p;
184		p += roundup(namelen, 4);
185		xdr_int(&p, &pairtype);
186
187		if (!memcmp(name, pairname, namelen) && type == pairtype) {
188			xdr_int(&p, &elements);
189			if (elementsp)
190				*elementsp = elements;
191			if (type == DATA_TYPE_UINT64) {
192				xdr_uint64_t(&p, (uint64_t *) valuep);
193				return (0);
194			} else if (type == DATA_TYPE_STRING) {
195				int len;
196				xdr_int(&p, &len);
197				(*(const char**) valuep) = (const char*) p;
198				return (0);
199			} else if (type == DATA_TYPE_NVLIST
200				   || type == DATA_TYPE_NVLIST_ARRAY) {
201				(*(const unsigned char**) valuep) =
202					 (const unsigned char*) p;
203				return (0);
204			} else {
205				return (EIO);
206			}
207		} else {
208			/*
209			 * Not the pair we are looking for, skip to the next one.
210			 */
211			p = pair + encoded_size;
212		}
213
214		pair = p;
215		xdr_int(&p, &encoded_size);
216		xdr_int(&p, &decoded_size);
217	}
218
219	return (EIO);
220}
221
222static int
223nvlist_check_features_for_read(const unsigned char *nvlist)
224{
225	const unsigned char *p, *pair;
226	int junk;
227	int encoded_size, decoded_size;
228	int rc;
229
230	rc = 0;
231
232	p = nvlist;
233	xdr_int(&p, &junk);
234	xdr_int(&p, &junk);
235
236	pair = p;
237	xdr_int(&p, &encoded_size);
238	xdr_int(&p, &decoded_size);
239	while (encoded_size && decoded_size) {
240		int namelen, pairtype;
241		const char *pairname;
242		int i, found;
243
244		found = 0;
245
246		xdr_int(&p, &namelen);
247		pairname = (const char*) p;
248		p += roundup(namelen, 4);
249		xdr_int(&p, &pairtype);
250
251		for (i = 0; features_for_read[i] != NULL; i++) {
252			if (!memcmp(pairname, features_for_read[i], namelen)) {
253				found = 1;
254				break;
255			}
256		}
257
258		if (!found) {
259			printf("ZFS: unsupported feature: %s\n", pairname);
260			rc = EIO;
261		}
262
263		p = pair + encoded_size;
264
265		pair = p;
266		xdr_int(&p, &encoded_size);
267		xdr_int(&p, &decoded_size);
268	}
269
270	return (rc);
271}
272
273/*
274 * Return the next nvlist in an nvlist array.
275 */
276static const unsigned char *
277nvlist_next(const unsigned char *nvlist)
278{
279	const unsigned char *p, *pair;
280	int junk;
281	int encoded_size, decoded_size;
282
283	p = nvlist;
284	xdr_int(&p, &junk);
285	xdr_int(&p, &junk);
286
287	pair = p;
288	xdr_int(&p, &encoded_size);
289	xdr_int(&p, &decoded_size);
290	while (encoded_size && decoded_size) {
291		p = pair + encoded_size;
292
293		pair = p;
294		xdr_int(&p, &encoded_size);
295		xdr_int(&p, &decoded_size);
296	}
297
298	return p;
299}
300
301#ifdef TEST
302
303static const unsigned char *
304nvlist_print(const unsigned char *nvlist, unsigned int indent)
305{
306	static const char* typenames[] = {
307		"DATA_TYPE_UNKNOWN",
308		"DATA_TYPE_BOOLEAN",
309		"DATA_TYPE_BYTE",
310		"DATA_TYPE_INT16",
311		"DATA_TYPE_UINT16",
312		"DATA_TYPE_INT32",
313		"DATA_TYPE_UINT32",
314		"DATA_TYPE_INT64",
315		"DATA_TYPE_UINT64",
316		"DATA_TYPE_STRING",
317		"DATA_TYPE_BYTE_ARRAY",
318		"DATA_TYPE_INT16_ARRAY",
319		"DATA_TYPE_UINT16_ARRAY",
320		"DATA_TYPE_INT32_ARRAY",
321		"DATA_TYPE_UINT32_ARRAY",
322		"DATA_TYPE_INT64_ARRAY",
323		"DATA_TYPE_UINT64_ARRAY",
324		"DATA_TYPE_STRING_ARRAY",
325		"DATA_TYPE_HRTIME",
326		"DATA_TYPE_NVLIST",
327		"DATA_TYPE_NVLIST_ARRAY",
328		"DATA_TYPE_BOOLEAN_VALUE",
329		"DATA_TYPE_INT8",
330		"DATA_TYPE_UINT8",
331		"DATA_TYPE_BOOLEAN_ARRAY",
332		"DATA_TYPE_INT8_ARRAY",
333		"DATA_TYPE_UINT8_ARRAY"
334	};
335
336	unsigned int i, j;
337	const unsigned char *p, *pair;
338	int junk;
339	int encoded_size, decoded_size;
340
341	p = nvlist;
342	xdr_int(&p, &junk);
343	xdr_int(&p, &junk);
344
345	pair = p;
346	xdr_int(&p, &encoded_size);
347	xdr_int(&p, &decoded_size);
348	while (encoded_size && decoded_size) {
349		int namelen, pairtype, elements;
350		const char *pairname;
351
352		xdr_int(&p, &namelen);
353		pairname = (const char*) p;
354		p += roundup(namelen, 4);
355		xdr_int(&p, &pairtype);
356
357		for (i = 0; i < indent; i++)
358			printf(" ");
359		printf("%s %s", typenames[pairtype], pairname);
360
361		xdr_int(&p, &elements);
362		switch (pairtype) {
363		case DATA_TYPE_UINT64: {
364			uint64_t val;
365			xdr_uint64_t(&p, &val);
366			printf(" = 0x%jx\n", (uintmax_t)val);
367			break;
368		}
369
370		case DATA_TYPE_STRING: {
371			int len;
372			xdr_int(&p, &len);
373			printf(" = \"%s\"\n", p);
374			break;
375		}
376
377		case DATA_TYPE_NVLIST:
378			printf("\n");
379			nvlist_print(p, indent + 1);
380			break;
381
382		case DATA_TYPE_NVLIST_ARRAY:
383			for (j = 0; j < elements; j++) {
384				printf("[%d]\n", j);
385				p = nvlist_print(p, indent + 1);
386				if (j != elements - 1) {
387					for (i = 0; i < indent; i++)
388						printf(" ");
389					printf("%s %s", typenames[pairtype], pairname);
390				}
391			}
392			break;
393
394		default:
395			printf("\n");
396		}
397
398		p = pair + encoded_size;
399
400		pair = p;
401		xdr_int(&p, &encoded_size);
402		xdr_int(&p, &decoded_size);
403	}
404
405	return p;
406}
407
408#endif
409
410static int
411vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
412    off_t offset, size_t size)
413{
414	size_t psize;
415	int rc;
416
417	if (!vdev->v_phys_read)
418		return (EIO);
419
420	if (bp) {
421		psize = BP_GET_PSIZE(bp);
422	} else {
423		psize = size;
424	}
425
426	/*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
427	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
428	if (rc)
429		return (rc);
430	if (bp && zio_checksum_verify(vdev->spa, bp, buf))
431		return (EIO);
432
433	return (0);
434}
435
436static int
437vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
438    off_t offset, size_t bytes)
439{
440
441	return (vdev_read_phys(vdev, bp, buf,
442		offset + VDEV_LABEL_START_SIZE, bytes));
443}
444
445
446static int
447vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
448    off_t offset, size_t bytes)
449{
450	vdev_t *kid;
451	int rc;
452
453	rc = EIO;
454	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
455		if (kid->v_state != VDEV_STATE_HEALTHY)
456			continue;
457		rc = kid->v_read(kid, bp, buf, offset, bytes);
458		if (!rc)
459			return (0);
460	}
461
462	return (rc);
463}
464
465static int
466vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
467    off_t offset, size_t bytes)
468{
469	vdev_t *kid;
470
471	/*
472	 * Here we should have two kids:
473	 * First one which is the one we are replacing and we can trust
474	 * only this one to have valid data, but it might not be present.
475	 * Second one is that one we are replacing with. It is most likely
476	 * healthy, but we can't trust it has needed data, so we won't use it.
477	 */
478	kid = STAILQ_FIRST(&vdev->v_children);
479	if (kid == NULL)
480		return (EIO);
481	if (kid->v_state != VDEV_STATE_HEALTHY)
482		return (EIO);
483	return (kid->v_read(kid, bp, buf, offset, bytes));
484}
485
486static vdev_t *
487vdev_find(uint64_t guid)
488{
489	vdev_t *vdev;
490
491	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
492		if (vdev->v_guid == guid)
493			return (vdev);
494
495	return (0);
496}
497
498static vdev_t *
499vdev_create(uint64_t guid, vdev_read_t *_read)
500{
501	vdev_t *vdev;
502
503	vdev = malloc(sizeof(vdev_t));
504	memset(vdev, 0, sizeof(vdev_t));
505	STAILQ_INIT(&vdev->v_children);
506	vdev->v_guid = guid;
507	vdev->v_state = VDEV_STATE_OFFLINE;
508	vdev->v_read = _read;
509	vdev->v_phys_read = 0;
510	vdev->v_read_priv = 0;
511	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
512
513	return (vdev);
514}
515
516static int
517vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
518    vdev_t **vdevp, int is_newer)
519{
520	int rc;
521	uint64_t guid, id, ashift, nparity;
522	const char *type;
523	const char *path;
524	vdev_t *vdev, *kid;
525	const unsigned char *kids;
526	int nkids, i, is_new;
527	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
528
529	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
530	    NULL, &guid)
531	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
532	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
533	    NULL, &type)) {
534		printf("ZFS: can't find vdev details\n");
535		return (ENOENT);
536	}
537
538	if (strcmp(type, VDEV_TYPE_MIRROR)
539	    && strcmp(type, VDEV_TYPE_DISK)
540#ifdef ZFS_TEST
541	    && strcmp(type, VDEV_TYPE_FILE)
542#endif
543	    && strcmp(type, VDEV_TYPE_RAIDZ)
544	    && strcmp(type, VDEV_TYPE_REPLACING)) {
545		printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
546		return (EIO);
547	}
548
549	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
550
551	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
552			&is_offline);
553	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
554			&is_removed);
555	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
556			&is_faulted);
557	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
558			&is_degraded);
559	nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
560			&isnt_present);
561
562	vdev = vdev_find(guid);
563	if (!vdev) {
564		is_new = 1;
565
566		if (!strcmp(type, VDEV_TYPE_MIRROR))
567			vdev = vdev_create(guid, vdev_mirror_read);
568		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
569			vdev = vdev_create(guid, vdev_raidz_read);
570		else if (!strcmp(type, VDEV_TYPE_REPLACING))
571			vdev = vdev_create(guid, vdev_replacing_read);
572		else
573			vdev = vdev_create(guid, vdev_disk_read);
574
575		vdev->v_id = id;
576		vdev->v_top = pvdev != NULL ? pvdev : vdev;
577		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
578			DATA_TYPE_UINT64, NULL, &ashift) == 0) {
579			vdev->v_ashift = ashift;
580		} else {
581			vdev->v_ashift = 0;
582		}
583		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
584			DATA_TYPE_UINT64, NULL, &nparity) == 0) {
585			vdev->v_nparity = nparity;
586		} else {
587			vdev->v_nparity = 0;
588		}
589		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
590				DATA_TYPE_STRING, NULL, &path) == 0) {
591			if (strncmp(path, "/dev/", 5) == 0)
592				path += 5;
593			vdev->v_name = strdup(path);
594		} else {
595			if (!strcmp(type, "raidz")) {
596				if (vdev->v_nparity == 1)
597					vdev->v_name = "raidz1";
598				else if (vdev->v_nparity == 2)
599					vdev->v_name = "raidz2";
600				else if (vdev->v_nparity == 3)
601					vdev->v_name = "raidz3";
602				else {
603					printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
604					return (EIO);
605				}
606			} else {
607				vdev->v_name = strdup(type);
608			}
609		}
610	} else {
611		is_new = 0;
612	}
613
614	if (is_new || is_newer) {
615		/*
616		 * This is either new vdev or we've already seen this vdev,
617		 * but from an older vdev label, so let's refresh its state
618		 * from the newer label.
619		 */
620		if (is_offline)
621			vdev->v_state = VDEV_STATE_OFFLINE;
622		else if (is_removed)
623			vdev->v_state = VDEV_STATE_REMOVED;
624		else if (is_faulted)
625			vdev->v_state = VDEV_STATE_FAULTED;
626		else if (is_degraded)
627			vdev->v_state = VDEV_STATE_DEGRADED;
628		else if (isnt_present)
629			vdev->v_state = VDEV_STATE_CANT_OPEN;
630	}
631
632	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
633	    &nkids, &kids);
634	/*
635	 * Its ok if we don't have any kids.
636	 */
637	if (rc == 0) {
638		vdev->v_nchildren = nkids;
639		for (i = 0; i < nkids; i++) {
640			rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
641			if (rc)
642				return (rc);
643			if (is_new)
644				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
645						   v_childlink);
646			kids = nvlist_next(kids);
647		}
648	} else {
649		vdev->v_nchildren = 0;
650	}
651
652	if (vdevp)
653		*vdevp = vdev;
654	return (0);
655}
656
657static void
658vdev_set_state(vdev_t *vdev)
659{
660	vdev_t *kid;
661	int good_kids;
662	int bad_kids;
663
664	/*
665	 * A mirror or raidz is healthy if all its kids are healthy. A
666	 * mirror is degraded if any of its kids is healthy; a raidz
667	 * is degraded if at most nparity kids are offline.
668	 */
669	if (STAILQ_FIRST(&vdev->v_children)) {
670		good_kids = 0;
671		bad_kids = 0;
672		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
673			if (kid->v_state == VDEV_STATE_HEALTHY)
674				good_kids++;
675			else
676				bad_kids++;
677		}
678		if (bad_kids == 0) {
679			vdev->v_state = VDEV_STATE_HEALTHY;
680		} else {
681			if (vdev->v_read == vdev_mirror_read) {
682				if (good_kids) {
683					vdev->v_state = VDEV_STATE_DEGRADED;
684				} else {
685					vdev->v_state = VDEV_STATE_OFFLINE;
686				}
687			} else if (vdev->v_read == vdev_raidz_read) {
688				if (bad_kids > vdev->v_nparity) {
689					vdev->v_state = VDEV_STATE_OFFLINE;
690				} else {
691					vdev->v_state = VDEV_STATE_DEGRADED;
692				}
693			}
694		}
695	}
696}
697
698static spa_t *
699spa_find_by_guid(uint64_t guid)
700{
701	spa_t *spa;
702
703	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
704		if (spa->spa_guid == guid)
705			return (spa);
706
707	return (0);
708}
709
710static spa_t *
711spa_find_by_name(const char *name)
712{
713	spa_t *spa;
714
715	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
716		if (!strcmp(spa->spa_name, name))
717			return (spa);
718
719	return (0);
720}
721
722#ifdef BOOT2
723static spa_t *
724spa_get_primary(void)
725{
726
727	return (STAILQ_FIRST(&zfs_pools));
728}
729
730static vdev_t *
731spa_get_primary_vdev(const spa_t *spa)
732{
733	vdev_t *vdev;
734	vdev_t *kid;
735
736	if (spa == NULL)
737		spa = spa_get_primary();
738	if (spa == NULL)
739		return (NULL);
740	vdev = STAILQ_FIRST(&spa->spa_vdevs);
741	if (vdev == NULL)
742		return (NULL);
743	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
744	     kid = STAILQ_FIRST(&vdev->v_children))
745		vdev = kid;
746	return (vdev);
747}
748#endif
749
750static spa_t *
751spa_create(uint64_t guid, const char *name)
752{
753	spa_t *spa;
754
755	if ((spa = malloc(sizeof(spa_t))) == NULL)
756		return (NULL);
757	memset(spa, 0, sizeof(spa_t));
758	if ((spa->spa_name = strdup(name)) == NULL) {
759		free(spa);
760		return (NULL);
761	}
762	STAILQ_INIT(&spa->spa_vdevs);
763	spa->spa_guid = guid;
764	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
765
766	return (spa);
767}
768
769static const char *
770state_name(vdev_state_t state)
771{
772	static const char* names[] = {
773		"UNKNOWN",
774		"CLOSED",
775		"OFFLINE",
776		"REMOVED",
777		"CANT_OPEN",
778		"FAULTED",
779		"DEGRADED",
780		"ONLINE"
781	};
782	return names[state];
783}
784
785#ifdef BOOT2
786
787#define pager_printf printf
788
789#else
790
791static int
792pager_printf(const char *fmt, ...)
793{
794	char line[80];
795	va_list args;
796
797	va_start(args, fmt);
798	vsprintf(line, fmt, args);
799	va_end(args);
800
801	return (pager_output(line));
802}
803
804#endif
805
806#define STATUS_FORMAT	"        %s %s\n"
807
808static int
809print_state(int indent, const char *name, vdev_state_t state)
810{
811	char buf[512];
812	int i;
813
814	buf[0] = 0;
815	for (i = 0; i < indent; i++)
816		strcat(buf, "  ");
817	strcat(buf, name);
818
819	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
820}
821
822static int
823vdev_status(vdev_t *vdev, int indent)
824{
825	vdev_t *kid;
826	int ret;
827	ret = print_state(indent, vdev->v_name, vdev->v_state);
828	if (ret != 0)
829		return (ret);
830
831	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
832		ret = vdev_status(kid, indent + 1);
833		if (ret != 0)
834			return (ret);
835	}
836	return (ret);
837}
838
839static int
840spa_status(spa_t *spa)
841{
842	static char bootfs[ZFS_MAXNAMELEN];
843	uint64_t rootid;
844	vdev_t *vdev;
845	int good_kids, bad_kids, degraded_kids, ret;
846	vdev_state_t state;
847
848	ret = pager_printf("  pool: %s\n", spa->spa_name);
849	if (ret != 0)
850		return (ret);
851
852	if (zfs_get_root(spa, &rootid) == 0 &&
853	    zfs_rlookup(spa, rootid, bootfs) == 0) {
854		if (bootfs[0] == '\0')
855			ret = pager_printf("bootfs: %s\n", spa->spa_name);
856		else
857			ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
858			    bootfs);
859		if (ret != 0)
860			return (ret);
861	}
862	ret = pager_printf("config:\n\n");
863	if (ret != 0)
864		return (ret);
865	ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
866	if (ret != 0)
867		return (ret);
868
869	good_kids = 0;
870	degraded_kids = 0;
871	bad_kids = 0;
872	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
873		if (vdev->v_state == VDEV_STATE_HEALTHY)
874			good_kids++;
875		else if (vdev->v_state == VDEV_STATE_DEGRADED)
876			degraded_kids++;
877		else
878			bad_kids++;
879	}
880
881	state = VDEV_STATE_CLOSED;
882	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
883		state = VDEV_STATE_HEALTHY;
884	else if ((good_kids + degraded_kids) > 0)
885		state = VDEV_STATE_DEGRADED;
886
887	ret = print_state(0, spa->spa_name, state);
888	if (ret != 0)
889		return (ret);
890	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
891		ret = vdev_status(vdev, 1);
892		if (ret != 0)
893			return (ret);
894	}
895	return (ret);
896}
897
898static int
899spa_all_status(void)
900{
901	spa_t *spa;
902	int first = 1, ret = 0;
903
904	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
905		if (!first) {
906			ret = pager_printf("\n");
907			if (ret != 0)
908				return (ret);
909		}
910		first = 0;
911		ret = spa_status(spa);
912		if (ret != 0)
913			return (ret);
914	}
915	return (ret);
916}
917
918static uint64_t
919vdev_label_offset(uint64_t psize, int l, uint64_t offset)
920{
921	uint64_t label_offset;
922
923	if (l < VDEV_LABELS / 2)
924		label_offset = 0;
925	else
926		label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
927
928	return (offset + l * sizeof (vdev_label_t) + label_offset);
929}
930
931static int
932vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
933{
934	vdev_t vtmp;
935	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
936	vdev_phys_t *tmp_label;
937	spa_t *spa;
938	vdev_t *vdev, *top_vdev, *pool_vdev;
939	off_t off;
940	blkptr_t bp;
941	const unsigned char *nvlist = NULL;
942	uint64_t val;
943	uint64_t guid;
944	uint64_t best_txg = 0;
945	uint64_t pool_txg, pool_guid;
946	uint64_t psize;
947	const char *pool_name;
948	const unsigned char *vdevs;
949	const unsigned char *features;
950	int i, l, rc, is_newer;
951	char *upbuf;
952	const struct uberblock *up;
953
954	/*
955	 * Load the vdev label and figure out which
956	 * uberblock is most current.
957	 */
958	memset(&vtmp, 0, sizeof(vtmp));
959	vtmp.v_phys_read = _read;
960	vtmp.v_read_priv = read_priv;
961	psize = P2ALIGN(ldi_get_size(read_priv),
962	    (uint64_t)sizeof (vdev_label_t));
963
964	/* Test for minimum pool size. */
965	if (psize < SPA_MINDEVSIZE)
966		return (EIO);
967
968	tmp_label = zfs_alloc(sizeof(vdev_phys_t));
969
970	for (l = 0; l < VDEV_LABELS; l++) {
971		off = vdev_label_offset(psize, l,
972		    offsetof(vdev_label_t, vl_vdev_phys));
973
974		BP_ZERO(&bp);
975		BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
976		BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
977		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
978		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
979		DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
980		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
981
982		if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
983			continue;
984
985		if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
986			continue;
987
988		nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
989		if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
990		    DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
991			continue;
992
993		if (best_txg <= pool_txg) {
994			best_txg = pool_txg;
995			memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
996		}
997	}
998
999	zfs_free(tmp_label, sizeof (vdev_phys_t));
1000
1001	if (best_txg == 0)
1002		return (EIO);
1003
1004	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
1005		return (EIO);
1006
1007	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
1008
1009	if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1010	    NULL, &val) != 0) {
1011		return (EIO);
1012	}
1013
1014	if (!SPA_VERSION_IS_SUPPORTED(val)) {
1015		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1016		    (unsigned) val, (unsigned) SPA_VERSION);
1017		return (EIO);
1018	}
1019
1020	/* Check ZFS features for read */
1021	if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1022	    DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1023	    nvlist_check_features_for_read(features) != 0) {
1024		return (EIO);
1025	}
1026
1027	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1028	    NULL, &val) != 0) {
1029		return (EIO);
1030	}
1031
1032	if (val == POOL_STATE_DESTROYED) {
1033		/* We don't boot only from destroyed pools. */
1034		return (EIO);
1035	}
1036
1037	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1038	    NULL, &pool_txg) != 0 ||
1039	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1040	    NULL, &pool_guid) != 0 ||
1041	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1042	    NULL, &pool_name) != 0) {
1043		/*
1044		 * Cache and spare devices end up here - just ignore
1045		 * them.
1046		 */
1047		/*printf("ZFS: can't find pool details\n");*/
1048		return (EIO);
1049	}
1050
1051	if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
1052	    NULL, &val) == 0 && val != 0) {
1053		return (EIO);
1054	}
1055
1056	/*
1057	 * Create the pool if this is the first time we've seen it.
1058	 */
1059	spa = spa_find_by_guid(pool_guid);
1060	if (spa == NULL) {
1061		spa = spa_create(pool_guid, pool_name);
1062		if (spa == NULL)
1063			return (ENOMEM);
1064	}
1065	if (pool_txg > spa->spa_txg) {
1066		spa->spa_txg = pool_txg;
1067		is_newer = 1;
1068	} else {
1069		is_newer = 0;
1070	}
1071
1072	/*
1073	 * Get the vdev tree and create our in-core copy of it.
1074	 * If we already have a vdev with this guid, this must
1075	 * be some kind of alias (overlapping slices, dangerously dedicated
1076	 * disks etc).
1077	 */
1078	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1079	    NULL, &guid) != 0) {
1080		return (EIO);
1081	}
1082	vdev = vdev_find(guid);
1083	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
1084		return (EIO);
1085
1086	if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1087	    NULL, &vdevs)) {
1088		return (EIO);
1089	}
1090
1091	rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1092	if (rc != 0)
1093		return (rc);
1094
1095	/*
1096	 * Add the toplevel vdev to the pool if its not already there.
1097	 */
1098	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1099		if (top_vdev == pool_vdev)
1100			break;
1101	if (!pool_vdev && top_vdev) {
1102		top_vdev->spa = spa;
1103		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1104	}
1105
1106	/*
1107	 * We should already have created an incomplete vdev for this
1108	 * vdev. Find it and initialise it with our read proc.
1109	 */
1110	vdev = vdev_find(guid);
1111	if (vdev) {
1112		vdev->v_phys_read = _read;
1113		vdev->v_read_priv = read_priv;
1114		vdev->v_state = VDEV_STATE_HEALTHY;
1115	} else {
1116		printf("ZFS: inconsistent nvlist contents\n");
1117		return (EIO);
1118	}
1119
1120	/*
1121	 * Re-evaluate top-level vdev state.
1122	 */
1123	vdev_set_state(top_vdev);
1124
1125	/*
1126	 * Ok, we are happy with the pool so far. Lets find
1127	 * the best uberblock and then we can actually access
1128	 * the contents of the pool.
1129	 */
1130	upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1131	up = (const struct uberblock *)upbuf;
1132	for (l = 0; l < VDEV_LABELS; l++) {
1133		for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
1134			off = vdev_label_offset(psize, l,
1135			    VDEV_UBERBLOCK_OFFSET(vdev, i));
1136			BP_ZERO(&bp);
1137			DVA_SET_OFFSET(&bp.blk_dva[0], off);
1138			BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1139			BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1140			BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1141			BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1142			ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1143
1144			if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1145				continue;
1146
1147			if (up->ub_magic != UBERBLOCK_MAGIC)
1148				continue;
1149			if (up->ub_txg < spa->spa_txg)
1150				continue;
1151			if (up->ub_txg > spa->spa_uberblock.ub_txg ||
1152			    (up->ub_txg == spa->spa_uberblock.ub_txg &&
1153			    up->ub_timestamp >
1154			    spa->spa_uberblock.ub_timestamp)) {
1155				spa->spa_uberblock = *up;
1156			}
1157		}
1158	}
1159	zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1160
1161	vdev->spa = spa;
1162	if (spap != NULL)
1163		*spap = spa;
1164	return (0);
1165}
1166
1167static int
1168ilog2(int n)
1169{
1170	int v;
1171
1172	for (v = 0; v < 32; v++)
1173		if (n == (1 << v))
1174			return v;
1175	return -1;
1176}
1177
1178static int
1179zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1180{
1181	blkptr_t gbh_bp;
1182	zio_gbh_phys_t zio_gb;
1183	char *pbuf;
1184	int i;
1185
1186	/* Artificial BP for gang block header. */
1187	gbh_bp = *bp;
1188	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1189	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1190	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1191	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1192	for (i = 0; i < SPA_DVAS_PER_BP; i++)
1193		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1194
1195	/* Read gang header block using the artificial BP. */
1196	if (zio_read(spa, &gbh_bp, &zio_gb))
1197		return (EIO);
1198
1199	pbuf = buf;
1200	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1201		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1202
1203		if (BP_IS_HOLE(gbp))
1204			continue;
1205		if (zio_read(spa, gbp, pbuf))
1206			return (EIO);
1207		pbuf += BP_GET_PSIZE(gbp);
1208	}
1209
1210	if (zio_checksum_verify(spa, bp, buf))
1211		return (EIO);
1212	return (0);
1213}
1214
1215static int
1216zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1217{
1218	int cpfunc = BP_GET_COMPRESS(bp);
1219	uint64_t align, size;
1220	void *pbuf;
1221	int i, error;
1222
1223	/*
1224	 * Process data embedded in block pointer
1225	 */
1226	if (BP_IS_EMBEDDED(bp)) {
1227		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1228
1229		size = BPE_GET_PSIZE(bp);
1230		ASSERT(size <= BPE_PAYLOAD_SIZE);
1231
1232		if (cpfunc != ZIO_COMPRESS_OFF)
1233			pbuf = zfs_alloc(size);
1234		else
1235			pbuf = buf;
1236
1237		decode_embedded_bp_compressed(bp, pbuf);
1238		error = 0;
1239
1240		if (cpfunc != ZIO_COMPRESS_OFF) {
1241			error = zio_decompress_data(cpfunc, pbuf,
1242			    size, buf, BP_GET_LSIZE(bp));
1243			zfs_free(pbuf, size);
1244		}
1245		if (error != 0)
1246			printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1247			    error);
1248		return (error);
1249	}
1250
1251	error = EIO;
1252
1253	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1254		const dva_t *dva = &bp->blk_dva[i];
1255		vdev_t *vdev;
1256		int vdevid;
1257		off_t offset;
1258
1259		if (!dva->dva_word[0] && !dva->dva_word[1])
1260			continue;
1261
1262		vdevid = DVA_GET_VDEV(dva);
1263		offset = DVA_GET_OFFSET(dva);
1264		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1265			if (vdev->v_id == vdevid)
1266				break;
1267		}
1268		if (!vdev || !vdev->v_read)
1269			continue;
1270
1271		size = BP_GET_PSIZE(bp);
1272		if (vdev->v_read == vdev_raidz_read) {
1273			align = 1ULL << vdev->v_top->v_ashift;
1274			if (P2PHASE(size, align) != 0)
1275				size = P2ROUNDUP(size, align);
1276		}
1277		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1278			pbuf = zfs_alloc(size);
1279		else
1280			pbuf = buf;
1281
1282		if (DVA_GET_GANG(dva))
1283			error = zio_read_gang(spa, bp, pbuf);
1284		else
1285			error = vdev->v_read(vdev, bp, pbuf, offset, size);
1286		if (error == 0) {
1287			if (cpfunc != ZIO_COMPRESS_OFF)
1288				error = zio_decompress_data(cpfunc, pbuf,
1289				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1290			else if (size != BP_GET_PSIZE(bp))
1291				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1292		}
1293		if (buf != pbuf)
1294			zfs_free(pbuf, size);
1295		if (error == 0)
1296			break;
1297	}
1298	if (error != 0)
1299		printf("ZFS: i/o error - all block copies unavailable\n");
1300	return (error);
1301}
1302
1303static int
1304dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1305{
1306	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1307	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1308	int nlevels = dnode->dn_nlevels;
1309	int i, rc;
1310
1311	if (bsize > SPA_MAXBLOCKSIZE) {
1312		printf("ZFS: I/O error - blocks larger than %llu are not "
1313		    "supported\n", SPA_MAXBLOCKSIZE);
1314		return (EIO);
1315	}
1316
1317	/*
1318	 * Note: bsize may not be a power of two here so we need to do an
1319	 * actual divide rather than a bitshift.
1320	 */
1321	while (buflen > 0) {
1322		uint64_t bn = offset / bsize;
1323		int boff = offset % bsize;
1324		int ibn;
1325		const blkptr_t *indbp;
1326		blkptr_t bp;
1327
1328		if (bn > dnode->dn_maxblkid)
1329			return (EIO);
1330
1331		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1332			goto cached;
1333
1334		indbp = dnode->dn_blkptr;
1335		for (i = 0; i < nlevels; i++) {
1336			/*
1337			 * Copy the bp from the indirect array so that
1338			 * we can re-use the scratch buffer for multi-level
1339			 * objects.
1340			 */
1341			ibn = bn >> ((nlevels - i - 1) * ibshift);
1342			ibn &= ((1 << ibshift) - 1);
1343			bp = indbp[ibn];
1344			if (BP_IS_HOLE(&bp)) {
1345				memset(dnode_cache_buf, 0, bsize);
1346				break;
1347			}
1348			rc = zio_read(spa, &bp, dnode_cache_buf);
1349			if (rc)
1350				return (rc);
1351			indbp = (const blkptr_t *) dnode_cache_buf;
1352		}
1353		dnode_cache_obj = dnode;
1354		dnode_cache_bn = bn;
1355	cached:
1356
1357		/*
1358		 * The buffer contains our data block. Copy what we
1359		 * need from it and loop.
1360		 */
1361		i = bsize - boff;
1362		if (i > buflen) i = buflen;
1363		memcpy(buf, &dnode_cache_buf[boff], i);
1364		buf = ((char*) buf) + i;
1365		offset += i;
1366		buflen -= i;
1367	}
1368
1369	return (0);
1370}
1371
1372/*
1373 * Lookup a value in a microzap directory. Assumes that the zap
1374 * scratch buffer contains the directory contents.
1375 */
1376static int
1377mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1378{
1379	const mzap_phys_t *mz;
1380	const mzap_ent_phys_t *mze;
1381	size_t size;
1382	int chunks, i;
1383
1384	/*
1385	 * Microzap objects use exactly one block. Read the whole
1386	 * thing.
1387	 */
1388	size = dnode->dn_datablkszsec * 512;
1389
1390	mz = (const mzap_phys_t *) zap_scratch;
1391	chunks = size / MZAP_ENT_LEN - 1;
1392
1393	for (i = 0; i < chunks; i++) {
1394		mze = &mz->mz_chunk[i];
1395		if (!strcmp(mze->mze_name, name)) {
1396			*value = mze->mze_value;
1397			return (0);
1398		}
1399	}
1400
1401	return (ENOENT);
1402}
1403
1404/*
1405 * Compare a name with a zap leaf entry. Return non-zero if the name
1406 * matches.
1407 */
1408static int
1409fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1410{
1411	size_t namelen;
1412	const zap_leaf_chunk_t *nc;
1413	const char *p;
1414
1415	namelen = zc->l_entry.le_name_numints;
1416
1417	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1418	p = name;
1419	while (namelen > 0) {
1420		size_t len;
1421		len = namelen;
1422		if (len > ZAP_LEAF_ARRAY_BYTES)
1423			len = ZAP_LEAF_ARRAY_BYTES;
1424		if (memcmp(p, nc->l_array.la_array, len))
1425			return (0);
1426		p += len;
1427		namelen -= len;
1428		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1429	}
1430
1431	return 1;
1432}
1433
1434/*
1435 * Extract a uint64_t value from a zap leaf entry.
1436 */
1437static uint64_t
1438fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1439{
1440	const zap_leaf_chunk_t *vc;
1441	int i;
1442	uint64_t value;
1443	const uint8_t *p;
1444
1445	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1446	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1447		value = (value << 8) | p[i];
1448	}
1449
1450	return value;
1451}
1452
1453static void
1454stv(int len, void *addr, uint64_t value)
1455{
1456	switch (len) {
1457	case 1:
1458		*(uint8_t *)addr = value;
1459		return;
1460	case 2:
1461		*(uint16_t *)addr = value;
1462		return;
1463	case 4:
1464		*(uint32_t *)addr = value;
1465		return;
1466	case 8:
1467		*(uint64_t *)addr = value;
1468		return;
1469	}
1470}
1471
1472/*
1473 * Extract a array from a zap leaf entry.
1474 */
1475static void
1476fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1477    uint64_t integer_size, uint64_t num_integers, void *buf)
1478{
1479	uint64_t array_int_len = zc->l_entry.le_value_intlen;
1480	uint64_t value = 0;
1481	uint64_t *u64 = buf;
1482	char *p = buf;
1483	int len = MIN(zc->l_entry.le_value_numints, num_integers);
1484	int chunk = zc->l_entry.le_value_chunk;
1485	int byten = 0;
1486
1487	if (integer_size == 8 && len == 1) {
1488		*u64 = fzap_leaf_value(zl, zc);
1489		return;
1490	}
1491
1492	while (len > 0) {
1493		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1494		int i;
1495
1496		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1497		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1498			value = (value << 8) | la->la_array[i];
1499			byten++;
1500			if (byten == array_int_len) {
1501				stv(integer_size, p, value);
1502				byten = 0;
1503				len--;
1504				if (len == 0)
1505					return;
1506				p += integer_size;
1507			}
1508		}
1509		chunk = la->la_next;
1510	}
1511}
1512
1513/*
1514 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1515 * buffer contains the directory header.
1516 */
1517static int
1518fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1519    uint64_t integer_size, uint64_t num_integers, void *value)
1520{
1521	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1522	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1523	fat_zap_t z;
1524	uint64_t *ptrtbl;
1525	uint64_t hash;
1526	int rc;
1527
1528	if (zh.zap_magic != ZAP_MAGIC)
1529		return (EIO);
1530
1531	z.zap_block_shift = ilog2(bsize);
1532	z.zap_phys = (zap_phys_t *) zap_scratch;
1533
1534	/*
1535	 * Figure out where the pointer table is and read it in if necessary.
1536	 */
1537	if (zh.zap_ptrtbl.zt_blk) {
1538		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1539			       zap_scratch, bsize);
1540		if (rc)
1541			return (rc);
1542		ptrtbl = (uint64_t *) zap_scratch;
1543	} else {
1544		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1545	}
1546
1547	hash = zap_hash(zh.zap_salt, name);
1548
1549	zap_leaf_t zl;
1550	zl.l_bs = z.zap_block_shift;
1551
1552	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1553	zap_leaf_chunk_t *zc;
1554
1555	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1556	if (rc)
1557		return (rc);
1558
1559	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1560
1561	/*
1562	 * Make sure this chunk matches our hash.
1563	 */
1564	if (zl.l_phys->l_hdr.lh_prefix_len > 0
1565	    && zl.l_phys->l_hdr.lh_prefix
1566	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1567		return (ENOENT);
1568
1569	/*
1570	 * Hash within the chunk to find our entry.
1571	 */
1572	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1573	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1574	h = zl.l_phys->l_hash[h];
1575	if (h == 0xffff)
1576		return (ENOENT);
1577	zc = &ZAP_LEAF_CHUNK(&zl, h);
1578	while (zc->l_entry.le_hash != hash) {
1579		if (zc->l_entry.le_next == 0xffff) {
1580			zc = NULL;
1581			break;
1582		}
1583		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1584	}
1585	if (fzap_name_equal(&zl, zc, name)) {
1586		if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1587		    integer_size * num_integers)
1588			return (E2BIG);
1589		fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1590		return (0);
1591	}
1592
1593	return (ENOENT);
1594}
1595
1596/*
1597 * Lookup a name in a zap object and return its value as a uint64_t.
1598 */
1599static int
1600zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1601    uint64_t integer_size, uint64_t num_integers, void *value)
1602{
1603	int rc;
1604	uint64_t zap_type;
1605	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1606
1607	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1608	if (rc)
1609		return (rc);
1610
1611	zap_type = *(uint64_t *) zap_scratch;
1612	if (zap_type == ZBT_MICRO)
1613		return mzap_lookup(dnode, name, value);
1614	else if (zap_type == ZBT_HEADER) {
1615		return fzap_lookup(spa, dnode, name, integer_size,
1616		    num_integers, value);
1617	}
1618	printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1619	return (EIO);
1620}
1621
1622/*
1623 * List a microzap directory. Assumes that the zap scratch buffer contains
1624 * the directory contents.
1625 */
1626static int
1627mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1628{
1629	const mzap_phys_t *mz;
1630	const mzap_ent_phys_t *mze;
1631	size_t size;
1632	int chunks, i, rc;
1633
1634	/*
1635	 * Microzap objects use exactly one block. Read the whole
1636	 * thing.
1637	 */
1638	size = dnode->dn_datablkszsec * 512;
1639	mz = (const mzap_phys_t *) zap_scratch;
1640	chunks = size / MZAP_ENT_LEN - 1;
1641
1642	for (i = 0; i < chunks; i++) {
1643		mze = &mz->mz_chunk[i];
1644		if (mze->mze_name[0]) {
1645			rc = callback(mze->mze_name, mze->mze_value);
1646			if (rc != 0)
1647				return (rc);
1648		}
1649	}
1650
1651	return (0);
1652}
1653
1654/*
1655 * List a fatzap directory. Assumes that the zap scratch buffer contains
1656 * the directory header.
1657 */
1658static int
1659fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1660{
1661	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1662	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1663	fat_zap_t z;
1664	int i, j, rc;
1665
1666	if (zh.zap_magic != ZAP_MAGIC)
1667		return (EIO);
1668
1669	z.zap_block_shift = ilog2(bsize);
1670	z.zap_phys = (zap_phys_t *) zap_scratch;
1671
1672	/*
1673	 * This assumes that the leaf blocks start at block 1. The
1674	 * documentation isn't exactly clear on this.
1675	 */
1676	zap_leaf_t zl;
1677	zl.l_bs = z.zap_block_shift;
1678	for (i = 0; i < zh.zap_num_leafs; i++) {
1679		off_t off = (i + 1) << zl.l_bs;
1680		char name[256], *p;
1681		uint64_t value;
1682
1683		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1684			return (EIO);
1685
1686		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1687
1688		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1689			zap_leaf_chunk_t *zc, *nc;
1690			int namelen;
1691
1692			zc = &ZAP_LEAF_CHUNK(&zl, j);
1693			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1694				continue;
1695			namelen = zc->l_entry.le_name_numints;
1696			if (namelen > sizeof(name))
1697				namelen = sizeof(name);
1698
1699			/*
1700			 * Paste the name back together.
1701			 */
1702			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1703			p = name;
1704			while (namelen > 0) {
1705				int len;
1706				len = namelen;
1707				if (len > ZAP_LEAF_ARRAY_BYTES)
1708					len = ZAP_LEAF_ARRAY_BYTES;
1709				memcpy(p, nc->l_array.la_array, len);
1710				p += len;
1711				namelen -= len;
1712				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1713			}
1714
1715			/*
1716			 * Assume the first eight bytes of the value are
1717			 * a uint64_t.
1718			 */
1719			value = fzap_leaf_value(&zl, zc);
1720
1721			//printf("%s 0x%jx\n", name, (uintmax_t)value);
1722			rc = callback((const char *)name, value);
1723			if (rc != 0)
1724				return (rc);
1725		}
1726	}
1727
1728	return (0);
1729}
1730
1731static int zfs_printf(const char *name, uint64_t value __unused)
1732{
1733
1734	printf("%s\n", name);
1735
1736	return (0);
1737}
1738
1739/*
1740 * List a zap directory.
1741 */
1742static int
1743zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1744{
1745	uint64_t zap_type;
1746	size_t size = dnode->dn_datablkszsec * 512;
1747
1748	if (dnode_read(spa, dnode, 0, zap_scratch, size))
1749		return (EIO);
1750
1751	zap_type = *(uint64_t *) zap_scratch;
1752	if (zap_type == ZBT_MICRO)
1753		return mzap_list(dnode, zfs_printf);
1754	else
1755		return fzap_list(spa, dnode, zfs_printf);
1756}
1757
1758static int
1759objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1760{
1761	off_t offset;
1762
1763	offset = objnum * sizeof(dnode_phys_t);
1764	return dnode_read(spa, &os->os_meta_dnode, offset,
1765		dnode, sizeof(dnode_phys_t));
1766}
1767
1768static int
1769mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1770{
1771	const mzap_phys_t *mz;
1772	const mzap_ent_phys_t *mze;
1773	size_t size;
1774	int chunks, i;
1775
1776	/*
1777	 * Microzap objects use exactly one block. Read the whole
1778	 * thing.
1779	 */
1780	size = dnode->dn_datablkszsec * 512;
1781
1782	mz = (const mzap_phys_t *) zap_scratch;
1783	chunks = size / MZAP_ENT_LEN - 1;
1784
1785	for (i = 0; i < chunks; i++) {
1786		mze = &mz->mz_chunk[i];
1787		if (value == mze->mze_value) {
1788			strcpy(name, mze->mze_name);
1789			return (0);
1790		}
1791	}
1792
1793	return (ENOENT);
1794}
1795
1796static void
1797fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1798{
1799	size_t namelen;
1800	const zap_leaf_chunk_t *nc;
1801	char *p;
1802
1803	namelen = zc->l_entry.le_name_numints;
1804
1805	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1806	p = name;
1807	while (namelen > 0) {
1808		size_t len;
1809		len = namelen;
1810		if (len > ZAP_LEAF_ARRAY_BYTES)
1811			len = ZAP_LEAF_ARRAY_BYTES;
1812		memcpy(p, nc->l_array.la_array, len);
1813		p += len;
1814		namelen -= len;
1815		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1816	}
1817
1818	*p = '\0';
1819}
1820
1821static int
1822fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1823{
1824	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1825	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1826	fat_zap_t z;
1827	int i, j;
1828
1829	if (zh.zap_magic != ZAP_MAGIC)
1830		return (EIO);
1831
1832	z.zap_block_shift = ilog2(bsize);
1833	z.zap_phys = (zap_phys_t *) zap_scratch;
1834
1835	/*
1836	 * This assumes that the leaf blocks start at block 1. The
1837	 * documentation isn't exactly clear on this.
1838	 */
1839	zap_leaf_t zl;
1840	zl.l_bs = z.zap_block_shift;
1841	for (i = 0; i < zh.zap_num_leafs; i++) {
1842		off_t off = (i + 1) << zl.l_bs;
1843
1844		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1845			return (EIO);
1846
1847		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1848
1849		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1850			zap_leaf_chunk_t *zc;
1851
1852			zc = &ZAP_LEAF_CHUNK(&zl, j);
1853			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1854				continue;
1855			if (zc->l_entry.le_value_intlen != 8 ||
1856			    zc->l_entry.le_value_numints != 1)
1857				continue;
1858
1859			if (fzap_leaf_value(&zl, zc) == value) {
1860				fzap_name_copy(&zl, zc, name);
1861				return (0);
1862			}
1863		}
1864	}
1865
1866	return (ENOENT);
1867}
1868
1869static int
1870zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1871{
1872	int rc;
1873	uint64_t zap_type;
1874	size_t size = dnode->dn_datablkszsec * 512;
1875
1876	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1877	if (rc)
1878		return (rc);
1879
1880	zap_type = *(uint64_t *) zap_scratch;
1881	if (zap_type == ZBT_MICRO)
1882		return mzap_rlookup(spa, dnode, name, value);
1883	else
1884		return fzap_rlookup(spa, dnode, name, value);
1885}
1886
1887static int
1888zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1889{
1890	char name[256];
1891	char component[256];
1892	uint64_t dir_obj, parent_obj, child_dir_zapobj;
1893	dnode_phys_t child_dir_zap, dataset, dir, parent;
1894	dsl_dir_phys_t *dd;
1895	dsl_dataset_phys_t *ds;
1896	char *p;
1897	int len;
1898
1899	p = &name[sizeof(name) - 1];
1900	*p = '\0';
1901
1902	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1903		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1904		return (EIO);
1905	}
1906	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1907	dir_obj = ds->ds_dir_obj;
1908
1909	for (;;) {
1910		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1911			return (EIO);
1912		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1913
1914		/* Actual loop condition. */
1915		parent_obj  = dd->dd_parent_obj;
1916		if (parent_obj == 0)
1917			break;
1918
1919		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1920			return (EIO);
1921		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1922		child_dir_zapobj = dd->dd_child_dir_zapobj;
1923		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1924			return (EIO);
1925		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1926			return (EIO);
1927
1928		len = strlen(component);
1929		p -= len;
1930		memcpy(p, component, len);
1931		--p;
1932		*p = '/';
1933
1934		/* Actual loop iteration. */
1935		dir_obj = parent_obj;
1936	}
1937
1938	if (*p != '\0')
1939		++p;
1940	strcpy(result, p);
1941
1942	return (0);
1943}
1944
1945static int
1946zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1947{
1948	char element[256];
1949	uint64_t dir_obj, child_dir_zapobj;
1950	dnode_phys_t child_dir_zap, dir;
1951	dsl_dir_phys_t *dd;
1952	const char *p, *q;
1953
1954	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1955		return (EIO);
1956	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1957	    1, &dir_obj))
1958		return (EIO);
1959
1960	p = name;
1961	for (;;) {
1962		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1963			return (EIO);
1964		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1965
1966		while (*p == '/')
1967			p++;
1968		/* Actual loop condition #1. */
1969		if (*p == '\0')
1970			break;
1971
1972		q = strchr(p, '/');
1973		if (q) {
1974			memcpy(element, p, q - p);
1975			element[q - p] = '\0';
1976			p = q + 1;
1977		} else {
1978			strcpy(element, p);
1979			p += strlen(p);
1980		}
1981
1982		child_dir_zapobj = dd->dd_child_dir_zapobj;
1983		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1984			return (EIO);
1985
1986		/* Actual loop condition #2. */
1987		if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1988		    1, &dir_obj) != 0)
1989			return (ENOENT);
1990	}
1991
1992	*objnum = dd->dd_head_dataset_obj;
1993	return (0);
1994}
1995
1996#ifndef BOOT2
1997static int
1998zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1999{
2000	uint64_t dir_obj, child_dir_zapobj;
2001	dnode_phys_t child_dir_zap, dir, dataset;
2002	dsl_dataset_phys_t *ds;
2003	dsl_dir_phys_t *dd;
2004
2005	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2006		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2007		return (EIO);
2008	}
2009	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2010	dir_obj = ds->ds_dir_obj;
2011
2012	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2013		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2014		return (EIO);
2015	}
2016	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2017
2018	child_dir_zapobj = dd->dd_child_dir_zapobj;
2019	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
2020		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2021		return (EIO);
2022	}
2023
2024	return (zap_list(spa, &child_dir_zap) != 0);
2025}
2026
2027int
2028zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
2029{
2030	uint64_t dir_obj, child_dir_zapobj, zap_type;
2031	dnode_phys_t child_dir_zap, dir, dataset;
2032	dsl_dataset_phys_t *ds;
2033	dsl_dir_phys_t *dd;
2034	int err;
2035
2036	err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2037	if (err != 0) {
2038		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2039		return (err);
2040	}
2041	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2042	dir_obj = ds->ds_dir_obj;
2043
2044	err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2045	if (err != 0) {
2046		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2047		return (err);
2048	}
2049	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2050
2051	child_dir_zapobj = dd->dd_child_dir_zapobj;
2052	err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2053	if (err != 0) {
2054		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2055		return (err);
2056	}
2057
2058	err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2059	if (err != 0)
2060		return (err);
2061
2062	zap_type = *(uint64_t *) zap_scratch;
2063	if (zap_type == ZBT_MICRO)
2064		return mzap_list(&child_dir_zap, callback);
2065	else
2066		return fzap_list(spa, &child_dir_zap, callback);
2067}
2068#endif
2069
2070/*
2071 * Find the object set given the object number of its dataset object
2072 * and return its details in *objset
2073 */
2074static int
2075zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2076{
2077	dnode_phys_t dataset;
2078	dsl_dataset_phys_t *ds;
2079
2080	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2081		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2082		return (EIO);
2083	}
2084
2085	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2086	if (zio_read(spa, &ds->ds_bp, objset)) {
2087		printf("ZFS: can't read object set for dataset %ju\n",
2088		    (uintmax_t)objnum);
2089		return (EIO);
2090	}
2091
2092	return (0);
2093}
2094
2095/*
2096 * Find the object set pointed to by the BOOTFS property or the root
2097 * dataset if there is none and return its details in *objset
2098 */
2099static int
2100zfs_get_root(const spa_t *spa, uint64_t *objid)
2101{
2102	dnode_phys_t dir, propdir;
2103	uint64_t props, bootfs, root;
2104
2105	*objid = 0;
2106
2107	/*
2108	 * Start with the MOS directory object.
2109	 */
2110	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2111		printf("ZFS: can't read MOS object directory\n");
2112		return (EIO);
2113	}
2114
2115	/*
2116	 * Lookup the pool_props and see if we can find a bootfs.
2117	 */
2118	if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2119	     && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2120	     && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2121	     && bootfs != 0)
2122	{
2123		*objid = bootfs;
2124		return (0);
2125	}
2126	/*
2127	 * Lookup the root dataset directory
2128	 */
2129	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2130	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2131		printf("ZFS: can't find root dsl_dir\n");
2132		return (EIO);
2133	}
2134
2135	/*
2136	 * Use the information from the dataset directory's bonus buffer
2137	 * to find the dataset object and from that the object set itself.
2138	 */
2139	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2140	*objid = dd->dd_head_dataset_obj;
2141	return (0);
2142}
2143
2144static int
2145zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2146{
2147
2148	mount->spa = spa;
2149
2150	/*
2151	 * Find the root object set if not explicitly provided
2152	 */
2153	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2154		printf("ZFS: can't find root filesystem\n");
2155		return (EIO);
2156	}
2157
2158	if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2159		printf("ZFS: can't open root filesystem\n");
2160		return (EIO);
2161	}
2162
2163	mount->rootobj = rootobj;
2164
2165	return (0);
2166}
2167
2168/*
2169 * callback function for feature name checks.
2170 */
2171static int
2172check_feature(const char *name, uint64_t value)
2173{
2174	int i;
2175
2176	if (value == 0)
2177		return (0);
2178	if (name[0] == '\0')
2179		return (0);
2180
2181	for (i = 0; features_for_read[i] != NULL; i++) {
2182		if (strcmp(name, features_for_read[i]) == 0)
2183			return (0);
2184	}
2185	printf("ZFS: unsupported feature: %s\n", name);
2186	return (EIO);
2187}
2188
2189/*
2190 * Checks whether the MOS features that are active are supported.
2191 */
2192static int
2193check_mos_features(const spa_t *spa)
2194{
2195	dnode_phys_t dir;
2196	uint64_t objnum, zap_type;
2197	size_t size;
2198	int rc;
2199
2200	if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2201	    &dir)) != 0)
2202		return (rc);
2203	if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2204	    sizeof (objnum), 1, &objnum)) != 0) {
2205		/*
2206		 * It is older pool without features. As we have already
2207		 * tested the label, just return without raising the error.
2208		 */
2209		return (0);
2210	}
2211
2212	if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2213		return (rc);
2214
2215	if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2216		return (EIO);
2217
2218	size = dir.dn_datablkszsec * 512;
2219	if (dnode_read(spa, &dir, 0, zap_scratch, size))
2220		return (EIO);
2221
2222	zap_type = *(uint64_t *) zap_scratch;
2223	if (zap_type == ZBT_MICRO)
2224		rc = mzap_list(&dir, check_feature);
2225	else
2226		rc = fzap_list(spa, &dir, check_feature);
2227
2228	return (rc);
2229}
2230
2231static int
2232zfs_spa_init(spa_t *spa)
2233{
2234	dnode_phys_t dir;
2235	int rc;
2236
2237	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2238		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2239		return (EIO);
2240	}
2241	if (spa->spa_mos.os_type != DMU_OST_META) {
2242		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2243		return (EIO);
2244	}
2245
2246	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2247	    &dir)) {
2248		printf("ZFS: failed to read pool %s directory object\n",
2249		    spa->spa_name);
2250		return (EIO);
2251	}
2252	/* this is allowed to fail, older pools do not have salt */
2253	rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2254	    sizeof (spa->spa_cksum_salt.zcs_bytes),
2255	    spa->spa_cksum_salt.zcs_bytes);
2256
2257	rc = check_mos_features(spa);
2258	if (rc != 0) {
2259		printf("ZFS: pool %s is not supported\n", spa->spa_name);
2260	}
2261
2262	return (rc);
2263}
2264
2265static int
2266zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2267{
2268
2269	if (dn->dn_bonustype != DMU_OT_SA) {
2270		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2271
2272		sb->st_mode = zp->zp_mode;
2273		sb->st_uid = zp->zp_uid;
2274		sb->st_gid = zp->zp_gid;
2275		sb->st_size = zp->zp_size;
2276	} else {
2277		sa_hdr_phys_t *sahdrp;
2278		int hdrsize;
2279		size_t size = 0;
2280		void *buf = NULL;
2281
2282		if (dn->dn_bonuslen != 0)
2283			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2284		else {
2285			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2286				blkptr_t *bp = DN_SPILL_BLKPTR(dn);
2287				int error;
2288
2289				size = BP_GET_LSIZE(bp);
2290				buf = zfs_alloc(size);
2291				error = zio_read(spa, bp, buf);
2292				if (error != 0) {
2293					zfs_free(buf, size);
2294					return (error);
2295				}
2296				sahdrp = buf;
2297			} else {
2298				return (EIO);
2299			}
2300		}
2301		hdrsize = SA_HDR_SIZE(sahdrp);
2302		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2303		    SA_MODE_OFFSET);
2304		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2305		    SA_UID_OFFSET);
2306		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2307		    SA_GID_OFFSET);
2308		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2309		    SA_SIZE_OFFSET);
2310		if (buf != NULL)
2311			zfs_free(buf, size);
2312	}
2313
2314	return (0);
2315}
2316
2317static int
2318zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
2319{
2320	int rc = 0;
2321
2322	if (dn->dn_bonustype == DMU_OT_SA) {
2323		sa_hdr_phys_t *sahdrp = NULL;
2324		size_t size = 0;
2325		void *buf = NULL;
2326		int hdrsize;
2327		char *p;
2328
2329		if (dn->dn_bonuslen != 0)
2330			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2331		else {
2332			blkptr_t *bp;
2333
2334			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
2335				return (EIO);
2336			bp = DN_SPILL_BLKPTR(dn);
2337
2338			size = BP_GET_LSIZE(bp);
2339			buf = zfs_alloc(size);
2340			rc = zio_read(spa, bp, buf);
2341			if (rc != 0) {
2342				zfs_free(buf, size);
2343				return (rc);
2344			}
2345			sahdrp = buf;
2346		}
2347		hdrsize = SA_HDR_SIZE(sahdrp);
2348		p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
2349		memcpy(path, p, psize);
2350		if (buf != NULL)
2351			zfs_free(buf, size);
2352		return (0);
2353	}
2354	/*
2355	 * Second test is purely to silence bogus compiler
2356	 * warning about accessing past the end of dn_bonus.
2357	 */
2358	if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
2359	    sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
2360		memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
2361	} else {
2362		rc = dnode_read(spa, dn, 0, path, psize);
2363	}
2364	return (rc);
2365}
2366
2367struct obj_list {
2368	uint64_t		objnum;
2369	STAILQ_ENTRY(obj_list)	entry;
2370};
2371
2372/*
2373 * Lookup a file and return its dnode.
2374 */
2375static int
2376zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2377{
2378	int rc;
2379	uint64_t objnum;
2380	const spa_t *spa;
2381	dnode_phys_t dn;
2382	const char *p, *q;
2383	char element[256];
2384	char path[1024];
2385	int symlinks_followed = 0;
2386	struct stat sb;
2387	struct obj_list *entry, *tentry;
2388	STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
2389
2390	spa = mount->spa;
2391	if (mount->objset.os_type != DMU_OST_ZFS) {
2392		printf("ZFS: unexpected object set type %ju\n",
2393		    (uintmax_t)mount->objset.os_type);
2394		return (EIO);
2395	}
2396
2397	if ((entry = malloc(sizeof(struct obj_list))) == NULL)
2398		return (ENOMEM);
2399
2400	/*
2401	 * Get the root directory dnode.
2402	 */
2403	rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2404	if (rc) {
2405		free(entry);
2406		return (rc);
2407	}
2408
2409	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
2410	if (rc) {
2411		free(entry);
2412		return (rc);
2413	}
2414	entry->objnum = objnum;
2415	STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2416
2417	rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2418	if (rc != 0)
2419		goto done;
2420
2421	p = upath;
2422	while (p && *p) {
2423		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2424		if (rc != 0)
2425			goto done;
2426
2427		while (*p == '/')
2428			p++;
2429		if (*p == '\0')
2430			break;
2431		q = p;
2432		while (*q != '\0' && *q != '/')
2433			q++;
2434
2435		/* skip dot */
2436		if (p + 1 == q && p[0] == '.') {
2437			p++;
2438			continue;
2439		}
2440		/* double dot */
2441		if (p + 2 == q && p[0] == '.' && p[1] == '.') {
2442			p += 2;
2443			if (STAILQ_FIRST(&on_cache) ==
2444			    STAILQ_LAST(&on_cache, obj_list, entry)) {
2445				rc = ENOENT;
2446				goto done;
2447			}
2448			entry = STAILQ_FIRST(&on_cache);
2449			STAILQ_REMOVE_HEAD(&on_cache, entry);
2450			free(entry);
2451			objnum = (STAILQ_FIRST(&on_cache))->objnum;
2452			continue;
2453		}
2454		if (q - p + 1 > sizeof(element)) {
2455			rc = ENAMETOOLONG;
2456			goto done;
2457		}
2458		memcpy(element, p, q - p);
2459		element[q - p] = 0;
2460		p = q;
2461
2462		if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
2463			goto done;
2464		if (!S_ISDIR(sb.st_mode)) {
2465			rc = ENOTDIR;
2466			goto done;
2467		}
2468
2469		rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2470		if (rc)
2471			goto done;
2472		objnum = ZFS_DIRENT_OBJ(objnum);
2473
2474		if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
2475			rc = ENOMEM;
2476			goto done;
2477		}
2478		entry->objnum = objnum;
2479		STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2480		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2481		if (rc)
2482			goto done;
2483
2484		/*
2485		 * Check for symlink.
2486		 */
2487		rc = zfs_dnode_stat(spa, &dn, &sb);
2488		if (rc)
2489			goto done;
2490		if (S_ISLNK(sb.st_mode)) {
2491			if (symlinks_followed > 10) {
2492				rc = EMLINK;
2493				goto done;
2494			}
2495			symlinks_followed++;
2496
2497			/*
2498			 * Read the link value and copy the tail of our
2499			 * current path onto the end.
2500			 */
2501			if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
2502				rc = ENAMETOOLONG;
2503				goto done;
2504			}
2505			strcpy(&path[sb.st_size], p);
2506
2507			rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
2508			if (rc != 0)
2509				goto done;
2510
2511			/*
2512			 * Restart with the new path, starting either at
2513			 * the root or at the parent depending whether or
2514			 * not the link is relative.
2515			 */
2516			p = path;
2517			if (*p == '/') {
2518				while (STAILQ_FIRST(&on_cache) !=
2519				    STAILQ_LAST(&on_cache, obj_list, entry)) {
2520					entry = STAILQ_FIRST(&on_cache);
2521					STAILQ_REMOVE_HEAD(&on_cache, entry);
2522					free(entry);
2523				}
2524			} else {
2525				entry = STAILQ_FIRST(&on_cache);
2526				STAILQ_REMOVE_HEAD(&on_cache, entry);
2527				free(entry);
2528			}
2529			objnum = (STAILQ_FIRST(&on_cache))->objnum;
2530		}
2531	}
2532
2533	*dnode = dn;
2534done:
2535	STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
2536		free(entry);
2537	return (rc);
2538}
2539