vdev_label.c revision 168404
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Virtual Device Labels
30 * ---------------------
31 *
32 * The vdev label serves several distinct purposes:
33 *
34 *	1. Uniquely identify this device as part of a ZFS pool and confirm its
35 *	   identity within the pool.
36 *
37 * 	2. Verify that all the devices given in a configuration are present
38 *         within the pool.
39 *
40 * 	3. Determine the uberblock for the pool.
41 *
42 * 	4. In case of an import operation, determine the configuration of the
43 *         toplevel vdev of which it is a part.
44 *
45 * 	5. If an import operation cannot find all the devices in the pool,
46 *         provide enough information to the administrator to determine which
47 *         devices are missing.
48 *
49 * It is important to note that while the kernel is responsible for writing the
50 * label, it only consumes the information in the first three cases.  The
51 * latter information is only consumed in userland when determining the
52 * configuration to import a pool.
53 *
54 *
55 * Label Organization
56 * ------------------
57 *
58 * Before describing the contents of the label, it's important to understand how
59 * the labels are written and updated with respect to the uberblock.
60 *
61 * When the pool configuration is altered, either because it was newly created
62 * or a device was added, we want to update all the labels such that we can deal
63 * with fatal failure at any point.  To this end, each disk has two labels which
64 * are updated before and after the uberblock is synced.  Assuming we have
65 * labels and an uberblock with the following transacation groups:
66 *
67 *              L1          UB          L2
68 *           +------+    +------+    +------+
69 *           |      |    |      |    |      |
70 *           | t10  |    | t10  |    | t10  |
71 *           |      |    |      |    |      |
72 *           +------+    +------+    +------+
73 *
74 * In this stable state, the labels and the uberblock were all updated within
75 * the same transaction group (10).  Each label is mirrored and checksummed, so
76 * that we can detect when we fail partway through writing the label.
77 *
78 * In order to identify which labels are valid, the labels are written in the
79 * following manner:
80 *
81 * 	1. For each vdev, update 'L1' to the new label
82 * 	2. Update the uberblock
83 * 	3. For each vdev, update 'L2' to the new label
84 *
85 * Given arbitrary failure, we can determine the correct label to use based on
86 * the transaction group.  If we fail after updating L1 but before updating the
87 * UB, we will notice that L1's transaction group is greater than the uberblock,
88 * so L2 must be valid.  If we fail after writing the uberblock but before
89 * writing L2, we will notice that L2's transaction group is less than L1, and
90 * therefore L1 is valid.
91 *
92 * Another added complexity is that not every label is updated when the config
93 * is synced.  If we add a single device, we do not want to have to re-write
94 * every label for every device in the pool.  This means that both L1 and L2 may
95 * be older than the pool uberblock, because the necessary information is stored
96 * on another vdev.
97 *
98 *
99 * On-disk Format
100 * --------------
101 *
102 * The vdev label consists of two distinct parts, and is wrapped within the
103 * vdev_label_t structure.  The label includes 8k of padding to permit legacy
104 * VTOC disk labels, but is otherwise ignored.
105 *
106 * The first half of the label is a packed nvlist which contains pool wide
107 * properties, per-vdev properties, and configuration information.  It is
108 * described in more detail below.
109 *
110 * The latter half of the label consists of a redundant array of uberblocks.
111 * These uberblocks are updated whenever a transaction group is committed,
112 * or when the configuration is updated.  When a pool is loaded, we scan each
113 * vdev for the 'best' uberblock.
114 *
115 *
116 * Configuration Information
117 * -------------------------
118 *
119 * The nvlist describing the pool and vdev contains the following elements:
120 *
121 * 	version		ZFS on-disk version
122 * 	name		Pool name
123 * 	state		Pool state
124 * 	txg		Transaction group in which this label was written
125 * 	pool_guid	Unique identifier for this pool
126 * 	vdev_tree	An nvlist describing vdev tree.
127 *
128 * Each leaf device label also contains the following:
129 *
130 * 	top_guid	Unique ID for top-level vdev in which this is contained
131 * 	guid		Unique ID for the leaf vdev
132 *
133 * The 'vs' configuration follows the format described in 'spa_config.c'.
134 */
135
136#include <sys/zfs_context.h>
137#include <sys/spa.h>
138#include <sys/spa_impl.h>
139#include <sys/dmu.h>
140#include <sys/zap.h>
141#include <sys/vdev.h>
142#include <sys/vdev_impl.h>
143#include <sys/uberblock_impl.h>
144#include <sys/metaslab.h>
145#include <sys/zio.h>
146#include <sys/fs/zfs.h>
147
148/*
149 * Basic routines to read and write from a vdev label.
150 * Used throughout the rest of this file.
151 */
152uint64_t
153vdev_label_offset(uint64_t psize, int l, uint64_t offset)
154{
155	ASSERT(offset < sizeof (vdev_label_t));
156
157	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
158	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
159}
160
161static void
162vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
163	uint64_t size, zio_done_func_t *done, void *private)
164{
165	ASSERT(vd->vdev_children == 0);
166
167	zio_nowait(zio_read_phys(zio, vd,
168	    vdev_label_offset(vd->vdev_psize, l, offset),
169	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
170	    ZIO_PRIORITY_SYNC_READ,
171	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
172}
173
174static void
175vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
176	uint64_t size, zio_done_func_t *done, void *private)
177{
178	ASSERT(vd->vdev_children == 0);
179
180	zio_nowait(zio_write_phys(zio, vd,
181	    vdev_label_offset(vd->vdev_psize, l, offset),
182	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
183	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
184}
185
186/*
187 * Generate the nvlist representing this vdev's config.
188 */
189nvlist_t *
190vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
191    boolean_t isspare)
192{
193	nvlist_t *nv = NULL;
194
195	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
196
197	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
198	    vd->vdev_ops->vdev_op_type) == 0);
199	if (!isspare)
200		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
201		    == 0);
202	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
203
204	if (vd->vdev_path != NULL)
205		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
206		    vd->vdev_path) == 0);
207
208	if (vd->vdev_devid != NULL)
209		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
210		    vd->vdev_devid) == 0);
211
212	if (vd->vdev_nparity != 0) {
213		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
214		    VDEV_TYPE_RAIDZ) == 0);
215
216		/*
217		 * Make sure someone hasn't managed to sneak a fancy new vdev
218		 * into a crufty old storage pool.
219		 */
220		ASSERT(vd->vdev_nparity == 1 ||
221		    (vd->vdev_nparity == 2 &&
222		    spa_version(spa) >= ZFS_VERSION_RAID6));
223
224		/*
225		 * Note that we'll add the nparity tag even on storage pools
226		 * that only support a single parity device -- older software
227		 * will just ignore it.
228		 */
229		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
230		    vd->vdev_nparity) == 0);
231	}
232
233	if (vd->vdev_wholedisk != -1ULL)
234		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
235		    vd->vdev_wholedisk) == 0);
236
237	if (vd->vdev_not_present)
238		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
239
240	if (vd->vdev_isspare)
241		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
242
243	if (!isspare && vd == vd->vdev_top) {
244		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
245		    vd->vdev_ms_array) == 0);
246		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
247		    vd->vdev_ms_shift) == 0);
248		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
249		    vd->vdev_ashift) == 0);
250		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
251		    vd->vdev_asize) == 0);
252	}
253
254	if (vd->vdev_dtl.smo_object != 0)
255		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
256		    vd->vdev_dtl.smo_object) == 0);
257
258	if (getstats) {
259		vdev_stat_t vs;
260		vdev_get_stats(vd, &vs);
261		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
262		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
263	}
264
265	if (!vd->vdev_ops->vdev_op_leaf) {
266		nvlist_t **child;
267		int c;
268
269		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
270		    KM_SLEEP);
271
272		for (c = 0; c < vd->vdev_children; c++)
273			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
274			    getstats, isspare);
275
276		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
277		    child, vd->vdev_children) == 0);
278
279		for (c = 0; c < vd->vdev_children; c++)
280			nvlist_free(child[c]);
281
282		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
283
284	} else {
285		if (vd->vdev_offline && !vd->vdev_tmpoffline)
286			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
287			    B_TRUE) == 0);
288		else
289			(void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
290			    DATA_TYPE_UINT64);
291	}
292
293	return (nv);
294}
295
296nvlist_t *
297vdev_label_read_config(vdev_t *vd)
298{
299	spa_t *spa = vd->vdev_spa;
300	nvlist_t *config = NULL;
301	vdev_phys_t *vp;
302	zio_t *zio;
303	int l;
304
305	ASSERT(spa_config_held(spa, RW_READER));
306
307	if (vdev_is_dead(vd))
308		return (NULL);
309
310	vp = zio_buf_alloc(sizeof (vdev_phys_t));
311
312	for (l = 0; l < VDEV_LABELS; l++) {
313
314		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
315		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
316
317		vdev_label_read(zio, vd, l, vp,
318		    offsetof(vdev_label_t, vl_vdev_phys),
319		    sizeof (vdev_phys_t), NULL, NULL);
320
321		if (zio_wait(zio) == 0 &&
322		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
323		    &config, 0) == 0)
324			break;
325
326		if (config != NULL) {
327			nvlist_free(config);
328			config = NULL;
329		}
330	}
331
332	zio_buf_free(vp, sizeof (vdev_phys_t));
333
334	return (config);
335}
336
337/*
338 * Determine if a device is in use.  The 'spare_guid' parameter will be filled
339 * in with the device guid if this spare is active elsewhere on the system.
340 */
341static boolean_t
342vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
343    uint64_t *spare_guid)
344{
345	spa_t *spa = vd->vdev_spa;
346	uint64_t state, pool_guid, device_guid, txg, spare_pool;
347	uint64_t vdtxg = 0;
348	nvlist_t *label;
349
350	if (spare_guid)
351		*spare_guid = 0ULL;
352
353	/*
354	 * Read the label, if any, and perform some basic sanity checks.
355	 */
356	if ((label = vdev_label_read_config(vd)) == NULL)
357		return (B_FALSE);
358
359	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
360	    &vdtxg);
361
362	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
363	    &state) != 0 ||
364	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
365	    &device_guid) != 0) {
366		nvlist_free(label);
367		return (B_FALSE);
368	}
369
370	if (state != POOL_STATE_SPARE &&
371	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
372	    &pool_guid) != 0 ||
373	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
374	    &txg) != 0)) {
375		nvlist_free(label);
376		return (B_FALSE);
377	}
378
379	nvlist_free(label);
380
381	/*
382	 * Check to see if this device indeed belongs to the pool it claims to
383	 * be a part of.  The only way this is allowed is if the device is a hot
384	 * spare (which we check for later on).
385	 */
386	if (state != POOL_STATE_SPARE &&
387	    !spa_guid_exists(pool_guid, device_guid) &&
388	    !spa_spare_exists(device_guid, NULL))
389		return (B_FALSE);
390
391	/*
392	 * If the transaction group is zero, then this an initialized (but
393	 * unused) label.  This is only an error if the create transaction
394	 * on-disk is the same as the one we're using now, in which case the
395	 * user has attempted to add the same vdev multiple times in the same
396	 * transaction.
397	 */
398	if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
399		return (B_TRUE);
400
401	/*
402	 * Check to see if this is a spare device.  We do an explicit check for
403	 * spa_has_spare() here because it may be on our pending list of spares
404	 * to add.
405	 */
406	if (spa_spare_exists(device_guid, &spare_pool) ||
407	    spa_has_spare(spa, device_guid)) {
408		if (spare_guid)
409			*spare_guid = device_guid;
410
411		switch (reason) {
412		case VDEV_LABEL_CREATE:
413			return (B_TRUE);
414
415		case VDEV_LABEL_REPLACE:
416			return (!spa_has_spare(spa, device_guid) ||
417			    spare_pool != 0ULL);
418
419		case VDEV_LABEL_SPARE:
420			return (spa_has_spare(spa, device_guid));
421		}
422	}
423
424	/*
425	 * If the device is marked ACTIVE, then this device is in use by another
426	 * pool on the system.
427	 */
428	return (state == POOL_STATE_ACTIVE);
429}
430
431/*
432 * Initialize a vdev label.  We check to make sure each leaf device is not in
433 * use, and writable.  We put down an initial label which we will later
434 * overwrite with a complete label.  Note that it's important to do this
435 * sequentially, not in parallel, so that we catch cases of multiple use of the
436 * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
437 * itself.
438 */
439int
440vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
441{
442	spa_t *spa = vd->vdev_spa;
443	nvlist_t *label;
444	vdev_phys_t *vp;
445	vdev_boot_header_t *vb;
446	uberblock_t *ub;
447	zio_t *zio;
448	int l, c, n;
449	char *buf;
450	size_t buflen;
451	int error;
452	uint64_t spare_guid;
453
454	ASSERT(spa_config_held(spa, RW_WRITER));
455
456	for (c = 0; c < vd->vdev_children; c++)
457		if ((error = vdev_label_init(vd->vdev_child[c],
458		    crtxg, reason)) != 0)
459			return (error);
460
461	if (!vd->vdev_ops->vdev_op_leaf)
462		return (0);
463
464	/*
465	 * Dead vdevs cannot be initialized.
466	 */
467	if (vdev_is_dead(vd))
468		return (EIO);
469
470	/*
471	 * Determine if the vdev is in use.
472	 */
473	if (reason != VDEV_LABEL_REMOVE &&
474	    vdev_inuse(vd, crtxg, reason, &spare_guid))
475		return (EBUSY);
476
477	ASSERT(reason != VDEV_LABEL_REMOVE ||
478	    vdev_inuse(vd, crtxg, reason, NULL));
479
480	/*
481	 * If this is a request to add or replace a spare that is in use
482	 * elsewhere on the system, then we must update the guid (which was
483	 * initialized to a random value) to reflect the actual GUID (which is
484	 * shared between multiple pools).
485	 */
486	if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
487		vdev_t *pvd = vd->vdev_parent;
488
489		for (; pvd != NULL; pvd = pvd->vdev_parent) {
490			pvd->vdev_guid_sum -= vd->vdev_guid;
491			pvd->vdev_guid_sum += spare_guid;
492		}
493
494		vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
495
496		/*
497		 * If this is a replacement, then we want to fallthrough to the
498		 * rest of the code.  If we're adding a spare, then it's already
499		 * labelled appropriately and we can just return.
500		 */
501		if (reason == VDEV_LABEL_SPARE)
502			return (0);
503		ASSERT(reason == VDEV_LABEL_REPLACE);
504	}
505
506	/*
507	 * Initialize its label.
508	 */
509	vp = zio_buf_alloc(sizeof (vdev_phys_t));
510	bzero(vp, sizeof (vdev_phys_t));
511
512	/*
513	 * Generate a label describing the pool and our top-level vdev.
514	 * We mark it as being from txg 0 to indicate that it's not
515	 * really part of an active pool just yet.  The labels will
516	 * be written again with a meaningful txg by spa_sync().
517	 */
518	if (reason == VDEV_LABEL_SPARE ||
519	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
520		/*
521		 * For inactive hot spares, we generate a special label that
522		 * identifies as a mutually shared hot spare.  We write the
523		 * label if we are adding a hot spare, or if we are removing an
524		 * active hot spare (in which case we want to revert the
525		 * labels).
526		 */
527		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
528
529		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
530		    spa_version(spa)) == 0);
531		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
532		    POOL_STATE_SPARE) == 0);
533		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
534		    vd->vdev_guid) == 0);
535	} else {
536		label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
537
538		/*
539		 * Add our creation time.  This allows us to detect multiple
540		 * vdev uses as described above, and automatically expires if we
541		 * fail.
542		 */
543		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
544		    crtxg) == 0);
545	}
546
547	buf = vp->vp_nvlist;
548	buflen = sizeof (vp->vp_nvlist);
549
550	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
551	if (error != 0) {
552		nvlist_free(label);
553		zio_buf_free(vp, sizeof (vdev_phys_t));
554		/* EFAULT means nvlist_pack ran out of room */
555		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
556	}
557
558	/*
559	 * Initialize boot block header.
560	 */
561	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
562	bzero(vb, sizeof (vdev_boot_header_t));
563	vb->vb_magic = VDEV_BOOT_MAGIC;
564	vb->vb_version = VDEV_BOOT_VERSION;
565	vb->vb_offset = VDEV_BOOT_OFFSET;
566	vb->vb_size = VDEV_BOOT_SIZE;
567
568	/*
569	 * Initialize uberblock template.
570	 */
571	ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
572	bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
573	*ub = spa->spa_uberblock;
574	ub->ub_txg = 0;
575
576	/*
577	 * Write everything in parallel.
578	 */
579	zio = zio_root(spa, NULL, NULL,
580	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
581
582	for (l = 0; l < VDEV_LABELS; l++) {
583
584		vdev_label_write(zio, vd, l, vp,
585		    offsetof(vdev_label_t, vl_vdev_phys),
586		    sizeof (vdev_phys_t), NULL, NULL);
587
588		vdev_label_write(zio, vd, l, vb,
589		    offsetof(vdev_label_t, vl_boot_header),
590		    sizeof (vdev_boot_header_t), NULL, NULL);
591
592		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
593			vdev_label_write(zio, vd, l, ub,
594			    VDEV_UBERBLOCK_OFFSET(vd, n),
595			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
596		}
597	}
598
599	error = zio_wait(zio);
600
601	nvlist_free(label);
602	zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
603	zio_buf_free(vb, sizeof (vdev_boot_header_t));
604	zio_buf_free(vp, sizeof (vdev_phys_t));
605
606	/*
607	 * If this vdev hasn't been previously identified as a spare, then we
608	 * mark it as such only if a) we are labelling it as a spare, or b) it
609	 * exists as a spare elsewhere in the system.
610	 */
611	if (error == 0 && !vd->vdev_isspare &&
612	    (reason == VDEV_LABEL_SPARE ||
613	    spa_spare_exists(vd->vdev_guid, NULL)))
614		spa_spare_add(vd);
615
616	return (error);
617}
618
619/*
620 * ==========================================================================
621 * uberblock load/sync
622 * ==========================================================================
623 */
624
625/*
626 * Consider the following situation: txg is safely synced to disk.  We've
627 * written the first uberblock for txg + 1, and then we lose power.  When we
628 * come back up, we fail to see the uberblock for txg + 1 because, say,
629 * it was on a mirrored device and the replica to which we wrote txg + 1
630 * is now offline.  If we then make some changes and sync txg + 1, and then
631 * the missing replica comes back, then for a new seconds we'll have two
632 * conflicting uberblocks on disk with the same txg.  The solution is simple:
633 * among uberblocks with equal txg, choose the one with the latest timestamp.
634 */
635static int
636vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
637{
638	if (ub1->ub_txg < ub2->ub_txg)
639		return (-1);
640	if (ub1->ub_txg > ub2->ub_txg)
641		return (1);
642
643	if (ub1->ub_timestamp < ub2->ub_timestamp)
644		return (-1);
645	if (ub1->ub_timestamp > ub2->ub_timestamp)
646		return (1);
647
648	return (0);
649}
650
651static void
652vdev_uberblock_load_done(zio_t *zio)
653{
654	uberblock_t *ub = zio->io_data;
655	uberblock_t *ubbest = zio->io_private;
656	spa_t *spa = zio->io_spa;
657
658	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
659
660	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
661		mutex_enter(&spa->spa_uberblock_lock);
662		if (vdev_uberblock_compare(ub, ubbest) > 0)
663			*ubbest = *ub;
664		mutex_exit(&spa->spa_uberblock_lock);
665	}
666
667	zio_buf_free(zio->io_data, zio->io_size);
668}
669
670void
671vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
672{
673	int l, c, n;
674
675	for (c = 0; c < vd->vdev_children; c++)
676		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
677
678	if (!vd->vdev_ops->vdev_op_leaf)
679		return;
680
681	if (vdev_is_dead(vd))
682		return;
683
684	for (l = 0; l < VDEV_LABELS; l++) {
685		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
686			vdev_label_read(zio, vd, l,
687			    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
688			    VDEV_UBERBLOCK_OFFSET(vd, n),
689			    VDEV_UBERBLOCK_SIZE(vd),
690			    vdev_uberblock_load_done, ubbest);
691		}
692	}
693}
694
695/*
696 * Write the uberblock to both labels of all leaves of the specified vdev.
697 * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
698 */
699static void
700vdev_uberblock_sync_done(zio_t *zio)
701{
702	uint64_t *good_writes = zio->io_root->io_private;
703
704	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
705		atomic_add_64(good_writes, 1);
706}
707
708static void
709vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
710{
711	int l, c, n;
712
713	for (c = 0; c < vd->vdev_children; c++)
714		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
715
716	if (!vd->vdev_ops->vdev_op_leaf)
717		return;
718
719	if (vdev_is_dead(vd))
720		return;
721
722	n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
723
724	ASSERT(ub->ub_txg == txg);
725
726	for (l = 0; l < VDEV_LABELS; l++)
727		vdev_label_write(zio, vd, l, ub,
728		    VDEV_UBERBLOCK_OFFSET(vd, n),
729		    VDEV_UBERBLOCK_SIZE(vd),
730		    vdev_uberblock_sync_done, NULL);
731
732	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
733}
734
735static int
736vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
737{
738	uberblock_t *ubbuf;
739	size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
740	uint64_t *good_writes;
741	zio_t *zio;
742	int error;
743
744	ubbuf = zio_buf_alloc(size);
745	bzero(ubbuf, size);
746	*ubbuf = *ub;
747
748	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
749
750	zio = zio_root(spa, NULL, good_writes,
751	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
752
753	vdev_uberblock_sync(zio, ubbuf, vd, txg);
754
755	error = zio_wait(zio);
756
757	if (error && *good_writes != 0) {
758		dprintf("partial success: good_writes = %llu\n", *good_writes);
759		error = 0;
760	}
761
762	/*
763	 * It's possible to have no good writes and no error if every vdev is in
764	 * the CANT_OPEN state.
765	 */
766	if (*good_writes == 0 && error == 0)
767		error = EIO;
768
769	kmem_free(good_writes, sizeof (uint64_t));
770	zio_buf_free(ubbuf, size);
771
772	return (error);
773}
774
775/*
776 * Sync out an individual vdev.
777 */
778static void
779vdev_sync_label_done(zio_t *zio)
780{
781	uint64_t *good_writes = zio->io_root->io_private;
782
783	if (zio->io_error == 0)
784		atomic_add_64(good_writes, 1);
785}
786
787static void
788vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
789{
790	nvlist_t *label;
791	vdev_phys_t *vp;
792	char *buf;
793	size_t buflen;
794	int c;
795
796	for (c = 0; c < vd->vdev_children; c++)
797		vdev_sync_label(zio, vd->vdev_child[c], l, txg);
798
799	if (!vd->vdev_ops->vdev_op_leaf)
800		return;
801
802	if (vdev_is_dead(vd))
803		return;
804
805	/*
806	 * Generate a label describing the top-level config to which we belong.
807	 */
808	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
809
810	vp = zio_buf_alloc(sizeof (vdev_phys_t));
811	bzero(vp, sizeof (vdev_phys_t));
812
813	buf = vp->vp_nvlist;
814	buflen = sizeof (vp->vp_nvlist);
815
816	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
817		vdev_label_write(zio, vd, l, vp,
818		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
819		    vdev_sync_label_done, NULL);
820
821	zio_buf_free(vp, sizeof (vdev_phys_t));
822	nvlist_free(label);
823
824	dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
825}
826
827static int
828vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
829{
830	uint64_t *good_writes;
831	zio_t *zio;
832	int error;
833
834	ASSERT(vd == vd->vdev_top);
835
836	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
837
838	zio = zio_root(vd->vdev_spa, NULL, good_writes,
839	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
840
841	/*
842	 * Recursively kick off writes to all labels.
843	 */
844	vdev_sync_label(zio, vd, l, txg);
845
846	error = zio_wait(zio);
847
848	if (error && *good_writes != 0) {
849		dprintf("partial success: good_writes = %llu\n", *good_writes);
850		error = 0;
851	}
852
853	if (*good_writes == 0 && error == 0)
854		error = ENODEV;
855
856	kmem_free(good_writes, sizeof (uint64_t));
857
858	return (error);
859}
860
861/*
862 * Sync the entire vdev configuration.
863 *
864 * The order of operations is carefully crafted to ensure that
865 * if the system panics or loses power at any time, the state on disk
866 * is still transactionally consistent.  The in-line comments below
867 * describe the failure semantics at each stage.
868 *
869 * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
870 * at any time, you can just call it again, and it will resume its work.
871 */
872int
873vdev_config_sync(vdev_t *uvd, uint64_t txg)
874{
875	spa_t *spa = uvd->vdev_spa;
876	uberblock_t *ub = &spa->spa_uberblock;
877	vdev_t *rvd = spa->spa_root_vdev;
878	vdev_t *vd;
879	zio_t *zio;
880	int l, error;
881
882	ASSERT(ub->ub_txg <= txg);
883
884	/*
885	 * If this isn't a resync due to I/O errors, and nothing changed
886	 * in this transaction group, and the vdev configuration hasn't changed,
887	 * then there's nothing to do.
888	 */
889	if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
890	    list_is_empty(&spa->spa_dirty_list)) {
891		dprintf("nothing to sync in %s in txg %llu\n",
892		    spa_name(spa), txg);
893		return (0);
894	}
895
896	if (txg > spa_freeze_txg(spa))
897		return (0);
898
899	ASSERT(txg <= spa->spa_final_txg);
900
901	dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
902
903	/*
904	 * Flush the write cache of every disk that's been written to
905	 * in this transaction group.  This ensures that all blocks
906	 * written in this txg will be committed to stable storage
907	 * before any uberblock that references them.
908	 */
909	zio = zio_root(spa, NULL, NULL,
910	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
911	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
912	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
913		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
914		    NULL, NULL, ZIO_PRIORITY_NOW,
915		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
916	}
917	(void) zio_wait(zio);
918
919	/*
920	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
921	 * system dies in the middle of this process, that's OK: all of the
922	 * even labels that made it to disk will be newer than any uberblock,
923	 * and will therefore be considered invalid.  The odd labels (L1, L3),
924	 * which have not yet been touched, will still be valid.
925	 */
926	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
927	    vd = list_next(&spa->spa_dirty_list, vd)) {
928		for (l = 0; l < VDEV_LABELS; l++) {
929			if (l & 1)
930				continue;
931			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
932				return (error);
933		}
934	}
935
936	/*
937	 * Flush the new labels to disk.  This ensures that all even-label
938	 * updates are committed to stable storage before the uberblock update.
939	 */
940	zio = zio_root(spa, NULL, NULL,
941	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
942	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
943	    vd = list_next(&spa->spa_dirty_list, vd)) {
944		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
945		    NULL, NULL, ZIO_PRIORITY_NOW,
946		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
947	}
948	(void) zio_wait(zio);
949
950	/*
951	 * Sync the uberblocks to all vdevs in the tree specified by uvd.
952	 * If the system dies in the middle of this step, there are two cases
953	 * to consider, and the on-disk state is consistent either way:
954	 *
955	 * (1)	If none of the new uberblocks made it to disk, then the
956	 *	previous uberblock will be the newest, and the odd labels
957	 *	(which had not yet been touched) will be valid with respect
958	 *	to that uberblock.
959	 *
960	 * (2)	If one or more new uberblocks made it to disk, then they
961	 *	will be the newest, and the even labels (which had all
962	 *	been successfully committed) will be valid with respect
963	 *	to the new uberblocks.
964	 */
965	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
966		return (error);
967
968	/*
969	 * Flush the uberblocks to disk.  This ensures that the odd labels
970	 * are no longer needed (because the new uberblocks and the even
971	 * labels are safely on disk), so it is safe to overwrite them.
972	 */
973	(void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
974	    NULL, NULL, ZIO_PRIORITY_NOW,
975	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
976
977	/*
978	 * Sync out odd labels for every dirty vdev.  If the system dies
979	 * in the middle of this process, the even labels and the new
980	 * uberblocks will suffice to open the pool.  The next time
981	 * the pool is opened, the first thing we'll do -- before any
982	 * user data is modified -- is mark every vdev dirty so that
983	 * all labels will be brought up to date.
984	 */
985	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
986	    vd = list_next(&spa->spa_dirty_list, vd)) {
987		for (l = 0; l < VDEV_LABELS; l++) {
988			if ((l & 1) == 0)
989				continue;
990			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
991				return (error);
992		}
993	}
994
995	/*
996	 * Flush the new labels to disk.  This ensures that all odd-label
997	 * updates are committed to stable storage before the next
998	 * transaction group begins.
999	 */
1000	zio = zio_root(spa, NULL, NULL,
1001	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
1002	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
1003	    vd = list_next(&spa->spa_dirty_list, vd)) {
1004		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
1005		    NULL, NULL, ZIO_PRIORITY_NOW,
1006		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
1007	}
1008	(void) zio_wait(zio);
1009
1010	return (0);
1011}
1012