1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa.h>
28#include <sys/vdev_impl.h>
29#include <sys/zio.h>
30#include <sys/fs/zfs.h>
31
32/*
33 * Virtual device vector for mirroring.
34 */
35
36typedef struct mirror_child {
37	vdev_t		*mc_vd;
38	uint64_t	mc_offset;
39	int		mc_error;
40	uint8_t		mc_tried;
41	uint8_t		mc_skipped;
42	uint8_t		mc_speculative;
43} mirror_child_t;
44
45typedef struct mirror_map {
46	int		mm_children;
47	int		mm_replacing;
48	int		mm_preferred;
49	int		mm_root;
50	mirror_child_t	mm_child[1];
51} mirror_map_t;
52
53int vdev_mirror_shift = 21;
54
55static void
56vdev_mirror_map_free(zio_t *zio)
57{
58	mirror_map_t *mm = zio->io_vsd;
59
60	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
61}
62
63static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
64	vdev_mirror_map_free,
65	zio_vsd_default_cksum_report
66};
67
68static mirror_map_t *
69vdev_mirror_map_alloc(zio_t *zio)
70{
71	mirror_map_t *mm = NULL;
72	mirror_child_t *mc;
73	vdev_t *vd = zio->io_vd;
74	int c, d;
75
76	if (vd == NULL) {
77		dva_t *dva = zio->io_bp->blk_dva;
78		spa_t *spa = zio->io_spa;
79
80		c = BP_GET_NDVAS(zio->io_bp);
81
82		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
83		mm->mm_children = c;
84		mm->mm_replacing = B_FALSE;
85		mm->mm_preferred = spa_get_random(c);
86		mm->mm_root = B_TRUE;
87
88		/*
89		 * Check the other, lower-index DVAs to see if they're on
90		 * the same vdev as the child we picked.  If they are, use
91		 * them since they are likely to have been allocated from
92		 * the primary metaslab in use at the time, and hence are
93		 * more likely to have locality with single-copy data.
94		 */
95		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
96			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
97				mm->mm_preferred = d;
98		}
99
100		for (c = 0; c < mm->mm_children; c++) {
101			mc = &mm->mm_child[c];
102
103			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
104			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
105		}
106	} else {
107		c = vd->vdev_children;
108
109		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
110		mm->mm_children = c;
111		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
112		    vd->vdev_ops == &vdev_spare_ops);
113		mm->mm_preferred = mm->mm_replacing ? 0 :
114		    (zio->io_offset >> vdev_mirror_shift) % c;
115		mm->mm_root = B_FALSE;
116
117		for (c = 0; c < mm->mm_children; c++) {
118			mc = &mm->mm_child[c];
119			mc->mc_vd = vd->vdev_child[c];
120			mc->mc_offset = zio->io_offset;
121		}
122	}
123
124	zio->io_vsd = mm;
125	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
126	return (mm);
127}
128
129static int
130vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
131{
132	int numerrors = 0;
133	int lasterror = 0;
134
135	if (vd->vdev_children == 0) {
136		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
137		return (EINVAL);
138	}
139
140	vdev_open_children(vd);
141
142	for (int c = 0; c < vd->vdev_children; c++) {
143		vdev_t *cvd = vd->vdev_child[c];
144
145		if (cvd->vdev_open_error) {
146			lasterror = cvd->vdev_open_error;
147			numerrors++;
148			continue;
149		}
150
151		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
152		*ashift = MAX(*ashift, cvd->vdev_ashift);
153	}
154
155	if (numerrors == vd->vdev_children) {
156		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
157		return (lasterror);
158	}
159
160	return (0);
161}
162
163static void
164vdev_mirror_close(vdev_t *vd)
165{
166	for (int c = 0; c < vd->vdev_children; c++)
167		vdev_close(vd->vdev_child[c]);
168}
169
170static void
171vdev_mirror_child_done(zio_t *zio)
172{
173	mirror_child_t *mc = zio->io_private;
174
175	mc->mc_error = zio->io_error;
176	mc->mc_tried = 1;
177	mc->mc_skipped = 0;
178}
179
180static void
181vdev_mirror_scrub_done(zio_t *zio)
182{
183	mirror_child_t *mc = zio->io_private;
184
185	if (zio->io_error == 0) {
186		zio_t *pio;
187
188		mutex_enter(&zio->io_lock);
189		while ((pio = zio_walk_parents(zio)) != NULL) {
190			mutex_enter(&pio->io_lock);
191			ASSERT3U(zio->io_size, >=, pio->io_size);
192			bcopy(zio->io_data, pio->io_data, pio->io_size);
193			mutex_exit(&pio->io_lock);
194		}
195		mutex_exit(&zio->io_lock);
196	}
197
198	zio_buf_free(zio->io_data, zio->io_size);
199
200	mc->mc_error = zio->io_error;
201	mc->mc_tried = 1;
202	mc->mc_skipped = 0;
203}
204
205/*
206 * Try to find a child whose DTL doesn't contain the block we want to read.
207 * If we can't, try the read on any vdev we haven't already tried.
208 */
209static int
210vdev_mirror_child_select(zio_t *zio)
211{
212	mirror_map_t *mm = zio->io_vsd;
213	mirror_child_t *mc;
214	uint64_t txg = zio->io_txg;
215	int i, c;
216
217	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
218
219	/*
220	 * Try to find a child whose DTL doesn't contain the block to read.
221	 * If a child is known to be completely inaccessible (indicated by
222	 * vdev_readable() returning B_FALSE), don't even try.
223	 */
224	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
225		if (c >= mm->mm_children)
226			c = 0;
227		mc = &mm->mm_child[c];
228		if (mc->mc_tried || mc->mc_skipped)
229			continue;
230		if (!vdev_readable(mc->mc_vd)) {
231			mc->mc_error = ENXIO;
232			mc->mc_tried = 1;	/* don't even try */
233			mc->mc_skipped = 1;
234			continue;
235		}
236		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
237			return (c);
238		mc->mc_error = ESTALE;
239		mc->mc_skipped = 1;
240		mc->mc_speculative = 1;
241	}
242
243	/*
244	 * Every device is either missing or has this txg in its DTL.
245	 * Look for any child we haven't already tried before giving up.
246	 */
247	for (c = 0; c < mm->mm_children; c++)
248		if (!mm->mm_child[c].mc_tried)
249			return (c);
250
251	/*
252	 * Every child failed.  There's no place left to look.
253	 */
254	return (-1);
255}
256
257static int
258vdev_mirror_io_start(zio_t *zio)
259{
260	mirror_map_t *mm;
261	mirror_child_t *mc;
262	int c, children;
263
264	mm = vdev_mirror_map_alloc(zio);
265
266	if (zio->io_type == ZIO_TYPE_READ) {
267		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
268			/*
269			 * For scrubbing reads we need to allocate a read
270			 * buffer for each child and issue reads to all
271			 * children.  If any child succeeds, it will copy its
272			 * data into zio->io_data in vdev_mirror_scrub_done.
273			 */
274			for (c = 0; c < mm->mm_children; c++) {
275				mc = &mm->mm_child[c];
276				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
277				    mc->mc_vd, mc->mc_offset,
278				    zio_buf_alloc(zio->io_size), zio->io_size,
279				    zio->io_type, zio->io_priority, 0,
280				    vdev_mirror_scrub_done, mc));
281			}
282			return (ZIO_PIPELINE_CONTINUE);
283		}
284		/*
285		 * For normal reads just pick one child.
286		 */
287		c = vdev_mirror_child_select(zio);
288		children = (c >= 0);
289	} else {
290		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
291
292		/*
293		 * Writes go to all children.
294		 */
295		c = 0;
296		children = mm->mm_children;
297	}
298
299	while (children--) {
300		mc = &mm->mm_child[c];
301		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
302		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
303		    zio->io_type, zio->io_priority, 0,
304		    vdev_mirror_child_done, mc));
305		c++;
306	}
307
308	return (ZIO_PIPELINE_CONTINUE);
309}
310
311static int
312vdev_mirror_worst_error(mirror_map_t *mm)
313{
314	int error[2] = { 0, 0 };
315
316	for (int c = 0; c < mm->mm_children; c++) {
317		mirror_child_t *mc = &mm->mm_child[c];
318		int s = mc->mc_speculative;
319		error[s] = zio_worst_error(error[s], mc->mc_error);
320	}
321
322	return (error[0] ? error[0] : error[1]);
323}
324
325static void
326vdev_mirror_io_done(zio_t *zio)
327{
328	mirror_map_t *mm = zio->io_vsd;
329	mirror_child_t *mc;
330	int c;
331	int good_copies = 0;
332	int unexpected_errors = 0;
333
334	for (c = 0; c < mm->mm_children; c++) {
335		mc = &mm->mm_child[c];
336
337		if (mc->mc_error) {
338			if (!mc->mc_skipped)
339				unexpected_errors++;
340		} else if (mc->mc_tried) {
341			good_copies++;
342		}
343	}
344
345	if (zio->io_type == ZIO_TYPE_WRITE) {
346		/*
347		 * XXX -- for now, treat partial writes as success.
348		 *
349		 * Now that we support write reallocation, it would be better
350		 * to treat partial failure as real failure unless there are
351		 * no non-degraded top-level vdevs left, and not update DTLs
352		 * if we intend to reallocate.
353		 */
354		/* XXPOLICY */
355		if (good_copies != mm->mm_children) {
356			/*
357			 * Always require at least one good copy.
358			 *
359			 * For ditto blocks (io_vd == NULL), require
360			 * all copies to be good.
361			 *
362			 * XXX -- for replacing vdevs, there's no great answer.
363			 * If the old device is really dead, we may not even
364			 * be able to access it -- so we only want to
365			 * require good writes to the new device.  But if
366			 * the new device turns out to be flaky, we want
367			 * to be able to detach it -- which requires all
368			 * writes to the old device to have succeeded.
369			 */
370			if (good_copies == 0 || zio->io_vd == NULL)
371				zio->io_error = vdev_mirror_worst_error(mm);
372		}
373		return;
374	}
375
376	ASSERT(zio->io_type == ZIO_TYPE_READ);
377
378	/*
379	 * If we don't have a good copy yet, keep trying other children.
380	 */
381	/* XXPOLICY */
382	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
383		ASSERT(c >= 0 && c < mm->mm_children);
384		mc = &mm->mm_child[c];
385		zio_vdev_io_redone(zio);
386		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
387		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
388		    ZIO_TYPE_READ, zio->io_priority, 0,
389		    vdev_mirror_child_done, mc));
390		return;
391	}
392
393	/* XXPOLICY */
394	if (good_copies == 0) {
395		zio->io_error = vdev_mirror_worst_error(mm);
396		ASSERT(zio->io_error != 0);
397	}
398
399	if (good_copies && spa_writeable(zio->io_spa) &&
400	    (unexpected_errors ||
401	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
402	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
403		/*
404		 * Use the good data we have in hand to repair damaged children.
405		 */
406		for (c = 0; c < mm->mm_children; c++) {
407			/*
408			 * Don't rewrite known good children.
409			 * Not only is it unnecessary, it could
410			 * actually be harmful: if the system lost
411			 * power while rewriting the only good copy,
412			 * there would be no good copies left!
413			 */
414			mc = &mm->mm_child[c];
415
416			if (mc->mc_error == 0) {
417				if (mc->mc_tried)
418					continue;
419				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
420				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
421				    zio->io_txg, 1))
422					continue;
423				mc->mc_error = ESTALE;
424			}
425
426			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
427			    mc->mc_vd, mc->mc_offset,
428			    zio->io_data, zio->io_size,
429			    ZIO_TYPE_WRITE, zio->io_priority,
430			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
431			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
432		}
433	}
434}
435
436static void
437vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
438{
439	if (faulted == vd->vdev_children)
440		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
441		    VDEV_AUX_NO_REPLICAS);
442	else if (degraded + faulted != 0)
443		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
444	else
445		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
446}
447
448vdev_ops_t vdev_mirror_ops = {
449	vdev_mirror_open,
450	vdev_mirror_close,
451	vdev_default_asize,
452	vdev_mirror_io_start,
453	vdev_mirror_io_done,
454	vdev_mirror_state_change,
455	VDEV_TYPE_MIRROR,	/* name of this vdev type */
456	B_FALSE			/* not a leaf vdev */
457};
458
459vdev_ops_t vdev_replacing_ops = {
460	vdev_mirror_open,
461	vdev_mirror_close,
462	vdev_default_asize,
463	vdev_mirror_io_start,
464	vdev_mirror_io_done,
465	vdev_mirror_state_change,
466	VDEV_TYPE_REPLACING,	/* name of this vdev type */
467	B_FALSE			/* not a leaf vdev */
468};
469
470vdev_ops_t vdev_spare_ops = {
471	vdev_mirror_open,
472	vdev_mirror_close,
473	vdev_default_asize,
474	vdev_mirror_io_start,
475	vdev_mirror_io_done,
476	vdev_mirror_state_change,
477	VDEV_TYPE_SPARE,	/* name of this vdev type */
478	B_FALSE			/* not a leaf vdev */
479};
480