1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/endian.h>
33#include <sys/kernel.h>
34#include <sys/kobj.h>
35#include <sys/limits.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/mutex.h>
39#include <sys/sysctl.h>
40#include <sys/systm.h>
41#include <geom/geom.h>
42#include "geom/raid/g_raid.h"
43#include "g_raid_tr_if.h"
44
45#define N	2
46
47SYSCTL_DECL(_kern_geom_raid_raid1e);
48
49#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
52    &g_raid1e_rebuild_slab, 0,
53    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
54
55#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
56static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
58    &g_raid1e_rebuild_fair_io, 0,
59    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
60
61#define RAID1E_REBUILD_CLUSTER_IDLE 100
62static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
64    &g_raid1e_rebuild_cluster_idle, 0,
65    "Number of slabs to do each time we trigger a rebuild cycle");
66
67#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
68static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
70    &g_raid1e_rebuild_meta_update, 0,
71    "When to update the meta data.");
72
73static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
74
75#define TR_RAID1E_NONE 0
76#define TR_RAID1E_REBUILD 1
77#define TR_RAID1E_RESYNC 2
78
79#define TR_RAID1E_F_DOING_SOME	0x1
80#define TR_RAID1E_F_LOCKED	0x2
81#define TR_RAID1E_F_ABORT	0x4
82
83struct g_raid_tr_raid1e_object {
84	struct g_raid_tr_object	 trso_base;
85	int			 trso_starting;
86	int			 trso_stopping;
87	int			 trso_type;
88	int			 trso_recover_slabs; /* slabs before rest */
89	int			 trso_fair_io;
90	int			 trso_meta_update;
91	int			 trso_flags;
92	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
93	void			*trso_buffer;	 /* Buffer space */
94	off_t			 trso_lock_pos; /* Locked range start. */
95	off_t			 trso_lock_len; /* Locked range length. */
96	struct bio		 trso_bio;
97};
98
99static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
100static g_raid_tr_event_t g_raid_tr_event_raid1e;
101static g_raid_tr_start_t g_raid_tr_start_raid1e;
102static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
103static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
104static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
105static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
106static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
107static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
108static g_raid_tr_free_t g_raid_tr_free_raid1e;
109
110static kobj_method_t g_raid_tr_raid1e_methods[] = {
111	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
112	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
113	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
114	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
115	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
116	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
117	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
118	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
119	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
120	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
121	{ 0, 0 }
122};
123
124static struct g_raid_tr_class g_raid_tr_raid1e_class = {
125	"RAID1E",
126	g_raid_tr_raid1e_methods,
127	sizeof(struct g_raid_tr_raid1e_object),
128	.trc_enable = 1,
129	.trc_priority = 200,
130	.trc_accept_unmapped = 1
131};
132
133static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
134static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
135    struct g_raid_subdisk *sd);
136static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
137    int no, off_t off, off_t len, u_int mask);
138
139static inline void
140V2P(struct g_raid_volume *vol, off_t virt,
141    int *disk, off_t *offset, off_t *start)
142{
143	off_t nstrip;
144	u_int strip_size;
145
146	strip_size = vol->v_strip_size;
147	/* Strip number. */
148	nstrip = virt / strip_size;
149	/* Start position in strip. */
150	*start = virt % strip_size;
151	/* Disk number. */
152	*disk = (nstrip * N) % vol->v_disks_count;
153	/* Strip start position in disk. */
154	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
155}
156
157static inline void
158P2V(struct g_raid_volume *vol, int disk, off_t offset,
159    off_t *virt, int *copy)
160{
161	off_t nstrip, start;
162	u_int strip_size;
163
164	strip_size = vol->v_strip_size;
165	/* Start position in strip. */
166	start = offset % strip_size;
167	/* Physical strip number. */
168	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
169	/* Number of physical strip (copy) inside virtual strip. */
170	*copy = nstrip % N;
171	/* Offset in virtual space. */
172	*virt = (nstrip / N) * strip_size + start;
173}
174
175static int
176g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
177{
178	struct g_raid_tr_raid1e_object *trs;
179
180	trs = (struct g_raid_tr_raid1e_object *)tr;
181	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
183		return (G_RAID_TR_TASTE_FAIL);
184	trs->trso_starting = 1;
185	return (G_RAID_TR_TASTE_SUCCEED);
186}
187
188static int
189g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
190{
191	struct g_raid_softc *sc;
192	struct g_raid_subdisk *sd, *bestsd, *worstsd;
193	int i, j, state, sstate;
194
195	sc = vol->v_softc;
196	state = G_RAID_VOLUME_S_OPTIMAL;
197	for (i = 0; i < vol->v_disks_count / N; i++) {
198		bestsd = &vol->v_subdisks[i * N];
199		for (j = 1; j < N; j++) {
200			sd = &vol->v_subdisks[i * N + j];
201			if (sd->sd_state > bestsd->sd_state)
202				bestsd = sd;
203			else if (sd->sd_state == bestsd->sd_state &&
204			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
205			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
206			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
207				bestsd = sd;
208		}
209		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
210		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
211			/* We found reasonable candidate. */
212			G_RAID_DEBUG1(1, sc,
213			    "Promote subdisk %s:%d from %s to ACTIVE.",
214			    vol->v_name, bestsd->sd_pos,
215			    g_raid_subdisk_state2str(bestsd->sd_state));
216			g_raid_change_subdisk_state(bestsd,
217			    G_RAID_SUBDISK_S_ACTIVE);
218			g_raid_write_metadata(sc,
219			    vol, bestsd, bestsd->sd_disk);
220		}
221		worstsd = &vol->v_subdisks[i * N];
222		for (j = 1; j < N; j++) {
223			sd = &vol->v_subdisks[i * N + j];
224			if (sd->sd_state < worstsd->sd_state)
225				worstsd = sd;
226		}
227		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
228			sstate = G_RAID_VOLUME_S_OPTIMAL;
229		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
230			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
231		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
232			sstate = G_RAID_VOLUME_S_DEGRADED;
233		else
234			sstate = G_RAID_VOLUME_S_BROKEN;
235		if (sstate < state)
236			state = sstate;
237	}
238	return (state);
239}
240
241static int
242g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
243{
244	struct g_raid_softc *sc;
245	struct g_raid_subdisk *sd, *bestsd, *worstsd;
246	int i, j, state, sstate;
247
248	sc = vol->v_softc;
249	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
250	    vol->v_disks_count)
251		return (G_RAID_VOLUME_S_OPTIMAL);
252	for (i = 0; i < vol->v_disks_count; i++) {
253		sd = &vol->v_subdisks[i];
254		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
255			/* We found reasonable candidate. */
256			G_RAID_DEBUG1(1, sc,
257			    "Promote subdisk %s:%d from %s to STALE.",
258			    vol->v_name, sd->sd_pos,
259			    g_raid_subdisk_state2str(sd->sd_state));
260			g_raid_change_subdisk_state(sd,
261			    G_RAID_SUBDISK_S_STALE);
262			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
263		}
264	}
265	state = G_RAID_VOLUME_S_OPTIMAL;
266	for (i = 0; i < vol->v_disks_count; i++) {
267		bestsd = &vol->v_subdisks[i];
268		worstsd = &vol->v_subdisks[i];
269		for (j = 1; j < N; j++) {
270			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
271			if (sd->sd_state > bestsd->sd_state)
272				bestsd = sd;
273			else if (sd->sd_state == bestsd->sd_state &&
274			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
275			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
276			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
277				bestsd = sd;
278			if (sd->sd_state < worstsd->sd_state)
279				worstsd = sd;
280		}
281		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
282			sstate = G_RAID_VOLUME_S_OPTIMAL;
283		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
284			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
285		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286			sstate = G_RAID_VOLUME_S_DEGRADED;
287		else
288			sstate = G_RAID_VOLUME_S_BROKEN;
289		if (sstate < state)
290			state = sstate;
291	}
292	return (state);
293}
294
295static int
296g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
297    struct g_raid_subdisk *sd)
298{
299	struct g_raid_tr_raid1e_object *trs;
300	struct g_raid_softc *sc;
301	u_int s;
302
303	sc = vol->v_softc;
304	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
305	if (trs->trso_stopping &&
306	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
307		s = G_RAID_VOLUME_S_STOPPED;
308	else if (trs->trso_starting)
309		s = G_RAID_VOLUME_S_STARTING;
310	else {
311		if ((vol->v_disks_count % N) == 0)
312			s = g_raid_tr_update_state_raid1e_even(vol);
313		else
314			s = g_raid_tr_update_state_raid1e_odd(vol);
315	}
316	if (s != vol->v_state) {
317		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
318		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
319		    G_RAID_EVENT_VOLUME);
320		g_raid_change_volume_state(vol, s);
321		if (!trs->trso_starting && !trs->trso_stopping)
322			g_raid_write_metadata(sc, vol, NULL, NULL);
323	}
324	if (!trs->trso_starting && !trs->trso_stopping)
325		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
326	return (0);
327}
328
329static void
330g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
331    struct g_raid_disk *disk)
332{
333	struct g_raid_volume *vol;
334
335	vol = sd->sd_volume;
336	/*
337	 * We don't fail the last disk in the pack, since it still has decent
338	 * data on it and that's better than failing the disk if it is the root
339	 * file system.
340	 *
341	 * XXX should this be controlled via a tunable?  It makes sense for
342	 * the volume that has / on it.  I can't think of a case where we'd
343	 * want the volume to go away on this kind of event.
344	 */
345	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349	     vol->v_disks_count) &&
350	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
351		return;
352	g_raid_fail_disk(sc, sd, disk);
353}
354
355static void
356g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
357{
358	struct g_raid_volume *vol;
359	struct g_raid_subdisk *sd;
360
361	vol = trs->trso_base.tro_volume;
362	sd = trs->trso_failed_sd;
363	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
364	free(trs->trso_buffer, M_TR_RAID1E);
365	trs->trso_buffer = NULL;
366	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
367	trs->trso_type = TR_RAID1E_NONE;
368	trs->trso_recover_slabs = 0;
369	trs->trso_failed_sd = NULL;
370	g_raid_tr_update_state_raid1e(vol, NULL);
371}
372
373static void
374g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
375{
376	struct g_raid_tr_raid1e_object *trs;
377	struct g_raid_subdisk *sd;
378
379	trs = (struct g_raid_tr_raid1e_object *)tr;
380	sd = trs->trso_failed_sd;
381	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
382	    "Subdisk %s:%d-%s rebuild completed.",
383	    sd->sd_volume->v_name, sd->sd_pos,
384	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
385	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
386	sd->sd_rebuild_pos = 0;
387	g_raid_tr_raid1e_rebuild_done(trs);
388}
389
390static void
391g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
392{
393	struct g_raid_tr_raid1e_object *trs;
394	struct g_raid_subdisk *sd;
395	struct g_raid_volume *vol;
396
397	vol = tr->tro_volume;
398	trs = (struct g_raid_tr_raid1e_object *)tr;
399	sd = trs->trso_failed_sd;
400	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
401		G_RAID_DEBUG1(1, vol->v_softc,
402		    "Subdisk %s:%d-%s rebuild is aborting.",
403		    sd->sd_volume->v_name, sd->sd_pos,
404		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
405		trs->trso_flags |= TR_RAID1E_F_ABORT;
406	} else {
407		G_RAID_DEBUG1(0, vol->v_softc,
408		    "Subdisk %s:%d-%s rebuild aborted.",
409		    sd->sd_volume->v_name, sd->sd_pos,
410		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
411		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
412		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
413			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
414			g_raid_unlock_range(tr->tro_volume,
415			    trs->trso_lock_pos, trs->trso_lock_len);
416		}
417		g_raid_tr_raid1e_rebuild_done(trs);
418	}
419}
420
421static void
422g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
423{
424	struct g_raid_tr_raid1e_object *trs;
425	struct g_raid_softc *sc;
426	struct g_raid_volume *vol;
427	struct g_raid_subdisk *sd;
428	struct bio *bp;
429	off_t len, virtual, vend, offset, start;
430	int disk, copy, best;
431
432	trs = (struct g_raid_tr_raid1e_object *)tr;
433	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
434		return;
435	vol = tr->tro_volume;
436	sc = vol->v_softc;
437	sd = trs->trso_failed_sd;
438
439	while (1) {
440		if (sd->sd_rebuild_pos >= sd->sd_size) {
441			g_raid_tr_raid1e_rebuild_finish(tr);
442			return;
443		}
444		/* Get virtual offset from physical rebuild position. */
445		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
446		/* Get physical offset back to get first stripe position. */
447		V2P(vol, virtual, &disk, &offset, &start);
448		/* Calculate contignous data length. */
449		len = MIN(g_raid1e_rebuild_slab,
450		    sd->sd_size - sd->sd_rebuild_pos);
451		if ((vol->v_disks_count % N) != 0)
452			len = MIN(len, vol->v_strip_size - start);
453		/* Find disk with most accurate data. */
454		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
455		    offset + start, len, 0);
456		if (best < 0) {
457			/* There is no any valid disk. */
458			g_raid_tr_raid1e_rebuild_abort(tr);
459			return;
460		} else if (best != copy) {
461			/* Some other disk has better data. */
462			break;
463		}
464		/* We have the most accurate data. Skip the range. */
465		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
466		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
467		sd->sd_rebuild_pos += len;
468	}
469
470	bp = &trs->trso_bio;
471	memset(bp, 0, sizeof(*bp));
472	bp->bio_offset = offset + start +
473	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
474	bp->bio_length = len;
475	bp->bio_data = trs->trso_buffer;
476	bp->bio_cmd = BIO_READ;
477	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
478	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
479	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
480	/*
481	 * If we are crossing stripe boundary, correct affected virtual
482	 * range we should lock.
483	 */
484	if (start + len > vol->v_strip_size) {
485		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
486		len = vend - virtual;
487	}
488	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
489	trs->trso_flags |= TR_RAID1E_F_LOCKED;
490	trs->trso_lock_pos = virtual;
491	trs->trso_lock_len = len;
492	/* Lock callback starts I/O */
493	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
494}
495
496static void
497g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
498{
499	struct g_raid_volume *vol;
500	struct g_raid_tr_raid1e_object *trs;
501	struct g_raid_subdisk *sd;
502
503	vol = tr->tro_volume;
504	trs = (struct g_raid_tr_raid1e_object *)tr;
505	if (trs->trso_failed_sd) {
506		G_RAID_DEBUG1(1, vol->v_softc,
507		    "Already rebuild in start rebuild. pos %jd\n",
508		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
509		return;
510	}
511	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
512	if (sd == NULL)
513		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
514	if (sd == NULL) {
515		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
516		if (sd != NULL) {
517			sd->sd_rebuild_pos = 0;
518			g_raid_change_subdisk_state(sd,
519			    G_RAID_SUBDISK_S_RESYNC);
520			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
521		} else {
522			sd = g_raid_get_subdisk(vol,
523			    G_RAID_SUBDISK_S_UNINITIALIZED);
524			if (sd == NULL)
525				sd = g_raid_get_subdisk(vol,
526				    G_RAID_SUBDISK_S_NEW);
527			if (sd != NULL) {
528				sd->sd_rebuild_pos = 0;
529				g_raid_change_subdisk_state(sd,
530				    G_RAID_SUBDISK_S_REBUILD);
531				g_raid_write_metadata(vol->v_softc,
532				    vol, sd, NULL);
533			}
534		}
535	}
536	if (sd == NULL) {
537		G_RAID_DEBUG1(1, vol->v_softc,
538		    "No failed disk to rebuild.  night night.");
539		return;
540	}
541	trs->trso_failed_sd = sd;
542	G_RAID_DEBUG1(0, vol->v_softc,
543	    "Subdisk %s:%d-%s rebuild start at %jd.",
544	    sd->sd_volume->v_name, sd->sd_pos,
545	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
546	    trs->trso_failed_sd->sd_rebuild_pos);
547	trs->trso_type = TR_RAID1E_REBUILD;
548	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
549	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
550	g_raid_tr_raid1e_rebuild_some(tr);
551}
552
553static void
554g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
555    struct g_raid_subdisk *sd)
556{
557	struct g_raid_volume *vol;
558	struct g_raid_tr_raid1e_object *trs;
559	int nr;
560
561	vol = tr->tro_volume;
562	trs = (struct g_raid_tr_raid1e_object *)tr;
563	if (trs->trso_stopping)
564		return;
565	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
566	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
567	switch(trs->trso_type) {
568	case TR_RAID1E_NONE:
569		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
570			return;
571		if (nr == 0) {
572			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
573			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
574			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
575			if (nr == 0)
576				return;
577		}
578		g_raid_tr_raid1e_rebuild_start(tr);
579		break;
580	case TR_RAID1E_REBUILD:
581		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
582		    trs->trso_failed_sd == sd)
583			g_raid_tr_raid1e_rebuild_abort(tr);
584		break;
585	case TR_RAID1E_RESYNC:
586		break;
587	}
588}
589
590static int
591g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
592    struct g_raid_subdisk *sd, u_int event)
593{
594
595	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
596	return (0);
597}
598
599static int
600g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
601{
602	struct g_raid_tr_raid1e_object *trs;
603	struct g_raid_volume *vol;
604
605	trs = (struct g_raid_tr_raid1e_object *)tr;
606	vol = tr->tro_volume;
607	trs->trso_starting = 0;
608	g_raid_tr_update_state_raid1e(vol, NULL);
609	return (0);
610}
611
612static int
613g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
614{
615	struct g_raid_tr_raid1e_object *trs;
616	struct g_raid_volume *vol;
617
618	trs = (struct g_raid_tr_raid1e_object *)tr;
619	vol = tr->tro_volume;
620	trs->trso_starting = 0;
621	trs->trso_stopping = 1;
622	g_raid_tr_update_state_raid1e(vol, NULL);
623	return (0);
624}
625
626/*
627 * Select the disk to read from.  Take into account: subdisk state, running
628 * error recovery, average disk load, head position and possible cache hits.
629 */
630#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
631static int
632g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
633    int no, off_t off, off_t len, u_int mask)
634{
635	struct g_raid_subdisk *sd;
636	off_t offset;
637	int i, best, prio, bestprio;
638
639	best = -1;
640	bestprio = INT_MAX;
641	for (i = 0; i < N; i++) {
642		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
643		offset = off;
644		if (no + i >= vol->v_disks_count)
645			offset += vol->v_strip_size;
646
647		prio = G_RAID_SUBDISK_LOAD(sd);
648		if ((mask & (1 << sd->sd_pos)) != 0)
649			continue;
650		switch (sd->sd_state) {
651		case G_RAID_SUBDISK_S_ACTIVE:
652			break;
653		case G_RAID_SUBDISK_S_RESYNC:
654			if (offset + off < sd->sd_rebuild_pos)
655				break;
656			/* FALLTHROUGH */
657		case G_RAID_SUBDISK_S_STALE:
658			prio += i << 24;
659			break;
660		case G_RAID_SUBDISK_S_REBUILD:
661			if (offset + off < sd->sd_rebuild_pos)
662				break;
663			/* FALLTHROUGH */
664		default:
665			continue;
666		}
667		prio += min(sd->sd_recovery, 255) << 16;
668		/* If disk head is precisely in position - highly prefer it. */
669		if (G_RAID_SUBDISK_POS(sd) == offset)
670			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
671		else
672		/* If disk head is close to position - prefer it. */
673		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
674		    G_RAID_SUBDISK_TRACK_SIZE)
675			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
676		if (prio < bestprio) {
677			bestprio = prio;
678			best = i;
679		}
680	}
681	return (best);
682}
683
684static void
685g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
686{
687	struct g_raid_volume *vol;
688	struct g_raid_subdisk *sd;
689	struct bio_queue_head queue;
690	struct bio *cbp;
691	char *addr;
692	off_t offset, start, length, remain;
693	u_int no, strip_size;
694	int best;
695
696	vol = tr->tro_volume;
697	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698		addr = NULL;
699	else
700		addr = bp->bio_data;
701	strip_size = vol->v_strip_size;
702	V2P(vol, bp->bio_offset, &no, &offset, &start);
703	remain = bp->bio_length;
704	bioq_init(&queue);
705	while (remain > 0) {
706		length = MIN(strip_size - start, remain);
707		best = g_raid_tr_raid1e_select_read_disk(vol,
708		    no, offset, length, 0);
709		KASSERT(best >= 0, ("No readable disk in volume %s!",
710		    vol->v_name));
711		no += best;
712		if (no >= vol->v_disks_count) {
713			no -= vol->v_disks_count;
714			offset += strip_size;
715		}
716		cbp = g_clone_bio(bp);
717		if (cbp == NULL)
718			goto failure;
719		cbp->bio_offset = offset + start;
720		cbp->bio_length = length;
721		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722			cbp->bio_ma_offset += (uintptr_t)addr;
723			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724			cbp->bio_ma_offset %= PAGE_SIZE;
725			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726			    cbp->bio_length) / PAGE_SIZE;
727		} else
728			cbp->bio_data = addr;
729		cbp->bio_caller1 = &vol->v_subdisks[no];
730		bioq_insert_tail(&queue, cbp);
731		no += N - best;
732		if (no >= vol->v_disks_count) {
733			no -= vol->v_disks_count;
734			offset += strip_size;
735		}
736		remain -= length;
737		addr += length;
738		start = 0;
739	}
740	while ((cbp = bioq_takefirst(&queue)) != NULL) {
741		sd = cbp->bio_caller1;
742		cbp->bio_caller1 = NULL;
743		g_raid_subdisk_iostart(sd, cbp);
744	}
745	return;
746failure:
747	while ((cbp = bioq_takefirst(&queue)) != NULL)
748		g_destroy_bio(cbp);
749	if (bp->bio_error == 0)
750		bp->bio_error = ENOMEM;
751	g_raid_iodone(bp, bp->bio_error);
752}
753
754static void
755g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
756{
757	struct g_raid_volume *vol;
758	struct g_raid_subdisk *sd;
759	struct bio_queue_head queue;
760	struct bio *cbp;
761	char *addr;
762	off_t offset, start, length, remain;
763	u_int no, strip_size;
764	int i;
765
766	vol = tr->tro_volume;
767	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768		addr = NULL;
769	else
770		addr = bp->bio_data;
771	strip_size = vol->v_strip_size;
772	V2P(vol, bp->bio_offset, &no, &offset, &start);
773	remain = bp->bio_length;
774	bioq_init(&queue);
775	while (remain > 0) {
776		length = MIN(strip_size - start, remain);
777		for (i = 0; i < N; i++) {
778			sd = &vol->v_subdisks[no];
779			switch (sd->sd_state) {
780			case G_RAID_SUBDISK_S_ACTIVE:
781			case G_RAID_SUBDISK_S_STALE:
782			case G_RAID_SUBDISK_S_RESYNC:
783				break;
784			case G_RAID_SUBDISK_S_REBUILD:
785				if (offset + start >= sd->sd_rebuild_pos)
786					goto nextdisk;
787				break;
788			default:
789				goto nextdisk;
790			}
791			cbp = g_clone_bio(bp);
792			if (cbp == NULL)
793				goto failure;
794			cbp->bio_offset = offset + start;
795			cbp->bio_length = length;
796			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797			    bp->bio_cmd != BIO_DELETE) {
798				cbp->bio_ma_offset += (uintptr_t)addr;
799				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800				cbp->bio_ma_offset %= PAGE_SIZE;
801				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802				    cbp->bio_length) / PAGE_SIZE;
803			} else
804				cbp->bio_data = addr;
805			cbp->bio_caller1 = sd;
806			bioq_insert_tail(&queue, cbp);
807nextdisk:
808			if (++no >= vol->v_disks_count) {
809				no = 0;
810				offset += strip_size;
811			}
812		}
813		remain -= length;
814		if (bp->bio_cmd != BIO_DELETE)
815			addr += length;
816		start = 0;
817	}
818	while ((cbp = bioq_takefirst(&queue)) != NULL) {
819		sd = cbp->bio_caller1;
820		cbp->bio_caller1 = NULL;
821		g_raid_subdisk_iostart(sd, cbp);
822	}
823	return;
824failure:
825	while ((cbp = bioq_takefirst(&queue)) != NULL)
826		g_destroy_bio(cbp);
827	if (bp->bio_error == 0)
828		bp->bio_error = ENOMEM;
829	g_raid_iodone(bp, bp->bio_error);
830}
831
832static void
833g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
834{
835	struct g_raid_volume *vol;
836	struct g_raid_tr_raid1e_object *trs;
837
838	vol = tr->tro_volume;
839	trs = (struct g_raid_tr_raid1e_object *)tr;
840	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
841	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
842	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
843		g_raid_iodone(bp, EIO);
844		return;
845	}
846	/*
847	 * If we're rebuilding, squeeze in rebuild activity every so often,
848	 * even when the disk is busy.  Be sure to only count real I/O
849	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
850	 * by this module.
851	 */
852	if (trs->trso_failed_sd != NULL &&
853	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
854		/* Make this new or running now round short. */
855		trs->trso_recover_slabs = 0;
856		if (--trs->trso_fair_io <= 0) {
857			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
858			g_raid_tr_raid1e_rebuild_some(tr);
859		}
860	}
861	switch (bp->bio_cmd) {
862	case BIO_READ:
863		g_raid_tr_iostart_raid1e_read(tr, bp);
864		break;
865	case BIO_WRITE:
866	case BIO_DELETE:
867		g_raid_tr_iostart_raid1e_write(tr, bp);
868		break;
869	case BIO_FLUSH:
870		g_raid_tr_flush_common(tr, bp);
871		break;
872	default:
873		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
874		    bp->bio_cmd, vol->v_name));
875		break;
876	}
877}
878
879static void
880g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
881    struct g_raid_subdisk *sd, struct bio *bp)
882{
883	struct bio *cbp;
884	struct g_raid_subdisk *nsd;
885	struct g_raid_volume *vol;
886	struct bio *pbp;
887	struct g_raid_tr_raid1e_object *trs;
888	off_t virtual, offset, start;
889	uintptr_t mask;
890	int error, do_write, copy, disk, best;
891
892	trs = (struct g_raid_tr_raid1e_object *)tr;
893	vol = tr->tro_volume;
894	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
895		if (trs->trso_type == TR_RAID1E_REBUILD) {
896			nsd = trs->trso_failed_sd;
897			if (bp->bio_cmd == BIO_READ) {
898
899				/* Immediately abort rebuild, if requested. */
900				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
901					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
902					g_raid_tr_raid1e_rebuild_abort(tr);
903					return;
904				}
905
906				/* On read error, skip and cross fingers. */
907				if (bp->bio_error != 0) {
908					G_RAID_LOGREQ(0, bp,
909					    "Read error during rebuild (%d), "
910					    "possible data loss!",
911					    bp->bio_error);
912					goto rebuild_round_done;
913				}
914
915				/*
916				 * The read operation finished, queue the
917				 * write and get out.
918				 */
919				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
920				    bp->bio_error);
921				bp->bio_cmd = BIO_WRITE;
922				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
923				bp->bio_offset = nsd->sd_rebuild_pos;
924				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
925				g_raid_subdisk_iostart(nsd, bp);
926			} else {
927				/*
928				 * The write operation just finished.  Do
929				 * another.  We keep cloning the master bio
930				 * since it has the right buffers allocated to
931				 * it.
932				 */
933				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
934				    bp->bio_error);
935				if (bp->bio_error != 0 ||
936				    trs->trso_flags & TR_RAID1E_F_ABORT) {
937					if ((trs->trso_flags &
938					    TR_RAID1E_F_ABORT) == 0) {
939						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
940						    nsd, nsd->sd_disk);
941					}
942					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
943					g_raid_tr_raid1e_rebuild_abort(tr);
944					return;
945				}
946rebuild_round_done:
947				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
948				g_raid_unlock_range(tr->tro_volume,
949				    trs->trso_lock_pos, trs->trso_lock_len);
950				nsd->sd_rebuild_pos += bp->bio_length;
951				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
952					g_raid_tr_raid1e_rebuild_finish(tr);
953					return;
954				}
955
956				/* Abort rebuild if we are stopping */
957				if (trs->trso_stopping) {
958					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
959					g_raid_tr_raid1e_rebuild_abort(tr);
960					return;
961				}
962
963				if (--trs->trso_meta_update <= 0) {
964					g_raid_write_metadata(vol->v_softc,
965					    vol, nsd, nsd->sd_disk);
966					trs->trso_meta_update =
967					    g_raid1e_rebuild_meta_update;
968					/* Compensate short rebuild I/Os. */
969					if ((vol->v_disks_count % N) != 0 &&
970					    vol->v_strip_size <
971					     g_raid1e_rebuild_slab) {
972						trs->trso_meta_update *=
973						    g_raid1e_rebuild_slab;
974						trs->trso_meta_update /=
975						    vol->v_strip_size;
976					}
977				}
978				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
979				if (--trs->trso_recover_slabs <= 0)
980					return;
981				/* Run next rebuild iteration. */
982				g_raid_tr_raid1e_rebuild_some(tr);
983			}
984		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
985			/*
986			 * read good sd, read bad sd in parallel.  when both
987			 * done, compare the buffers.  write good to the bad
988			 * if different.  do the next bit of work.
989			 */
990			panic("Somehow, we think we're doing a resync");
991		}
992		return;
993	}
994	pbp = bp->bio_parent;
995	pbp->bio_inbed++;
996	mask = (intptr_t)bp->bio_caller2;
997	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
998		/*
999		 * Read failed on first drive.  Retry the read error on
1000		 * another disk drive, if available, before erroring out the
1001		 * read.
1002		 */
1003		sd->sd_disk->d_read_errs++;
1004		G_RAID_LOGREQ(0, bp,
1005		    "Read error (%d), %d read errors total",
1006		    bp->bio_error, sd->sd_disk->d_read_errs);
1007
1008		/*
1009		 * If there are too many read errors, we move to degraded.
1010		 * XXX Do we want to FAIL the drive (eg, make the user redo
1011		 * everything to get it back in sync), or just degrade the
1012		 * drive, which kicks off a resync?
1013		 */
1014		do_write = 0;
1015		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017		else if (mask == 0)
1018			do_write = 1;
1019
1020		/* Restore what we were doing. */
1021		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1022		V2P(vol, virtual, &disk, &offset, &start);
1023
1024		/* Find the other disk, and try to do the I/O to it. */
1025		mask |= 1 << copy;
1026		best = g_raid_tr_raid1e_select_read_disk(vol,
1027		    disk, offset, start, mask);
1028		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029			disk += best;
1030			if (disk >= vol->v_disks_count) {
1031				disk -= vol->v_disks_count;
1032				offset += vol->v_strip_size;
1033			}
1034			cbp->bio_offset = offset + start;
1035			cbp->bio_length = bp->bio_length;
1036			cbp->bio_data = bp->bio_data;
1037			cbp->bio_ma = bp->bio_ma;
1038			cbp->bio_ma_offset = bp->bio_ma_offset;
1039			cbp->bio_ma_n = bp->bio_ma_n;
1040			g_destroy_bio(bp);
1041			nsd = &vol->v_subdisks[disk];
1042			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043			    nsd->sd_pos);
1044			if (do_write)
1045				mask |= 1 << 31;
1046			if ((mask & (1U << 31)) != 0)
1047				sd->sd_recovery++;
1048			cbp->bio_caller2 = (void *)mask;
1049			if (do_write) {
1050				cbp->bio_caller1 = nsd;
1051				/* Lock callback starts I/O */
1052				g_raid_lock_range(sd->sd_volume,
1053				    virtual, cbp->bio_length, pbp, cbp);
1054			} else {
1055				g_raid_subdisk_iostart(nsd, cbp);
1056			}
1057			return;
1058		}
1059		/*
1060		 * We can't retry.  Return the original error by falling
1061		 * through.  This will happen when there's only one good disk.
1062		 * We don't need to fail the raid, since its actual state is
1063		 * based on the state of the subdisks.
1064		 */
1065		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066	}
1067	if (bp->bio_cmd == BIO_READ &&
1068	    bp->bio_error == 0 &&
1069	    (mask & (1U << 31)) != 0) {
1070		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071
1072		/* Restore what we were doing. */
1073		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1074		V2P(vol, virtual, &disk, &offset, &start);
1075
1076		/* Find best disk to write. */
1077		best = g_raid_tr_raid1e_select_read_disk(vol,
1078		    disk, offset, start, ~mask);
1079		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080			disk += best;
1081			if (disk >= vol->v_disks_count) {
1082				disk -= vol->v_disks_count;
1083				offset += vol->v_strip_size;
1084			}
1085			cbp->bio_offset = offset + start;
1086			cbp->bio_cmd = BIO_WRITE;
1087			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088			cbp->bio_caller2 = (void *)mask;
1089			g_destroy_bio(bp);
1090			G_RAID_LOGREQ(2, cbp,
1091			    "Attempting bad sector remap on failing drive.");
1092			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093			return;
1094		}
1095	}
1096	if ((mask & (1U << 31)) != 0) {
1097		/*
1098		 * We're done with a recovery, mark the range as unlocked.
1099		 * For any write errors, we aggressively fail the disk since
1100		 * there was both a READ and a WRITE error at this location.
1101		 * Both types of errors generally indicates the drive is on
1102		 * the verge of total failure anyway.  Better to stop trusting
1103		 * it now.  However, we need to reset error to 0 in that case
1104		 * because we're not failing the original I/O which succeeded.
1105		 */
1106
1107		/* Restore what we were doing. */
1108		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1109		V2P(vol, virtual, &disk, &offset, &start);
1110
1111		for (copy = 0; copy < N; copy++) {
1112			if ((mask & (1 << copy) ) != 0)
1113				vol->v_subdisks[(disk + copy) %
1114				    vol->v_disks_count].sd_recovery--;
1115		}
1116
1117		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119			    "failing subdisk.");
1120			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121			bp->bio_error = 0;
1122		}
1123		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125	}
1126	if (pbp->bio_cmd != BIO_READ) {
1127		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128			pbp->bio_error = bp->bio_error;
1129		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132		}
1133		error = pbp->bio_error;
1134	} else
1135		error = bp->bio_error;
1136	g_destroy_bio(bp);
1137	if (pbp->bio_children == pbp->bio_inbed) {
1138		pbp->bio_completed = pbp->bio_length;
1139		g_raid_iodone(pbp, error);
1140	}
1141}
1142
1143static int
1144g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1145    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1146{
1147	struct g_raid_volume *vol;
1148	struct g_raid_subdisk *sd;
1149	struct bio_queue_head queue;
1150	char *addr;
1151	off_t offset, start, length, remain;
1152	u_int no, strip_size;
1153	int i, error;
1154
1155	vol = tr->tro_volume;
1156	addr = virtual;
1157	strip_size = vol->v_strip_size;
1158	V2P(vol, boffset, &no, &offset, &start);
1159	remain = blength;
1160	bioq_init(&queue);
1161	while (remain > 0) {
1162		length = MIN(strip_size - start, remain);
1163		for (i = 0; i < N; i++) {
1164			sd = &vol->v_subdisks[no];
1165			switch (sd->sd_state) {
1166			case G_RAID_SUBDISK_S_ACTIVE:
1167			case G_RAID_SUBDISK_S_STALE:
1168			case G_RAID_SUBDISK_S_RESYNC:
1169				break;
1170			case G_RAID_SUBDISK_S_REBUILD:
1171				if (offset + start >= sd->sd_rebuild_pos)
1172					goto nextdisk;
1173				break;
1174			default:
1175				goto nextdisk;
1176			}
1177			error = g_raid_subdisk_kerneldump(sd,
1178			    addr, 0, offset + start, length);
1179			if (error != 0)
1180				return (error);
1181nextdisk:
1182			if (++no >= vol->v_disks_count) {
1183				no = 0;
1184				offset += strip_size;
1185			}
1186		}
1187		remain -= length;
1188		addr += length;
1189		start = 0;
1190	}
1191	return (0);
1192}
1193
1194static int
1195g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196{
1197	struct bio *bp;
1198	struct g_raid_subdisk *sd;
1199
1200	bp = (struct bio *)argp;
1201	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202	g_raid_subdisk_iostart(sd, bp);
1203
1204	return (0);
1205}
1206
1207static int
1208g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209{
1210	struct g_raid_tr_raid1e_object *trs;
1211	struct g_raid_volume *vol;
1212
1213	vol = tr->tro_volume;
1214	trs = (struct g_raid_tr_raid1e_object *)tr;
1215	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217	/* Compensate short rebuild I/Os. */
1218	if ((vol->v_disks_count % N) != 0 &&
1219	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1220		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221		trs->trso_recover_slabs /= vol->v_strip_size;
1222	}
1223	if (trs->trso_type == TR_RAID1E_REBUILD)
1224		g_raid_tr_raid1e_rebuild_some(tr);
1225	return (0);
1226}
1227
1228static int
1229g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230{
1231	struct g_raid_tr_raid1e_object *trs;
1232
1233	trs = (struct g_raid_tr_raid1e_object *)tr;
1234
1235	if (trs->trso_buffer != NULL) {
1236		free(trs->trso_buffer, M_TR_RAID1E);
1237		trs->trso_buffer = NULL;
1238	}
1239	return (0);
1240}
1241
1242G_RAID_TR_DECLARE(raid1e, "RAID1E");
1243