1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/bio.h>
34#include <sys/endian.h>
35#include <sys/kernel.h>
36#include <sys/kobj.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/malloc.h>
40#include <sys/mutex.h>
41#include <sys/sysctl.h>
42#include <sys/systm.h>
43#include <geom/geom.h>
44#include <geom/geom_dbg.h>
45#include "geom/raid/g_raid.h"
46#include "g_raid_tr_if.h"
47
48#define N	2
49
50SYSCTL_DECL(_kern_geom_raid_raid1e);
51
52#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
53static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
54SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
55    &g_raid1e_rebuild_slab, 0,
56    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
57
58#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
59static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
60SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
61    &g_raid1e_rebuild_fair_io, 0,
62    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
63
64#define RAID1E_REBUILD_CLUSTER_IDLE 100
65static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
66SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
67    &g_raid1e_rebuild_cluster_idle, 0,
68    "Number of slabs to do each time we trigger a rebuild cycle");
69
70#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
71static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
72SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
73    &g_raid1e_rebuild_meta_update, 0,
74    "When to update the meta data.");
75
76static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
77
78#define TR_RAID1E_NONE 0
79#define TR_RAID1E_REBUILD 1
80#define TR_RAID1E_RESYNC 2
81
82#define TR_RAID1E_F_DOING_SOME	0x1
83#define TR_RAID1E_F_LOCKED	0x2
84#define TR_RAID1E_F_ABORT	0x4
85
86struct g_raid_tr_raid1e_object {
87	struct g_raid_tr_object	 trso_base;
88	int			 trso_starting;
89	int			 trso_stopping;
90	int			 trso_type;
91	int			 trso_recover_slabs; /* slabs before rest */
92	int			 trso_fair_io;
93	int			 trso_meta_update;
94	int			 trso_flags;
95	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
96	void			*trso_buffer;	 /* Buffer space */
97	off_t			 trso_lock_pos; /* Locked range start. */
98	off_t			 trso_lock_len; /* Locked range length. */
99	struct bio		 trso_bio;
100};
101
102static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
103static g_raid_tr_event_t g_raid_tr_event_raid1e;
104static g_raid_tr_start_t g_raid_tr_start_raid1e;
105static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
106static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
107static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
108static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
109static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
110static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
111static g_raid_tr_free_t g_raid_tr_free_raid1e;
112
113static kobj_method_t g_raid_tr_raid1e_methods[] = {
114	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
115	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
116	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
117	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
118	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
119	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
120	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
121	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
122	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
123	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
124	{ 0, 0 }
125};
126
127static struct g_raid_tr_class g_raid_tr_raid1e_class = {
128	"RAID1E",
129	g_raid_tr_raid1e_methods,
130	sizeof(struct g_raid_tr_raid1e_object),
131	.trc_enable = 1,
132	.trc_priority = 200,
133	.trc_accept_unmapped = 1
134};
135
136static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
137static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
138    struct g_raid_subdisk *sd);
139static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
140    int no, off_t off, off_t len, u_int mask);
141
142static inline void
143V2P(struct g_raid_volume *vol, off_t virt,
144    int *disk, off_t *offset, off_t *start)
145{
146	off_t nstrip;
147	u_int strip_size;
148
149	strip_size = vol->v_strip_size;
150	/* Strip number. */
151	nstrip = virt / strip_size;
152	/* Start position in strip. */
153	*start = virt % strip_size;
154	/* Disk number. */
155	*disk = (nstrip * N) % vol->v_disks_count;
156	/* Strip start position in disk. */
157	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
158}
159
160static inline void
161P2V(struct g_raid_volume *vol, int disk, off_t offset,
162    off_t *virt, int *copy)
163{
164	off_t nstrip, start;
165	u_int strip_size;
166
167	strip_size = vol->v_strip_size;
168	/* Start position in strip. */
169	start = offset % strip_size;
170	/* Physical strip number. */
171	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
172	/* Number of physical strip (copy) inside virtual strip. */
173	*copy = nstrip % N;
174	/* Offset in virtual space. */
175	*virt = (nstrip / N) * strip_size + start;
176}
177
178static int
179g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
180{
181	struct g_raid_tr_raid1e_object *trs;
182
183	trs = (struct g_raid_tr_raid1e_object *)tr;
184	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
185	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
186		return (G_RAID_TR_TASTE_FAIL);
187	trs->trso_starting = 1;
188	return (G_RAID_TR_TASTE_SUCCEED);
189}
190
191static int
192g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
193{
194	struct g_raid_softc *sc;
195	struct g_raid_subdisk *sd, *bestsd, *worstsd;
196	int i, j, state, sstate;
197
198	sc = vol->v_softc;
199	state = G_RAID_VOLUME_S_OPTIMAL;
200	for (i = 0; i < vol->v_disks_count / N; i++) {
201		bestsd = &vol->v_subdisks[i * N];
202		for (j = 1; j < N; j++) {
203			sd = &vol->v_subdisks[i * N + j];
204			if (sd->sd_state > bestsd->sd_state)
205				bestsd = sd;
206			else if (sd->sd_state == bestsd->sd_state &&
207			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
208			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
209			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
210				bestsd = sd;
211		}
212		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
213		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
214			/* We found reasonable candidate. */
215			G_RAID_DEBUG1(1, sc,
216			    "Promote subdisk %s:%d from %s to ACTIVE.",
217			    vol->v_name, bestsd->sd_pos,
218			    g_raid_subdisk_state2str(bestsd->sd_state));
219			g_raid_change_subdisk_state(bestsd,
220			    G_RAID_SUBDISK_S_ACTIVE);
221			g_raid_write_metadata(sc,
222			    vol, bestsd, bestsd->sd_disk);
223		}
224		worstsd = &vol->v_subdisks[i * N];
225		for (j = 1; j < N; j++) {
226			sd = &vol->v_subdisks[i * N + j];
227			if (sd->sd_state < worstsd->sd_state)
228				worstsd = sd;
229		}
230		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
231			sstate = G_RAID_VOLUME_S_OPTIMAL;
232		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
233			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
234		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235			sstate = G_RAID_VOLUME_S_DEGRADED;
236		else
237			sstate = G_RAID_VOLUME_S_BROKEN;
238		if (sstate < state)
239			state = sstate;
240	}
241	return (state);
242}
243
244static int
245g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
246{
247	struct g_raid_softc *sc;
248	struct g_raid_subdisk *sd, *bestsd, *worstsd;
249	int i, j, state, sstate;
250
251	sc = vol->v_softc;
252	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
253	    vol->v_disks_count)
254		return (G_RAID_VOLUME_S_OPTIMAL);
255	for (i = 0; i < vol->v_disks_count; i++) {
256		sd = &vol->v_subdisks[i];
257		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
258			/* We found reasonable candidate. */
259			G_RAID_DEBUG1(1, sc,
260			    "Promote subdisk %s:%d from %s to STALE.",
261			    vol->v_name, sd->sd_pos,
262			    g_raid_subdisk_state2str(sd->sd_state));
263			g_raid_change_subdisk_state(sd,
264			    G_RAID_SUBDISK_S_STALE);
265			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
266		}
267	}
268	state = G_RAID_VOLUME_S_OPTIMAL;
269	for (i = 0; i < vol->v_disks_count; i++) {
270		bestsd = &vol->v_subdisks[i];
271		worstsd = &vol->v_subdisks[i];
272		for (j = 1; j < N; j++) {
273			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
274			if (sd->sd_state > bestsd->sd_state)
275				bestsd = sd;
276			else if (sd->sd_state == bestsd->sd_state &&
277			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
278			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
279			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
280				bestsd = sd;
281			if (sd->sd_state < worstsd->sd_state)
282				worstsd = sd;
283		}
284		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
285			sstate = G_RAID_VOLUME_S_OPTIMAL;
286		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
288		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
289			sstate = G_RAID_VOLUME_S_DEGRADED;
290		else
291			sstate = G_RAID_VOLUME_S_BROKEN;
292		if (sstate < state)
293			state = sstate;
294	}
295	return (state);
296}
297
298static int
299g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
300    struct g_raid_subdisk *sd)
301{
302	struct g_raid_tr_raid1e_object *trs;
303	struct g_raid_softc *sc;
304	u_int s;
305
306	sc = vol->v_softc;
307	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
308	if (trs->trso_stopping &&
309	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
310		s = G_RAID_VOLUME_S_STOPPED;
311	else if (trs->trso_starting)
312		s = G_RAID_VOLUME_S_STARTING;
313	else {
314		if ((vol->v_disks_count % N) == 0)
315			s = g_raid_tr_update_state_raid1e_even(vol);
316		else
317			s = g_raid_tr_update_state_raid1e_odd(vol);
318	}
319	if (s != vol->v_state) {
320		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
321		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
322		    G_RAID_EVENT_VOLUME);
323		g_raid_change_volume_state(vol, s);
324		if (!trs->trso_starting && !trs->trso_stopping)
325			g_raid_write_metadata(sc, vol, NULL, NULL);
326	}
327	if (!trs->trso_starting && !trs->trso_stopping)
328		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
329	return (0);
330}
331
332static void
333g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
334    struct g_raid_disk *disk)
335{
336	struct g_raid_volume *vol;
337
338	vol = sd->sd_volume;
339	/*
340	 * We don't fail the last disk in the pack, since it still has decent
341	 * data on it and that's better than failing the disk if it is the root
342	 * file system.
343	 *
344	 * XXX should this be controlled via a tunable?  It makes sense for
345	 * the volume that has / on it.  I can't think of a case where we'd
346	 * want the volume to go away on this kind of event.
347	 */
348	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
349	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
350	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
351	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
352	     vol->v_disks_count) &&
353	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
354		return;
355	g_raid_fail_disk(sc, sd, disk);
356}
357
358static void
359g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
360{
361	struct g_raid_volume *vol;
362	struct g_raid_subdisk *sd;
363
364	vol = trs->trso_base.tro_volume;
365	sd = trs->trso_failed_sd;
366	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
367	free(trs->trso_buffer, M_TR_RAID1E);
368	trs->trso_buffer = NULL;
369	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
370	trs->trso_type = TR_RAID1E_NONE;
371	trs->trso_recover_slabs = 0;
372	trs->trso_failed_sd = NULL;
373	g_raid_tr_update_state_raid1e(vol, NULL);
374}
375
376static void
377g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
378{
379	struct g_raid_tr_raid1e_object *trs;
380	struct g_raid_subdisk *sd;
381
382	trs = (struct g_raid_tr_raid1e_object *)tr;
383	sd = trs->trso_failed_sd;
384	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
385	    "Subdisk %s:%d-%s rebuild completed.",
386	    sd->sd_volume->v_name, sd->sd_pos,
387	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
388	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
389	sd->sd_rebuild_pos = 0;
390	g_raid_tr_raid1e_rebuild_done(trs);
391}
392
393static void
394g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
395{
396	struct g_raid_tr_raid1e_object *trs;
397	struct g_raid_subdisk *sd;
398	struct g_raid_volume *vol;
399
400	vol = tr->tro_volume;
401	trs = (struct g_raid_tr_raid1e_object *)tr;
402	sd = trs->trso_failed_sd;
403	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
404		G_RAID_DEBUG1(1, vol->v_softc,
405		    "Subdisk %s:%d-%s rebuild is aborting.",
406		    sd->sd_volume->v_name, sd->sd_pos,
407		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
408		trs->trso_flags |= TR_RAID1E_F_ABORT;
409	} else {
410		G_RAID_DEBUG1(0, vol->v_softc,
411		    "Subdisk %s:%d-%s rebuild aborted.",
412		    sd->sd_volume->v_name, sd->sd_pos,
413		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
414		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
415		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
416			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
417			g_raid_unlock_range(tr->tro_volume,
418			    trs->trso_lock_pos, trs->trso_lock_len);
419		}
420		g_raid_tr_raid1e_rebuild_done(trs);
421	}
422}
423
424static void
425g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
426{
427	struct g_raid_tr_raid1e_object *trs;
428	struct g_raid_softc *sc;
429	struct g_raid_volume *vol;
430	struct g_raid_subdisk *sd;
431	struct bio *bp;
432	off_t len, virtual, vend, offset, start;
433	int disk, copy, best;
434
435	trs = (struct g_raid_tr_raid1e_object *)tr;
436	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
437		return;
438	vol = tr->tro_volume;
439	sc = vol->v_softc;
440	sd = trs->trso_failed_sd;
441
442	while (1) {
443		if (sd->sd_rebuild_pos >= sd->sd_size) {
444			g_raid_tr_raid1e_rebuild_finish(tr);
445			return;
446		}
447		/* Get virtual offset from physical rebuild position. */
448		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
449		/* Get physical offset back to get first stripe position. */
450		V2P(vol, virtual, &disk, &offset, &start);
451		/* Calculate contignous data length. */
452		len = MIN(g_raid1e_rebuild_slab,
453		    sd->sd_size - sd->sd_rebuild_pos);
454		if ((vol->v_disks_count % N) != 0)
455			len = MIN(len, vol->v_strip_size - start);
456		/* Find disk with most accurate data. */
457		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
458		    offset + start, len, 0);
459		if (best < 0) {
460			/* There is no any valid disk. */
461			g_raid_tr_raid1e_rebuild_abort(tr);
462			return;
463		} else if (best != copy) {
464			/* Some other disk has better data. */
465			break;
466		}
467		/* We have the most accurate data. Skip the range. */
468		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
469		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
470		sd->sd_rebuild_pos += len;
471	}
472
473	bp = &trs->trso_bio;
474	memset(bp, 0, sizeof(*bp));
475	bp->bio_offset = offset + start +
476	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
477	bp->bio_length = len;
478	bp->bio_data = trs->trso_buffer;
479	bp->bio_cmd = BIO_READ;
480	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
481	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
482	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
483	/*
484	 * If we are crossing stripe boundary, correct affected virtual
485	 * range we should lock.
486	 */
487	if (start + len > vol->v_strip_size) {
488		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
489		len = vend - virtual;
490	}
491	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
492	trs->trso_flags |= TR_RAID1E_F_LOCKED;
493	trs->trso_lock_pos = virtual;
494	trs->trso_lock_len = len;
495	/* Lock callback starts I/O */
496	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
497}
498
499static void
500g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
501{
502	struct g_raid_volume *vol;
503	struct g_raid_tr_raid1e_object *trs;
504	struct g_raid_subdisk *sd;
505
506	vol = tr->tro_volume;
507	trs = (struct g_raid_tr_raid1e_object *)tr;
508	if (trs->trso_failed_sd) {
509		G_RAID_DEBUG1(1, vol->v_softc,
510		    "Already rebuild in start rebuild. pos %jd\n",
511		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
512		return;
513	}
514	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
515	if (sd == NULL)
516		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
517	if (sd == NULL) {
518		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
519		if (sd != NULL) {
520			sd->sd_rebuild_pos = 0;
521			g_raid_change_subdisk_state(sd,
522			    G_RAID_SUBDISK_S_RESYNC);
523			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
524		} else {
525			sd = g_raid_get_subdisk(vol,
526			    G_RAID_SUBDISK_S_UNINITIALIZED);
527			if (sd == NULL)
528				sd = g_raid_get_subdisk(vol,
529				    G_RAID_SUBDISK_S_NEW);
530			if (sd != NULL) {
531				sd->sd_rebuild_pos = 0;
532				g_raid_change_subdisk_state(sd,
533				    G_RAID_SUBDISK_S_REBUILD);
534				g_raid_write_metadata(vol->v_softc,
535				    vol, sd, NULL);
536			}
537		}
538	}
539	if (sd == NULL) {
540		G_RAID_DEBUG1(1, vol->v_softc,
541		    "No failed disk to rebuild.  night night.");
542		return;
543	}
544	trs->trso_failed_sd = sd;
545	G_RAID_DEBUG1(0, vol->v_softc,
546	    "Subdisk %s:%d-%s rebuild start at %jd.",
547	    sd->sd_volume->v_name, sd->sd_pos,
548	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
549	    trs->trso_failed_sd->sd_rebuild_pos);
550	trs->trso_type = TR_RAID1E_REBUILD;
551	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
552	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
553	g_raid_tr_raid1e_rebuild_some(tr);
554}
555
556static void
557g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
558    struct g_raid_subdisk *sd)
559{
560	struct g_raid_volume *vol;
561	struct g_raid_tr_raid1e_object *trs;
562	int nr;
563
564	vol = tr->tro_volume;
565	trs = (struct g_raid_tr_raid1e_object *)tr;
566	if (trs->trso_stopping)
567		return;
568	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
569	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
570	switch(trs->trso_type) {
571	case TR_RAID1E_NONE:
572		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
573			return;
574		if (nr == 0) {
575			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
576			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
577			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
578			if (nr == 0)
579				return;
580		}
581		g_raid_tr_raid1e_rebuild_start(tr);
582		break;
583	case TR_RAID1E_REBUILD:
584		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
585		    trs->trso_failed_sd == sd)
586			g_raid_tr_raid1e_rebuild_abort(tr);
587		break;
588	case TR_RAID1E_RESYNC:
589		break;
590	}
591}
592
593static int
594g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
595    struct g_raid_subdisk *sd, u_int event)
596{
597
598	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
599	return (0);
600}
601
602static int
603g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
604{
605	struct g_raid_tr_raid1e_object *trs;
606	struct g_raid_volume *vol;
607
608	trs = (struct g_raid_tr_raid1e_object *)tr;
609	vol = tr->tro_volume;
610	trs->trso_starting = 0;
611	g_raid_tr_update_state_raid1e(vol, NULL);
612	return (0);
613}
614
615static int
616g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
617{
618	struct g_raid_tr_raid1e_object *trs;
619	struct g_raid_volume *vol;
620
621	trs = (struct g_raid_tr_raid1e_object *)tr;
622	vol = tr->tro_volume;
623	trs->trso_starting = 0;
624	trs->trso_stopping = 1;
625	g_raid_tr_update_state_raid1e(vol, NULL);
626	return (0);
627}
628
629/*
630 * Select the disk to read from.  Take into account: subdisk state, running
631 * error recovery, average disk load, head position and possible cache hits.
632 */
633#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
634static int
635g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
636    int no, off_t off, off_t len, u_int mask)
637{
638	struct g_raid_subdisk *sd;
639	off_t offset;
640	int i, best, prio, bestprio;
641
642	best = -1;
643	bestprio = INT_MAX;
644	for (i = 0; i < N; i++) {
645		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
646		offset = off;
647		if (no + i >= vol->v_disks_count)
648			offset += vol->v_strip_size;
649
650		prio = G_RAID_SUBDISK_LOAD(sd);
651		if ((mask & (1 << sd->sd_pos)) != 0)
652			continue;
653		switch (sd->sd_state) {
654		case G_RAID_SUBDISK_S_ACTIVE:
655			break;
656		case G_RAID_SUBDISK_S_RESYNC:
657			if (offset + off < sd->sd_rebuild_pos)
658				break;
659			/* FALLTHROUGH */
660		case G_RAID_SUBDISK_S_STALE:
661			prio += i << 24;
662			break;
663		case G_RAID_SUBDISK_S_REBUILD:
664			if (offset + off < sd->sd_rebuild_pos)
665				break;
666			/* FALLTHROUGH */
667		default:
668			continue;
669		}
670		prio += min(sd->sd_recovery, 255) << 16;
671		/* If disk head is precisely in position - highly prefer it. */
672		if (G_RAID_SUBDISK_POS(sd) == offset)
673			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
674		else
675		/* If disk head is close to position - prefer it. */
676		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
677		    G_RAID_SUBDISK_TRACK_SIZE)
678			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
679		if (prio < bestprio) {
680			bestprio = prio;
681			best = i;
682		}
683	}
684	return (best);
685}
686
687static void
688g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
689{
690	struct g_raid_volume *vol;
691	struct g_raid_subdisk *sd;
692	struct bio_queue_head queue;
693	struct bio *cbp;
694	char *addr;
695	off_t offset, start, length, remain;
696	u_int no, strip_size;
697	int best;
698
699	vol = tr->tro_volume;
700	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
701		addr = NULL;
702	else
703		addr = bp->bio_data;
704	strip_size = vol->v_strip_size;
705	V2P(vol, bp->bio_offset, &no, &offset, &start);
706	remain = bp->bio_length;
707	bioq_init(&queue);
708	while (remain > 0) {
709		length = MIN(strip_size - start, remain);
710		best = g_raid_tr_raid1e_select_read_disk(vol,
711		    no, offset, length, 0);
712		KASSERT(best >= 0, ("No readable disk in volume %s!",
713		    vol->v_name));
714		no += best;
715		if (no >= vol->v_disks_count) {
716			no -= vol->v_disks_count;
717			offset += strip_size;
718		}
719		cbp = g_clone_bio(bp);
720		if (cbp == NULL)
721			goto failure;
722		cbp->bio_offset = offset + start;
723		cbp->bio_length = length;
724		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
725			cbp->bio_ma_offset += (uintptr_t)addr;
726			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
727			cbp->bio_ma_offset %= PAGE_SIZE;
728			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
729			    cbp->bio_length) / PAGE_SIZE;
730		} else
731			cbp->bio_data = addr;
732		cbp->bio_caller1 = &vol->v_subdisks[no];
733		bioq_insert_tail(&queue, cbp);
734		no += N - best;
735		if (no >= vol->v_disks_count) {
736			no -= vol->v_disks_count;
737			offset += strip_size;
738		}
739		remain -= length;
740		addr += length;
741		start = 0;
742	}
743	while ((cbp = bioq_takefirst(&queue)) != NULL) {
744		sd = cbp->bio_caller1;
745		cbp->bio_caller1 = NULL;
746		g_raid_subdisk_iostart(sd, cbp);
747	}
748	return;
749failure:
750	while ((cbp = bioq_takefirst(&queue)) != NULL)
751		g_destroy_bio(cbp);
752	if (bp->bio_error == 0)
753		bp->bio_error = ENOMEM;
754	g_raid_iodone(bp, bp->bio_error);
755}
756
757static void
758g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
759{
760	struct g_raid_volume *vol;
761	struct g_raid_subdisk *sd;
762	struct bio_queue_head queue;
763	struct bio *cbp;
764	char *addr;
765	off_t offset, start, length, remain;
766	u_int no, strip_size;
767	int i;
768
769	vol = tr->tro_volume;
770	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
771		addr = NULL;
772	else
773		addr = bp->bio_data;
774	strip_size = vol->v_strip_size;
775	V2P(vol, bp->bio_offset, &no, &offset, &start);
776	remain = bp->bio_length;
777	bioq_init(&queue);
778	while (remain > 0) {
779		length = MIN(strip_size - start, remain);
780		for (i = 0; i < N; i++) {
781			sd = &vol->v_subdisks[no];
782			switch (sd->sd_state) {
783			case G_RAID_SUBDISK_S_ACTIVE:
784			case G_RAID_SUBDISK_S_STALE:
785			case G_RAID_SUBDISK_S_RESYNC:
786				break;
787			case G_RAID_SUBDISK_S_REBUILD:
788				if (offset + start >= sd->sd_rebuild_pos)
789					goto nextdisk;
790				break;
791			default:
792				goto nextdisk;
793			}
794			cbp = g_clone_bio(bp);
795			if (cbp == NULL)
796				goto failure;
797			cbp->bio_offset = offset + start;
798			cbp->bio_length = length;
799			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
800			    bp->bio_cmd != BIO_DELETE) {
801				cbp->bio_ma_offset += (uintptr_t)addr;
802				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
803				cbp->bio_ma_offset %= PAGE_SIZE;
804				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
805				    cbp->bio_length) / PAGE_SIZE;
806			} else
807				cbp->bio_data = addr;
808			cbp->bio_caller1 = sd;
809			bioq_insert_tail(&queue, cbp);
810nextdisk:
811			if (++no >= vol->v_disks_count) {
812				no = 0;
813				offset += strip_size;
814			}
815		}
816		remain -= length;
817		if (bp->bio_cmd != BIO_DELETE)
818			addr += length;
819		start = 0;
820	}
821	while ((cbp = bioq_takefirst(&queue)) != NULL) {
822		sd = cbp->bio_caller1;
823		cbp->bio_caller1 = NULL;
824		g_raid_subdisk_iostart(sd, cbp);
825	}
826	return;
827failure:
828	while ((cbp = bioq_takefirst(&queue)) != NULL)
829		g_destroy_bio(cbp);
830	if (bp->bio_error == 0)
831		bp->bio_error = ENOMEM;
832	g_raid_iodone(bp, bp->bio_error);
833}
834
835static void
836g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
837{
838	struct g_raid_volume *vol;
839	struct g_raid_tr_raid1e_object *trs;
840
841	vol = tr->tro_volume;
842	trs = (struct g_raid_tr_raid1e_object *)tr;
843	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
844	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
845	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
846		g_raid_iodone(bp, EIO);
847		return;
848	}
849	/*
850	 * If we're rebuilding, squeeze in rebuild activity every so often,
851	 * even when the disk is busy.  Be sure to only count real I/O
852	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
853	 * by this module.
854	 */
855	if (trs->trso_failed_sd != NULL &&
856	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
857		/* Make this new or running now round short. */
858		trs->trso_recover_slabs = 0;
859		if (--trs->trso_fair_io <= 0) {
860			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
861			g_raid_tr_raid1e_rebuild_some(tr);
862		}
863	}
864	switch (bp->bio_cmd) {
865	case BIO_READ:
866		g_raid_tr_iostart_raid1e_read(tr, bp);
867		break;
868	case BIO_WRITE:
869	case BIO_DELETE:
870		g_raid_tr_iostart_raid1e_write(tr, bp);
871		break;
872	case BIO_SPEEDUP:
873	case BIO_FLUSH:
874		g_raid_tr_flush_common(tr, bp);
875		break;
876	default:
877		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
878		    bp->bio_cmd, vol->v_name));
879		break;
880	}
881}
882
883static void
884g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
885    struct g_raid_subdisk *sd, struct bio *bp)
886{
887	struct bio *cbp;
888	struct g_raid_subdisk *nsd;
889	struct g_raid_volume *vol;
890	struct bio *pbp;
891	struct g_raid_tr_raid1e_object *trs;
892	off_t virtual, offset, start;
893	uintptr_t mask;
894	int error, do_write, copy, disk, best;
895
896	trs = (struct g_raid_tr_raid1e_object *)tr;
897	vol = tr->tro_volume;
898	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
899		if (trs->trso_type == TR_RAID1E_REBUILD) {
900			nsd = trs->trso_failed_sd;
901			if (bp->bio_cmd == BIO_READ) {
902				/* Immediately abort rebuild, if requested. */
903				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
904					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
905					g_raid_tr_raid1e_rebuild_abort(tr);
906					return;
907				}
908
909				/* On read error, skip and cross fingers. */
910				if (bp->bio_error != 0) {
911					G_RAID_LOGREQ(0, bp,
912					    "Read error during rebuild (%d), "
913					    "possible data loss!",
914					    bp->bio_error);
915					goto rebuild_round_done;
916				}
917
918				/*
919				 * The read operation finished, queue the
920				 * write and get out.
921				 */
922				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
923				    bp->bio_error);
924				bp->bio_cmd = BIO_WRITE;
925				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
926				bp->bio_offset = nsd->sd_rebuild_pos;
927				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
928				g_raid_subdisk_iostart(nsd, bp);
929			} else {
930				/*
931				 * The write operation just finished.  Do
932				 * another.  We keep cloning the master bio
933				 * since it has the right buffers allocated to
934				 * it.
935				 */
936				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
937				    bp->bio_error);
938				if (bp->bio_error != 0 ||
939				    trs->trso_flags & TR_RAID1E_F_ABORT) {
940					if ((trs->trso_flags &
941					    TR_RAID1E_F_ABORT) == 0) {
942						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
943						    nsd, nsd->sd_disk);
944					}
945					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
946					g_raid_tr_raid1e_rebuild_abort(tr);
947					return;
948				}
949rebuild_round_done:
950				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
951				g_raid_unlock_range(tr->tro_volume,
952				    trs->trso_lock_pos, trs->trso_lock_len);
953				nsd->sd_rebuild_pos += bp->bio_length;
954				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
955					g_raid_tr_raid1e_rebuild_finish(tr);
956					return;
957				}
958
959				/* Abort rebuild if we are stopping */
960				if (trs->trso_stopping) {
961					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
962					g_raid_tr_raid1e_rebuild_abort(tr);
963					return;
964				}
965
966				if (--trs->trso_meta_update <= 0) {
967					g_raid_write_metadata(vol->v_softc,
968					    vol, nsd, nsd->sd_disk);
969					trs->trso_meta_update =
970					    g_raid1e_rebuild_meta_update;
971					/* Compensate short rebuild I/Os. */
972					if ((vol->v_disks_count % N) != 0 &&
973					    vol->v_strip_size <
974					     g_raid1e_rebuild_slab) {
975						trs->trso_meta_update *=
976						    g_raid1e_rebuild_slab;
977						trs->trso_meta_update /=
978						    vol->v_strip_size;
979					}
980				}
981				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
982				if (--trs->trso_recover_slabs <= 0)
983					return;
984				/* Run next rebuild iteration. */
985				g_raid_tr_raid1e_rebuild_some(tr);
986			}
987		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
988			/*
989			 * read good sd, read bad sd in parallel.  when both
990			 * done, compare the buffers.  write good to the bad
991			 * if different.  do the next bit of work.
992			 */
993			panic("Somehow, we think we're doing a resync");
994		}
995		return;
996	}
997	pbp = bp->bio_parent;
998	pbp->bio_inbed++;
999	mask = (intptr_t)bp->bio_caller2;
1000	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1001		/*
1002		 * Read failed on first drive.  Retry the read error on
1003		 * another disk drive, if available, before erroring out the
1004		 * read.
1005		 */
1006		sd->sd_disk->d_read_errs++;
1007		G_RAID_LOGREQ(0, bp,
1008		    "Read error (%d), %d read errors total",
1009		    bp->bio_error, sd->sd_disk->d_read_errs);
1010
1011		/*
1012		 * If there are too many read errors, we move to degraded.
1013		 * XXX Do we want to FAIL the drive (eg, make the user redo
1014		 * everything to get it back in sync), or just degrade the
1015		 * drive, which kicks off a resync?
1016		 */
1017		do_write = 0;
1018		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1019			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1020		else if (mask == 0)
1021			do_write = 1;
1022
1023		/* Restore what we were doing. */
1024		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1025		V2P(vol, virtual, &disk, &offset, &start);
1026
1027		/* Find the other disk, and try to do the I/O to it. */
1028		mask |= 1 << copy;
1029		best = g_raid_tr_raid1e_select_read_disk(vol,
1030		    disk, offset, start, mask);
1031		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1032			disk += best;
1033			if (disk >= vol->v_disks_count) {
1034				disk -= vol->v_disks_count;
1035				offset += vol->v_strip_size;
1036			}
1037			cbp->bio_offset = offset + start;
1038			cbp->bio_length = bp->bio_length;
1039			cbp->bio_data = bp->bio_data;
1040			cbp->bio_ma = bp->bio_ma;
1041			cbp->bio_ma_offset = bp->bio_ma_offset;
1042			cbp->bio_ma_n = bp->bio_ma_n;
1043			g_destroy_bio(bp);
1044			nsd = &vol->v_subdisks[disk];
1045			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1046			    nsd->sd_pos);
1047			if (do_write)
1048				mask |= 1 << 31;
1049			if ((mask & (1U << 31)) != 0)
1050				sd->sd_recovery++;
1051			cbp->bio_caller2 = (void *)mask;
1052			if (do_write) {
1053				cbp->bio_caller1 = nsd;
1054				/* Lock callback starts I/O */
1055				g_raid_lock_range(sd->sd_volume,
1056				    virtual, cbp->bio_length, pbp, cbp);
1057			} else {
1058				g_raid_subdisk_iostart(nsd, cbp);
1059			}
1060			return;
1061		}
1062		/*
1063		 * We can't retry.  Return the original error by falling
1064		 * through.  This will happen when there's only one good disk.
1065		 * We don't need to fail the raid, since its actual state is
1066		 * based on the state of the subdisks.
1067		 */
1068		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1069	}
1070	if (bp->bio_cmd == BIO_READ &&
1071	    bp->bio_error == 0 &&
1072	    (mask & (1U << 31)) != 0) {
1073		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1074
1075		/* Restore what we were doing. */
1076		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1077		V2P(vol, virtual, &disk, &offset, &start);
1078
1079		/* Find best disk to write. */
1080		best = g_raid_tr_raid1e_select_read_disk(vol,
1081		    disk, offset, start, ~mask);
1082		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1083			disk += best;
1084			if (disk >= vol->v_disks_count) {
1085				disk -= vol->v_disks_count;
1086				offset += vol->v_strip_size;
1087			}
1088			cbp->bio_offset = offset + start;
1089			cbp->bio_cmd = BIO_WRITE;
1090			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1091			cbp->bio_caller2 = (void *)mask;
1092			g_destroy_bio(bp);
1093			G_RAID_LOGREQ(2, cbp,
1094			    "Attempting bad sector remap on failing drive.");
1095			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1096			return;
1097		}
1098	}
1099	if ((mask & (1U << 31)) != 0) {
1100		/*
1101		 * We're done with a recovery, mark the range as unlocked.
1102		 * For any write errors, we aggressively fail the disk since
1103		 * there was both a READ and a WRITE error at this location.
1104		 * Both types of errors generally indicates the drive is on
1105		 * the verge of total failure anyway.  Better to stop trusting
1106		 * it now.  However, we need to reset error to 0 in that case
1107		 * because we're not failing the original I/O which succeeded.
1108		 */
1109
1110		/* Restore what we were doing. */
1111		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1112		V2P(vol, virtual, &disk, &offset, &start);
1113
1114		for (copy = 0; copy < N; copy++) {
1115			if ((mask & (1 << copy) ) != 0)
1116				vol->v_subdisks[(disk + copy) %
1117				    vol->v_disks_count].sd_recovery--;
1118		}
1119
1120		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1121			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1122			    "failing subdisk.");
1123			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1124			bp->bio_error = 0;
1125		}
1126		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1127		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1128	}
1129	if (pbp->bio_cmd != BIO_READ) {
1130		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1131			pbp->bio_error = bp->bio_error;
1132		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1133			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1134			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1135		}
1136		error = pbp->bio_error;
1137	} else
1138		error = bp->bio_error;
1139	g_destroy_bio(bp);
1140	if (pbp->bio_children == pbp->bio_inbed) {
1141		pbp->bio_completed = pbp->bio_length;
1142		g_raid_iodone(pbp, error);
1143	}
1144}
1145
1146static int
1147g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1148    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1149{
1150	struct g_raid_volume *vol;
1151	struct g_raid_subdisk *sd;
1152	struct bio_queue_head queue;
1153	char *addr;
1154	off_t offset, start, length, remain;
1155	u_int no, strip_size;
1156	int i, error;
1157
1158	vol = tr->tro_volume;
1159	addr = virtual;
1160	strip_size = vol->v_strip_size;
1161	V2P(vol, boffset, &no, &offset, &start);
1162	remain = blength;
1163	bioq_init(&queue);
1164	while (remain > 0) {
1165		length = MIN(strip_size - start, remain);
1166		for (i = 0; i < N; i++) {
1167			sd = &vol->v_subdisks[no];
1168			switch (sd->sd_state) {
1169			case G_RAID_SUBDISK_S_ACTIVE:
1170			case G_RAID_SUBDISK_S_STALE:
1171			case G_RAID_SUBDISK_S_RESYNC:
1172				break;
1173			case G_RAID_SUBDISK_S_REBUILD:
1174				if (offset + start >= sd->sd_rebuild_pos)
1175					goto nextdisk;
1176				break;
1177			default:
1178				goto nextdisk;
1179			}
1180			error = g_raid_subdisk_kerneldump(sd,
1181			    addr, 0, offset + start, length);
1182			if (error != 0)
1183				return (error);
1184nextdisk:
1185			if (++no >= vol->v_disks_count) {
1186				no = 0;
1187				offset += strip_size;
1188			}
1189		}
1190		remain -= length;
1191		addr += length;
1192		start = 0;
1193	}
1194	return (0);
1195}
1196
1197static int
1198g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1199{
1200	struct bio *bp;
1201	struct g_raid_subdisk *sd;
1202
1203	bp = (struct bio *)argp;
1204	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1205	g_raid_subdisk_iostart(sd, bp);
1206
1207	return (0);
1208}
1209
1210static int
1211g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1212{
1213	struct g_raid_tr_raid1e_object *trs;
1214	struct g_raid_volume *vol;
1215
1216	vol = tr->tro_volume;
1217	trs = (struct g_raid_tr_raid1e_object *)tr;
1218	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1219	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1220	/* Compensate short rebuild I/Os. */
1221	if ((vol->v_disks_count % N) != 0 &&
1222	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1223		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1224		trs->trso_recover_slabs /= vol->v_strip_size;
1225	}
1226	if (trs->trso_type == TR_RAID1E_REBUILD)
1227		g_raid_tr_raid1e_rebuild_some(tr);
1228	return (0);
1229}
1230
1231static int
1232g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1233{
1234	struct g_raid_tr_raid1e_object *trs;
1235
1236	trs = (struct g_raid_tr_raid1e_object *)tr;
1237
1238	if (trs->trso_buffer != NULL) {
1239		free(trs->trso_buffer, M_TR_RAID1E);
1240		trs->trso_buffer = NULL;
1241	}
1242	return (0);
1243}
1244
1245G_RAID_TR_DECLARE(raid1e, "RAID1E");
1246