tr_raid1e.c revision 240465
1219974Smav/*-
2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3219974Smav * All rights reserved.
4219974Smav *
5219974Smav * Redistribution and use in source and binary forms, with or without
6219974Smav * modification, are permitted provided that the following conditions
7219974Smav * are met:
8219974Smav * 1. Redistributions of source code must retain the above copyright
9219974Smav *    notice, this list of conditions and the following disclaimer.
10219974Smav * 2. Redistributions in binary form must reproduce the above copyright
11219974Smav *    notice, this list of conditions and the following disclaimer in the
12219974Smav *    documentation and/or other materials provided with the distribution.
13219974Smav *
14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17219974Smav * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24219974Smav * SUCH DAMAGE.
25219974Smav */
26219974Smav
27219974Smav#include <sys/cdefs.h>
28219974Smav__FBSDID("$FreeBSD: head/sys/geom/raid/tr_raid1e.c 240465 2012-09-13 13:27:09Z mav $");
29219974Smav
30219974Smav#include <sys/param.h>
31219974Smav#include <sys/bio.h>
32219974Smav#include <sys/endian.h>
33219974Smav#include <sys/kernel.h>
34219974Smav#include <sys/kobj.h>
35219974Smav#include <sys/limits.h>
36219974Smav#include <sys/lock.h>
37219974Smav#include <sys/malloc.h>
38219974Smav#include <sys/mutex.h>
39219974Smav#include <sys/sysctl.h>
40219974Smav#include <sys/systm.h>
41219974Smav#include <geom/geom.h>
42219974Smav#include "geom/raid/g_raid.h"
43219974Smav#include "g_raid_tr_if.h"
44219974Smav
45219974Smav#define N	2
46219974Smav
47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e);
48219974Smav
49219974Smav#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
52219974Smav    &g_raid1e_rebuild_slab);
53219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
54219974Smav    &g_raid1e_rebuild_slab, 0,
55219974Smav    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56219974Smav
57219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
60219974Smav    &g_raid1e_rebuild_fair_io);
61219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
62219974Smav    &g_raid1e_rebuild_fair_io, 0,
63219974Smav    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64219974Smav
65219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100
66219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
67219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
68219974Smav    &g_raid1e_rebuild_cluster_idle);
69219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
70219974Smav    &g_raid1e_rebuild_cluster_idle, 0,
71219974Smav    "Number of slabs to do each time we trigger a rebuild cycle");
72219974Smav
73219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
74219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
75219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
76219974Smav    &g_raid1e_rebuild_meta_update);
77219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
78219974Smav    &g_raid1e_rebuild_meta_update, 0,
79219974Smav    "When to update the meta data.");
80219974Smav
81219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
82219974Smav
83219974Smav#define TR_RAID1E_NONE 0
84219974Smav#define TR_RAID1E_REBUILD 1
85219974Smav#define TR_RAID1E_RESYNC 2
86219974Smav
87219974Smav#define TR_RAID1E_F_DOING_SOME	0x1
88219974Smav#define TR_RAID1E_F_LOCKED	0x2
89219974Smav#define TR_RAID1E_F_ABORT	0x4
90219974Smav
91219974Smavstruct g_raid_tr_raid1e_object {
92219974Smav	struct g_raid_tr_object	 trso_base;
93219974Smav	int			 trso_starting;
94219974Smav	int			 trso_stopping;
95219974Smav	int			 trso_type;
96219974Smav	int			 trso_recover_slabs; /* slabs before rest */
97219974Smav	int			 trso_fair_io;
98219974Smav	int			 trso_meta_update;
99219974Smav	int			 trso_flags;
100219974Smav	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
101219974Smav	void			*trso_buffer;	 /* Buffer space */
102219974Smav	off_t			 trso_lock_pos; /* Locked range start. */
103219974Smav	off_t			 trso_lock_len; /* Locked range length. */
104219974Smav	struct bio		 trso_bio;
105219974Smav};
106219974Smav
107219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e;
108219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e;
109219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e;
110219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e;
111219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
112219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
113219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
114219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e;
115219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e;
116219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e;
117219974Smav
118219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = {
119219974Smav	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
120219974Smav	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
121219974Smav	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
122219974Smav	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
123219974Smav	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
124219974Smav	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
125219974Smav	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
126219974Smav	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
127219974Smav	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
128219974Smav	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
129219974Smav	{ 0, 0 }
130219974Smav};
131219974Smav
132219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = {
133219974Smav	"RAID1E",
134219974Smav	g_raid_tr_raid1e_methods,
135219974Smav	sizeof(struct g_raid_tr_raid1e_object),
136240465Smav	.trc_enable = 1,
137219974Smav	.trc_priority = 200
138219974Smav};
139219974Smav
140219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
141219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
142219974Smav    struct g_raid_subdisk *sd);
143219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
144219974Smav    int no, off_t off, off_t len, u_int mask);
145219974Smav
146219974Smavstatic inline void
147219974SmavV2P(struct g_raid_volume *vol, off_t virt,
148219974Smav    int *disk, off_t *offset, off_t *start)
149219974Smav{
150219974Smav	off_t nstrip;
151219974Smav	u_int strip_size;
152219974Smav
153219974Smav	strip_size = vol->v_strip_size;
154219974Smav	/* Strip number. */
155219974Smav	nstrip = virt / strip_size;
156219974Smav	/* Start position in strip. */
157219974Smav	*start = virt % strip_size;
158219974Smav	/* Disk number. */
159219974Smav	*disk = (nstrip * N) % vol->v_disks_count;
160219974Smav	/* Strip start position in disk. */
161219974Smav	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
162219974Smav}
163219974Smav
164219974Smavstatic inline void
165219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset,
166219974Smav    off_t *virt, int *copy)
167219974Smav{
168219974Smav	off_t nstrip, start;
169219974Smav	u_int strip_size;
170219974Smav
171219974Smav	strip_size = vol->v_strip_size;
172219974Smav	/* Start position in strip. */
173219974Smav	start = offset % strip_size;
174219974Smav	/* Physical strip number. */
175219974Smav	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
176219974Smav	/* Number of physical strip (copy) inside virtual strip. */
177219974Smav	*copy = nstrip % N;
178219974Smav	/* Offset in virtual space. */
179219974Smav	*virt = (nstrip / N) * strip_size + start;
180219974Smav}
181219974Smav
182219974Smavstatic int
183219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
184219974Smav{
185219974Smav	struct g_raid_tr_raid1e_object *trs;
186219974Smav
187219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
188219974Smav	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
189234603Smav	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
190219974Smav		return (G_RAID_TR_TASTE_FAIL);
191219974Smav	trs->trso_starting = 1;
192219974Smav	return (G_RAID_TR_TASTE_SUCCEED);
193219974Smav}
194219974Smav
195219974Smavstatic int
196219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
197219974Smav{
198219974Smav	struct g_raid_softc *sc;
199219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
200219974Smav	int i, j, state, sstate;
201219974Smav
202219974Smav	sc = vol->v_softc;
203219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
204219974Smav	for (i = 0; i < vol->v_disks_count / N; i++) {
205219974Smav		bestsd = &vol->v_subdisks[i * N];
206219974Smav		for (j = 1; j < N; j++) {
207219974Smav			sd = &vol->v_subdisks[i * N + j];
208219974Smav			if (sd->sd_state > bestsd->sd_state)
209219974Smav				bestsd = sd;
210219974Smav			else if (sd->sd_state == bestsd->sd_state &&
211219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
212219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
213219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
214219974Smav				bestsd = sd;
215219974Smav		}
216219974Smav		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
217219974Smav		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
218219974Smav			/* We found reasonable candidate. */
219219974Smav			G_RAID_DEBUG1(1, sc,
220219974Smav			    "Promote subdisk %s:%d from %s to ACTIVE.",
221219974Smav			    vol->v_name, bestsd->sd_pos,
222219974Smav			    g_raid_subdisk_state2str(bestsd->sd_state));
223219974Smav			g_raid_change_subdisk_state(bestsd,
224219974Smav			    G_RAID_SUBDISK_S_ACTIVE);
225219974Smav			g_raid_write_metadata(sc,
226219974Smav			    vol, bestsd, bestsd->sd_disk);
227219974Smav		}
228219974Smav		worstsd = &vol->v_subdisks[i * N];
229219974Smav		for (j = 1; j < N; j++) {
230219974Smav			sd = &vol->v_subdisks[i * N + j];
231219974Smav			if (sd->sd_state < worstsd->sd_state)
232219974Smav				worstsd = sd;
233219974Smav		}
234219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
236219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
237219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
238219974Smav		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
239219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
240219974Smav		else
241219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
242219974Smav		if (sstate < state)
243219974Smav			state = sstate;
244219974Smav	}
245219974Smav	return (state);
246219974Smav}
247219974Smav
248219974Smavstatic int
249219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
250219974Smav{
251219974Smav	struct g_raid_softc *sc;
252219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
253219974Smav	int i, j, state, sstate;
254219974Smav
255219974Smav	sc = vol->v_softc;
256219974Smav	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
257219974Smav	    vol->v_disks_count)
258219974Smav		return (G_RAID_VOLUME_S_OPTIMAL);
259219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
260219974Smav		sd = &vol->v_subdisks[i];
261219974Smav		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
262219974Smav			/* We found reasonable candidate. */
263219974Smav			G_RAID_DEBUG1(1, sc,
264219974Smav			    "Promote subdisk %s:%d from %s to STALE.",
265219974Smav			    vol->v_name, sd->sd_pos,
266219974Smav			    g_raid_subdisk_state2str(sd->sd_state));
267219974Smav			g_raid_change_subdisk_state(sd,
268219974Smav			    G_RAID_SUBDISK_S_STALE);
269219974Smav			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
270219974Smav		}
271219974Smav	}
272219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
273219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
274219974Smav		bestsd = &vol->v_subdisks[i];
275219974Smav		worstsd = &vol->v_subdisks[i];
276219974Smav		for (j = 1; j < N; j++) {
277219974Smav			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
278219974Smav			if (sd->sd_state > bestsd->sd_state)
279219974Smav				bestsd = sd;
280219974Smav			else if (sd->sd_state == bestsd->sd_state &&
281219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
282219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
283219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
284219974Smav				bestsd = sd;
285219974Smav			if (sd->sd_state < worstsd->sd_state)
286219974Smav				worstsd = sd;
287219974Smav		}
288219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
289219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
290219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
291219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
292219974Smav		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
293219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
294219974Smav		else
295219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
296219974Smav		if (sstate < state)
297219974Smav			state = sstate;
298219974Smav	}
299219974Smav	return (state);
300219974Smav}
301219974Smav
302219974Smavstatic int
303219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
304219974Smav    struct g_raid_subdisk *sd)
305219974Smav{
306219974Smav	struct g_raid_tr_raid1e_object *trs;
307219974Smav	struct g_raid_softc *sc;
308219974Smav	u_int s;
309219974Smav
310219974Smav	sc = vol->v_softc;
311219974Smav	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
312219974Smav	if (trs->trso_stopping &&
313219974Smav	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
314219974Smav		s = G_RAID_VOLUME_S_STOPPED;
315219974Smav	else if (trs->trso_starting)
316219974Smav		s = G_RAID_VOLUME_S_STARTING;
317219974Smav	else {
318219974Smav		if ((vol->v_disks_count % N) == 0)
319219974Smav			s = g_raid_tr_update_state_raid1e_even(vol);
320219974Smav		else
321219974Smav			s = g_raid_tr_update_state_raid1e_odd(vol);
322219974Smav	}
323219974Smav	if (s != vol->v_state) {
324219974Smav		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
325219974Smav		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
326219974Smav		    G_RAID_EVENT_VOLUME);
327219974Smav		g_raid_change_volume_state(vol, s);
328219974Smav		if (!trs->trso_starting && !trs->trso_stopping)
329219974Smav			g_raid_write_metadata(sc, vol, NULL, NULL);
330219974Smav	}
331219974Smav	if (!trs->trso_starting && !trs->trso_stopping)
332219974Smav		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
333219974Smav	return (0);
334219974Smav}
335219974Smav
336219974Smavstatic void
337219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
338219974Smav    struct g_raid_disk *disk)
339219974Smav{
340235270Smav	struct g_raid_volume *vol;
341235270Smav
342235270Smav	vol = sd->sd_volume;
343219974Smav	/*
344219974Smav	 * We don't fail the last disk in the pack, since it still has decent
345219974Smav	 * data on it and that's better than failing the disk if it is the root
346219974Smav	 * file system.
347219974Smav	 *
348219974Smav	 * XXX should this be controlled via a tunable?  It makes sense for
349219974Smav	 * the volume that has / on it.  I can't think of a case where we'd
350219974Smav	 * want the volume to go away on this kind of event.
351219974Smav	 */
352235270Smav	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
353235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
354235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
355235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
356235270Smav	     vol->v_disks_count) &&
357235270Smav	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
358219974Smav		return;
359219974Smav	g_raid_fail_disk(sc, sd, disk);
360219974Smav}
361219974Smav
362219974Smavstatic void
363219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
364219974Smav{
365219974Smav	struct g_raid_volume *vol;
366219974Smav	struct g_raid_subdisk *sd;
367219974Smav
368219974Smav	vol = trs->trso_base.tro_volume;
369219974Smav	sd = trs->trso_failed_sd;
370219974Smav	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
371219974Smav	free(trs->trso_buffer, M_TR_RAID1E);
372219974Smav	trs->trso_buffer = NULL;
373219974Smav	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
374219974Smav	trs->trso_type = TR_RAID1E_NONE;
375219974Smav	trs->trso_recover_slabs = 0;
376219974Smav	trs->trso_failed_sd = NULL;
377219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
378219974Smav}
379219974Smav
380219974Smavstatic void
381219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
382219974Smav{
383219974Smav	struct g_raid_tr_raid1e_object *trs;
384219974Smav	struct g_raid_subdisk *sd;
385219974Smav
386219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
387219974Smav	sd = trs->trso_failed_sd;
388219974Smav	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
389219974Smav	    "Subdisk %s:%d-%s rebuild completed.",
390219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
391219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
392219974Smav	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
393219974Smav	sd->sd_rebuild_pos = 0;
394219974Smav	g_raid_tr_raid1e_rebuild_done(trs);
395219974Smav}
396219974Smav
397219974Smavstatic void
398219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
399219974Smav{
400219974Smav	struct g_raid_tr_raid1e_object *trs;
401219974Smav	struct g_raid_subdisk *sd;
402219974Smav	struct g_raid_volume *vol;
403219974Smav
404219974Smav	vol = tr->tro_volume;
405219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
406219974Smav	sd = trs->trso_failed_sd;
407219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
408219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
409219974Smav		    "Subdisk %s:%d-%s rebuild is aborting.",
410219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
411219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412219974Smav		trs->trso_flags |= TR_RAID1E_F_ABORT;
413219974Smav	} else {
414219974Smav		G_RAID_DEBUG1(0, vol->v_softc,
415219974Smav		    "Subdisk %s:%d-%s rebuild aborted.",
416219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
417219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
418219974Smav		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
419219974Smav		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
420219974Smav			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
421219974Smav			g_raid_unlock_range(tr->tro_volume,
422219974Smav			    trs->trso_lock_pos, trs->trso_lock_len);
423219974Smav		}
424219974Smav		g_raid_tr_raid1e_rebuild_done(trs);
425219974Smav	}
426219974Smav}
427219974Smav
428219974Smavstatic void
429219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
430219974Smav{
431219974Smav	struct g_raid_tr_raid1e_object *trs;
432219974Smav	struct g_raid_softc *sc;
433219974Smav	struct g_raid_volume *vol;
434219974Smav	struct g_raid_subdisk *sd;
435219974Smav	struct bio *bp;
436219974Smav	off_t len, virtual, vend, offset, start;
437219974Smav	int disk, copy, best;
438219974Smav
439219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
440219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
441219974Smav		return;
442219974Smav	vol = tr->tro_volume;
443219974Smav	sc = vol->v_softc;
444219974Smav	sd = trs->trso_failed_sd;
445219974Smav
446219974Smav	while (1) {
447219974Smav		if (sd->sd_rebuild_pos >= sd->sd_size) {
448219974Smav			g_raid_tr_raid1e_rebuild_finish(tr);
449219974Smav			return;
450219974Smav		}
451219974Smav		/* Get virtual offset from physical rebuild position. */
452219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
453219974Smav		/* Get physical offset back to get first stripe position. */
454219974Smav		V2P(vol, virtual, &disk, &offset, &start);
455219974Smav		/* Calculate contignous data length. */
456219974Smav		len = MIN(g_raid1e_rebuild_slab,
457219974Smav		    sd->sd_size - sd->sd_rebuild_pos);
458219974Smav		if ((vol->v_disks_count % N) != 0)
459219974Smav			len = MIN(len, vol->v_strip_size - start);
460219974Smav		/* Find disk with most accurate data. */
461219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
462219974Smav		    offset + start, len, 0);
463219974Smav		if (best < 0) {
464219974Smav			/* There is no any valid disk. */
465219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
466219974Smav			return;
467219974Smav		} else if (best != copy) {
468219974Smav			/* Some other disk has better data. */
469219974Smav			break;
470219974Smav		}
471219974Smav		/* We have the most accurate data. Skip the range. */
472219974Smav		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
473219974Smav		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
474219974Smav		sd->sd_rebuild_pos += len;
475219974Smav	}
476219974Smav
477219974Smav	bp = &trs->trso_bio;
478219974Smav	memset(bp, 0, sizeof(*bp));
479219974Smav	bp->bio_offset = offset + start +
480219974Smav	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
481219974Smav	bp->bio_length = len;
482219974Smav	bp->bio_data = trs->trso_buffer;
483219974Smav	bp->bio_cmd = BIO_READ;
484219974Smav	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
485219974Smav	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
486219974Smav	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
487219974Smav	/*
488219974Smav	 * If we are crossing stripe boundary, correct affected virtual
489219974Smav	 * range we should lock.
490219974Smav	 */
491219974Smav	if (start + len > vol->v_strip_size) {
492219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
493219974Smav		len = vend - virtual;
494219974Smav	}
495219974Smav	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
496219974Smav	trs->trso_flags |= TR_RAID1E_F_LOCKED;
497219974Smav	trs->trso_lock_pos = virtual;
498219974Smav	trs->trso_lock_len = len;
499219974Smav	/* Lock callback starts I/O */
500219974Smav	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
501219974Smav}
502219974Smav
503219974Smavstatic void
504219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
505219974Smav{
506219974Smav	struct g_raid_volume *vol;
507219974Smav	struct g_raid_tr_raid1e_object *trs;
508219974Smav	struct g_raid_subdisk *sd;
509219974Smav
510219974Smav	vol = tr->tro_volume;
511219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
512219974Smav	if (trs->trso_failed_sd) {
513219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
514219974Smav		    "Already rebuild in start rebuild. pos %jd\n",
515219974Smav		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
516219974Smav		return;
517219974Smav	}
518219974Smav	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
519219974Smav	if (sd == NULL)
520219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
521219974Smav	if (sd == NULL) {
522219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
523219974Smav		if (sd != NULL) {
524219974Smav			sd->sd_rebuild_pos = 0;
525219974Smav			g_raid_change_subdisk_state(sd,
526219974Smav			    G_RAID_SUBDISK_S_RESYNC);
527219974Smav			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
528219974Smav		} else {
529219974Smav			sd = g_raid_get_subdisk(vol,
530219974Smav			    G_RAID_SUBDISK_S_UNINITIALIZED);
531219974Smav			if (sd == NULL)
532219974Smav				sd = g_raid_get_subdisk(vol,
533219974Smav				    G_RAID_SUBDISK_S_NEW);
534219974Smav			if (sd != NULL) {
535219974Smav				sd->sd_rebuild_pos = 0;
536219974Smav				g_raid_change_subdisk_state(sd,
537219974Smav				    G_RAID_SUBDISK_S_REBUILD);
538219974Smav				g_raid_write_metadata(vol->v_softc,
539219974Smav				    vol, sd, NULL);
540219974Smav			}
541219974Smav		}
542219974Smav	}
543219974Smav	if (sd == NULL) {
544219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
545219974Smav		    "No failed disk to rebuild.  night night.");
546219974Smav		return;
547219974Smav	}
548219974Smav	trs->trso_failed_sd = sd;
549219974Smav	G_RAID_DEBUG1(0, vol->v_softc,
550219974Smav	    "Subdisk %s:%d-%s rebuild start at %jd.",
551219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
552219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
553219974Smav	    trs->trso_failed_sd->sd_rebuild_pos);
554219974Smav	trs->trso_type = TR_RAID1E_REBUILD;
555219974Smav	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
556219974Smav	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
557219974Smav	g_raid_tr_raid1e_rebuild_some(tr);
558219974Smav}
559219974Smav
560219974Smavstatic void
561219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
562219974Smav    struct g_raid_subdisk *sd)
563219974Smav{
564219974Smav	struct g_raid_volume *vol;
565219974Smav	struct g_raid_tr_raid1e_object *trs;
566219974Smav	int nr;
567219974Smav
568219974Smav	vol = tr->tro_volume;
569219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
570219974Smav	if (trs->trso_stopping)
571219974Smav		return;
572219974Smav	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
573219974Smav	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
574219974Smav	switch(trs->trso_type) {
575219974Smav	case TR_RAID1E_NONE:
576219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
577219974Smav			return;
578219974Smav		if (nr == 0) {
579219974Smav			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
580219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
581219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
582219974Smav			if (nr == 0)
583219974Smav				return;
584219974Smav		}
585219974Smav		g_raid_tr_raid1e_rebuild_start(tr);
586219974Smav		break;
587219974Smav	case TR_RAID1E_REBUILD:
588219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
589219974Smav		    trs->trso_failed_sd == sd)
590219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
591219974Smav		break;
592219974Smav	case TR_RAID1E_RESYNC:
593219974Smav		break;
594219974Smav	}
595219974Smav}
596219974Smav
597219974Smavstatic int
598219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
599219974Smav    struct g_raid_subdisk *sd, u_int event)
600219974Smav{
601219974Smav
602219974Smav	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
603219974Smav	return (0);
604219974Smav}
605219974Smav
606219974Smavstatic int
607219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
608219974Smav{
609219974Smav	struct g_raid_tr_raid1e_object *trs;
610219974Smav	struct g_raid_volume *vol;
611219974Smav
612219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
613219974Smav	vol = tr->tro_volume;
614219974Smav	trs->trso_starting = 0;
615219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
616219974Smav	return (0);
617219974Smav}
618219974Smav
619219974Smavstatic int
620219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
621219974Smav{
622219974Smav	struct g_raid_tr_raid1e_object *trs;
623219974Smav	struct g_raid_volume *vol;
624219974Smav
625219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
626219974Smav	vol = tr->tro_volume;
627219974Smav	trs->trso_starting = 0;
628219974Smav	trs->trso_stopping = 1;
629219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
630219974Smav	return (0);
631219974Smav}
632219974Smav
633219974Smav/*
634219974Smav * Select the disk to read from.  Take into account: subdisk state, running
635219974Smav * error recovery, average disk load, head position and possible cache hits.
636219974Smav */
637219974Smav#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
638219974Smavstatic int
639219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
640219974Smav    int no, off_t off, off_t len, u_int mask)
641219974Smav{
642219974Smav	struct g_raid_subdisk *sd;
643219974Smav	off_t offset;
644219974Smav	int i, best, prio, bestprio;
645219974Smav
646219974Smav	best = -1;
647219974Smav	bestprio = INT_MAX;
648219974Smav	for (i = 0; i < N; i++) {
649219974Smav		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
650219974Smav		offset = off;
651219974Smav		if (no + i >= vol->v_disks_count)
652219974Smav			offset += vol->v_strip_size;
653219974Smav
654219974Smav		prio = G_RAID_SUBDISK_LOAD(sd);
655219974Smav		if ((mask & (1 << sd->sd_pos)) != 0)
656219974Smav			continue;
657219974Smav		switch (sd->sd_state) {
658219974Smav		case G_RAID_SUBDISK_S_ACTIVE:
659219974Smav			break;
660219974Smav		case G_RAID_SUBDISK_S_RESYNC:
661219974Smav			if (offset + off < sd->sd_rebuild_pos)
662219974Smav				break;
663219974Smav			/* FALLTHROUGH */
664219974Smav		case G_RAID_SUBDISK_S_STALE:
665219974Smav			prio += i << 24;
666219974Smav			break;
667219974Smav		case G_RAID_SUBDISK_S_REBUILD:
668219974Smav			if (offset + off < sd->sd_rebuild_pos)
669219974Smav				break;
670219974Smav			/* FALLTHROUGH */
671219974Smav		default:
672219974Smav			continue;
673219974Smav		}
674219974Smav		prio += min(sd->sd_recovery, 255) << 16;
675219974Smav		/* If disk head is precisely in position - highly prefer it. */
676219974Smav		if (G_RAID_SUBDISK_POS(sd) == offset)
677219974Smav			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
678219974Smav		else
679219974Smav		/* If disk head is close to position - prefer it. */
680219974Smav		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
681219974Smav		    G_RAID_SUBDISK_TRACK_SIZE)
682219974Smav			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
683219974Smav		if (prio < bestprio) {
684219974Smav			bestprio = prio;
685219974Smav			best = i;
686219974Smav		}
687219974Smav	}
688219974Smav	return (best);
689219974Smav}
690219974Smav
691219974Smavstatic void
692219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
693219974Smav{
694219974Smav	struct g_raid_volume *vol;
695219974Smav	struct g_raid_subdisk *sd;
696219974Smav	struct bio_queue_head queue;
697219974Smav	struct bio *cbp;
698219974Smav	char *addr;
699219974Smav	off_t offset, start, length, remain;
700219974Smav	u_int no, strip_size;
701219974Smav	int best;
702219974Smav
703219974Smav	vol = tr->tro_volume;
704219974Smav	addr = bp->bio_data;
705219974Smav	strip_size = vol->v_strip_size;
706219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
707219974Smav	remain = bp->bio_length;
708219974Smav	bioq_init(&queue);
709219974Smav	while (remain > 0) {
710219974Smav		length = MIN(strip_size - start, remain);
711219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
712219974Smav		    no, offset, length, 0);
713219974Smav		KASSERT(best >= 0, ("No readable disk in volume %s!",
714219974Smav		    vol->v_name));
715219974Smav		no += best;
716219974Smav		if (no >= vol->v_disks_count) {
717219974Smav			no -= vol->v_disks_count;
718219974Smav			offset += strip_size;
719219974Smav		}
720219974Smav		cbp = g_clone_bio(bp);
721219974Smav		if (cbp == NULL)
722219974Smav			goto failure;
723219974Smav		cbp->bio_offset = offset + start;
724219974Smav		cbp->bio_data = addr;
725219974Smav		cbp->bio_length = length;
726219974Smav		cbp->bio_caller1 = &vol->v_subdisks[no];
727219974Smav		bioq_insert_tail(&queue, cbp);
728219974Smav		no += N - best;
729219974Smav		if (no >= vol->v_disks_count) {
730219974Smav			no -= vol->v_disks_count;
731219974Smav			offset += strip_size;
732219974Smav		}
733219974Smav		remain -= length;
734219974Smav		addr += length;
735219974Smav		start = 0;
736219974Smav	}
737219974Smav	for (cbp = bioq_first(&queue); cbp != NULL;
738219974Smav	    cbp = bioq_first(&queue)) {
739219974Smav		bioq_remove(&queue, cbp);
740219974Smav		sd = cbp->bio_caller1;
741219974Smav		cbp->bio_caller1 = NULL;
742219974Smav		g_raid_subdisk_iostart(sd, cbp);
743219974Smav	}
744219974Smav	return;
745219974Smavfailure:
746219974Smav	for (cbp = bioq_first(&queue); cbp != NULL;
747219974Smav	    cbp = bioq_first(&queue)) {
748219974Smav		bioq_remove(&queue, cbp);
749219974Smav		g_destroy_bio(cbp);
750219974Smav	}
751219974Smav	if (bp->bio_error == 0)
752219974Smav		bp->bio_error = ENOMEM;
753219974Smav	g_raid_iodone(bp, bp->bio_error);
754219974Smav}
755219974Smav
756219974Smavstatic void
757219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
758219974Smav{
759219974Smav	struct g_raid_volume *vol;
760219974Smav	struct g_raid_subdisk *sd;
761219974Smav	struct bio_queue_head queue;
762219974Smav	struct bio *cbp;
763219974Smav	char *addr;
764219974Smav	off_t offset, start, length, remain;
765219974Smav	u_int no, strip_size;
766219974Smav	int i;
767219974Smav
768219974Smav	vol = tr->tro_volume;
769219974Smav	addr = bp->bio_data;
770219974Smav	strip_size = vol->v_strip_size;
771219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
772219974Smav	remain = bp->bio_length;
773219974Smav	bioq_init(&queue);
774219974Smav	while (remain > 0) {
775219974Smav		length = MIN(strip_size - start, remain);
776219974Smav		for (i = 0; i < N; i++) {
777219974Smav			sd = &vol->v_subdisks[no];
778219974Smav			switch (sd->sd_state) {
779219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
780219974Smav			case G_RAID_SUBDISK_S_STALE:
781219974Smav			case G_RAID_SUBDISK_S_RESYNC:
782219974Smav				break;
783219974Smav			case G_RAID_SUBDISK_S_REBUILD:
784219974Smav				if (offset + start >= sd->sd_rebuild_pos)
785219974Smav					goto nextdisk;
786219974Smav				break;
787219974Smav			default:
788219974Smav				goto nextdisk;
789219974Smav			}
790219974Smav			cbp = g_clone_bio(bp);
791219974Smav			if (cbp == NULL)
792219974Smav				goto failure;
793219974Smav			cbp->bio_offset = offset + start;
794219974Smav			cbp->bio_data = addr;
795219974Smav			cbp->bio_length = length;
796219974Smav			cbp->bio_caller1 = sd;
797219974Smav			bioq_insert_tail(&queue, cbp);
798219974Smavnextdisk:
799219974Smav			if (++no >= vol->v_disks_count) {
800219974Smav				no = 0;
801219974Smav				offset += strip_size;
802219974Smav			}
803219974Smav		}
804219974Smav		remain -= length;
805219974Smav		addr += length;
806219974Smav		start = 0;
807219974Smav	}
808219974Smav	for (cbp = bioq_first(&queue); cbp != NULL;
809219974Smav	    cbp = bioq_first(&queue)) {
810219974Smav		bioq_remove(&queue, cbp);
811219974Smav		sd = cbp->bio_caller1;
812219974Smav		cbp->bio_caller1 = NULL;
813219974Smav		g_raid_subdisk_iostart(sd, cbp);
814219974Smav	}
815219974Smav	return;
816219974Smavfailure:
817219974Smav	for (cbp = bioq_first(&queue); cbp != NULL;
818219974Smav	    cbp = bioq_first(&queue)) {
819219974Smav		bioq_remove(&queue, cbp);
820219974Smav		g_destroy_bio(cbp);
821219974Smav	}
822219974Smav	if (bp->bio_error == 0)
823219974Smav		bp->bio_error = ENOMEM;
824219974Smav	g_raid_iodone(bp, bp->bio_error);
825219974Smav}
826219974Smav
827219974Smavstatic void
828219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
829219974Smav{
830219974Smav	struct g_raid_volume *vol;
831219974Smav	struct g_raid_tr_raid1e_object *trs;
832219974Smav
833219974Smav	vol = tr->tro_volume;
834219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
835219974Smav	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
836219974Smav	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
837219974Smav	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
838219974Smav		g_raid_iodone(bp, EIO);
839219974Smav		return;
840219974Smav	}
841219974Smav	/*
842219974Smav	 * If we're rebuilding, squeeze in rebuild activity every so often,
843219974Smav	 * even when the disk is busy.  Be sure to only count real I/O
844219974Smav	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
845219974Smav	 * by this module.
846219974Smav	 */
847219974Smav	if (trs->trso_failed_sd != NULL &&
848219974Smav	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
849219974Smav		/* Make this new or running now round short. */
850219974Smav		trs->trso_recover_slabs = 0;
851219974Smav		if (--trs->trso_fair_io <= 0) {
852219974Smav			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
853219974Smav			g_raid_tr_raid1e_rebuild_some(tr);
854219974Smav		}
855219974Smav	}
856219974Smav	switch (bp->bio_cmd) {
857219974Smav	case BIO_READ:
858219974Smav		g_raid_tr_iostart_raid1e_read(tr, bp);
859219974Smav		break;
860219974Smav	case BIO_WRITE:
861219974Smav		g_raid_tr_iostart_raid1e_write(tr, bp);
862219974Smav		break;
863219974Smav	case BIO_DELETE:
864219974Smav		g_raid_iodone(bp, EIO);
865219974Smav		break;
866219974Smav	case BIO_FLUSH:
867219974Smav		g_raid_tr_flush_common(tr, bp);
868219974Smav		break;
869219974Smav	default:
870219974Smav		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
871219974Smav		    bp->bio_cmd, vol->v_name));
872219974Smav		break;
873219974Smav	}
874219974Smav}
875219974Smav
876219974Smavstatic void
877219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
878219974Smav    struct g_raid_subdisk *sd, struct bio *bp)
879219974Smav{
880219974Smav	struct bio *cbp;
881219974Smav	struct g_raid_subdisk *nsd;
882219974Smav	struct g_raid_volume *vol;
883219974Smav	struct bio *pbp;
884219974Smav	struct g_raid_tr_raid1e_object *trs;
885219974Smav	off_t virtual, offset, start;
886219974Smav	uintptr_t mask;
887219974Smav	int error, do_write, copy, disk, best;
888219974Smav
889219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
890219974Smav	vol = tr->tro_volume;
891219974Smav	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
892219974Smav		if (trs->trso_type == TR_RAID1E_REBUILD) {
893219974Smav			nsd = trs->trso_failed_sd;
894219974Smav			if (bp->bio_cmd == BIO_READ) {
895219974Smav
896219974Smav				/* Immediately abort rebuild, if requested. */
897219974Smav				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
898219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
899219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
900219974Smav					return;
901219974Smav				}
902219974Smav
903219974Smav				/* On read error, skip and cross fingers. */
904219974Smav				if (bp->bio_error != 0) {
905219974Smav					G_RAID_LOGREQ(0, bp,
906219974Smav					    "Read error during rebuild (%d), "
907219974Smav					    "possible data loss!",
908219974Smav					    bp->bio_error);
909219974Smav					goto rebuild_round_done;
910219974Smav				}
911219974Smav
912219974Smav				/*
913219974Smav				 * The read operation finished, queue the
914219974Smav				 * write and get out.
915219974Smav				 */
916219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
917219974Smav				    bp->bio_error);
918219974Smav				bp->bio_cmd = BIO_WRITE;
919219974Smav				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
920219974Smav				bp->bio_offset = nsd->sd_rebuild_pos;
921219974Smav				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
922219974Smav				g_raid_subdisk_iostart(nsd, bp);
923219974Smav			} else {
924219974Smav				/*
925219974Smav				 * The write operation just finished.  Do
926219974Smav				 * another.  We keep cloning the master bio
927219974Smav				 * since it has the right buffers allocated to
928219974Smav				 * it.
929219974Smav				 */
930219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
931219974Smav				    bp->bio_error);
932219974Smav				if (bp->bio_error != 0 ||
933219974Smav				    trs->trso_flags & TR_RAID1E_F_ABORT) {
934219974Smav					if ((trs->trso_flags &
935219974Smav					    TR_RAID1E_F_ABORT) == 0) {
936219974Smav						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
937219974Smav						    nsd, nsd->sd_disk);
938219974Smav					}
939219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
940219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
941219974Smav					return;
942219974Smav				}
943219974Smavrebuild_round_done:
944219974Smav				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
945219974Smav				g_raid_unlock_range(tr->tro_volume,
946219974Smav				    trs->trso_lock_pos, trs->trso_lock_len);
947219974Smav				nsd->sd_rebuild_pos += bp->bio_length;
948219974Smav				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
949219974Smav					g_raid_tr_raid1e_rebuild_finish(tr);
950219974Smav					return;
951219974Smav				}
952219974Smav
953219974Smav				/* Abort rebuild if we are stopping */
954219974Smav				if (trs->trso_stopping) {
955219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
956219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
957219974Smav					return;
958219974Smav				}
959219974Smav
960219974Smav				if (--trs->trso_meta_update <= 0) {
961219974Smav					g_raid_write_metadata(vol->v_softc,
962219974Smav					    vol, nsd, nsd->sd_disk);
963219974Smav					trs->trso_meta_update =
964219974Smav					    g_raid1e_rebuild_meta_update;
965219974Smav					/* Compensate short rebuild I/Os. */
966219974Smav					if ((vol->v_disks_count % N) != 0 &&
967219974Smav					    vol->v_strip_size <
968219974Smav					     g_raid1e_rebuild_slab) {
969219974Smav						trs->trso_meta_update *=
970219974Smav						    g_raid1e_rebuild_slab;
971219974Smav						trs->trso_meta_update /=
972219974Smav						    vol->v_strip_size;
973219974Smav					}
974219974Smav				}
975219974Smav				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
976219974Smav				if (--trs->trso_recover_slabs <= 0)
977219974Smav					return;
978219974Smav				/* Run next rebuild iteration. */
979219974Smav				g_raid_tr_raid1e_rebuild_some(tr);
980219974Smav			}
981219974Smav		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
982219974Smav			/*
983219974Smav			 * read good sd, read bad sd in parallel.  when both
984219974Smav			 * done, compare the buffers.  write good to the bad
985219974Smav			 * if different.  do the next bit of work.
986219974Smav			 */
987219974Smav			panic("Somehow, we think we're doing a resync");
988219974Smav		}
989219974Smav		return;
990219974Smav	}
991219974Smav	pbp = bp->bio_parent;
992219974Smav	pbp->bio_inbed++;
993219974Smav	mask = (intptr_t)bp->bio_caller2;
994219974Smav	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
995219974Smav		/*
996219974Smav		 * Read failed on first drive.  Retry the read error on
997219974Smav		 * another disk drive, if available, before erroring out the
998219974Smav		 * read.
999219974Smav		 */
1000219974Smav		sd->sd_disk->d_read_errs++;
1001219974Smav		G_RAID_LOGREQ(0, bp,
1002219974Smav		    "Read error (%d), %d read errors total",
1003219974Smav		    bp->bio_error, sd->sd_disk->d_read_errs);
1004219974Smav
1005219974Smav		/*
1006219974Smav		 * If there are too many read errors, we move to degraded.
1007219974Smav		 * XXX Do we want to FAIL the drive (eg, make the user redo
1008219974Smav		 * everything to get it back in sync), or just degrade the
1009219974Smav		 * drive, which kicks off a resync?
1010219974Smav		 */
1011219974Smav		do_write = 0;
1012219974Smav		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1013219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1014219974Smav		else if (mask == 0)
1015219974Smav			do_write = 1;
1016219974Smav
1017219974Smav		/* Restore what we were doing. */
1018219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1019219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1020219974Smav
1021219974Smav		/* Find the other disk, and try to do the I/O to it. */
1022219974Smav		mask |= 1 << copy;
1023219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1024219974Smav		    disk, offset, start, mask);
1025219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1026219974Smav			disk += best;
1027219974Smav			if (disk >= vol->v_disks_count) {
1028219974Smav				disk -= vol->v_disks_count;
1029219974Smav				offset += vol->v_strip_size;
1030219974Smav			}
1031219974Smav			cbp->bio_offset = offset + start;
1032219974Smav			cbp->bio_length = bp->bio_length;
1033219974Smav			cbp->bio_data = bp->bio_data;
1034219974Smav			g_destroy_bio(bp);
1035219974Smav			nsd = &vol->v_subdisks[disk];
1036219974Smav			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1037219974Smav			    nsd->sd_pos);
1038219974Smav			if (do_write)
1039219974Smav				mask |= 1 << 31;
1040219974Smav			if ((mask & (1 << 31)) != 0)
1041219974Smav				sd->sd_recovery++;
1042219974Smav			cbp->bio_caller2 = (void *)mask;
1043219974Smav			if (do_write) {
1044219974Smav				cbp->bio_caller1 = nsd;
1045219974Smav				/* Lock callback starts I/O */
1046219974Smav				g_raid_lock_range(sd->sd_volume,
1047219974Smav				    virtual, cbp->bio_length, pbp, cbp);
1048219974Smav			} else {
1049219974Smav				g_raid_subdisk_iostart(nsd, cbp);
1050219974Smav			}
1051219974Smav			return;
1052219974Smav		}
1053219974Smav		/*
1054219974Smav		 * We can't retry.  Return the original error by falling
1055219974Smav		 * through.  This will happen when there's only one good disk.
1056219974Smav		 * We don't need to fail the raid, since its actual state is
1057219974Smav		 * based on the state of the subdisks.
1058219974Smav		 */
1059219974Smav		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1060219974Smav	}
1061219974Smav	if (bp->bio_cmd == BIO_READ &&
1062219974Smav	    bp->bio_error == 0 &&
1063219974Smav	    (mask & (1 << 31)) != 0) {
1064219974Smav		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1065219974Smav
1066219974Smav		/* Restore what we were doing. */
1067219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1068219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1069219974Smav
1070219974Smav		/* Find best disk to write. */
1071219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1072219974Smav		    disk, offset, start, ~mask);
1073219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1074219974Smav			disk += best;
1075219974Smav			if (disk >= vol->v_disks_count) {
1076219974Smav				disk -= vol->v_disks_count;
1077219974Smav				offset += vol->v_strip_size;
1078219974Smav			}
1079219974Smav			cbp->bio_offset = offset + start;
1080219974Smav			cbp->bio_length = bp->bio_length;
1081219974Smav			cbp->bio_data = bp->bio_data;
1082219974Smav			cbp->bio_cmd = BIO_WRITE;
1083219974Smav			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1084219974Smav			cbp->bio_caller2 = (void *)mask;
1085219974Smav			g_destroy_bio(bp);
1086219974Smav			G_RAID_LOGREQ(2, cbp,
1087219974Smav			    "Attempting bad sector remap on failing drive.");
1088219974Smav			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1089219974Smav			return;
1090219974Smav		}
1091219974Smav	}
1092219974Smav	if ((mask & (1 << 31)) != 0) {
1093219974Smav		/*
1094219974Smav		 * We're done with a recovery, mark the range as unlocked.
1095219974Smav		 * For any write errors, we agressively fail the disk since
1096219974Smav		 * there was both a READ and a WRITE error at this location.
1097219974Smav		 * Both types of errors generally indicates the drive is on
1098219974Smav		 * the verge of total failure anyway.  Better to stop trusting
1099219974Smav		 * it now.  However, we need to reset error to 0 in that case
1100219974Smav		 * because we're not failing the original I/O which succeeded.
1101219974Smav		 */
1102219974Smav
1103219974Smav		/* Restore what we were doing. */
1104219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1105219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1106219974Smav
1107219974Smav		for (copy = 0; copy < N; copy++) {
1108219974Smav			if ((mask & (1 << copy) ) != 0)
1109219974Smav				vol->v_subdisks[(disk + copy) %
1110219974Smav				    vol->v_disks_count].sd_recovery--;
1111219974Smav		}
1112219974Smav
1113219974Smav		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1114219974Smav			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1115219974Smav			    "failing subdisk.");
1116219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1117219974Smav			bp->bio_error = 0;
1118219974Smav		}
1119219974Smav		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1120219974Smav		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1121219974Smav	}
1122235270Smav	if (pbp->bio_cmd != BIO_READ) {
1123235270Smav		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1124235270Smav			pbp->bio_error = bp->bio_error;
1125235270Smav		if (bp->bio_error != 0) {
1126235270Smav			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1127235270Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1128235270Smav		}
1129235270Smav		error = pbp->bio_error;
1130235270Smav	} else
1131235270Smav		error = bp->bio_error;
1132219974Smav	g_destroy_bio(bp);
1133219974Smav	if (pbp->bio_children == pbp->bio_inbed) {
1134219974Smav		pbp->bio_completed = pbp->bio_length;
1135219974Smav		g_raid_iodone(pbp, error);
1136219974Smav	}
1137219974Smav}
1138219974Smav
1139219974Smavstatic int
1140219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1141219974Smav    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1142219974Smav{
1143219974Smav	struct g_raid_volume *vol;
1144219974Smav	struct g_raid_subdisk *sd;
1145219974Smav	struct bio_queue_head queue;
1146219974Smav	char *addr;
1147219974Smav	off_t offset, start, length, remain;
1148219974Smav	u_int no, strip_size;
1149219974Smav	int i, error;
1150219974Smav
1151219974Smav	vol = tr->tro_volume;
1152219974Smav	addr = virtual;
1153219974Smav	strip_size = vol->v_strip_size;
1154219974Smav	V2P(vol, boffset, &no, &offset, &start);
1155219974Smav	remain = blength;
1156219974Smav	bioq_init(&queue);
1157219974Smav	while (remain > 0) {
1158219974Smav		length = MIN(strip_size - start, remain);
1159219974Smav		for (i = 0; i < N; i++) {
1160219974Smav			sd = &vol->v_subdisks[no];
1161219974Smav			switch (sd->sd_state) {
1162219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
1163219974Smav			case G_RAID_SUBDISK_S_STALE:
1164219974Smav			case G_RAID_SUBDISK_S_RESYNC:
1165219974Smav				break;
1166219974Smav			case G_RAID_SUBDISK_S_REBUILD:
1167219974Smav				if (offset + start >= sd->sd_rebuild_pos)
1168219974Smav					goto nextdisk;
1169219974Smav				break;
1170219974Smav			default:
1171219974Smav				goto nextdisk;
1172219974Smav			}
1173219974Smav			error = g_raid_subdisk_kerneldump(sd,
1174219974Smav			    addr, 0, offset + start, length);
1175219974Smav			if (error != 0)
1176219974Smav				return (error);
1177219974Smavnextdisk:
1178219974Smav			if (++no >= vol->v_disks_count) {
1179219974Smav				no = 0;
1180219974Smav				offset += strip_size;
1181219974Smav			}
1182219974Smav		}
1183219974Smav		remain -= length;
1184219974Smav		addr += length;
1185219974Smav		start = 0;
1186219974Smav	}
1187219974Smav	return (0);
1188219974Smav}
1189219974Smav
1190219974Smavstatic int
1191219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1192219974Smav{
1193219974Smav	struct bio *bp;
1194219974Smav	struct g_raid_subdisk *sd;
1195219974Smav
1196219974Smav	bp = (struct bio *)argp;
1197219974Smav	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1198219974Smav	g_raid_subdisk_iostart(sd, bp);
1199219974Smav
1200219974Smav	return (0);
1201219974Smav}
1202219974Smav
1203219974Smavstatic int
1204219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1205219974Smav{
1206219974Smav	struct g_raid_tr_raid1e_object *trs;
1207219974Smav	struct g_raid_volume *vol;
1208219974Smav
1209219974Smav	vol = tr->tro_volume;
1210219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1211219974Smav	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1212219974Smav	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1213219974Smav	/* Compensate short rebuild I/Os. */
1214219974Smav	if ((vol->v_disks_count % N) != 0 &&
1215219974Smav	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1216219974Smav		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1217219974Smav		trs->trso_recover_slabs /= vol->v_strip_size;
1218219974Smav	}
1219219974Smav	if (trs->trso_type == TR_RAID1E_REBUILD)
1220219974Smav		g_raid_tr_raid1e_rebuild_some(tr);
1221219974Smav	return (0);
1222219974Smav}
1223219974Smav
1224219974Smavstatic int
1225219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1226219974Smav{
1227219974Smav	struct g_raid_tr_raid1e_object *trs;
1228219974Smav
1229219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1230219974Smav
1231219974Smav	if (trs->trso_buffer != NULL) {
1232219974Smav		free(trs->trso_buffer, M_TR_RAID1E);
1233219974Smav		trs->trso_buffer = NULL;
1234219974Smav	}
1235219974Smav	return (0);
1236219974Smav}
1237219974Smav
1238240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E");
1239