1219974Smav/*-
2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3219974Smav * All rights reserved.
4219974Smav *
5219974Smav * Redistribution and use in source and binary forms, with or without
6219974Smav * modification, are permitted provided that the following conditions
7219974Smav * are met:
8219974Smav * 1. Redistributions of source code must retain the above copyright
9219974Smav *    notice, this list of conditions and the following disclaimer.
10219974Smav * 2. Redistributions in binary form must reproduce the above copyright
11219974Smav *    notice, this list of conditions and the following disclaimer in the
12219974Smav *    documentation and/or other materials provided with the distribution.
13219974Smav *
14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17219974Smav * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24219974Smav * SUCH DAMAGE.
25219974Smav */
26219974Smav
27219974Smav#include <sys/cdefs.h>
28219974Smav__FBSDID("$FreeBSD$");
29219974Smav
30219974Smav#include <sys/param.h>
31219974Smav#include <sys/bio.h>
32219974Smav#include <sys/endian.h>
33219974Smav#include <sys/kernel.h>
34219974Smav#include <sys/kobj.h>
35219974Smav#include <sys/limits.h>
36219974Smav#include <sys/lock.h>
37219974Smav#include <sys/malloc.h>
38219974Smav#include <sys/mutex.h>
39219974Smav#include <sys/sysctl.h>
40219974Smav#include <sys/systm.h>
41219974Smav#include <geom/geom.h>
42219974Smav#include "geom/raid/g_raid.h"
43219974Smav#include "g_raid_tr_if.h"
44219974Smav
45219974Smav#define N	2
46219974Smav
47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e);
48219974Smav
49219974Smav#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
52219974Smav    &g_raid1e_rebuild_slab, 0,
53219974Smav    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
54219974Smav
55219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
56219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
58219974Smav    &g_raid1e_rebuild_fair_io, 0,
59219974Smav    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
60219974Smav
61219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100
62219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
64219974Smav    &g_raid1e_rebuild_cluster_idle, 0,
65219974Smav    "Number of slabs to do each time we trigger a rebuild cycle");
66219974Smav
67219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
68219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69267992ShselaskySYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
70219974Smav    &g_raid1e_rebuild_meta_update, 0,
71219974Smav    "When to update the meta data.");
72219974Smav
73219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
74219974Smav
75219974Smav#define TR_RAID1E_NONE 0
76219974Smav#define TR_RAID1E_REBUILD 1
77219974Smav#define TR_RAID1E_RESYNC 2
78219974Smav
79219974Smav#define TR_RAID1E_F_DOING_SOME	0x1
80219974Smav#define TR_RAID1E_F_LOCKED	0x2
81219974Smav#define TR_RAID1E_F_ABORT	0x4
82219974Smav
83219974Smavstruct g_raid_tr_raid1e_object {
84219974Smav	struct g_raid_tr_object	 trso_base;
85219974Smav	int			 trso_starting;
86219974Smav	int			 trso_stopping;
87219974Smav	int			 trso_type;
88219974Smav	int			 trso_recover_slabs; /* slabs before rest */
89219974Smav	int			 trso_fair_io;
90219974Smav	int			 trso_meta_update;
91219974Smav	int			 trso_flags;
92219974Smav	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
93219974Smav	void			*trso_buffer;	 /* Buffer space */
94219974Smav	off_t			 trso_lock_pos; /* Locked range start. */
95219974Smav	off_t			 trso_lock_len; /* Locked range length. */
96219974Smav	struct bio		 trso_bio;
97219974Smav};
98219974Smav
99219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e;
100219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e;
101219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e;
102219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e;
103219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
104219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
105219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
106219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e;
107219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e;
108219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e;
109219974Smav
110219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = {
111219974Smav	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
112219974Smav	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
113219974Smav	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
114219974Smav	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
115219974Smav	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
116219974Smav	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
117219974Smav	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
118219974Smav	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
119219974Smav	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
120219974Smav	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
121219974Smav	{ 0, 0 }
122219974Smav};
123219974Smav
124219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = {
125219974Smav	"RAID1E",
126219974Smav	g_raid_tr_raid1e_methods,
127219974Smav	sizeof(struct g_raid_tr_raid1e_object),
128240465Smav	.trc_enable = 1,
129256610Smav	.trc_priority = 200,
130256610Smav	.trc_accept_unmapped = 1
131219974Smav};
132219974Smav
133219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
134219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
135219974Smav    struct g_raid_subdisk *sd);
136219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
137219974Smav    int no, off_t off, off_t len, u_int mask);
138219974Smav
139219974Smavstatic inline void
140219974SmavV2P(struct g_raid_volume *vol, off_t virt,
141219974Smav    int *disk, off_t *offset, off_t *start)
142219974Smav{
143219974Smav	off_t nstrip;
144219974Smav	u_int strip_size;
145219974Smav
146219974Smav	strip_size = vol->v_strip_size;
147219974Smav	/* Strip number. */
148219974Smav	nstrip = virt / strip_size;
149219974Smav	/* Start position in strip. */
150219974Smav	*start = virt % strip_size;
151219974Smav	/* Disk number. */
152219974Smav	*disk = (nstrip * N) % vol->v_disks_count;
153219974Smav	/* Strip start position in disk. */
154219974Smav	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
155219974Smav}
156219974Smav
157219974Smavstatic inline void
158219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset,
159219974Smav    off_t *virt, int *copy)
160219974Smav{
161219974Smav	off_t nstrip, start;
162219974Smav	u_int strip_size;
163219974Smav
164219974Smav	strip_size = vol->v_strip_size;
165219974Smav	/* Start position in strip. */
166219974Smav	start = offset % strip_size;
167219974Smav	/* Physical strip number. */
168219974Smav	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
169219974Smav	/* Number of physical strip (copy) inside virtual strip. */
170219974Smav	*copy = nstrip % N;
171219974Smav	/* Offset in virtual space. */
172219974Smav	*virt = (nstrip / N) * strip_size + start;
173219974Smav}
174219974Smav
175219974Smavstatic int
176219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
177219974Smav{
178219974Smav	struct g_raid_tr_raid1e_object *trs;
179219974Smav
180219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
181219974Smav	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182234603Smav	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
183219974Smav		return (G_RAID_TR_TASTE_FAIL);
184219974Smav	trs->trso_starting = 1;
185219974Smav	return (G_RAID_TR_TASTE_SUCCEED);
186219974Smav}
187219974Smav
188219974Smavstatic int
189219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
190219974Smav{
191219974Smav	struct g_raid_softc *sc;
192219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
193219974Smav	int i, j, state, sstate;
194219974Smav
195219974Smav	sc = vol->v_softc;
196219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
197219974Smav	for (i = 0; i < vol->v_disks_count / N; i++) {
198219974Smav		bestsd = &vol->v_subdisks[i * N];
199219974Smav		for (j = 1; j < N; j++) {
200219974Smav			sd = &vol->v_subdisks[i * N + j];
201219974Smav			if (sd->sd_state > bestsd->sd_state)
202219974Smav				bestsd = sd;
203219974Smav			else if (sd->sd_state == bestsd->sd_state &&
204219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
205219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
206219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
207219974Smav				bestsd = sd;
208219974Smav		}
209219974Smav		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
210219974Smav		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
211219974Smav			/* We found reasonable candidate. */
212219974Smav			G_RAID_DEBUG1(1, sc,
213219974Smav			    "Promote subdisk %s:%d from %s to ACTIVE.",
214219974Smav			    vol->v_name, bestsd->sd_pos,
215219974Smav			    g_raid_subdisk_state2str(bestsd->sd_state));
216219974Smav			g_raid_change_subdisk_state(bestsd,
217219974Smav			    G_RAID_SUBDISK_S_ACTIVE);
218219974Smav			g_raid_write_metadata(sc,
219219974Smav			    vol, bestsd, bestsd->sd_disk);
220219974Smav		}
221219974Smav		worstsd = &vol->v_subdisks[i * N];
222219974Smav		for (j = 1; j < N; j++) {
223219974Smav			sd = &vol->v_subdisks[i * N + j];
224219974Smav			if (sd->sd_state < worstsd->sd_state)
225219974Smav				worstsd = sd;
226219974Smav		}
227219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
228219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
229219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
230219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
231219974Smav		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
232219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
233219974Smav		else
234219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
235219974Smav		if (sstate < state)
236219974Smav			state = sstate;
237219974Smav	}
238219974Smav	return (state);
239219974Smav}
240219974Smav
241219974Smavstatic int
242219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
243219974Smav{
244219974Smav	struct g_raid_softc *sc;
245219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
246219974Smav	int i, j, state, sstate;
247219974Smav
248219974Smav	sc = vol->v_softc;
249219974Smav	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
250219974Smav	    vol->v_disks_count)
251219974Smav		return (G_RAID_VOLUME_S_OPTIMAL);
252219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
253219974Smav		sd = &vol->v_subdisks[i];
254219974Smav		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
255219974Smav			/* We found reasonable candidate. */
256219974Smav			G_RAID_DEBUG1(1, sc,
257219974Smav			    "Promote subdisk %s:%d from %s to STALE.",
258219974Smav			    vol->v_name, sd->sd_pos,
259219974Smav			    g_raid_subdisk_state2str(sd->sd_state));
260219974Smav			g_raid_change_subdisk_state(sd,
261219974Smav			    G_RAID_SUBDISK_S_STALE);
262219974Smav			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
263219974Smav		}
264219974Smav	}
265219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
266219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
267219974Smav		bestsd = &vol->v_subdisks[i];
268219974Smav		worstsd = &vol->v_subdisks[i];
269219974Smav		for (j = 1; j < N; j++) {
270219974Smav			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
271219974Smav			if (sd->sd_state > bestsd->sd_state)
272219974Smav				bestsd = sd;
273219974Smav			else if (sd->sd_state == bestsd->sd_state &&
274219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
275219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
276219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
277219974Smav				bestsd = sd;
278219974Smav			if (sd->sd_state < worstsd->sd_state)
279219974Smav				worstsd = sd;
280219974Smav		}
281219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
282219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
283219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
284219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
285219974Smav		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
287219974Smav		else
288219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
289219974Smav		if (sstate < state)
290219974Smav			state = sstate;
291219974Smav	}
292219974Smav	return (state);
293219974Smav}
294219974Smav
295219974Smavstatic int
296219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
297219974Smav    struct g_raid_subdisk *sd)
298219974Smav{
299219974Smav	struct g_raid_tr_raid1e_object *trs;
300219974Smav	struct g_raid_softc *sc;
301219974Smav	u_int s;
302219974Smav
303219974Smav	sc = vol->v_softc;
304219974Smav	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
305219974Smav	if (trs->trso_stopping &&
306219974Smav	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
307219974Smav		s = G_RAID_VOLUME_S_STOPPED;
308219974Smav	else if (trs->trso_starting)
309219974Smav		s = G_RAID_VOLUME_S_STARTING;
310219974Smav	else {
311219974Smav		if ((vol->v_disks_count % N) == 0)
312219974Smav			s = g_raid_tr_update_state_raid1e_even(vol);
313219974Smav		else
314219974Smav			s = g_raid_tr_update_state_raid1e_odd(vol);
315219974Smav	}
316219974Smav	if (s != vol->v_state) {
317219974Smav		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
318219974Smav		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
319219974Smav		    G_RAID_EVENT_VOLUME);
320219974Smav		g_raid_change_volume_state(vol, s);
321219974Smav		if (!trs->trso_starting && !trs->trso_stopping)
322219974Smav			g_raid_write_metadata(sc, vol, NULL, NULL);
323219974Smav	}
324219974Smav	if (!trs->trso_starting && !trs->trso_stopping)
325219974Smav		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
326219974Smav	return (0);
327219974Smav}
328219974Smav
329219974Smavstatic void
330219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
331219974Smav    struct g_raid_disk *disk)
332219974Smav{
333235270Smav	struct g_raid_volume *vol;
334235270Smav
335235270Smav	vol = sd->sd_volume;
336219974Smav	/*
337219974Smav	 * We don't fail the last disk in the pack, since it still has decent
338219974Smav	 * data on it and that's better than failing the disk if it is the root
339219974Smav	 * file system.
340219974Smav	 *
341219974Smav	 * XXX should this be controlled via a tunable?  It makes sense for
342219974Smav	 * the volume that has / on it.  I can't think of a case where we'd
343219974Smav	 * want the volume to go away on this kind of event.
344219974Smav	 */
345235270Smav	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349235270Smav	     vol->v_disks_count) &&
350235270Smav	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
351219974Smav		return;
352219974Smav	g_raid_fail_disk(sc, sd, disk);
353219974Smav}
354219974Smav
355219974Smavstatic void
356219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
357219974Smav{
358219974Smav	struct g_raid_volume *vol;
359219974Smav	struct g_raid_subdisk *sd;
360219974Smav
361219974Smav	vol = trs->trso_base.tro_volume;
362219974Smav	sd = trs->trso_failed_sd;
363219974Smav	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
364219974Smav	free(trs->trso_buffer, M_TR_RAID1E);
365219974Smav	trs->trso_buffer = NULL;
366219974Smav	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
367219974Smav	trs->trso_type = TR_RAID1E_NONE;
368219974Smav	trs->trso_recover_slabs = 0;
369219974Smav	trs->trso_failed_sd = NULL;
370219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
371219974Smav}
372219974Smav
373219974Smavstatic void
374219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
375219974Smav{
376219974Smav	struct g_raid_tr_raid1e_object *trs;
377219974Smav	struct g_raid_subdisk *sd;
378219974Smav
379219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
380219974Smav	sd = trs->trso_failed_sd;
381219974Smav	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
382219974Smav	    "Subdisk %s:%d-%s rebuild completed.",
383219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
384219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
385219974Smav	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
386219974Smav	sd->sd_rebuild_pos = 0;
387219974Smav	g_raid_tr_raid1e_rebuild_done(trs);
388219974Smav}
389219974Smav
390219974Smavstatic void
391219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
392219974Smav{
393219974Smav	struct g_raid_tr_raid1e_object *trs;
394219974Smav	struct g_raid_subdisk *sd;
395219974Smav	struct g_raid_volume *vol;
396219974Smav
397219974Smav	vol = tr->tro_volume;
398219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
399219974Smav	sd = trs->trso_failed_sd;
400219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
401219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
402219974Smav		    "Subdisk %s:%d-%s rebuild is aborting.",
403219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
404219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
405219974Smav		trs->trso_flags |= TR_RAID1E_F_ABORT;
406219974Smav	} else {
407219974Smav		G_RAID_DEBUG1(0, vol->v_softc,
408219974Smav		    "Subdisk %s:%d-%s rebuild aborted.",
409219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
410219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
411219974Smav		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
412219974Smav		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
413219974Smav			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
414219974Smav			g_raid_unlock_range(tr->tro_volume,
415219974Smav			    trs->trso_lock_pos, trs->trso_lock_len);
416219974Smav		}
417219974Smav		g_raid_tr_raid1e_rebuild_done(trs);
418219974Smav	}
419219974Smav}
420219974Smav
421219974Smavstatic void
422219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
423219974Smav{
424219974Smav	struct g_raid_tr_raid1e_object *trs;
425219974Smav	struct g_raid_softc *sc;
426219974Smav	struct g_raid_volume *vol;
427219974Smav	struct g_raid_subdisk *sd;
428219974Smav	struct bio *bp;
429219974Smav	off_t len, virtual, vend, offset, start;
430219974Smav	int disk, copy, best;
431219974Smav
432219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
433219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
434219974Smav		return;
435219974Smav	vol = tr->tro_volume;
436219974Smav	sc = vol->v_softc;
437219974Smav	sd = trs->trso_failed_sd;
438219974Smav
439219974Smav	while (1) {
440219974Smav		if (sd->sd_rebuild_pos >= sd->sd_size) {
441219974Smav			g_raid_tr_raid1e_rebuild_finish(tr);
442219974Smav			return;
443219974Smav		}
444219974Smav		/* Get virtual offset from physical rebuild position. */
445219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
446219974Smav		/* Get physical offset back to get first stripe position. */
447219974Smav		V2P(vol, virtual, &disk, &offset, &start);
448219974Smav		/* Calculate contignous data length. */
449219974Smav		len = MIN(g_raid1e_rebuild_slab,
450219974Smav		    sd->sd_size - sd->sd_rebuild_pos);
451219974Smav		if ((vol->v_disks_count % N) != 0)
452219974Smav			len = MIN(len, vol->v_strip_size - start);
453219974Smav		/* Find disk with most accurate data. */
454219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
455219974Smav		    offset + start, len, 0);
456219974Smav		if (best < 0) {
457219974Smav			/* There is no any valid disk. */
458219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
459219974Smav			return;
460219974Smav		} else if (best != copy) {
461219974Smav			/* Some other disk has better data. */
462219974Smav			break;
463219974Smav		}
464219974Smav		/* We have the most accurate data. Skip the range. */
465219974Smav		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
466219974Smav		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
467219974Smav		sd->sd_rebuild_pos += len;
468219974Smav	}
469219974Smav
470219974Smav	bp = &trs->trso_bio;
471219974Smav	memset(bp, 0, sizeof(*bp));
472219974Smav	bp->bio_offset = offset + start +
473219974Smav	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
474219974Smav	bp->bio_length = len;
475219974Smav	bp->bio_data = trs->trso_buffer;
476219974Smav	bp->bio_cmd = BIO_READ;
477219974Smav	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
478219974Smav	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
479219974Smav	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
480219974Smav	/*
481219974Smav	 * If we are crossing stripe boundary, correct affected virtual
482219974Smav	 * range we should lock.
483219974Smav	 */
484219974Smav	if (start + len > vol->v_strip_size) {
485219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
486219974Smav		len = vend - virtual;
487219974Smav	}
488219974Smav	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
489219974Smav	trs->trso_flags |= TR_RAID1E_F_LOCKED;
490219974Smav	trs->trso_lock_pos = virtual;
491219974Smav	trs->trso_lock_len = len;
492219974Smav	/* Lock callback starts I/O */
493219974Smav	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
494219974Smav}
495219974Smav
496219974Smavstatic void
497219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
498219974Smav{
499219974Smav	struct g_raid_volume *vol;
500219974Smav	struct g_raid_tr_raid1e_object *trs;
501219974Smav	struct g_raid_subdisk *sd;
502219974Smav
503219974Smav	vol = tr->tro_volume;
504219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
505219974Smav	if (trs->trso_failed_sd) {
506219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
507219974Smav		    "Already rebuild in start rebuild. pos %jd\n",
508219974Smav		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
509219974Smav		return;
510219974Smav	}
511219974Smav	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
512219974Smav	if (sd == NULL)
513219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
514219974Smav	if (sd == NULL) {
515219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
516219974Smav		if (sd != NULL) {
517219974Smav			sd->sd_rebuild_pos = 0;
518219974Smav			g_raid_change_subdisk_state(sd,
519219974Smav			    G_RAID_SUBDISK_S_RESYNC);
520219974Smav			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
521219974Smav		} else {
522219974Smav			sd = g_raid_get_subdisk(vol,
523219974Smav			    G_RAID_SUBDISK_S_UNINITIALIZED);
524219974Smav			if (sd == NULL)
525219974Smav				sd = g_raid_get_subdisk(vol,
526219974Smav				    G_RAID_SUBDISK_S_NEW);
527219974Smav			if (sd != NULL) {
528219974Smav				sd->sd_rebuild_pos = 0;
529219974Smav				g_raid_change_subdisk_state(sd,
530219974Smav				    G_RAID_SUBDISK_S_REBUILD);
531219974Smav				g_raid_write_metadata(vol->v_softc,
532219974Smav				    vol, sd, NULL);
533219974Smav			}
534219974Smav		}
535219974Smav	}
536219974Smav	if (sd == NULL) {
537219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
538219974Smav		    "No failed disk to rebuild.  night night.");
539219974Smav		return;
540219974Smav	}
541219974Smav	trs->trso_failed_sd = sd;
542219974Smav	G_RAID_DEBUG1(0, vol->v_softc,
543219974Smav	    "Subdisk %s:%d-%s rebuild start at %jd.",
544219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
545219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
546219974Smav	    trs->trso_failed_sd->sd_rebuild_pos);
547219974Smav	trs->trso_type = TR_RAID1E_REBUILD;
548219974Smav	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
549219974Smav	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
550219974Smav	g_raid_tr_raid1e_rebuild_some(tr);
551219974Smav}
552219974Smav
553219974Smavstatic void
554219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
555219974Smav    struct g_raid_subdisk *sd)
556219974Smav{
557219974Smav	struct g_raid_volume *vol;
558219974Smav	struct g_raid_tr_raid1e_object *trs;
559219974Smav	int nr;
560219974Smav
561219974Smav	vol = tr->tro_volume;
562219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
563219974Smav	if (trs->trso_stopping)
564219974Smav		return;
565219974Smav	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
566219974Smav	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
567219974Smav	switch(trs->trso_type) {
568219974Smav	case TR_RAID1E_NONE:
569219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
570219974Smav			return;
571219974Smav		if (nr == 0) {
572219974Smav			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
573219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
574219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
575219974Smav			if (nr == 0)
576219974Smav				return;
577219974Smav		}
578219974Smav		g_raid_tr_raid1e_rebuild_start(tr);
579219974Smav		break;
580219974Smav	case TR_RAID1E_REBUILD:
581219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
582219974Smav		    trs->trso_failed_sd == sd)
583219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
584219974Smav		break;
585219974Smav	case TR_RAID1E_RESYNC:
586219974Smav		break;
587219974Smav	}
588219974Smav}
589219974Smav
590219974Smavstatic int
591219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
592219974Smav    struct g_raid_subdisk *sd, u_int event)
593219974Smav{
594219974Smav
595219974Smav	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
596219974Smav	return (0);
597219974Smav}
598219974Smav
599219974Smavstatic int
600219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
601219974Smav{
602219974Smav	struct g_raid_tr_raid1e_object *trs;
603219974Smav	struct g_raid_volume *vol;
604219974Smav
605219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
606219974Smav	vol = tr->tro_volume;
607219974Smav	trs->trso_starting = 0;
608219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
609219974Smav	return (0);
610219974Smav}
611219974Smav
612219974Smavstatic int
613219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
614219974Smav{
615219974Smav	struct g_raid_tr_raid1e_object *trs;
616219974Smav	struct g_raid_volume *vol;
617219974Smav
618219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
619219974Smav	vol = tr->tro_volume;
620219974Smav	trs->trso_starting = 0;
621219974Smav	trs->trso_stopping = 1;
622219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
623219974Smav	return (0);
624219974Smav}
625219974Smav
626219974Smav/*
627219974Smav * Select the disk to read from.  Take into account: subdisk state, running
628219974Smav * error recovery, average disk load, head position and possible cache hits.
629219974Smav */
630219974Smav#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
631219974Smavstatic int
632219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
633219974Smav    int no, off_t off, off_t len, u_int mask)
634219974Smav{
635219974Smav	struct g_raid_subdisk *sd;
636219974Smav	off_t offset;
637219974Smav	int i, best, prio, bestprio;
638219974Smav
639219974Smav	best = -1;
640219974Smav	bestprio = INT_MAX;
641219974Smav	for (i = 0; i < N; i++) {
642219974Smav		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
643219974Smav		offset = off;
644219974Smav		if (no + i >= vol->v_disks_count)
645219974Smav			offset += vol->v_strip_size;
646219974Smav
647219974Smav		prio = G_RAID_SUBDISK_LOAD(sd);
648219974Smav		if ((mask & (1 << sd->sd_pos)) != 0)
649219974Smav			continue;
650219974Smav		switch (sd->sd_state) {
651219974Smav		case G_RAID_SUBDISK_S_ACTIVE:
652219974Smav			break;
653219974Smav		case G_RAID_SUBDISK_S_RESYNC:
654219974Smav			if (offset + off < sd->sd_rebuild_pos)
655219974Smav				break;
656219974Smav			/* FALLTHROUGH */
657219974Smav		case G_RAID_SUBDISK_S_STALE:
658219974Smav			prio += i << 24;
659219974Smav			break;
660219974Smav		case G_RAID_SUBDISK_S_REBUILD:
661219974Smav			if (offset + off < sd->sd_rebuild_pos)
662219974Smav				break;
663219974Smav			/* FALLTHROUGH */
664219974Smav		default:
665219974Smav			continue;
666219974Smav		}
667219974Smav		prio += min(sd->sd_recovery, 255) << 16;
668219974Smav		/* If disk head is precisely in position - highly prefer it. */
669219974Smav		if (G_RAID_SUBDISK_POS(sd) == offset)
670219974Smav			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
671219974Smav		else
672219974Smav		/* If disk head is close to position - prefer it. */
673219974Smav		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
674219974Smav		    G_RAID_SUBDISK_TRACK_SIZE)
675219974Smav			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
676219974Smav		if (prio < bestprio) {
677219974Smav			bestprio = prio;
678219974Smav			best = i;
679219974Smav		}
680219974Smav	}
681219974Smav	return (best);
682219974Smav}
683219974Smav
684219974Smavstatic void
685219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
686219974Smav{
687219974Smav	struct g_raid_volume *vol;
688219974Smav	struct g_raid_subdisk *sd;
689219974Smav	struct bio_queue_head queue;
690219974Smav	struct bio *cbp;
691219974Smav	char *addr;
692219974Smav	off_t offset, start, length, remain;
693219974Smav	u_int no, strip_size;
694219974Smav	int best;
695219974Smav
696219974Smav	vol = tr->tro_volume;
697256610Smav	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698256610Smav		addr = NULL;
699256610Smav	else
700256610Smav		addr = bp->bio_data;
701219974Smav	strip_size = vol->v_strip_size;
702219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
703219974Smav	remain = bp->bio_length;
704219974Smav	bioq_init(&queue);
705219974Smav	while (remain > 0) {
706219974Smav		length = MIN(strip_size - start, remain);
707219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
708219974Smav		    no, offset, length, 0);
709219974Smav		KASSERT(best >= 0, ("No readable disk in volume %s!",
710219974Smav		    vol->v_name));
711219974Smav		no += best;
712219974Smav		if (no >= vol->v_disks_count) {
713219974Smav			no -= vol->v_disks_count;
714219974Smav			offset += strip_size;
715219974Smav		}
716219974Smav		cbp = g_clone_bio(bp);
717219974Smav		if (cbp == NULL)
718219974Smav			goto failure;
719219974Smav		cbp->bio_offset = offset + start;
720219974Smav		cbp->bio_length = length;
721256610Smav		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722256610Smav			cbp->bio_ma_offset += (uintptr_t)addr;
723256610Smav			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724256610Smav			cbp->bio_ma_offset %= PAGE_SIZE;
725256610Smav			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726256610Smav			    cbp->bio_length) / PAGE_SIZE;
727256610Smav		} else
728256610Smav			cbp->bio_data = addr;
729219974Smav		cbp->bio_caller1 = &vol->v_subdisks[no];
730219974Smav		bioq_insert_tail(&queue, cbp);
731219974Smav		no += N - best;
732219974Smav		if (no >= vol->v_disks_count) {
733219974Smav			no -= vol->v_disks_count;
734219974Smav			offset += strip_size;
735219974Smav		}
736219974Smav		remain -= length;
737219974Smav		addr += length;
738219974Smav		start = 0;
739219974Smav	}
740256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL) {
741219974Smav		sd = cbp->bio_caller1;
742219974Smav		cbp->bio_caller1 = NULL;
743219974Smav		g_raid_subdisk_iostart(sd, cbp);
744219974Smav	}
745219974Smav	return;
746219974Smavfailure:
747256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL)
748219974Smav		g_destroy_bio(cbp);
749219974Smav	if (bp->bio_error == 0)
750219974Smav		bp->bio_error = ENOMEM;
751219974Smav	g_raid_iodone(bp, bp->bio_error);
752219974Smav}
753219974Smav
754219974Smavstatic void
755219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
756219974Smav{
757219974Smav	struct g_raid_volume *vol;
758219974Smav	struct g_raid_subdisk *sd;
759219974Smav	struct bio_queue_head queue;
760219974Smav	struct bio *cbp;
761219974Smav	char *addr;
762219974Smav	off_t offset, start, length, remain;
763219974Smav	u_int no, strip_size;
764219974Smav	int i;
765219974Smav
766219974Smav	vol = tr->tro_volume;
767256610Smav	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768256610Smav		addr = NULL;
769256610Smav	else
770256610Smav		addr = bp->bio_data;
771219974Smav	strip_size = vol->v_strip_size;
772219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
773219974Smav	remain = bp->bio_length;
774219974Smav	bioq_init(&queue);
775219974Smav	while (remain > 0) {
776219974Smav		length = MIN(strip_size - start, remain);
777219974Smav		for (i = 0; i < N; i++) {
778219974Smav			sd = &vol->v_subdisks[no];
779219974Smav			switch (sd->sd_state) {
780219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
781219974Smav			case G_RAID_SUBDISK_S_STALE:
782219974Smav			case G_RAID_SUBDISK_S_RESYNC:
783219974Smav				break;
784219974Smav			case G_RAID_SUBDISK_S_REBUILD:
785219974Smav				if (offset + start >= sd->sd_rebuild_pos)
786219974Smav					goto nextdisk;
787219974Smav				break;
788219974Smav			default:
789219974Smav				goto nextdisk;
790219974Smav			}
791219974Smav			cbp = g_clone_bio(bp);
792219974Smav			if (cbp == NULL)
793219974Smav				goto failure;
794219974Smav			cbp->bio_offset = offset + start;
795219974Smav			cbp->bio_length = length;
796256610Smav			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797256610Smav			    bp->bio_cmd != BIO_DELETE) {
798256610Smav				cbp->bio_ma_offset += (uintptr_t)addr;
799256610Smav				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800256610Smav				cbp->bio_ma_offset %= PAGE_SIZE;
801256610Smav				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802256610Smav				    cbp->bio_length) / PAGE_SIZE;
803256610Smav			} else
804256610Smav				cbp->bio_data = addr;
805219974Smav			cbp->bio_caller1 = sd;
806219974Smav			bioq_insert_tail(&queue, cbp);
807219974Smavnextdisk:
808219974Smav			if (++no >= vol->v_disks_count) {
809219974Smav				no = 0;
810219974Smav				offset += strip_size;
811219974Smav			}
812219974Smav		}
813219974Smav		remain -= length;
814242323Smav		if (bp->bio_cmd != BIO_DELETE)
815242323Smav			addr += length;
816219974Smav		start = 0;
817219974Smav	}
818256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL) {
819219974Smav		sd = cbp->bio_caller1;
820219974Smav		cbp->bio_caller1 = NULL;
821219974Smav		g_raid_subdisk_iostart(sd, cbp);
822219974Smav	}
823219974Smav	return;
824219974Smavfailure:
825256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL)
826219974Smav		g_destroy_bio(cbp);
827219974Smav	if (bp->bio_error == 0)
828219974Smav		bp->bio_error = ENOMEM;
829219974Smav	g_raid_iodone(bp, bp->bio_error);
830219974Smav}
831219974Smav
832219974Smavstatic void
833219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
834219974Smav{
835219974Smav	struct g_raid_volume *vol;
836219974Smav	struct g_raid_tr_raid1e_object *trs;
837219974Smav
838219974Smav	vol = tr->tro_volume;
839219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
840219974Smav	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
841219974Smav	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
842219974Smav	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
843219974Smav		g_raid_iodone(bp, EIO);
844219974Smav		return;
845219974Smav	}
846219974Smav	/*
847219974Smav	 * If we're rebuilding, squeeze in rebuild activity every so often,
848219974Smav	 * even when the disk is busy.  Be sure to only count real I/O
849219974Smav	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
850219974Smav	 * by this module.
851219974Smav	 */
852219974Smav	if (trs->trso_failed_sd != NULL &&
853219974Smav	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
854219974Smav		/* Make this new or running now round short. */
855219974Smav		trs->trso_recover_slabs = 0;
856219974Smav		if (--trs->trso_fair_io <= 0) {
857219974Smav			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
858219974Smav			g_raid_tr_raid1e_rebuild_some(tr);
859219974Smav		}
860219974Smav	}
861219974Smav	switch (bp->bio_cmd) {
862219974Smav	case BIO_READ:
863219974Smav		g_raid_tr_iostart_raid1e_read(tr, bp);
864219974Smav		break;
865219974Smav	case BIO_WRITE:
866242323Smav	case BIO_DELETE:
867219974Smav		g_raid_tr_iostart_raid1e_write(tr, bp);
868219974Smav		break;
869219974Smav	case BIO_FLUSH:
870219974Smav		g_raid_tr_flush_common(tr, bp);
871219974Smav		break;
872219974Smav	default:
873219974Smav		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
874219974Smav		    bp->bio_cmd, vol->v_name));
875219974Smav		break;
876219974Smav	}
877219974Smav}
878219974Smav
879219974Smavstatic void
880219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
881219974Smav    struct g_raid_subdisk *sd, struct bio *bp)
882219974Smav{
883219974Smav	struct bio *cbp;
884219974Smav	struct g_raid_subdisk *nsd;
885219974Smav	struct g_raid_volume *vol;
886219974Smav	struct bio *pbp;
887219974Smav	struct g_raid_tr_raid1e_object *trs;
888219974Smav	off_t virtual, offset, start;
889219974Smav	uintptr_t mask;
890219974Smav	int error, do_write, copy, disk, best;
891219974Smav
892219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
893219974Smav	vol = tr->tro_volume;
894219974Smav	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
895219974Smav		if (trs->trso_type == TR_RAID1E_REBUILD) {
896219974Smav			nsd = trs->trso_failed_sd;
897219974Smav			if (bp->bio_cmd == BIO_READ) {
898219974Smav
899219974Smav				/* Immediately abort rebuild, if requested. */
900219974Smav				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
901219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
902219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
903219974Smav					return;
904219974Smav				}
905219974Smav
906219974Smav				/* On read error, skip and cross fingers. */
907219974Smav				if (bp->bio_error != 0) {
908219974Smav					G_RAID_LOGREQ(0, bp,
909219974Smav					    "Read error during rebuild (%d), "
910219974Smav					    "possible data loss!",
911219974Smav					    bp->bio_error);
912219974Smav					goto rebuild_round_done;
913219974Smav				}
914219974Smav
915219974Smav				/*
916219974Smav				 * The read operation finished, queue the
917219974Smav				 * write and get out.
918219974Smav				 */
919219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
920219974Smav				    bp->bio_error);
921219974Smav				bp->bio_cmd = BIO_WRITE;
922219974Smav				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
923219974Smav				bp->bio_offset = nsd->sd_rebuild_pos;
924219974Smav				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
925219974Smav				g_raid_subdisk_iostart(nsd, bp);
926219974Smav			} else {
927219974Smav				/*
928219974Smav				 * The write operation just finished.  Do
929219974Smav				 * another.  We keep cloning the master bio
930219974Smav				 * since it has the right buffers allocated to
931219974Smav				 * it.
932219974Smav				 */
933219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
934219974Smav				    bp->bio_error);
935219974Smav				if (bp->bio_error != 0 ||
936219974Smav				    trs->trso_flags & TR_RAID1E_F_ABORT) {
937219974Smav					if ((trs->trso_flags &
938219974Smav					    TR_RAID1E_F_ABORT) == 0) {
939219974Smav						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
940219974Smav						    nsd, nsd->sd_disk);
941219974Smav					}
942219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
943219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
944219974Smav					return;
945219974Smav				}
946219974Smavrebuild_round_done:
947219974Smav				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
948219974Smav				g_raid_unlock_range(tr->tro_volume,
949219974Smav				    trs->trso_lock_pos, trs->trso_lock_len);
950219974Smav				nsd->sd_rebuild_pos += bp->bio_length;
951219974Smav				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
952219974Smav					g_raid_tr_raid1e_rebuild_finish(tr);
953219974Smav					return;
954219974Smav				}
955219974Smav
956219974Smav				/* Abort rebuild if we are stopping */
957219974Smav				if (trs->trso_stopping) {
958219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
959219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
960219974Smav					return;
961219974Smav				}
962219974Smav
963219974Smav				if (--trs->trso_meta_update <= 0) {
964219974Smav					g_raid_write_metadata(vol->v_softc,
965219974Smav					    vol, nsd, nsd->sd_disk);
966219974Smav					trs->trso_meta_update =
967219974Smav					    g_raid1e_rebuild_meta_update;
968219974Smav					/* Compensate short rebuild I/Os. */
969219974Smav					if ((vol->v_disks_count % N) != 0 &&
970219974Smav					    vol->v_strip_size <
971219974Smav					     g_raid1e_rebuild_slab) {
972219974Smav						trs->trso_meta_update *=
973219974Smav						    g_raid1e_rebuild_slab;
974219974Smav						trs->trso_meta_update /=
975219974Smav						    vol->v_strip_size;
976219974Smav					}
977219974Smav				}
978219974Smav				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
979219974Smav				if (--trs->trso_recover_slabs <= 0)
980219974Smav					return;
981219974Smav				/* Run next rebuild iteration. */
982219974Smav				g_raid_tr_raid1e_rebuild_some(tr);
983219974Smav			}
984219974Smav		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
985219974Smav			/*
986219974Smav			 * read good sd, read bad sd in parallel.  when both
987219974Smav			 * done, compare the buffers.  write good to the bad
988219974Smav			 * if different.  do the next bit of work.
989219974Smav			 */
990219974Smav			panic("Somehow, we think we're doing a resync");
991219974Smav		}
992219974Smav		return;
993219974Smav	}
994219974Smav	pbp = bp->bio_parent;
995219974Smav	pbp->bio_inbed++;
996219974Smav	mask = (intptr_t)bp->bio_caller2;
997219974Smav	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
998219974Smav		/*
999219974Smav		 * Read failed on first drive.  Retry the read error on
1000219974Smav		 * another disk drive, if available, before erroring out the
1001219974Smav		 * read.
1002219974Smav		 */
1003219974Smav		sd->sd_disk->d_read_errs++;
1004219974Smav		G_RAID_LOGREQ(0, bp,
1005219974Smav		    "Read error (%d), %d read errors total",
1006219974Smav		    bp->bio_error, sd->sd_disk->d_read_errs);
1007219974Smav
1008219974Smav		/*
1009219974Smav		 * If there are too many read errors, we move to degraded.
1010219974Smav		 * XXX Do we want to FAIL the drive (eg, make the user redo
1011219974Smav		 * everything to get it back in sync), or just degrade the
1012219974Smav		 * drive, which kicks off a resync?
1013219974Smav		 */
1014219974Smav		do_write = 0;
1015219974Smav		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017219974Smav		else if (mask == 0)
1018219974Smav			do_write = 1;
1019219974Smav
1020219974Smav		/* Restore what we were doing. */
1021219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1022219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1023219974Smav
1024219974Smav		/* Find the other disk, and try to do the I/O to it. */
1025219974Smav		mask |= 1 << copy;
1026219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1027219974Smav		    disk, offset, start, mask);
1028219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029219974Smav			disk += best;
1030219974Smav			if (disk >= vol->v_disks_count) {
1031219974Smav				disk -= vol->v_disks_count;
1032219974Smav				offset += vol->v_strip_size;
1033219974Smav			}
1034219974Smav			cbp->bio_offset = offset + start;
1035219974Smav			cbp->bio_length = bp->bio_length;
1036219974Smav			cbp->bio_data = bp->bio_data;
1037256610Smav			cbp->bio_ma = bp->bio_ma;
1038256610Smav			cbp->bio_ma_offset = bp->bio_ma_offset;
1039256610Smav			cbp->bio_ma_n = bp->bio_ma_n;
1040219974Smav			g_destroy_bio(bp);
1041219974Smav			nsd = &vol->v_subdisks[disk];
1042219974Smav			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043219974Smav			    nsd->sd_pos);
1044219974Smav			if (do_write)
1045219974Smav				mask |= 1 << 31;
1046258780Seadler			if ((mask & (1U << 31)) != 0)
1047219974Smav				sd->sd_recovery++;
1048219974Smav			cbp->bio_caller2 = (void *)mask;
1049219974Smav			if (do_write) {
1050219974Smav				cbp->bio_caller1 = nsd;
1051219974Smav				/* Lock callback starts I/O */
1052219974Smav				g_raid_lock_range(sd->sd_volume,
1053219974Smav				    virtual, cbp->bio_length, pbp, cbp);
1054219974Smav			} else {
1055219974Smav				g_raid_subdisk_iostart(nsd, cbp);
1056219974Smav			}
1057219974Smav			return;
1058219974Smav		}
1059219974Smav		/*
1060219974Smav		 * We can't retry.  Return the original error by falling
1061219974Smav		 * through.  This will happen when there's only one good disk.
1062219974Smav		 * We don't need to fail the raid, since its actual state is
1063219974Smav		 * based on the state of the subdisks.
1064219974Smav		 */
1065219974Smav		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066219974Smav	}
1067219974Smav	if (bp->bio_cmd == BIO_READ &&
1068219974Smav	    bp->bio_error == 0 &&
1069258780Seadler	    (mask & (1U << 31)) != 0) {
1070219974Smav		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071219974Smav
1072219974Smav		/* Restore what we were doing. */
1073219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1074219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1075219974Smav
1076219974Smav		/* Find best disk to write. */
1077219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1078219974Smav		    disk, offset, start, ~mask);
1079219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080219974Smav			disk += best;
1081219974Smav			if (disk >= vol->v_disks_count) {
1082219974Smav				disk -= vol->v_disks_count;
1083219974Smav				offset += vol->v_strip_size;
1084219974Smav			}
1085219974Smav			cbp->bio_offset = offset + start;
1086219974Smav			cbp->bio_cmd = BIO_WRITE;
1087219974Smav			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088219974Smav			cbp->bio_caller2 = (void *)mask;
1089219974Smav			g_destroy_bio(bp);
1090219974Smav			G_RAID_LOGREQ(2, cbp,
1091219974Smav			    "Attempting bad sector remap on failing drive.");
1092219974Smav			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093219974Smav			return;
1094219974Smav		}
1095219974Smav	}
1096258780Seadler	if ((mask & (1U << 31)) != 0) {
1097219974Smav		/*
1098219974Smav		 * We're done with a recovery, mark the range as unlocked.
1099298808Spfg		 * For any write errors, we aggressively fail the disk since
1100219974Smav		 * there was both a READ and a WRITE error at this location.
1101219974Smav		 * Both types of errors generally indicates the drive is on
1102219974Smav		 * the verge of total failure anyway.  Better to stop trusting
1103219974Smav		 * it now.  However, we need to reset error to 0 in that case
1104219974Smav		 * because we're not failing the original I/O which succeeded.
1105219974Smav		 */
1106219974Smav
1107219974Smav		/* Restore what we were doing. */
1108219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1109219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1110219974Smav
1111219974Smav		for (copy = 0; copy < N; copy++) {
1112219974Smav			if ((mask & (1 << copy) ) != 0)
1113219974Smav				vol->v_subdisks[(disk + copy) %
1114219974Smav				    vol->v_disks_count].sd_recovery--;
1115219974Smav		}
1116219974Smav
1117219974Smav		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118219974Smav			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119219974Smav			    "failing subdisk.");
1120219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121219974Smav			bp->bio_error = 0;
1122219974Smav		}
1123219974Smav		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124219974Smav		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125219974Smav	}
1126242328Smav	if (pbp->bio_cmd != BIO_READ) {
1127235270Smav		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128235270Smav			pbp->bio_error = bp->bio_error;
1129242328Smav		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130235270Smav			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131235270Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132235270Smav		}
1133235270Smav		error = pbp->bio_error;
1134235270Smav	} else
1135235270Smav		error = bp->bio_error;
1136219974Smav	g_destroy_bio(bp);
1137219974Smav	if (pbp->bio_children == pbp->bio_inbed) {
1138219974Smav		pbp->bio_completed = pbp->bio_length;
1139219974Smav		g_raid_iodone(pbp, error);
1140219974Smav	}
1141219974Smav}
1142219974Smav
1143219974Smavstatic int
1144219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1145219974Smav    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1146219974Smav{
1147219974Smav	struct g_raid_volume *vol;
1148219974Smav	struct g_raid_subdisk *sd;
1149219974Smav	struct bio_queue_head queue;
1150219974Smav	char *addr;
1151219974Smav	off_t offset, start, length, remain;
1152219974Smav	u_int no, strip_size;
1153219974Smav	int i, error;
1154219974Smav
1155219974Smav	vol = tr->tro_volume;
1156219974Smav	addr = virtual;
1157219974Smav	strip_size = vol->v_strip_size;
1158219974Smav	V2P(vol, boffset, &no, &offset, &start);
1159219974Smav	remain = blength;
1160219974Smav	bioq_init(&queue);
1161219974Smav	while (remain > 0) {
1162219974Smav		length = MIN(strip_size - start, remain);
1163219974Smav		for (i = 0; i < N; i++) {
1164219974Smav			sd = &vol->v_subdisks[no];
1165219974Smav			switch (sd->sd_state) {
1166219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
1167219974Smav			case G_RAID_SUBDISK_S_STALE:
1168219974Smav			case G_RAID_SUBDISK_S_RESYNC:
1169219974Smav				break;
1170219974Smav			case G_RAID_SUBDISK_S_REBUILD:
1171219974Smav				if (offset + start >= sd->sd_rebuild_pos)
1172219974Smav					goto nextdisk;
1173219974Smav				break;
1174219974Smav			default:
1175219974Smav				goto nextdisk;
1176219974Smav			}
1177219974Smav			error = g_raid_subdisk_kerneldump(sd,
1178219974Smav			    addr, 0, offset + start, length);
1179219974Smav			if (error != 0)
1180219974Smav				return (error);
1181219974Smavnextdisk:
1182219974Smav			if (++no >= vol->v_disks_count) {
1183219974Smav				no = 0;
1184219974Smav				offset += strip_size;
1185219974Smav			}
1186219974Smav		}
1187219974Smav		remain -= length;
1188219974Smav		addr += length;
1189219974Smav		start = 0;
1190219974Smav	}
1191219974Smav	return (0);
1192219974Smav}
1193219974Smav
1194219974Smavstatic int
1195219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196219974Smav{
1197219974Smav	struct bio *bp;
1198219974Smav	struct g_raid_subdisk *sd;
1199219974Smav
1200219974Smav	bp = (struct bio *)argp;
1201219974Smav	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202219974Smav	g_raid_subdisk_iostart(sd, bp);
1203219974Smav
1204219974Smav	return (0);
1205219974Smav}
1206219974Smav
1207219974Smavstatic int
1208219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209219974Smav{
1210219974Smav	struct g_raid_tr_raid1e_object *trs;
1211219974Smav	struct g_raid_volume *vol;
1212219974Smav
1213219974Smav	vol = tr->tro_volume;
1214219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1215219974Smav	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216219974Smav	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217219974Smav	/* Compensate short rebuild I/Os. */
1218219974Smav	if ((vol->v_disks_count % N) != 0 &&
1219219974Smav	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1220219974Smav		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221219974Smav		trs->trso_recover_slabs /= vol->v_strip_size;
1222219974Smav	}
1223219974Smav	if (trs->trso_type == TR_RAID1E_REBUILD)
1224219974Smav		g_raid_tr_raid1e_rebuild_some(tr);
1225219974Smav	return (0);
1226219974Smav}
1227219974Smav
1228219974Smavstatic int
1229219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230219974Smav{
1231219974Smav	struct g_raid_tr_raid1e_object *trs;
1232219974Smav
1233219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1234219974Smav
1235219974Smav	if (trs->trso_buffer != NULL) {
1236219974Smav		free(trs->trso_buffer, M_TR_RAID1E);
1237219974Smav		trs->trso_buffer = NULL;
1238219974Smav	}
1239219974Smav	return (0);
1240219974Smav}
1241219974Smav
1242240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E");
1243