tr_raid1e.c revision 256610
1219974Smav/*-
2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3219974Smav * All rights reserved.
4219974Smav *
5219974Smav * Redistribution and use in source and binary forms, with or without
6219974Smav * modification, are permitted provided that the following conditions
7219974Smav * are met:
8219974Smav * 1. Redistributions of source code must retain the above copyright
9219974Smav *    notice, this list of conditions and the following disclaimer.
10219974Smav * 2. Redistributions in binary form must reproduce the above copyright
11219974Smav *    notice, this list of conditions and the following disclaimer in the
12219974Smav *    documentation and/or other materials provided with the distribution.
13219974Smav *
14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17219974Smav * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24219974Smav * SUCH DAMAGE.
25219974Smav */
26219974Smav
27219974Smav#include <sys/cdefs.h>
28219974Smav__FBSDID("$FreeBSD: head/sys/geom/raid/tr_raid1e.c 256610 2013-10-16 09:33:23Z mav $");
29219974Smav
30219974Smav#include <sys/param.h>
31219974Smav#include <sys/bio.h>
32219974Smav#include <sys/endian.h>
33219974Smav#include <sys/kernel.h>
34219974Smav#include <sys/kobj.h>
35219974Smav#include <sys/limits.h>
36219974Smav#include <sys/lock.h>
37219974Smav#include <sys/malloc.h>
38219974Smav#include <sys/mutex.h>
39219974Smav#include <sys/sysctl.h>
40219974Smav#include <sys/systm.h>
41219974Smav#include <geom/geom.h>
42219974Smav#include "geom/raid/g_raid.h"
43219974Smav#include "g_raid_tr_if.h"
44219974Smav
45219974Smav#define N	2
46219974Smav
47240465SmavSYSCTL_DECL(_kern_geom_raid_raid1e);
48219974Smav
49219974Smav#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50219974Smavstatic int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
52219974Smav    &g_raid1e_rebuild_slab);
53219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
54219974Smav    &g_raid1e_rebuild_slab, 0,
55219974Smav    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56219974Smav
57219974Smav#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58219974Smavstatic int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
60219974Smav    &g_raid1e_rebuild_fair_io);
61219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
62219974Smav    &g_raid1e_rebuild_fair_io, 0,
63219974Smav    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64219974Smav
65219974Smav#define RAID1E_REBUILD_CLUSTER_IDLE 100
66219974Smavstatic int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
67219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
68219974Smav    &g_raid1e_rebuild_cluster_idle);
69219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
70219974Smav    &g_raid1e_rebuild_cluster_idle, 0,
71219974Smav    "Number of slabs to do each time we trigger a rebuild cycle");
72219974Smav
73219974Smav#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
74219974Smavstatic int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
75219974SmavTUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
76219974Smav    &g_raid1e_rebuild_meta_update);
77219974SmavSYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
78219974Smav    &g_raid1e_rebuild_meta_update, 0,
79219974Smav    "When to update the meta data.");
80219974Smav
81219974Smavstatic MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
82219974Smav
83219974Smav#define TR_RAID1E_NONE 0
84219974Smav#define TR_RAID1E_REBUILD 1
85219974Smav#define TR_RAID1E_RESYNC 2
86219974Smav
87219974Smav#define TR_RAID1E_F_DOING_SOME	0x1
88219974Smav#define TR_RAID1E_F_LOCKED	0x2
89219974Smav#define TR_RAID1E_F_ABORT	0x4
90219974Smav
91219974Smavstruct g_raid_tr_raid1e_object {
92219974Smav	struct g_raid_tr_object	 trso_base;
93219974Smav	int			 trso_starting;
94219974Smav	int			 trso_stopping;
95219974Smav	int			 trso_type;
96219974Smav	int			 trso_recover_slabs; /* slabs before rest */
97219974Smav	int			 trso_fair_io;
98219974Smav	int			 trso_meta_update;
99219974Smav	int			 trso_flags;
100219974Smav	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
101219974Smav	void			*trso_buffer;	 /* Buffer space */
102219974Smav	off_t			 trso_lock_pos; /* Locked range start. */
103219974Smav	off_t			 trso_lock_len; /* Locked range length. */
104219974Smav	struct bio		 trso_bio;
105219974Smav};
106219974Smav
107219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1e;
108219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1e;
109219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1e;
110219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1e;
111219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
112219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
113219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
114219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1e;
115219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1e;
116219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1e;
117219974Smav
118219974Smavstatic kobj_method_t g_raid_tr_raid1e_methods[] = {
119219974Smav	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
120219974Smav	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
121219974Smav	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
122219974Smav	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
123219974Smav	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
124219974Smav	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
125219974Smav	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
126219974Smav	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
127219974Smav	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
128219974Smav	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
129219974Smav	{ 0, 0 }
130219974Smav};
131219974Smav
132219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1e_class = {
133219974Smav	"RAID1E",
134219974Smav	g_raid_tr_raid1e_methods,
135219974Smav	sizeof(struct g_raid_tr_raid1e_object),
136240465Smav	.trc_enable = 1,
137256610Smav	.trc_priority = 200,
138256610Smav	.trc_accept_unmapped = 1
139219974Smav};
140219974Smav
141219974Smavstatic void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
142219974Smavstatic void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
143219974Smav    struct g_raid_subdisk *sd);
144219974Smavstatic int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
145219974Smav    int no, off_t off, off_t len, u_int mask);
146219974Smav
147219974Smavstatic inline void
148219974SmavV2P(struct g_raid_volume *vol, off_t virt,
149219974Smav    int *disk, off_t *offset, off_t *start)
150219974Smav{
151219974Smav	off_t nstrip;
152219974Smav	u_int strip_size;
153219974Smav
154219974Smav	strip_size = vol->v_strip_size;
155219974Smav	/* Strip number. */
156219974Smav	nstrip = virt / strip_size;
157219974Smav	/* Start position in strip. */
158219974Smav	*start = virt % strip_size;
159219974Smav	/* Disk number. */
160219974Smav	*disk = (nstrip * N) % vol->v_disks_count;
161219974Smav	/* Strip start position in disk. */
162219974Smav	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
163219974Smav}
164219974Smav
165219974Smavstatic inline void
166219974SmavP2V(struct g_raid_volume *vol, int disk, off_t offset,
167219974Smav    off_t *virt, int *copy)
168219974Smav{
169219974Smav	off_t nstrip, start;
170219974Smav	u_int strip_size;
171219974Smav
172219974Smav	strip_size = vol->v_strip_size;
173219974Smav	/* Start position in strip. */
174219974Smav	start = offset % strip_size;
175219974Smav	/* Physical strip number. */
176219974Smav	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
177219974Smav	/* Number of physical strip (copy) inside virtual strip. */
178219974Smav	*copy = nstrip % N;
179219974Smav	/* Offset in virtual space. */
180219974Smav	*virt = (nstrip / N) * strip_size + start;
181219974Smav}
182219974Smav
183219974Smavstatic int
184219974Smavg_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
185219974Smav{
186219974Smav	struct g_raid_tr_raid1e_object *trs;
187219974Smav
188219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
189219974Smav	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
190234603Smav	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
191219974Smav		return (G_RAID_TR_TASTE_FAIL);
192219974Smav	trs->trso_starting = 1;
193219974Smav	return (G_RAID_TR_TASTE_SUCCEED);
194219974Smav}
195219974Smav
196219974Smavstatic int
197219974Smavg_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
198219974Smav{
199219974Smav	struct g_raid_softc *sc;
200219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
201219974Smav	int i, j, state, sstate;
202219974Smav
203219974Smav	sc = vol->v_softc;
204219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
205219974Smav	for (i = 0; i < vol->v_disks_count / N; i++) {
206219974Smav		bestsd = &vol->v_subdisks[i * N];
207219974Smav		for (j = 1; j < N; j++) {
208219974Smav			sd = &vol->v_subdisks[i * N + j];
209219974Smav			if (sd->sd_state > bestsd->sd_state)
210219974Smav				bestsd = sd;
211219974Smav			else if (sd->sd_state == bestsd->sd_state &&
212219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
213219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
214219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
215219974Smav				bestsd = sd;
216219974Smav		}
217219974Smav		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
218219974Smav		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
219219974Smav			/* We found reasonable candidate. */
220219974Smav			G_RAID_DEBUG1(1, sc,
221219974Smav			    "Promote subdisk %s:%d from %s to ACTIVE.",
222219974Smav			    vol->v_name, bestsd->sd_pos,
223219974Smav			    g_raid_subdisk_state2str(bestsd->sd_state));
224219974Smav			g_raid_change_subdisk_state(bestsd,
225219974Smav			    G_RAID_SUBDISK_S_ACTIVE);
226219974Smav			g_raid_write_metadata(sc,
227219974Smav			    vol, bestsd, bestsd->sd_disk);
228219974Smav		}
229219974Smav		worstsd = &vol->v_subdisks[i * N];
230219974Smav		for (j = 1; j < N; j++) {
231219974Smav			sd = &vol->v_subdisks[i * N + j];
232219974Smav			if (sd->sd_state < worstsd->sd_state)
233219974Smav				worstsd = sd;
234219974Smav		}
235219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
236219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
237219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
238219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
239219974Smav		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
240219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
241219974Smav		else
242219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
243219974Smav		if (sstate < state)
244219974Smav			state = sstate;
245219974Smav	}
246219974Smav	return (state);
247219974Smav}
248219974Smav
249219974Smavstatic int
250219974Smavg_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
251219974Smav{
252219974Smav	struct g_raid_softc *sc;
253219974Smav	struct g_raid_subdisk *sd, *bestsd, *worstsd;
254219974Smav	int i, j, state, sstate;
255219974Smav
256219974Smav	sc = vol->v_softc;
257219974Smav	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
258219974Smav	    vol->v_disks_count)
259219974Smav		return (G_RAID_VOLUME_S_OPTIMAL);
260219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
261219974Smav		sd = &vol->v_subdisks[i];
262219974Smav		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
263219974Smav			/* We found reasonable candidate. */
264219974Smav			G_RAID_DEBUG1(1, sc,
265219974Smav			    "Promote subdisk %s:%d from %s to STALE.",
266219974Smav			    vol->v_name, sd->sd_pos,
267219974Smav			    g_raid_subdisk_state2str(sd->sd_state));
268219974Smav			g_raid_change_subdisk_state(sd,
269219974Smav			    G_RAID_SUBDISK_S_STALE);
270219974Smav			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
271219974Smav		}
272219974Smav	}
273219974Smav	state = G_RAID_VOLUME_S_OPTIMAL;
274219974Smav	for (i = 0; i < vol->v_disks_count; i++) {
275219974Smav		bestsd = &vol->v_subdisks[i];
276219974Smav		worstsd = &vol->v_subdisks[i];
277219974Smav		for (j = 1; j < N; j++) {
278219974Smav			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
279219974Smav			if (sd->sd_state > bestsd->sd_state)
280219974Smav				bestsd = sd;
281219974Smav			else if (sd->sd_state == bestsd->sd_state &&
282219974Smav			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
283219974Smav			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
284219974Smav			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
285219974Smav				bestsd = sd;
286219974Smav			if (sd->sd_state < worstsd->sd_state)
287219974Smav				worstsd = sd;
288219974Smav		}
289219974Smav		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
290219974Smav			sstate = G_RAID_VOLUME_S_OPTIMAL;
291219974Smav		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
292219974Smav			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
293219974Smav		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
294219974Smav			sstate = G_RAID_VOLUME_S_DEGRADED;
295219974Smav		else
296219974Smav			sstate = G_RAID_VOLUME_S_BROKEN;
297219974Smav		if (sstate < state)
298219974Smav			state = sstate;
299219974Smav	}
300219974Smav	return (state);
301219974Smav}
302219974Smav
303219974Smavstatic int
304219974Smavg_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
305219974Smav    struct g_raid_subdisk *sd)
306219974Smav{
307219974Smav	struct g_raid_tr_raid1e_object *trs;
308219974Smav	struct g_raid_softc *sc;
309219974Smav	u_int s;
310219974Smav
311219974Smav	sc = vol->v_softc;
312219974Smav	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
313219974Smav	if (trs->trso_stopping &&
314219974Smav	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
315219974Smav		s = G_RAID_VOLUME_S_STOPPED;
316219974Smav	else if (trs->trso_starting)
317219974Smav		s = G_RAID_VOLUME_S_STARTING;
318219974Smav	else {
319219974Smav		if ((vol->v_disks_count % N) == 0)
320219974Smav			s = g_raid_tr_update_state_raid1e_even(vol);
321219974Smav		else
322219974Smav			s = g_raid_tr_update_state_raid1e_odd(vol);
323219974Smav	}
324219974Smav	if (s != vol->v_state) {
325219974Smav		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
326219974Smav		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
327219974Smav		    G_RAID_EVENT_VOLUME);
328219974Smav		g_raid_change_volume_state(vol, s);
329219974Smav		if (!trs->trso_starting && !trs->trso_stopping)
330219974Smav			g_raid_write_metadata(sc, vol, NULL, NULL);
331219974Smav	}
332219974Smav	if (!trs->trso_starting && !trs->trso_stopping)
333219974Smav		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
334219974Smav	return (0);
335219974Smav}
336219974Smav
337219974Smavstatic void
338219974Smavg_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
339219974Smav    struct g_raid_disk *disk)
340219974Smav{
341235270Smav	struct g_raid_volume *vol;
342235270Smav
343235270Smav	vol = sd->sd_volume;
344219974Smav	/*
345219974Smav	 * We don't fail the last disk in the pack, since it still has decent
346219974Smav	 * data on it and that's better than failing the disk if it is the root
347219974Smav	 * file system.
348219974Smav	 *
349219974Smav	 * XXX should this be controlled via a tunable?  It makes sense for
350219974Smav	 * the volume that has / on it.  I can't think of a case where we'd
351219974Smav	 * want the volume to go away on this kind of event.
352219974Smav	 */
353235270Smav	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
354235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
355235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
356235270Smav	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
357235270Smav	     vol->v_disks_count) &&
358235270Smav	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
359219974Smav		return;
360219974Smav	g_raid_fail_disk(sc, sd, disk);
361219974Smav}
362219974Smav
363219974Smavstatic void
364219974Smavg_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
365219974Smav{
366219974Smav	struct g_raid_volume *vol;
367219974Smav	struct g_raid_subdisk *sd;
368219974Smav
369219974Smav	vol = trs->trso_base.tro_volume;
370219974Smav	sd = trs->trso_failed_sd;
371219974Smav	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
372219974Smav	free(trs->trso_buffer, M_TR_RAID1E);
373219974Smav	trs->trso_buffer = NULL;
374219974Smav	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
375219974Smav	trs->trso_type = TR_RAID1E_NONE;
376219974Smav	trs->trso_recover_slabs = 0;
377219974Smav	trs->trso_failed_sd = NULL;
378219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
379219974Smav}
380219974Smav
381219974Smavstatic void
382219974Smavg_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
383219974Smav{
384219974Smav	struct g_raid_tr_raid1e_object *trs;
385219974Smav	struct g_raid_subdisk *sd;
386219974Smav
387219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
388219974Smav	sd = trs->trso_failed_sd;
389219974Smav	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
390219974Smav	    "Subdisk %s:%d-%s rebuild completed.",
391219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
392219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
393219974Smav	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
394219974Smav	sd->sd_rebuild_pos = 0;
395219974Smav	g_raid_tr_raid1e_rebuild_done(trs);
396219974Smav}
397219974Smav
398219974Smavstatic void
399219974Smavg_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
400219974Smav{
401219974Smav	struct g_raid_tr_raid1e_object *trs;
402219974Smav	struct g_raid_subdisk *sd;
403219974Smav	struct g_raid_volume *vol;
404219974Smav
405219974Smav	vol = tr->tro_volume;
406219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
407219974Smav	sd = trs->trso_failed_sd;
408219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
409219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
410219974Smav		    "Subdisk %s:%d-%s rebuild is aborting.",
411219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
412219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
413219974Smav		trs->trso_flags |= TR_RAID1E_F_ABORT;
414219974Smav	} else {
415219974Smav		G_RAID_DEBUG1(0, vol->v_softc,
416219974Smav		    "Subdisk %s:%d-%s rebuild aborted.",
417219974Smav		    sd->sd_volume->v_name, sd->sd_pos,
418219974Smav		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
419219974Smav		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
420219974Smav		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
421219974Smav			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
422219974Smav			g_raid_unlock_range(tr->tro_volume,
423219974Smav			    trs->trso_lock_pos, trs->trso_lock_len);
424219974Smav		}
425219974Smav		g_raid_tr_raid1e_rebuild_done(trs);
426219974Smav	}
427219974Smav}
428219974Smav
429219974Smavstatic void
430219974Smavg_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
431219974Smav{
432219974Smav	struct g_raid_tr_raid1e_object *trs;
433219974Smav	struct g_raid_softc *sc;
434219974Smav	struct g_raid_volume *vol;
435219974Smav	struct g_raid_subdisk *sd;
436219974Smav	struct bio *bp;
437219974Smav	off_t len, virtual, vend, offset, start;
438219974Smav	int disk, copy, best;
439219974Smav
440219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
441219974Smav	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
442219974Smav		return;
443219974Smav	vol = tr->tro_volume;
444219974Smav	sc = vol->v_softc;
445219974Smav	sd = trs->trso_failed_sd;
446219974Smav
447219974Smav	while (1) {
448219974Smav		if (sd->sd_rebuild_pos >= sd->sd_size) {
449219974Smav			g_raid_tr_raid1e_rebuild_finish(tr);
450219974Smav			return;
451219974Smav		}
452219974Smav		/* Get virtual offset from physical rebuild position. */
453219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
454219974Smav		/* Get physical offset back to get first stripe position. */
455219974Smav		V2P(vol, virtual, &disk, &offset, &start);
456219974Smav		/* Calculate contignous data length. */
457219974Smav		len = MIN(g_raid1e_rebuild_slab,
458219974Smav		    sd->sd_size - sd->sd_rebuild_pos);
459219974Smav		if ((vol->v_disks_count % N) != 0)
460219974Smav			len = MIN(len, vol->v_strip_size - start);
461219974Smav		/* Find disk with most accurate data. */
462219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
463219974Smav		    offset + start, len, 0);
464219974Smav		if (best < 0) {
465219974Smav			/* There is no any valid disk. */
466219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
467219974Smav			return;
468219974Smav		} else if (best != copy) {
469219974Smav			/* Some other disk has better data. */
470219974Smav			break;
471219974Smav		}
472219974Smav		/* We have the most accurate data. Skip the range. */
473219974Smav		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
474219974Smav		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
475219974Smav		sd->sd_rebuild_pos += len;
476219974Smav	}
477219974Smav
478219974Smav	bp = &trs->trso_bio;
479219974Smav	memset(bp, 0, sizeof(*bp));
480219974Smav	bp->bio_offset = offset + start +
481219974Smav	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
482219974Smav	bp->bio_length = len;
483219974Smav	bp->bio_data = trs->trso_buffer;
484219974Smav	bp->bio_cmd = BIO_READ;
485219974Smav	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
486219974Smav	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
487219974Smav	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
488219974Smav	/*
489219974Smav	 * If we are crossing stripe boundary, correct affected virtual
490219974Smav	 * range we should lock.
491219974Smav	 */
492219974Smav	if (start + len > vol->v_strip_size) {
493219974Smav		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
494219974Smav		len = vend - virtual;
495219974Smav	}
496219974Smav	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
497219974Smav	trs->trso_flags |= TR_RAID1E_F_LOCKED;
498219974Smav	trs->trso_lock_pos = virtual;
499219974Smav	trs->trso_lock_len = len;
500219974Smav	/* Lock callback starts I/O */
501219974Smav	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
502219974Smav}
503219974Smav
504219974Smavstatic void
505219974Smavg_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
506219974Smav{
507219974Smav	struct g_raid_volume *vol;
508219974Smav	struct g_raid_tr_raid1e_object *trs;
509219974Smav	struct g_raid_subdisk *sd;
510219974Smav
511219974Smav	vol = tr->tro_volume;
512219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
513219974Smav	if (trs->trso_failed_sd) {
514219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
515219974Smav		    "Already rebuild in start rebuild. pos %jd\n",
516219974Smav		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
517219974Smav		return;
518219974Smav	}
519219974Smav	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
520219974Smav	if (sd == NULL)
521219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
522219974Smav	if (sd == NULL) {
523219974Smav		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
524219974Smav		if (sd != NULL) {
525219974Smav			sd->sd_rebuild_pos = 0;
526219974Smav			g_raid_change_subdisk_state(sd,
527219974Smav			    G_RAID_SUBDISK_S_RESYNC);
528219974Smav			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
529219974Smav		} else {
530219974Smav			sd = g_raid_get_subdisk(vol,
531219974Smav			    G_RAID_SUBDISK_S_UNINITIALIZED);
532219974Smav			if (sd == NULL)
533219974Smav				sd = g_raid_get_subdisk(vol,
534219974Smav				    G_RAID_SUBDISK_S_NEW);
535219974Smav			if (sd != NULL) {
536219974Smav				sd->sd_rebuild_pos = 0;
537219974Smav				g_raid_change_subdisk_state(sd,
538219974Smav				    G_RAID_SUBDISK_S_REBUILD);
539219974Smav				g_raid_write_metadata(vol->v_softc,
540219974Smav				    vol, sd, NULL);
541219974Smav			}
542219974Smav		}
543219974Smav	}
544219974Smav	if (sd == NULL) {
545219974Smav		G_RAID_DEBUG1(1, vol->v_softc,
546219974Smav		    "No failed disk to rebuild.  night night.");
547219974Smav		return;
548219974Smav	}
549219974Smav	trs->trso_failed_sd = sd;
550219974Smav	G_RAID_DEBUG1(0, vol->v_softc,
551219974Smav	    "Subdisk %s:%d-%s rebuild start at %jd.",
552219974Smav	    sd->sd_volume->v_name, sd->sd_pos,
553219974Smav	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
554219974Smav	    trs->trso_failed_sd->sd_rebuild_pos);
555219974Smav	trs->trso_type = TR_RAID1E_REBUILD;
556219974Smav	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
557219974Smav	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
558219974Smav	g_raid_tr_raid1e_rebuild_some(tr);
559219974Smav}
560219974Smav
561219974Smavstatic void
562219974Smavg_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
563219974Smav    struct g_raid_subdisk *sd)
564219974Smav{
565219974Smav	struct g_raid_volume *vol;
566219974Smav	struct g_raid_tr_raid1e_object *trs;
567219974Smav	int nr;
568219974Smav
569219974Smav	vol = tr->tro_volume;
570219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
571219974Smav	if (trs->trso_stopping)
572219974Smav		return;
573219974Smav	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
574219974Smav	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
575219974Smav	switch(trs->trso_type) {
576219974Smav	case TR_RAID1E_NONE:
577219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
578219974Smav			return;
579219974Smav		if (nr == 0) {
580219974Smav			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
581219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
582219974Smav			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
583219974Smav			if (nr == 0)
584219974Smav				return;
585219974Smav		}
586219974Smav		g_raid_tr_raid1e_rebuild_start(tr);
587219974Smav		break;
588219974Smav	case TR_RAID1E_REBUILD:
589219974Smav		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
590219974Smav		    trs->trso_failed_sd == sd)
591219974Smav			g_raid_tr_raid1e_rebuild_abort(tr);
592219974Smav		break;
593219974Smav	case TR_RAID1E_RESYNC:
594219974Smav		break;
595219974Smav	}
596219974Smav}
597219974Smav
598219974Smavstatic int
599219974Smavg_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
600219974Smav    struct g_raid_subdisk *sd, u_int event)
601219974Smav{
602219974Smav
603219974Smav	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
604219974Smav	return (0);
605219974Smav}
606219974Smav
607219974Smavstatic int
608219974Smavg_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
609219974Smav{
610219974Smav	struct g_raid_tr_raid1e_object *trs;
611219974Smav	struct g_raid_volume *vol;
612219974Smav
613219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
614219974Smav	vol = tr->tro_volume;
615219974Smav	trs->trso_starting = 0;
616219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
617219974Smav	return (0);
618219974Smav}
619219974Smav
620219974Smavstatic int
621219974Smavg_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
622219974Smav{
623219974Smav	struct g_raid_tr_raid1e_object *trs;
624219974Smav	struct g_raid_volume *vol;
625219974Smav
626219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
627219974Smav	vol = tr->tro_volume;
628219974Smav	trs->trso_starting = 0;
629219974Smav	trs->trso_stopping = 1;
630219974Smav	g_raid_tr_update_state_raid1e(vol, NULL);
631219974Smav	return (0);
632219974Smav}
633219974Smav
634219974Smav/*
635219974Smav * Select the disk to read from.  Take into account: subdisk state, running
636219974Smav * error recovery, average disk load, head position and possible cache hits.
637219974Smav */
638219974Smav#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
639219974Smavstatic int
640219974Smavg_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
641219974Smav    int no, off_t off, off_t len, u_int mask)
642219974Smav{
643219974Smav	struct g_raid_subdisk *sd;
644219974Smav	off_t offset;
645219974Smav	int i, best, prio, bestprio;
646219974Smav
647219974Smav	best = -1;
648219974Smav	bestprio = INT_MAX;
649219974Smav	for (i = 0; i < N; i++) {
650219974Smav		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
651219974Smav		offset = off;
652219974Smav		if (no + i >= vol->v_disks_count)
653219974Smav			offset += vol->v_strip_size;
654219974Smav
655219974Smav		prio = G_RAID_SUBDISK_LOAD(sd);
656219974Smav		if ((mask & (1 << sd->sd_pos)) != 0)
657219974Smav			continue;
658219974Smav		switch (sd->sd_state) {
659219974Smav		case G_RAID_SUBDISK_S_ACTIVE:
660219974Smav			break;
661219974Smav		case G_RAID_SUBDISK_S_RESYNC:
662219974Smav			if (offset + off < sd->sd_rebuild_pos)
663219974Smav				break;
664219974Smav			/* FALLTHROUGH */
665219974Smav		case G_RAID_SUBDISK_S_STALE:
666219974Smav			prio += i << 24;
667219974Smav			break;
668219974Smav		case G_RAID_SUBDISK_S_REBUILD:
669219974Smav			if (offset + off < sd->sd_rebuild_pos)
670219974Smav				break;
671219974Smav			/* FALLTHROUGH */
672219974Smav		default:
673219974Smav			continue;
674219974Smav		}
675219974Smav		prio += min(sd->sd_recovery, 255) << 16;
676219974Smav		/* If disk head is precisely in position - highly prefer it. */
677219974Smav		if (G_RAID_SUBDISK_POS(sd) == offset)
678219974Smav			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
679219974Smav		else
680219974Smav		/* If disk head is close to position - prefer it. */
681219974Smav		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
682219974Smav		    G_RAID_SUBDISK_TRACK_SIZE)
683219974Smav			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
684219974Smav		if (prio < bestprio) {
685219974Smav			bestprio = prio;
686219974Smav			best = i;
687219974Smav		}
688219974Smav	}
689219974Smav	return (best);
690219974Smav}
691219974Smav
692219974Smavstatic void
693219974Smavg_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
694219974Smav{
695219974Smav	struct g_raid_volume *vol;
696219974Smav	struct g_raid_subdisk *sd;
697219974Smav	struct bio_queue_head queue;
698219974Smav	struct bio *cbp;
699219974Smav	char *addr;
700219974Smav	off_t offset, start, length, remain;
701219974Smav	u_int no, strip_size;
702219974Smav	int best;
703219974Smav
704219974Smav	vol = tr->tro_volume;
705256610Smav	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
706256610Smav		addr = NULL;
707256610Smav	else
708256610Smav		addr = bp->bio_data;
709219974Smav	strip_size = vol->v_strip_size;
710219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
711219974Smav	remain = bp->bio_length;
712219974Smav	bioq_init(&queue);
713219974Smav	while (remain > 0) {
714219974Smav		length = MIN(strip_size - start, remain);
715219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
716219974Smav		    no, offset, length, 0);
717219974Smav		KASSERT(best >= 0, ("No readable disk in volume %s!",
718219974Smav		    vol->v_name));
719219974Smav		no += best;
720219974Smav		if (no >= vol->v_disks_count) {
721219974Smav			no -= vol->v_disks_count;
722219974Smav			offset += strip_size;
723219974Smav		}
724219974Smav		cbp = g_clone_bio(bp);
725219974Smav		if (cbp == NULL)
726219974Smav			goto failure;
727219974Smav		cbp->bio_offset = offset + start;
728219974Smav		cbp->bio_length = length;
729256610Smav		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
730256610Smav			cbp->bio_ma_offset += (uintptr_t)addr;
731256610Smav			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
732256610Smav			cbp->bio_ma_offset %= PAGE_SIZE;
733256610Smav			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
734256610Smav			    cbp->bio_length) / PAGE_SIZE;
735256610Smav		} else
736256610Smav			cbp->bio_data = addr;
737219974Smav		cbp->bio_caller1 = &vol->v_subdisks[no];
738219974Smav		bioq_insert_tail(&queue, cbp);
739219974Smav		no += N - best;
740219974Smav		if (no >= vol->v_disks_count) {
741219974Smav			no -= vol->v_disks_count;
742219974Smav			offset += strip_size;
743219974Smav		}
744219974Smav		remain -= length;
745219974Smav		addr += length;
746219974Smav		start = 0;
747219974Smav	}
748256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL) {
749219974Smav		sd = cbp->bio_caller1;
750219974Smav		cbp->bio_caller1 = NULL;
751219974Smav		g_raid_subdisk_iostart(sd, cbp);
752219974Smav	}
753219974Smav	return;
754219974Smavfailure:
755256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL)
756219974Smav		g_destroy_bio(cbp);
757219974Smav	if (bp->bio_error == 0)
758219974Smav		bp->bio_error = ENOMEM;
759219974Smav	g_raid_iodone(bp, bp->bio_error);
760219974Smav}
761219974Smav
762219974Smavstatic void
763219974Smavg_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
764219974Smav{
765219974Smav	struct g_raid_volume *vol;
766219974Smav	struct g_raid_subdisk *sd;
767219974Smav	struct bio_queue_head queue;
768219974Smav	struct bio *cbp;
769219974Smav	char *addr;
770219974Smav	off_t offset, start, length, remain;
771219974Smav	u_int no, strip_size;
772219974Smav	int i;
773219974Smav
774219974Smav	vol = tr->tro_volume;
775256610Smav	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
776256610Smav		addr = NULL;
777256610Smav	else
778256610Smav		addr = bp->bio_data;
779219974Smav	strip_size = vol->v_strip_size;
780219974Smav	V2P(vol, bp->bio_offset, &no, &offset, &start);
781219974Smav	remain = bp->bio_length;
782219974Smav	bioq_init(&queue);
783219974Smav	while (remain > 0) {
784219974Smav		length = MIN(strip_size - start, remain);
785219974Smav		for (i = 0; i < N; i++) {
786219974Smav			sd = &vol->v_subdisks[no];
787219974Smav			switch (sd->sd_state) {
788219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
789219974Smav			case G_RAID_SUBDISK_S_STALE:
790219974Smav			case G_RAID_SUBDISK_S_RESYNC:
791219974Smav				break;
792219974Smav			case G_RAID_SUBDISK_S_REBUILD:
793219974Smav				if (offset + start >= sd->sd_rebuild_pos)
794219974Smav					goto nextdisk;
795219974Smav				break;
796219974Smav			default:
797219974Smav				goto nextdisk;
798219974Smav			}
799219974Smav			cbp = g_clone_bio(bp);
800219974Smav			if (cbp == NULL)
801219974Smav				goto failure;
802219974Smav			cbp->bio_offset = offset + start;
803219974Smav			cbp->bio_length = length;
804256610Smav			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
805256610Smav			    bp->bio_cmd != BIO_DELETE) {
806256610Smav				cbp->bio_ma_offset += (uintptr_t)addr;
807256610Smav				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
808256610Smav				cbp->bio_ma_offset %= PAGE_SIZE;
809256610Smav				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
810256610Smav				    cbp->bio_length) / PAGE_SIZE;
811256610Smav			} else
812256610Smav				cbp->bio_data = addr;
813219974Smav			cbp->bio_caller1 = sd;
814219974Smav			bioq_insert_tail(&queue, cbp);
815219974Smavnextdisk:
816219974Smav			if (++no >= vol->v_disks_count) {
817219974Smav				no = 0;
818219974Smav				offset += strip_size;
819219974Smav			}
820219974Smav		}
821219974Smav		remain -= length;
822242323Smav		if (bp->bio_cmd != BIO_DELETE)
823242323Smav			addr += length;
824219974Smav		start = 0;
825219974Smav	}
826256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL) {
827219974Smav		sd = cbp->bio_caller1;
828219974Smav		cbp->bio_caller1 = NULL;
829219974Smav		g_raid_subdisk_iostart(sd, cbp);
830219974Smav	}
831219974Smav	return;
832219974Smavfailure:
833256610Smav	while ((cbp = bioq_takefirst(&queue)) != NULL)
834219974Smav		g_destroy_bio(cbp);
835219974Smav	if (bp->bio_error == 0)
836219974Smav		bp->bio_error = ENOMEM;
837219974Smav	g_raid_iodone(bp, bp->bio_error);
838219974Smav}
839219974Smav
840219974Smavstatic void
841219974Smavg_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
842219974Smav{
843219974Smav	struct g_raid_volume *vol;
844219974Smav	struct g_raid_tr_raid1e_object *trs;
845219974Smav
846219974Smav	vol = tr->tro_volume;
847219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
848219974Smav	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
849219974Smav	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
850219974Smav	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
851219974Smav		g_raid_iodone(bp, EIO);
852219974Smav		return;
853219974Smav	}
854219974Smav	/*
855219974Smav	 * If we're rebuilding, squeeze in rebuild activity every so often,
856219974Smav	 * even when the disk is busy.  Be sure to only count real I/O
857219974Smav	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
858219974Smav	 * by this module.
859219974Smav	 */
860219974Smav	if (trs->trso_failed_sd != NULL &&
861219974Smav	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
862219974Smav		/* Make this new or running now round short. */
863219974Smav		trs->trso_recover_slabs = 0;
864219974Smav		if (--trs->trso_fair_io <= 0) {
865219974Smav			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
866219974Smav			g_raid_tr_raid1e_rebuild_some(tr);
867219974Smav		}
868219974Smav	}
869219974Smav	switch (bp->bio_cmd) {
870219974Smav	case BIO_READ:
871219974Smav		g_raid_tr_iostart_raid1e_read(tr, bp);
872219974Smav		break;
873219974Smav	case BIO_WRITE:
874242323Smav	case BIO_DELETE:
875219974Smav		g_raid_tr_iostart_raid1e_write(tr, bp);
876219974Smav		break;
877219974Smav	case BIO_FLUSH:
878219974Smav		g_raid_tr_flush_common(tr, bp);
879219974Smav		break;
880219974Smav	default:
881219974Smav		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
882219974Smav		    bp->bio_cmd, vol->v_name));
883219974Smav		break;
884219974Smav	}
885219974Smav}
886219974Smav
887219974Smavstatic void
888219974Smavg_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
889219974Smav    struct g_raid_subdisk *sd, struct bio *bp)
890219974Smav{
891219974Smav	struct bio *cbp;
892219974Smav	struct g_raid_subdisk *nsd;
893219974Smav	struct g_raid_volume *vol;
894219974Smav	struct bio *pbp;
895219974Smav	struct g_raid_tr_raid1e_object *trs;
896219974Smav	off_t virtual, offset, start;
897219974Smav	uintptr_t mask;
898219974Smav	int error, do_write, copy, disk, best;
899219974Smav
900219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
901219974Smav	vol = tr->tro_volume;
902219974Smav	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
903219974Smav		if (trs->trso_type == TR_RAID1E_REBUILD) {
904219974Smav			nsd = trs->trso_failed_sd;
905219974Smav			if (bp->bio_cmd == BIO_READ) {
906219974Smav
907219974Smav				/* Immediately abort rebuild, if requested. */
908219974Smav				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
909219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
910219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
911219974Smav					return;
912219974Smav				}
913219974Smav
914219974Smav				/* On read error, skip and cross fingers. */
915219974Smav				if (bp->bio_error != 0) {
916219974Smav					G_RAID_LOGREQ(0, bp,
917219974Smav					    "Read error during rebuild (%d), "
918219974Smav					    "possible data loss!",
919219974Smav					    bp->bio_error);
920219974Smav					goto rebuild_round_done;
921219974Smav				}
922219974Smav
923219974Smav				/*
924219974Smav				 * The read operation finished, queue the
925219974Smav				 * write and get out.
926219974Smav				 */
927219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
928219974Smav				    bp->bio_error);
929219974Smav				bp->bio_cmd = BIO_WRITE;
930219974Smav				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
931219974Smav				bp->bio_offset = nsd->sd_rebuild_pos;
932219974Smav				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
933219974Smav				g_raid_subdisk_iostart(nsd, bp);
934219974Smav			} else {
935219974Smav				/*
936219974Smav				 * The write operation just finished.  Do
937219974Smav				 * another.  We keep cloning the master bio
938219974Smav				 * since it has the right buffers allocated to
939219974Smav				 * it.
940219974Smav				 */
941219974Smav				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
942219974Smav				    bp->bio_error);
943219974Smav				if (bp->bio_error != 0 ||
944219974Smav				    trs->trso_flags & TR_RAID1E_F_ABORT) {
945219974Smav					if ((trs->trso_flags &
946219974Smav					    TR_RAID1E_F_ABORT) == 0) {
947219974Smav						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
948219974Smav						    nsd, nsd->sd_disk);
949219974Smav					}
950219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
951219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
952219974Smav					return;
953219974Smav				}
954219974Smavrebuild_round_done:
955219974Smav				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
956219974Smav				g_raid_unlock_range(tr->tro_volume,
957219974Smav				    trs->trso_lock_pos, trs->trso_lock_len);
958219974Smav				nsd->sd_rebuild_pos += bp->bio_length;
959219974Smav				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
960219974Smav					g_raid_tr_raid1e_rebuild_finish(tr);
961219974Smav					return;
962219974Smav				}
963219974Smav
964219974Smav				/* Abort rebuild if we are stopping */
965219974Smav				if (trs->trso_stopping) {
966219974Smav					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
967219974Smav					g_raid_tr_raid1e_rebuild_abort(tr);
968219974Smav					return;
969219974Smav				}
970219974Smav
971219974Smav				if (--trs->trso_meta_update <= 0) {
972219974Smav					g_raid_write_metadata(vol->v_softc,
973219974Smav					    vol, nsd, nsd->sd_disk);
974219974Smav					trs->trso_meta_update =
975219974Smav					    g_raid1e_rebuild_meta_update;
976219974Smav					/* Compensate short rebuild I/Os. */
977219974Smav					if ((vol->v_disks_count % N) != 0 &&
978219974Smav					    vol->v_strip_size <
979219974Smav					     g_raid1e_rebuild_slab) {
980219974Smav						trs->trso_meta_update *=
981219974Smav						    g_raid1e_rebuild_slab;
982219974Smav						trs->trso_meta_update /=
983219974Smav						    vol->v_strip_size;
984219974Smav					}
985219974Smav				}
986219974Smav				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
987219974Smav				if (--trs->trso_recover_slabs <= 0)
988219974Smav					return;
989219974Smav				/* Run next rebuild iteration. */
990219974Smav				g_raid_tr_raid1e_rebuild_some(tr);
991219974Smav			}
992219974Smav		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
993219974Smav			/*
994219974Smav			 * read good sd, read bad sd in parallel.  when both
995219974Smav			 * done, compare the buffers.  write good to the bad
996219974Smav			 * if different.  do the next bit of work.
997219974Smav			 */
998219974Smav			panic("Somehow, we think we're doing a resync");
999219974Smav		}
1000219974Smav		return;
1001219974Smav	}
1002219974Smav	pbp = bp->bio_parent;
1003219974Smav	pbp->bio_inbed++;
1004219974Smav	mask = (intptr_t)bp->bio_caller2;
1005219974Smav	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
1006219974Smav		/*
1007219974Smav		 * Read failed on first drive.  Retry the read error on
1008219974Smav		 * another disk drive, if available, before erroring out the
1009219974Smav		 * read.
1010219974Smav		 */
1011219974Smav		sd->sd_disk->d_read_errs++;
1012219974Smav		G_RAID_LOGREQ(0, bp,
1013219974Smav		    "Read error (%d), %d read errors total",
1014219974Smav		    bp->bio_error, sd->sd_disk->d_read_errs);
1015219974Smav
1016219974Smav		/*
1017219974Smav		 * If there are too many read errors, we move to degraded.
1018219974Smav		 * XXX Do we want to FAIL the drive (eg, make the user redo
1019219974Smav		 * everything to get it back in sync), or just degrade the
1020219974Smav		 * drive, which kicks off a resync?
1021219974Smav		 */
1022219974Smav		do_write = 0;
1023219974Smav		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1024219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1025219974Smav		else if (mask == 0)
1026219974Smav			do_write = 1;
1027219974Smav
1028219974Smav		/* Restore what we were doing. */
1029219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1030219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1031219974Smav
1032219974Smav		/* Find the other disk, and try to do the I/O to it. */
1033219974Smav		mask |= 1 << copy;
1034219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1035219974Smav		    disk, offset, start, mask);
1036219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1037219974Smav			disk += best;
1038219974Smav			if (disk >= vol->v_disks_count) {
1039219974Smav				disk -= vol->v_disks_count;
1040219974Smav				offset += vol->v_strip_size;
1041219974Smav			}
1042219974Smav			cbp->bio_offset = offset + start;
1043219974Smav			cbp->bio_length = bp->bio_length;
1044219974Smav			cbp->bio_data = bp->bio_data;
1045256610Smav			cbp->bio_ma = bp->bio_ma;
1046256610Smav			cbp->bio_ma_offset = bp->bio_ma_offset;
1047256610Smav			cbp->bio_ma_n = bp->bio_ma_n;
1048219974Smav			g_destroy_bio(bp);
1049219974Smav			nsd = &vol->v_subdisks[disk];
1050219974Smav			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1051219974Smav			    nsd->sd_pos);
1052219974Smav			if (do_write)
1053219974Smav				mask |= 1 << 31;
1054219974Smav			if ((mask & (1 << 31)) != 0)
1055219974Smav				sd->sd_recovery++;
1056219974Smav			cbp->bio_caller2 = (void *)mask;
1057219974Smav			if (do_write) {
1058219974Smav				cbp->bio_caller1 = nsd;
1059219974Smav				/* Lock callback starts I/O */
1060219974Smav				g_raid_lock_range(sd->sd_volume,
1061219974Smav				    virtual, cbp->bio_length, pbp, cbp);
1062219974Smav			} else {
1063219974Smav				g_raid_subdisk_iostart(nsd, cbp);
1064219974Smav			}
1065219974Smav			return;
1066219974Smav		}
1067219974Smav		/*
1068219974Smav		 * We can't retry.  Return the original error by falling
1069219974Smav		 * through.  This will happen when there's only one good disk.
1070219974Smav		 * We don't need to fail the raid, since its actual state is
1071219974Smav		 * based on the state of the subdisks.
1072219974Smav		 */
1073219974Smav		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1074219974Smav	}
1075219974Smav	if (bp->bio_cmd == BIO_READ &&
1076219974Smav	    bp->bio_error == 0 &&
1077219974Smav	    (mask & (1 << 31)) != 0) {
1078219974Smav		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1079219974Smav
1080219974Smav		/* Restore what we were doing. */
1081219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1082219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1083219974Smav
1084219974Smav		/* Find best disk to write. */
1085219974Smav		best = g_raid_tr_raid1e_select_read_disk(vol,
1086219974Smav		    disk, offset, start, ~mask);
1087219974Smav		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1088219974Smav			disk += best;
1089219974Smav			if (disk >= vol->v_disks_count) {
1090219974Smav				disk -= vol->v_disks_count;
1091219974Smav				offset += vol->v_strip_size;
1092219974Smav			}
1093219974Smav			cbp->bio_offset = offset + start;
1094219974Smav			cbp->bio_cmd = BIO_WRITE;
1095219974Smav			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1096219974Smav			cbp->bio_caller2 = (void *)mask;
1097219974Smav			g_destroy_bio(bp);
1098219974Smav			G_RAID_LOGREQ(2, cbp,
1099219974Smav			    "Attempting bad sector remap on failing drive.");
1100219974Smav			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1101219974Smav			return;
1102219974Smav		}
1103219974Smav	}
1104219974Smav	if ((mask & (1 << 31)) != 0) {
1105219974Smav		/*
1106219974Smav		 * We're done with a recovery, mark the range as unlocked.
1107219974Smav		 * For any write errors, we agressively fail the disk since
1108219974Smav		 * there was both a READ and a WRITE error at this location.
1109219974Smav		 * Both types of errors generally indicates the drive is on
1110219974Smav		 * the verge of total failure anyway.  Better to stop trusting
1111219974Smav		 * it now.  However, we need to reset error to 0 in that case
1112219974Smav		 * because we're not failing the original I/O which succeeded.
1113219974Smav		 */
1114219974Smav
1115219974Smav		/* Restore what we were doing. */
1116219974Smav		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1117219974Smav		V2P(vol, virtual, &disk, &offset, &start);
1118219974Smav
1119219974Smav		for (copy = 0; copy < N; copy++) {
1120219974Smav			if ((mask & (1 << copy) ) != 0)
1121219974Smav				vol->v_subdisks[(disk + copy) %
1122219974Smav				    vol->v_disks_count].sd_recovery--;
1123219974Smav		}
1124219974Smav
1125219974Smav		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1126219974Smav			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1127219974Smav			    "failing subdisk.");
1128219974Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1129219974Smav			bp->bio_error = 0;
1130219974Smav		}
1131219974Smav		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1132219974Smav		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1133219974Smav	}
1134242328Smav	if (pbp->bio_cmd != BIO_READ) {
1135235270Smav		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1136235270Smav			pbp->bio_error = bp->bio_error;
1137242328Smav		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1138235270Smav			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1139235270Smav			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1140235270Smav		}
1141235270Smav		error = pbp->bio_error;
1142235270Smav	} else
1143235270Smav		error = bp->bio_error;
1144219974Smav	g_destroy_bio(bp);
1145219974Smav	if (pbp->bio_children == pbp->bio_inbed) {
1146219974Smav		pbp->bio_completed = pbp->bio_length;
1147219974Smav		g_raid_iodone(pbp, error);
1148219974Smav	}
1149219974Smav}
1150219974Smav
1151219974Smavstatic int
1152219974Smavg_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1153219974Smav    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1154219974Smav{
1155219974Smav	struct g_raid_volume *vol;
1156219974Smav	struct g_raid_subdisk *sd;
1157219974Smav	struct bio_queue_head queue;
1158219974Smav	char *addr;
1159219974Smav	off_t offset, start, length, remain;
1160219974Smav	u_int no, strip_size;
1161219974Smav	int i, error;
1162219974Smav
1163219974Smav	vol = tr->tro_volume;
1164219974Smav	addr = virtual;
1165219974Smav	strip_size = vol->v_strip_size;
1166219974Smav	V2P(vol, boffset, &no, &offset, &start);
1167219974Smav	remain = blength;
1168219974Smav	bioq_init(&queue);
1169219974Smav	while (remain > 0) {
1170219974Smav		length = MIN(strip_size - start, remain);
1171219974Smav		for (i = 0; i < N; i++) {
1172219974Smav			sd = &vol->v_subdisks[no];
1173219974Smav			switch (sd->sd_state) {
1174219974Smav			case G_RAID_SUBDISK_S_ACTIVE:
1175219974Smav			case G_RAID_SUBDISK_S_STALE:
1176219974Smav			case G_RAID_SUBDISK_S_RESYNC:
1177219974Smav				break;
1178219974Smav			case G_RAID_SUBDISK_S_REBUILD:
1179219974Smav				if (offset + start >= sd->sd_rebuild_pos)
1180219974Smav					goto nextdisk;
1181219974Smav				break;
1182219974Smav			default:
1183219974Smav				goto nextdisk;
1184219974Smav			}
1185219974Smav			error = g_raid_subdisk_kerneldump(sd,
1186219974Smav			    addr, 0, offset + start, length);
1187219974Smav			if (error != 0)
1188219974Smav				return (error);
1189219974Smavnextdisk:
1190219974Smav			if (++no >= vol->v_disks_count) {
1191219974Smav				no = 0;
1192219974Smav				offset += strip_size;
1193219974Smav			}
1194219974Smav		}
1195219974Smav		remain -= length;
1196219974Smav		addr += length;
1197219974Smav		start = 0;
1198219974Smav	}
1199219974Smav	return (0);
1200219974Smav}
1201219974Smav
1202219974Smavstatic int
1203219974Smavg_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1204219974Smav{
1205219974Smav	struct bio *bp;
1206219974Smav	struct g_raid_subdisk *sd;
1207219974Smav
1208219974Smav	bp = (struct bio *)argp;
1209219974Smav	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1210219974Smav	g_raid_subdisk_iostart(sd, bp);
1211219974Smav
1212219974Smav	return (0);
1213219974Smav}
1214219974Smav
1215219974Smavstatic int
1216219974Smavg_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1217219974Smav{
1218219974Smav	struct g_raid_tr_raid1e_object *trs;
1219219974Smav	struct g_raid_volume *vol;
1220219974Smav
1221219974Smav	vol = tr->tro_volume;
1222219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1223219974Smav	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1224219974Smav	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1225219974Smav	/* Compensate short rebuild I/Os. */
1226219974Smav	if ((vol->v_disks_count % N) != 0 &&
1227219974Smav	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1228219974Smav		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1229219974Smav		trs->trso_recover_slabs /= vol->v_strip_size;
1230219974Smav	}
1231219974Smav	if (trs->trso_type == TR_RAID1E_REBUILD)
1232219974Smav		g_raid_tr_raid1e_rebuild_some(tr);
1233219974Smav	return (0);
1234219974Smav}
1235219974Smav
1236219974Smavstatic int
1237219974Smavg_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1238219974Smav{
1239219974Smav	struct g_raid_tr_raid1e_object *trs;
1240219974Smav
1241219974Smav	trs = (struct g_raid_tr_raid1e_object *)tr;
1242219974Smav
1243219974Smav	if (trs->trso_buffer != NULL) {
1244219974Smav		free(trs->trso_buffer, M_TR_RAID1E);
1245219974Smav		trs->trso_buffer = NULL;
1246219974Smav	}
1247219974Smav	return (0);
1248219974Smav}
1249219974Smav
1250240465SmavG_RAID_TR_DECLARE(raid1e, "RAID1E");
1251