g_raid3.c revision 223921
1218887Sdim/*-
2218887Sdim * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3218887Sdim * All rights reserved.
4218887Sdim *
5218887Sdim * Redistribution and use in source and binary forms, with or without
6218887Sdim * modification, are permitted provided that the following conditions
7218887Sdim * are met:
8218887Sdim * 1. Redistributions of source code must retain the above copyright
9218887Sdim *    notice, this list of conditions and the following disclaimer.
10218887Sdim * 2. Redistributions in binary form must reproduce the above copyright
11218887Sdim *    notice, this list of conditions and the following disclaimer in the
12218887Sdim *    documentation and/or other materials provided with the distribution.
13218887Sdim *
14218887Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15218887Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16218887Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17249423Sdim * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18218887Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19234353Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20218887Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21249423Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22249423Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23234353Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24249423Sdim * SUCH DAMAGE.
25218887Sdim */
26218887Sdim
27218887Sdim#include <sys/cdefs.h>
28218887Sdim__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 223921 2011-07-11 05:22:31Z ae $");
29218887Sdim
30218887Sdim#include <sys/param.h>
31218887Sdim#include <sys/systm.h>
32218887Sdim#include <sys/kernel.h>
33218887Sdim#include <sys/module.h>
34218887Sdim#include <sys/limits.h>
35218887Sdim#include <sys/lock.h>
36218887Sdim#include <sys/mutex.h>
37218887Sdim#include <sys/bio.h>
38218887Sdim#include <sys/sbuf.h>
39218887Sdim#include <sys/sysctl.h>
40218887Sdim#include <sys/malloc.h>
41218887Sdim#include <sys/eventhandler.h>
42218887Sdim#include <vm/uma.h>
43218887Sdim#include <geom/geom.h>
44226633Sdim#include <sys/proc.h>
45218887Sdim#include <sys/kthread.h>
46218887Sdim#include <sys/sched.h>
47218887Sdim#include <geom/raid3/g_raid3.h>
48218887Sdim
49218887SdimFEATURE(geom_raid3, "GEOM RAID-3 functionality");
50218887Sdim
51218887Sdimstatic MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
52218887Sdim
53218887SdimSYSCTL_DECL(_kern_geom);
54218887SdimSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
55218887Sdimu_int g_raid3_debug = 0;
56218887SdimTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
57218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
58218887Sdim    "Debug level");
59218887Sdimstatic u_int g_raid3_timeout = 4;
60218887SdimTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
61218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
62226633Sdim    0, "Time to wait on all raid3 components");
63218887Sdimstatic u_int g_raid3_idletime = 5;
64218887SdimTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
65218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
66218887Sdim    &g_raid3_idletime, 0, "Mark components as clean when idling");
67218887Sdimstatic u_int g_raid3_disconnect_on_failure = 1;
68218887SdimTUNABLE_INT("kern.geom.raid3.disconnect_on_failure",
69218887Sdim    &g_raid3_disconnect_on_failure);
70218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
71218887Sdim    &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
72218887Sdimstatic u_int g_raid3_syncreqs = 2;
73218887SdimTUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs);
74218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
75218887Sdim    &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
76218887Sdimstatic u_int g_raid3_use_malloc = 0;
77218887SdimTUNABLE_INT("kern.geom.raid3.use_malloc", &g_raid3_use_malloc);
78218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
79218887Sdim    &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
80218887Sdim
81218887Sdimstatic u_int g_raid3_n64k = 50;
82218887SdimTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
83226633SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
84218887Sdim    "Maximum number of 64kB allocations");
85218887Sdimstatic u_int g_raid3_n16k = 200;
86218887SdimTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
87218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
88218887Sdim    "Maximum number of 16kB allocations");
89218887Sdimstatic u_int g_raid3_n4k = 1200;
90218887SdimTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
91218887SdimSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
92218887Sdim    "Maximum number of 4kB allocations");
93218887Sdim
94218887SdimSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
95218887Sdim    "GEOM_RAID3 statistics");
96218887Sdimstatic u_int g_raid3_parity_mismatch = 0;
97218887SdimSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
98218887Sdim    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
99218887Sdim
100218887Sdim#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
101218887Sdim	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
102226633Sdim	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
103218887Sdim	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
104218887Sdim} while (0)
105218887Sdim
106218887Sdimstatic eventhandler_tag g_raid3_pre_sync = NULL;
107218887Sdim
108218887Sdimstatic int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
109218887Sdim    struct g_geom *gp);
110218887Sdimstatic g_taste_t g_raid3_taste;
111218887Sdimstatic void g_raid3_init(struct g_class *mp);
112218887Sdimstatic void g_raid3_fini(struct g_class *mp);
113218887Sdim
114218887Sdimstruct g_class g_raid3_class = {
115218887Sdim	.name = G_RAID3_CLASS_NAME,
116218887Sdim	.version = G_VERSION,
117218887Sdim	.ctlreq = g_raid3_config,
118218887Sdim	.taste = g_raid3_taste,
119218887Sdim	.destroy_geom = g_raid3_destroy_geom,
120218887Sdim	.init = g_raid3_init,
121226633Sdim	.fini = g_raid3_fini
122218887Sdim};
123218887Sdim
124218887Sdim
125218887Sdimstatic void g_raid3_destroy_provider(struct g_raid3_softc *sc);
126218887Sdimstatic int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
127218887Sdimstatic void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
128218887Sdimstatic void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
129218887Sdim    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
130218887Sdimstatic void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
131218887Sdimstatic int g_raid3_register_request(struct bio *pbp);
132218887Sdimstatic void g_raid3_sync_release(struct g_raid3_softc *sc);
133218887Sdim
134218887Sdim
135218887Sdimstatic const char *
136218887Sdimg_raid3_disk_state2str(int state)
137218887Sdim{
138218887Sdim
139218887Sdim	switch (state) {
140218887Sdim	case G_RAID3_DISK_STATE_NODISK:
141218887Sdim		return ("NODISK");
142218887Sdim	case G_RAID3_DISK_STATE_NONE:
143218887Sdim		return ("NONE");
144218887Sdim	case G_RAID3_DISK_STATE_NEW:
145218887Sdim		return ("NEW");
146218887Sdim	case G_RAID3_DISK_STATE_ACTIVE:
147218887Sdim		return ("ACTIVE");
148218887Sdim	case G_RAID3_DISK_STATE_STALE:
149218887Sdim		return ("STALE");
150218887Sdim	case G_RAID3_DISK_STATE_SYNCHRONIZING:
151218887Sdim		return ("SYNCHRONIZING");
152218887Sdim	case G_RAID3_DISK_STATE_DISCONNECTED:
153218887Sdim		return ("DISCONNECTED");
154218887Sdim	default:
155218887Sdim		return ("INVALID");
156218887Sdim	}
157218887Sdim}
158218887Sdim
159218887Sdimstatic const char *
160218887Sdimg_raid3_device_state2str(int state)
161218887Sdim{
162218887Sdim
163218887Sdim	switch (state) {
164218887Sdim	case G_RAID3_DEVICE_STATE_STARTING:
165218887Sdim		return ("STARTING");
166218887Sdim	case G_RAID3_DEVICE_STATE_DEGRADED:
167218887Sdim		return ("DEGRADED");
168218887Sdim	case G_RAID3_DEVICE_STATE_COMPLETE:
169218887Sdim		return ("COMPLETE");
170218887Sdim	default:
171218887Sdim		return ("INVALID");
172218887Sdim	}
173218887Sdim}
174218887Sdim
175218887Sdimconst char *
176218887Sdimg_raid3_get_diskname(struct g_raid3_disk *disk)
177218887Sdim{
178218887Sdim
179218887Sdim	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
180218887Sdim		return ("[unknown]");
181218887Sdim	return (disk->d_name);
182218887Sdim}
183239462Sdim
184226633Sdimstatic void *
185218887Sdimg_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
186218887Sdim{
187218887Sdim	void *ptr;
188218887Sdim	enum g_raid3_zones zone;
189218887Sdim
190218887Sdim	if (g_raid3_use_malloc ||
191218887Sdim	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
192218887Sdim		ptr = malloc(size, M_RAID3, flags);
193218887Sdim	else {
194218887Sdim		ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
195218887Sdim		   &sc->sc_zones[zone], flags);
196218887Sdim		sc->sc_zones[zone].sz_requested++;
197218887Sdim		if (ptr == NULL)
198249423Sdim			sc->sc_zones[zone].sz_failed++;
199249423Sdim	}
200249423Sdim	return (ptr);
201249423Sdim}
202218887Sdim
203218887Sdimstatic void
204218887Sdimg_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
205218887Sdim{
206218887Sdim	enum g_raid3_zones zone;
207218887Sdim
208218887Sdim	if (g_raid3_use_malloc ||
209218887Sdim	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
210218887Sdim		free(ptr, M_RAID3);
211218887Sdim	else {
212218887Sdim		uma_zfree_arg(sc->sc_zones[zone].sz_zone,
213218887Sdim		    ptr, &sc->sc_zones[zone]);
214218887Sdim	}
215218887Sdim}
216218887Sdim
217218887Sdimstatic int
218218887Sdimg_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
219218887Sdim{
220218887Sdim	struct g_raid3_zone *sz = arg;
221218887Sdim
222218887Sdim	if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
223218887Sdim		return (ENOMEM);
224218887Sdim	sz->sz_inuse++;
225218887Sdim	return (0);
226218887Sdim}
227218887Sdim
228218887Sdimstatic void
229234353Sdimg_raid3_uma_dtor(void *mem, int size, void *arg)
230234353Sdim{
231234353Sdim	struct g_raid3_zone *sz = arg;
232234353Sdim
233234353Sdim	sz->sz_inuse--;
234234353Sdim}
235234353Sdim
236234353Sdim#define	g_raid3_xor(src, dst, size)					\
237234353Sdim	_g_raid3_xor((uint64_t *)(src),					\
238234353Sdim	    (uint64_t *)(dst), (size_t)size)
239234353Sdimstatic void
240218887Sdim_g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
241249423Sdim{
242218887Sdim
243218887Sdim	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
244218887Sdim	for (; size > 0; size -= 128) {
245218887Sdim		*dst++ ^= (*src++);
246218887Sdim		*dst++ ^= (*src++);
247218887Sdim		*dst++ ^= (*src++);
248218887Sdim		*dst++ ^= (*src++);
249218887Sdim		*dst++ ^= (*src++);
250218887Sdim		*dst++ ^= (*src++);
251218887Sdim		*dst++ ^= (*src++);
252218887Sdim		*dst++ ^= (*src++);
253218887Sdim		*dst++ ^= (*src++);
254218887Sdim		*dst++ ^= (*src++);
255218887Sdim		*dst++ ^= (*src++);
256218887Sdim		*dst++ ^= (*src++);
257218887Sdim		*dst++ ^= (*src++);
258218887Sdim		*dst++ ^= (*src++);
259218887Sdim		*dst++ ^= (*src++);
260218887Sdim		*dst++ ^= (*src++);
261218887Sdim	}
262218887Sdim}
263218887Sdim
264218887Sdimstatic int
265218887Sdimg_raid3_is_zero(struct bio *bp)
266218887Sdim{
267218887Sdim	static const uint64_t zeros[] = {
268218887Sdim	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269218887Sdim	};
270234353Sdim	u_char *addr;
271234353Sdim	ssize_t size;
272234353Sdim
273234353Sdim	size = bp->bio_length;
274234353Sdim	addr = (u_char *)bp->bio_data;
275234353Sdim	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
276234353Sdim		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
277234353Sdim			return (0);
278218887Sdim	}
279226633Sdim	return (1);
280249423Sdim}
281218887Sdim
282218887Sdim/*
283218887Sdim * --- Events handling functions ---
284249423Sdim * Events in geom_raid3 are used to maintain disks and device status
285218887Sdim * from one thread to simplify locking.
286218887Sdim */
287218887Sdimstatic void
288218887Sdimg_raid3_event_free(struct g_raid3_event *ep)
289218887Sdim{
290218887Sdim
291218887Sdim	free(ep, M_RAID3);
292218887Sdim}
293218887Sdim
294218887Sdimint
295218887Sdimg_raid3_event_send(void *arg, int state, int flags)
296226633Sdim{
297218887Sdim	struct g_raid3_softc *sc;
298218887Sdim	struct g_raid3_disk *disk;
299218887Sdim	struct g_raid3_event *ep;
300218887Sdim	int error;
301218887Sdim
302218887Sdim	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
303218887Sdim	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
304218887Sdim	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
305218887Sdim		disk = NULL;
306218887Sdim		sc = arg;
307218887Sdim	} else {
308218887Sdim		disk = arg;
309218887Sdim		sc = disk->d_softc;
310218887Sdim	}
311218887Sdim	ep->e_disk = disk;
312218887Sdim	ep->e_state = state;
313218887Sdim	ep->e_flags = flags;
314218887Sdim	ep->e_error = 0;
315234353Sdim	mtx_lock(&sc->sc_events_mtx);
316234353Sdim	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
317234353Sdim	mtx_unlock(&sc->sc_events_mtx);
318234353Sdim	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
319234353Sdim	mtx_lock(&sc->sc_queue_mtx);
320234353Sdim	wakeup(sc);
321226633Sdim	wakeup(&sc->sc_queue);
322218887Sdim	mtx_unlock(&sc->sc_queue_mtx);
323218887Sdim	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
324218887Sdim		return (0);
325218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
326218887Sdim	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
327218887Sdim	sx_xunlock(&sc->sc_lock);
328218887Sdim	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
329218887Sdim		mtx_lock(&sc->sc_events_mtx);
330218887Sdim		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
331218887Sdim		    hz * 5);
332218887Sdim	}
333218887Sdim	error = ep->e_error;
334218887Sdim	g_raid3_event_free(ep);
335218887Sdim	sx_xlock(&sc->sc_lock);
336218887Sdim	return (error);
337218887Sdim}
338218887Sdim
339218887Sdimstatic struct g_raid3_event *
340218887Sdimg_raid3_event_get(struct g_raid3_softc *sc)
341218887Sdim{
342218887Sdim	struct g_raid3_event *ep;
343218887Sdim
344218887Sdim	mtx_lock(&sc->sc_events_mtx);
345218887Sdim	ep = TAILQ_FIRST(&sc->sc_events);
346218887Sdim	mtx_unlock(&sc->sc_events_mtx);
347218887Sdim	return (ep);
348218887Sdim}
349218887Sdim
350218887Sdimstatic void
351218887Sdimg_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
352218887Sdim{
353218887Sdim
354218887Sdim	mtx_lock(&sc->sc_events_mtx);
355218887Sdim	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
356218887Sdim	mtx_unlock(&sc->sc_events_mtx);
357218887Sdim}
358218887Sdim
359218887Sdimstatic void
360218887Sdimg_raid3_event_cancel(struct g_raid3_disk *disk)
361243830Sdim{
362218887Sdim	struct g_raid3_softc *sc;
363218887Sdim	struct g_raid3_event *ep, *tmpep;
364218887Sdim
365218887Sdim	sc = disk->d_softc;
366218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
367218887Sdim
368218887Sdim	mtx_lock(&sc->sc_events_mtx);
369218887Sdim	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
370218887Sdim		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
371218887Sdim			continue;
372218887Sdim		if (ep->e_disk != disk)
373234353Sdim			continue;
374218887Sdim		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
375218887Sdim		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
376218887Sdim			g_raid3_event_free(ep);
377218887Sdim		else {
378218887Sdim			ep->e_error = ECANCELED;
379218887Sdim			wakeup(ep);
380218887Sdim		}
381218887Sdim	}
382218887Sdim	mtx_unlock(&sc->sc_events_mtx);
383218887Sdim}
384218887Sdim
385218887Sdim/*
386218887Sdim * Return the number of disks in the given state.
387218887Sdim * If state is equal to -1, count all connected disks.
388218887Sdim */
389218887Sdimu_int
390218887Sdimg_raid3_ndisks(struct g_raid3_softc *sc, int state)
391218887Sdim{
392218887Sdim	struct g_raid3_disk *disk;
393218887Sdim	u_int n, ndisks;
394218887Sdim
395218887Sdim	sx_assert(&sc->sc_lock, SX_LOCKED);
396218887Sdim
397218887Sdim	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
398218887Sdim		disk = &sc->sc_disks[n];
399218887Sdim		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
400218887Sdim			continue;
401218887Sdim		if (state == -1 || disk->d_state == state)
402218887Sdim			ndisks++;
403218887Sdim	}
404218887Sdim	return (ndisks);
405218887Sdim}
406218887Sdim
407218887Sdimstatic u_int
408218887Sdimg_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
409249423Sdim{
410249423Sdim	struct bio *bp;
411249423Sdim	u_int nreqs = 0;
412249423Sdim
413249423Sdim	mtx_lock(&sc->sc_queue_mtx);
414249423Sdim	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
415218887Sdim		if (bp->bio_from == cp)
416218887Sdim			nreqs++;
417218887Sdim	}
418249423Sdim	mtx_unlock(&sc->sc_queue_mtx);
419218887Sdim	return (nreqs);
420218887Sdim}
421218887Sdim
422234353Sdimstatic int
423234353Sdimg_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
424234353Sdim{
425234353Sdim
426234353Sdim	if (cp->index > 0) {
427234353Sdim		G_RAID3_DEBUG(2,
428234353Sdim		    "I/O requests for %s exist, can't destroy it now.",
429234353Sdim		    cp->provider->name);
430234353Sdim		return (1);
431234353Sdim	}
432234353Sdim	if (g_raid3_nrequests(sc, cp) > 0) {
433234353Sdim		G_RAID3_DEBUG(2,
434234353Sdim		    "I/O requests for %s in queue, can't destroy it now.",
435234353Sdim		    cp->provider->name);
436218887Sdim		return (1);
437218887Sdim	}
438218887Sdim	return (0);
439218887Sdim}
440218887Sdim
441218887Sdimstatic void
442218887Sdimg_raid3_destroy_consumer(void *arg, int flags __unused)
443218887Sdim{
444218887Sdim	struct g_consumer *cp;
445218887Sdim
446218887Sdim	g_topology_assert();
447218887Sdim
448218887Sdim	cp = arg;
449218887Sdim	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
450226633Sdim	g_detach(cp);
451218887Sdim	g_destroy_consumer(cp);
452218887Sdim}
453218887Sdim
454226633Sdimstatic void
455243830Sdimg_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
456218887Sdim{
457218887Sdim	struct g_provider *pp;
458226633Sdim	int retaste_wait;
459218887Sdim
460218887Sdim	g_topology_assert();
461218887Sdim
462226633Sdim	cp->private = NULL;
463243830Sdim	if (g_raid3_is_busy(sc, cp))
464218887Sdim		return;
465218887Sdim	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
466226633Sdim	pp = cp->provider;
467218887Sdim	retaste_wait = 0;
468218887Sdim	if (cp->acw == 1) {
469218887Sdim		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
470226633Sdim			retaste_wait = 1;
471218887Sdim	}
472243830Sdim	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
473218887Sdim	    -cp->acw, -cp->ace, 0);
474218887Sdim	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
475226633Sdim		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
476226633Sdim	if (retaste_wait) {
477243830Sdim		/*
478218887Sdim		 * After retaste event was send (inside g_access()), we can send
479218887Sdim		 * event to detach and destroy consumer.
480226633Sdim		 * A class, which has consumer to the given provider connected
481249423Sdim		 * will not receive retaste event for the provider.
482218887Sdim		 * This is the way how I ignore retaste events when I close
483218887Sdim		 * consumers opened for write: I detach and destroy consumer
484226633Sdim		 * after retaste event is sent.
485218887Sdim		 */
486218887Sdim		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
487218887Sdim		return;
488226633Sdim	}
489218887Sdim	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
490218887Sdim	g_detach(cp);
491218887Sdim	g_destroy_consumer(cp);
492218887Sdim}
493226633Sdim
494226633Sdimstatic int
495218887Sdimg_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
496218887Sdim{
497226633Sdim	struct g_consumer *cp;
498226633Sdim	int error;
499218887Sdim
500218887Sdim	g_topology_assert_not();
501226633Sdim	KASSERT(disk->d_consumer == NULL,
502234353Sdim	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
503218887Sdim
504218887Sdim	g_topology_lock();
505234353Sdim	cp = g_new_consumer(disk->d_softc->sc_geom);
506234353Sdim	error = g_attach(cp, pp);
507234353Sdim	if (error != 0) {
508234353Sdim		g_destroy_consumer(cp);
509226633Sdim		g_topology_unlock();
510218887Sdim		return (error);
511218887Sdim	}
512218887Sdim	error = g_access(cp, 1, 1, 1);
513226633Sdim		g_topology_unlock();
514226633Sdim	if (error != 0) {
515218887Sdim		g_detach(cp);
516218887Sdim		g_destroy_consumer(cp);
517218887Sdim		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
518218887Sdim		    pp->name, error);
519218887Sdim		return (error);
520218887Sdim	}
521226633Sdim	disk->d_consumer = cp;
522218887Sdim	disk->d_consumer->private = disk;
523218887Sdim	disk->d_consumer->index = 0;
524218887Sdim	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
525226633Sdim	return (0);
526218887Sdim}
527218887Sdim
528218887Sdimstatic void
529234353Sdimg_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
530234353Sdim{
531234353Sdim
532234353Sdim	g_topology_assert();
533234353Sdim
534234353Sdim	if (cp == NULL)
535234353Sdim		return;
536234353Sdim	if (cp->provider != NULL)
537234353Sdim		g_raid3_kill_consumer(sc, cp);
538234353Sdim	else
539234353Sdim		g_destroy_consumer(cp);
540234353Sdim}
541239462Sdim
542239462Sdim/*
543239462Sdim * Initialize disk. This means allocate memory, create consumer, attach it
544239462Sdim * to the provider and open access (r1w1e1) to it.
545239462Sdim */
546239462Sdimstatic struct g_raid3_disk *
547239462Sdimg_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
548239462Sdim    struct g_raid3_metadata *md, int *errorp)
549239462Sdim{
550239462Sdim	struct g_raid3_disk *disk;
551239462Sdim	int error;
552239462Sdim
553239462Sdim	disk = &sc->sc_disks[md->md_no];
554239462Sdim	error = g_raid3_connect_disk(disk, pp);
555239462Sdim	if (error != 0) {
556239462Sdim		if (errorp != NULL)
557239462Sdim			*errorp = error;
558239462Sdim		return (NULL);
559239462Sdim	}
560239462Sdim	disk->d_state = G_RAID3_DISK_STATE_NONE;
561239462Sdim	disk->d_flags = md->md_dflags;
562234353Sdim	if (md->md_provider[0] != '\0')
563234353Sdim		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
564234353Sdim	disk->d_sync.ds_consumer = NULL;
565239462Sdim	disk->d_sync.ds_offset = md->md_sync_offset;
566239462Sdim	disk->d_sync.ds_offset_done = md->md_sync_offset;
567239462Sdim	disk->d_genid = md->md_genid;
568239462Sdim	disk->d_sync.ds_syncid = md->md_syncid;
569239462Sdim	if (errorp != NULL)
570234353Sdim		*errorp = 0;
571234353Sdim	return (disk);
572234353Sdim}
573249423Sdim
574249423Sdimstatic void
575249423Sdimg_raid3_destroy_disk(struct g_raid3_disk *disk)
576249423Sdim{
577249423Sdim	struct g_raid3_softc *sc;
578249423Sdim
579249423Sdim	g_topology_assert_not();
580249423Sdim	sc = disk->d_softc;
581239462Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
582239462Sdim
583234353Sdim	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
584234353Sdim		return;
585239462Sdim	g_raid3_event_cancel(disk);
586239462Sdim	switch (disk->d_state) {
587239462Sdim	case G_RAID3_DISK_STATE_SYNCHRONIZING:
588239462Sdim		if (sc->sc_syncdisk != NULL)
589239462Sdim			g_raid3_sync_stop(sc, 1);
590218887Sdim		/* FALLTHROUGH */
591218887Sdim	case G_RAID3_DISK_STATE_NEW:
592218887Sdim	case G_RAID3_DISK_STATE_STALE:
593218887Sdim	case G_RAID3_DISK_STATE_ACTIVE:
594218887Sdim		g_topology_lock();
595218887Sdim		g_raid3_disconnect_consumer(sc, disk->d_consumer);
596218887Sdim		g_topology_unlock();
597218887Sdim		disk->d_consumer = NULL;
598218887Sdim		break;
599218887Sdim	default:
600218887Sdim		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
601218887Sdim		    g_raid3_get_diskname(disk),
602218887Sdim		    g_raid3_disk_state2str(disk->d_state)));
603218887Sdim	}
604218887Sdim	disk->d_state = G_RAID3_DISK_STATE_NODISK;
605218887Sdim}
606218887Sdim
607218887Sdimstatic void
608218887Sdimg_raid3_destroy_device(struct g_raid3_softc *sc)
609218887Sdim{
610218887Sdim	struct g_raid3_event *ep;
611218887Sdim	struct g_raid3_disk *disk;
612218887Sdim	struct g_geom *gp;
613218887Sdim	struct g_consumer *cp;
614218887Sdim	u_int n;
615218887Sdim
616218887Sdim	g_topology_assert_not();
617218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
618218887Sdim
619218887Sdim	gp = sc->sc_geom;
620218887Sdim	if (sc->sc_provider != NULL)
621218887Sdim		g_raid3_destroy_provider(sc);
622218887Sdim	for (n = 0; n < sc->sc_ndisks; n++) {
623218887Sdim		disk = &sc->sc_disks[n];
624218887Sdim		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
625218887Sdim			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
626218887Sdim			g_raid3_update_metadata(disk);
627218887Sdim			g_raid3_destroy_disk(disk);
628218887Sdim		}
629218887Sdim	}
630218887Sdim	while ((ep = g_raid3_event_get(sc)) != NULL) {
631218887Sdim		g_raid3_event_remove(sc, ep);
632218887Sdim		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
633218887Sdim			g_raid3_event_free(ep);
634218887Sdim		else {
635218887Sdim			ep->e_error = ECANCELED;
636218887Sdim			ep->e_flags |= G_RAID3_EVENT_DONE;
637218887Sdim			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
638218887Sdim			mtx_lock(&sc->sc_events_mtx);
639218887Sdim			wakeup(ep);
640218887Sdim			mtx_unlock(&sc->sc_events_mtx);
641234353Sdim		}
642234353Sdim	}
643234353Sdim	callout_drain(&sc->sc_callout);
644234353Sdim	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
645234353Sdim	g_topology_lock();
646234353Sdim	if (cp != NULL)
647234353Sdim		g_raid3_disconnect_consumer(sc, cp);
648234353Sdim	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
649234353Sdim	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
650234353Sdim	g_wither_geom(gp, ENXIO);
651218887Sdim	g_topology_unlock();
652234353Sdim	if (!g_raid3_use_malloc) {
653218887Sdim		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
654218887Sdim		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
655218887Sdim		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
656218887Sdim	}
657218887Sdim	mtx_destroy(&sc->sc_queue_mtx);
658218887Sdim	mtx_destroy(&sc->sc_events_mtx);
659218887Sdim	sx_xunlock(&sc->sc_lock);
660218887Sdim	sx_destroy(&sc->sc_lock);
661218887Sdim}
662218887Sdim
663218887Sdimstatic void
664218887Sdimg_raid3_orphan(struct g_consumer *cp)
665218887Sdim{
666218887Sdim	struct g_raid3_disk *disk;
667218887Sdim
668218887Sdim	g_topology_assert();
669218887Sdim
670218887Sdim	disk = cp->private;
671218887Sdim	if (disk == NULL)
672218887Sdim		return;
673218887Sdim	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
674218887Sdim	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
675218887Sdim	    G_RAID3_EVENT_DONTWAIT);
676218887Sdim}
677218887Sdim
678218887Sdimstatic int
679218887Sdimg_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
680218887Sdim{
681234353Sdim	struct g_raid3_softc *sc;
682234353Sdim	struct g_consumer *cp;
683234353Sdim	off_t offset, length;
684234353Sdim	u_char *sector;
685234353Sdim	int error = 0;
686239462Sdim
687239462Sdim	g_topology_assert_not();
688239462Sdim	sc = disk->d_softc;
689239462Sdim	sx_assert(&sc->sc_lock, SX_LOCKED);
690239462Sdim
691239462Sdim	cp = disk->d_consumer;
692239462Sdim	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
693239462Sdim	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
694239462Sdim	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
695239462Sdim	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
696239462Sdim	    cp->acw, cp->ace));
697239462Sdim	length = cp->provider->sectorsize;
698239462Sdim	offset = cp->provider->mediasize - length;
699239462Sdim	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
700239462Sdim	if (md != NULL)
701239462Sdim		raid3_metadata_encode(md, sector);
702239462Sdim	error = g_write_data(cp, offset, sector, length);
703239462Sdim	free(sector, M_RAID3);
704239462Sdim	if (error != 0) {
705239462Sdim		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
706239462Sdim			G_RAID3_DEBUG(0, "Cannot write metadata on %s "
707239462Sdim			    "(device=%s, error=%d).",
708239462Sdim			    g_raid3_get_diskname(disk), sc->sc_name, error);
709239462Sdim			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
710239462Sdim		} else {
711239462Sdim			G_RAID3_DEBUG(1, "Cannot write metadata on %s "
712239462Sdim			    "(device=%s, error=%d).",
713239462Sdim			    g_raid3_get_diskname(disk), sc->sc_name, error);
714239462Sdim		}
715239462Sdim		if (g_raid3_disconnect_on_failure &&
716239462Sdim		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
717218887Sdim			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
718218887Sdim			g_raid3_event_send(disk,
719218887Sdim			    G_RAID3_DISK_STATE_DISCONNECTED,
720218887Sdim			    G_RAID3_EVENT_DONTWAIT);
721234353Sdim		}
722234353Sdim	}
723234353Sdim	return (error);
724234353Sdim}
725234353Sdim
726234353Sdimint
727234353Sdimg_raid3_clear_metadata(struct g_raid3_disk *disk)
728234353Sdim{
729234353Sdim	int error;
730234353Sdim
731234353Sdim	g_topology_assert_not();
732234353Sdim	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
733234353Sdim
734234353Sdim	error = g_raid3_write_metadata(disk, NULL);
735234353Sdim	if (error == 0) {
736234353Sdim		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
737234353Sdim		    g_raid3_get_diskname(disk));
738234353Sdim	} else {
739234353Sdim		G_RAID3_DEBUG(0,
740234353Sdim		    "Cannot clear metadata on disk %s (error=%d).",
741234353Sdim		    g_raid3_get_diskname(disk), error);
742234353Sdim	}
743234353Sdim	return (error);
744234353Sdim}
745234353Sdim
746218887Sdimvoid
747218887Sdimg_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
748218887Sdim{
749239462Sdim	struct g_raid3_softc *sc;
750239462Sdim	struct g_provider *pp;
751239462Sdim
752239462Sdim	sc = disk->d_softc;
753239462Sdim	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
754239462Sdim	md->md_version = G_RAID3_VERSION;
755239462Sdim	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
756218887Sdim	md->md_id = sc->sc_id;
757218887Sdim	md->md_all = sc->sc_ndisks;
758218887Sdim	md->md_genid = sc->sc_genid;
759218887Sdim	md->md_mediasize = sc->sc_mediasize;
760218887Sdim	md->md_sectorsize = sc->sc_sectorsize;
761218887Sdim	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
762218887Sdim	md->md_no = disk->d_no;
763218887Sdim	md->md_syncid = disk->d_sync.ds_syncid;
764218887Sdim	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
765218887Sdim	if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
766218887Sdim		md->md_sync_offset = 0;
767243830Sdim	else {
768243830Sdim		md->md_sync_offset =
769234353Sdim		    disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
770243830Sdim	}
771243830Sdim	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
772218887Sdim		pp = disk->d_consumer->provider;
773218887Sdim	else
774218887Sdim		pp = NULL;
775234353Sdim	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
776234353Sdim		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
777234353Sdim	else
778218887Sdim		bzero(md->md_provider, sizeof(md->md_provider));
779218887Sdim	if (pp != NULL)
780218887Sdim		md->md_provsize = pp->mediasize;
781218887Sdim	else
782218887Sdim		md->md_provsize = 0;
783218887Sdim}
784218887Sdim
785218887Sdimvoid
786218887Sdimg_raid3_update_metadata(struct g_raid3_disk *disk)
787218887Sdim{
788218887Sdim	struct g_raid3_softc *sc;
789218887Sdim	struct g_raid3_metadata md;
790218887Sdim	int error;
791218887Sdim
792218887Sdim	g_topology_assert_not();
793218887Sdim	sc = disk->d_softc;
794218887Sdim	sx_assert(&sc->sc_lock, SX_LOCKED);
795218887Sdim
796218887Sdim	g_raid3_fill_metadata(disk, &md);
797218887Sdim	error = g_raid3_write_metadata(disk, &md);
798234353Sdim	if (error == 0) {
799234353Sdim		G_RAID3_DEBUG(2, "Metadata on %s updated.",
800234353Sdim		    g_raid3_get_diskname(disk));
801234353Sdim	} else {
802218887Sdim		G_RAID3_DEBUG(0,
803218887Sdim		    "Cannot update metadata on disk %s (error=%d).",
804234353Sdim		    g_raid3_get_diskname(disk), error);
805234353Sdim	}
806234353Sdim}
807234353Sdim
808234353Sdimstatic void
809234353Sdimg_raid3_bump_syncid(struct g_raid3_softc *sc)
810234353Sdim{
811234353Sdim	struct g_raid3_disk *disk;
812234353Sdim	u_int n;
813234353Sdim
814234353Sdim	g_topology_assert_not();
815234353Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
816218887Sdim	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
817218887Sdim	    ("%s called with no active disks (device=%s).", __func__,
818218887Sdim	    sc->sc_name));
819218887Sdim
820218887Sdim	sc->sc_syncid++;
821218887Sdim	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
822226633Sdim	    sc->sc_syncid);
823218887Sdim	for (n = 0; n < sc->sc_ndisks; n++) {
824218887Sdim		disk = &sc->sc_disks[n];
825218887Sdim		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
826218887Sdim		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
827218887Sdim			disk->d_sync.ds_syncid = sc->sc_syncid;
828218887Sdim			g_raid3_update_metadata(disk);
829218887Sdim		}
830218887Sdim	}
831218887Sdim}
832218887Sdim
833218887Sdimstatic void
834218887Sdimg_raid3_bump_genid(struct g_raid3_softc *sc)
835218887Sdim{
836218887Sdim	struct g_raid3_disk *disk;
837218887Sdim	u_int n;
838218887Sdim
839218887Sdim	g_topology_assert_not();
840218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
841226633Sdim	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
842218887Sdim	    ("%s called with no active disks (device=%s).", __func__,
843218887Sdim	    sc->sc_name));
844218887Sdim
845218887Sdim	sc->sc_genid++;
846218887Sdim	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
847218887Sdim	    sc->sc_genid);
848226633Sdim	for (n = 0; n < sc->sc_ndisks; n++) {
849218887Sdim		disk = &sc->sc_disks[n];
850218887Sdim		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
851218887Sdim		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
852218887Sdim			disk->d_genid = sc->sc_genid;
853218887Sdim			g_raid3_update_metadata(disk);
854218887Sdim		}
855218887Sdim	}
856218887Sdim}
857218887Sdim
858218887Sdimstatic int
859218887Sdimg_raid3_idle(struct g_raid3_softc *sc, int acw)
860218887Sdim{
861218887Sdim	struct g_raid3_disk *disk;
862243830Sdim	u_int i;
863218887Sdim	int timeout;
864218887Sdim
865218887Sdim	g_topology_assert_not();
866218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
867218887Sdim
868234353Sdim	if (sc->sc_provider == NULL)
869218887Sdim		return (0);
870218887Sdim	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
871218887Sdim		return (0);
872218887Sdim	if (sc->sc_idle)
873218887Sdim		return (0);
874218887Sdim	if (sc->sc_writes > 0)
875218887Sdim		return (0);
876218887Sdim	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
877218887Sdim		timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
878239462Sdim		if (timeout > 0)
879239462Sdim			return (timeout);
880239462Sdim	}
881239462Sdim	sc->sc_idle = 1;
882218887Sdim	for (i = 0; i < sc->sc_ndisks; i++) {
883226633Sdim		disk = &sc->sc_disks[i];
884218887Sdim		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
885218887Sdim			continue;
886218887Sdim		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
887218887Sdim		    g_raid3_get_diskname(disk), sc->sc_name);
888218887Sdim		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
889226633Sdim		g_raid3_update_metadata(disk);
890218887Sdim	}
891218887Sdim	return (0);
892218887Sdim}
893218887Sdim
894218887Sdimstatic void
895218887Sdimg_raid3_unidle(struct g_raid3_softc *sc)
896218887Sdim{
897218887Sdim	struct g_raid3_disk *disk;
898218887Sdim	u_int i;
899218887Sdim
900218887Sdim	g_topology_assert_not();
901218887Sdim	sx_assert(&sc->sc_lock, SX_XLOCKED);
902249423Sdim
903249423Sdim	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
904249423Sdim		return;
905249423Sdim	sc->sc_idle = 0;
906249423Sdim	sc->sc_last_write = time_uptime;
907249423Sdim	for (i = 0; i < sc->sc_ndisks; i++) {
908249423Sdim		disk = &sc->sc_disks[i];
909249423Sdim		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
910249423Sdim			continue;
911249423Sdim		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
912249423Sdim		    g_raid3_get_diskname(disk), sc->sc_name);
913249423Sdim		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
914249423Sdim		g_raid3_update_metadata(disk);
915249423Sdim	}
916249423Sdim}
917249423Sdim
918249423Sdim/*
919249423Sdim * Treat bio_driver1 field in parent bio as list head and field bio_caller1
920249423Sdim * in child bio as pointer to the next element on the list.
921249423Sdim */
922249423Sdim#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
923249423Sdim
924249423Sdim#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
925249423Sdim
926218887Sdim#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
927249423Sdim	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
928249423Sdim	    (bp) = G_RAID3_NEXT_BIO(bp))
929249423Sdim
930249423Sdim#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
931249423Sdim	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
932249423Sdim	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
933239462Sdim	    (bp) = (tmpbp))
934249423Sdim
935249423Sdimstatic void
936249423Sdimg_raid3_init_bio(struct bio *pbp)
937249423Sdim{
938249423Sdim
939249423Sdim	G_RAID3_HEAD_BIO(pbp) = NULL;
940239462Sdim}
941249423Sdim
942239462Sdimstatic void
943239462Sdimg_raid3_remove_bio(struct bio *cbp)
944239462Sdim{
945249423Sdim	struct bio *pbp, *bp;
946218887Sdim
947218887Sdim	pbp = cbp->bio_parent;
948218887Sdim	if (G_RAID3_HEAD_BIO(pbp) == cbp)
949218887Sdim		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
950218887Sdim	else {
951218887Sdim		G_RAID3_FOREACH_BIO(pbp, bp) {
952218887Sdim			if (G_RAID3_NEXT_BIO(bp) == cbp) {
953218887Sdim				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
954218887Sdim				break;
955218887Sdim			}
956218887Sdim		}
957218887Sdim	}
958218887Sdim	G_RAID3_NEXT_BIO(cbp) = NULL;
959226633Sdim}
960218887Sdim
961218887Sdimstatic void
962218887Sdimg_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
963218887Sdim{
964218887Sdim	struct bio *pbp, *bp;
965218887Sdim
966218887Sdim	g_raid3_remove_bio(sbp);
967218887Sdim	pbp = dbp->bio_parent;
968218887Sdim	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
969218887Sdim	if (G_RAID3_HEAD_BIO(pbp) == dbp)
970218887Sdim		G_RAID3_HEAD_BIO(pbp) = sbp;
971218887Sdim	else {
972218887Sdim		G_RAID3_FOREACH_BIO(pbp, bp) {
973218887Sdim			if (G_RAID3_NEXT_BIO(bp) == dbp) {
974218887Sdim				G_RAID3_NEXT_BIO(bp) = sbp;
975218887Sdim				break;
976218887Sdim			}
977218887Sdim		}
978218887Sdim	}
979218887Sdim	G_RAID3_NEXT_BIO(dbp) = NULL;
980218887Sdim}
981218887Sdim
982218887Sdimstatic void
983218887Sdimg_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
984218887Sdim{
985218887Sdim	struct bio *bp, *pbp;
986218887Sdim	size_t size;
987218887Sdim
988218887Sdim	pbp = cbp->bio_parent;
989218887Sdim	pbp->bio_children--;
990218887Sdim	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
991218887Sdim	size = pbp->bio_length / (sc->sc_ndisks - 1);
992218887Sdim	g_raid3_free(sc, cbp->bio_data, size);
993218887Sdim	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
994218887Sdim		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
995218887Sdim		G_RAID3_NEXT_BIO(cbp) = NULL;
996218887Sdim		g_destroy_bio(cbp);
997218887Sdim	} else {
998218887Sdim		G_RAID3_FOREACH_BIO(pbp, bp) {
999218887Sdim			if (G_RAID3_NEXT_BIO(bp) == cbp)
1000218887Sdim				break;
1001218887Sdim		}
1002218887Sdim		if (bp != NULL) {
1003218887Sdim			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
1004218887Sdim			    ("NULL bp->bio_driver1"));
1005218887Sdim			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
1006218887Sdim			G_RAID3_NEXT_BIO(cbp) = NULL;
1007218887Sdim		}
1008218887Sdim		g_destroy_bio(cbp);
1009218887Sdim	}
1010218887Sdim}
1011218887Sdim
1012218887Sdimstatic struct bio *
1013218887Sdimg_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
1014218887Sdim{
1015218887Sdim	struct bio *bp, *cbp;
1016243830Sdim	size_t size;
1017243830Sdim	int memflag;
1018243830Sdim
1019243830Sdim	cbp = g_clone_bio(pbp);
1020218887Sdim	if (cbp == NULL)
1021218887Sdim		return (NULL);
1022218887Sdim	size = pbp->bio_length / (sc->sc_ndisks - 1);
1023218887Sdim	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
1024239462Sdim		memflag = M_WAITOK;
1025218887Sdim	else
1026218887Sdim		memflag = M_NOWAIT;
1027239462Sdim	cbp->bio_data = g_raid3_alloc(sc, size, memflag);
1028239462Sdim	if (cbp->bio_data == NULL) {
1029239462Sdim		pbp->bio_children--;
1030239462Sdim		g_destroy_bio(cbp);
1031239462Sdim		return (NULL);
1032239462Sdim	}
1033239462Sdim	G_RAID3_NEXT_BIO(cbp) = NULL;
1034218887Sdim	if (G_RAID3_HEAD_BIO(pbp) == NULL)
1035239462Sdim		G_RAID3_HEAD_BIO(pbp) = cbp;
1036239462Sdim	else {
1037239462Sdim		G_RAID3_FOREACH_BIO(pbp, bp) {
1038239462Sdim			if (G_RAID3_NEXT_BIO(bp) == NULL) {
1039239462Sdim				G_RAID3_NEXT_BIO(bp) = cbp;
1040239462Sdim				break;
1041239462Sdim			}
1042239462Sdim		}
1043218887Sdim	}
1044218887Sdim	return (cbp);
1045218887Sdim}
1046218887Sdim
1047218887Sdimstatic void
1048218887Sdimg_raid3_scatter(struct bio *pbp)
1049218887Sdim{
1050218887Sdim	struct g_raid3_softc *sc;
1051218887Sdim	struct g_raid3_disk *disk;
1052218887Sdim	struct bio *bp, *cbp, *tmpbp;
1053218887Sdim	off_t atom, cadd, padd, left;
1054218887Sdim	int first;
1055218887Sdim
1056218887Sdim	sc = pbp->bio_to->geom->softc;
1057218887Sdim	bp = NULL;
1058218887Sdim	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1059218887Sdim		/*
1060218887Sdim		 * Find bio for which we should calculate data.
1061218887Sdim		 */
1062218887Sdim		G_RAID3_FOREACH_BIO(pbp, cbp) {
1063218887Sdim			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1064218887Sdim				bp = cbp;
1065218887Sdim				break;
1066218887Sdim			}
1067218887Sdim		}
1068218887Sdim		KASSERT(bp != NULL, ("NULL parity bio."));
1069218887Sdim	}
1070249423Sdim	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1071218887Sdim	cadd = padd = 0;
1072218887Sdim	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1073218887Sdim		G_RAID3_FOREACH_BIO(pbp, cbp) {
1074218887Sdim			if (cbp == bp)
1075218887Sdim				continue;
1076218887Sdim			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1077218887Sdim			padd += atom;
1078218887Sdim		}
1079218887Sdim		cadd += atom;
1080218887Sdim	}
1081218887Sdim	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1082218887Sdim		/*
1083218887Sdim		 * Calculate parity.
1084218887Sdim		 */
1085218887Sdim		first = 1;
1086218887Sdim		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1087218887Sdim			if (cbp == bp)
1088218887Sdim				continue;
1089218887Sdim			if (first) {
1090218887Sdim				bcopy(cbp->bio_data, bp->bio_data,
1091218887Sdim				    bp->bio_length);
1092218887Sdim				first = 0;
1093218887Sdim			} else {
1094218887Sdim				g_raid3_xor(cbp->bio_data, bp->bio_data,
1095218887Sdim				    bp->bio_length);
1096218887Sdim			}
1097218887Sdim			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1098218887Sdim				g_raid3_destroy_bio(sc, cbp);
1099249423Sdim		}
1100249423Sdim	}
1101249423Sdim	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1102249423Sdim		struct g_consumer *cp;
1103249423Sdim
1104249423Sdim		disk = cbp->bio_caller2;
1105249423Sdim		cp = disk->d_consumer;
1106249423Sdim		cbp->bio_to = cp->provider;
1107249423Sdim		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1108249423Sdim		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1109249423Sdim		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1110249423Sdim		    cp->acr, cp->acw, cp->ace));
1111249423Sdim		cp->index++;
1112249423Sdim		sc->sc_writes++;
1113249423Sdim		g_io_request(cbp, cp);
1114249423Sdim	}
1115249423Sdim}
1116218887Sdim
1117218887Sdimstatic void
1118239462Sdimg_raid3_gather(struct bio *pbp)
1119218887Sdim{
1120218887Sdim	struct g_raid3_softc *sc;
1121218887Sdim	struct g_raid3_disk *disk;
1122218887Sdim	struct bio *xbp, *fbp, *cbp;
1123249423Sdim	off_t atom, cadd, padd, left;
1124249423Sdim
1125249423Sdim	sc = pbp->bio_to->geom->softc;
1126249423Sdim	/*
1127249423Sdim	 * Find bio for which we have to calculate data.
1128249423Sdim	 * While going through this path, check if all requests
1129249423Sdim	 * succeeded, if not, deny whole request.
1130249423Sdim	 * If we're in COMPLETE mode, we allow one request to fail,
1131249423Sdim	 * so if we find one, we're sending it to the parity consumer.
1132249423Sdim	 * If there are more failed requests, we deny whole request.
1133249423Sdim	 */
1134249423Sdim	xbp = fbp = NULL;
1135239462Sdim	G_RAID3_FOREACH_BIO(pbp, cbp) {
1136249423Sdim		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1137249423Sdim			KASSERT(xbp == NULL, ("More than one parity bio."));
1138249423Sdim			xbp = cbp;
1139249423Sdim		}
1140249423Sdim		if (cbp->bio_error == 0)
1141249423Sdim			continue;
1142249423Sdim		/*
1143249423Sdim		 * Found failed request.
1144249423Sdim		 */
1145218887Sdim		if (fbp == NULL) {
1146218887Sdim			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1147218887Sdim				/*
1148218887Sdim				 * We are already in degraded mode, so we can't
1149218887Sdim				 * accept any failures.
1150249423Sdim				 */
1151218887Sdim				if (pbp->bio_error == 0)
1152218887Sdim					pbp->bio_error = cbp->bio_error;
1153249423Sdim			} else {
1154218887Sdim				fbp = cbp;
1155239462Sdim			}
1156239462Sdim		} else {
1157239462Sdim			/*
1158239462Sdim			 * Next failed request, that's too many.
1159239462Sdim			 */
1160239462Sdim			if (pbp->bio_error == 0)
1161239462Sdim				pbp->bio_error = fbp->bio_error;
1162239462Sdim		}
1163239462Sdim		disk = cbp->bio_caller2;
1164239462Sdim		if (disk == NULL)
1165239462Sdim			continue;
1166239462Sdim		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1167239462Sdim			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1168239462Sdim			G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
1169249423Sdim			    cbp->bio_error);
1170239462Sdim		} else {
1171239462Sdim			G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
1172239462Sdim			    cbp->bio_error);
1173239462Sdim		}
1174239462Sdim		if (g_raid3_disconnect_on_failure &&
1175239462Sdim		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1176243830Sdim			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1177249423Sdim			g_raid3_event_send(disk,
1178239462Sdim			    G_RAID3_DISK_STATE_DISCONNECTED,
1179239462Sdim			    G_RAID3_EVENT_DONTWAIT);
1180239462Sdim		}
1181239462Sdim	}
1182239462Sdim	if (pbp->bio_error != 0)
1183239462Sdim		goto finish;
1184239462Sdim	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1185239462Sdim		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1186249423Sdim		if (xbp != fbp)
1187249423Sdim			g_raid3_replace_bio(xbp, fbp);
1188249423Sdim		g_raid3_destroy_bio(sc, fbp);
1189249423Sdim	} else if (fbp != NULL) {
1190249423Sdim		struct g_consumer *cp;
1191249423Sdim
1192249423Sdim		/*
1193249423Sdim		 * One request failed, so send the same request to
1194249423Sdim		 * the parity consumer.
1195249423Sdim		 */
1196249423Sdim		disk = pbp->bio_driver2;
1197249423Sdim		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1198249423Sdim			pbp->bio_error = fbp->bio_error;
1199239462Sdim			goto finish;
1200239462Sdim		}
1201239462Sdim		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1202239462Sdim		pbp->bio_inbed--;
1203239462Sdim		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1204249423Sdim		if (disk->d_no == sc->sc_ndisks - 1)
1205239462Sdim			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1206249423Sdim		fbp->bio_error = 0;
1207249423Sdim		fbp->bio_completed = 0;
1208239462Sdim		fbp->bio_children = 0;
1209249423Sdim		fbp->bio_inbed = 0;
1210239462Sdim		cp = disk->d_consumer;
1211239462Sdim		fbp->bio_caller2 = disk;
1212239462Sdim		fbp->bio_to = cp->provider;
1213239462Sdim		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1214239462Sdim		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1215218887Sdim		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1216218887Sdim		    cp->acr, cp->acw, cp->ace));
1217239462Sdim		cp->index++;
1218239462Sdim		g_io_request(fbp, cp);
1219218887Sdim		return;
1220239462Sdim	}
1221239462Sdim	if (xbp != NULL) {
1222239462Sdim		/*
1223239462Sdim		 * Calculate parity.
1224239462Sdim		 */
1225218887Sdim		G_RAID3_FOREACH_BIO(pbp, cbp) {
1226218887Sdim			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1227249423Sdim				continue;
1228249423Sdim			g_raid3_xor(cbp->bio_data, xbp->bio_data,
1229239462Sdim			    xbp->bio_length);
1230239462Sdim		}
1231239462Sdim		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1232239462Sdim		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1233239462Sdim			if (!g_raid3_is_zero(xbp)) {
1234218887Sdim				g_raid3_parity_mismatch++;
1235239462Sdim				pbp->bio_error = EIO;
1236239462Sdim				goto finish;
1237218887Sdim			}
1238218887Sdim			g_raid3_destroy_bio(sc, xbp);
1239239462Sdim		}
1240218887Sdim	}
1241218887Sdim	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1242218887Sdim	cadd = padd = 0;
1243218887Sdim	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1244218887Sdim		G_RAID3_FOREACH_BIO(pbp, cbp) {
1245239462Sdim			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1246239462Sdim			pbp->bio_completed += atom;
1247218887Sdim			padd += atom;
1248243830Sdim		}
1249218887Sdim		cadd += atom;
1250243830Sdim	}
1251243830Sdimfinish:
1252243830Sdim	if (pbp->bio_error == 0)
1253243830Sdim		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1254239462Sdim	else {
1255239462Sdim		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1256239462Sdim			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1257239462Sdim		else
1258239462Sdim			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1259239462Sdim	}
1260239462Sdim	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1261239462Sdim	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1262218887Sdim		g_raid3_destroy_bio(sc, cbp);
1263218887Sdim	g_io_deliver(pbp, pbp->bio_error);
1264218887Sdim}
1265218887Sdim
1266218887Sdimstatic void
1267218887Sdimg_raid3_done(struct bio *bp)
1268218887Sdim{
1269218887Sdim	struct g_raid3_softc *sc;
1270218887Sdim
1271218887Sdim	sc = bp->bio_from->geom->softc;
1272218887Sdim	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1273218887Sdim	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1274218887Sdim	mtx_lock(&sc->sc_queue_mtx);
1275218887Sdim	bioq_insert_head(&sc->sc_queue, bp);
1276218887Sdim	mtx_unlock(&sc->sc_queue_mtx);
1277218887Sdim	wakeup(sc);
1278239462Sdim	wakeup(&sc->sc_queue);
1279239462Sdim}
1280218887Sdim
1281218887Sdimstatic void
1282218887Sdimg_raid3_regular_request(struct bio *cbp)
1283218887Sdim{
1284218887Sdim	struct g_raid3_softc *sc;
1285218887Sdim	struct g_raid3_disk *disk;
1286218887Sdim	struct bio *pbp;
1287249423Sdim
1288249423Sdim	g_topology_assert_not();
1289249423Sdim
1290249423Sdim	pbp = cbp->bio_parent;
1291249423Sdim	sc = pbp->bio_to->geom->softc;
1292249423Sdim	cbp->bio_from->index--;
1293249423Sdim	if (cbp->bio_cmd == BIO_WRITE)
1294249423Sdim		sc->sc_writes--;
1295249423Sdim	disk = cbp->bio_from->private;
1296249423Sdim	if (disk == NULL) {
1297249423Sdim		g_topology_lock();
1298249423Sdim		g_raid3_kill_consumer(sc, cbp->bio_from);
1299249423Sdim		g_topology_unlock();
1300249423Sdim	}
1301249423Sdim
1302249423Sdim	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1303249423Sdim	pbp->bio_inbed++;
1304249423Sdim	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1305249423Sdim	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1306249423Sdim	    pbp->bio_children));
1307249423Sdim	if (pbp->bio_inbed != pbp->bio_children)
1308249423Sdim		return;
1309249423Sdim	switch (pbp->bio_cmd) {
1310218887Sdim	case BIO_READ:
1311218887Sdim		g_raid3_gather(pbp);
1312218887Sdim		break;
1313218887Sdim	case BIO_WRITE:
1314234353Sdim	case BIO_DELETE:
1315234353Sdim	    {
1316218887Sdim		int error = 0;
1317218887Sdim
1318218887Sdim		pbp->bio_completed = pbp->bio_length;
1319218887Sdim		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1320218887Sdim			if (cbp->bio_error == 0) {
1321218887Sdim				g_raid3_destroy_bio(sc, cbp);
1322218887Sdim				continue;
1323218887Sdim			}
1324218887Sdim
1325218887Sdim			if (error == 0)
1326218887Sdim				error = cbp->bio_error;
1327218887Sdim			else if (pbp->bio_error == 0) {
1328218887Sdim				/*
1329218887Sdim				 * Next failed request, that's too many.
1330239462Sdim				 */
1331239462Sdim				pbp->bio_error = error;
1332218887Sdim			}
1333218887Sdim
1334218887Sdim			disk = cbp->bio_caller2;
1335239462Sdim			if (disk == NULL) {
1336249423Sdim				g_raid3_destroy_bio(sc, cbp);
1337218887Sdim				continue;
1338239462Sdim			}
1339218887Sdim
1340239462Sdim			if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1341218887Sdim				disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1342218887Sdim				G_RAID3_LOGREQ(0, cbp,
1343218887Sdim				    "Request failed (error=%d).",
1344239462Sdim				    cbp->bio_error);
1345218887Sdim			} else {
1346218887Sdim				G_RAID3_LOGREQ(1, cbp,
1347218887Sdim				    "Request failed (error=%d).",
1348218887Sdim				    cbp->bio_error);
1349218887Sdim			}
1350218887Sdim			if (g_raid3_disconnect_on_failure &&
1351218887Sdim			    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1352218887Sdim				sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1353218887Sdim				g_raid3_event_send(disk,
1354239462Sdim				    G_RAID3_DISK_STATE_DISCONNECTED,
1355239462Sdim				    G_RAID3_EVENT_DONTWAIT);
1356239462Sdim			}
1357239462Sdim			g_raid3_destroy_bio(sc, cbp);
1358239462Sdim		}
1359239462Sdim		if (pbp->bio_error == 0)
1360239462Sdim			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1361239462Sdim		else
1362218887Sdim			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1363218887Sdim		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1364218887Sdim		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1365218887Sdim		bioq_remove(&sc->sc_inflight, pbp);
1366218887Sdim		/* Release delayed sync requests if possible. */
1367218887Sdim		g_raid3_sync_release(sc);
1368218887Sdim		g_io_deliver(pbp, pbp->bio_error);
1369218887Sdim		break;
1370218887Sdim	    }
1371239462Sdim	}
1372239462Sdim}
1373239462Sdim
1374239462Sdimstatic void
1375239462Sdimg_raid3_sync_done(struct bio *bp)
1376239462Sdim{
1377239462Sdim	struct g_raid3_softc *sc;
1378239462Sdim
1379218887Sdim	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1380249423Sdim	sc = bp->bio_from->geom->softc;
1381249423Sdim	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1382249423Sdim	mtx_lock(&sc->sc_queue_mtx);
1383249423Sdim	bioq_insert_head(&sc->sc_queue, bp);
1384249423Sdim	mtx_unlock(&sc->sc_queue_mtx);
1385249423Sdim	wakeup(sc);
1386249423Sdim	wakeup(&sc->sc_queue);
1387249423Sdim}
1388249423Sdim
1389249423Sdimstatic void
1390g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
1391{
1392	struct bio_queue_head queue;
1393	struct g_raid3_disk *disk;
1394	struct g_consumer *cp;
1395	struct bio *cbp;
1396	u_int i;
1397
1398	bioq_init(&queue);
1399	for (i = 0; i < sc->sc_ndisks; i++) {
1400		disk = &sc->sc_disks[i];
1401		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
1402			continue;
1403		cbp = g_clone_bio(bp);
1404		if (cbp == NULL) {
1405			for (cbp = bioq_first(&queue); cbp != NULL;
1406			    cbp = bioq_first(&queue)) {
1407				bioq_remove(&queue, cbp);
1408				g_destroy_bio(cbp);
1409			}
1410			if (bp->bio_error == 0)
1411				bp->bio_error = ENOMEM;
1412			g_io_deliver(bp, bp->bio_error);
1413			return;
1414		}
1415		bioq_insert_tail(&queue, cbp);
1416		cbp->bio_done = g_std_done;
1417		cbp->bio_caller1 = disk;
1418		cbp->bio_to = disk->d_consumer->provider;
1419	}
1420	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
1421		bioq_remove(&queue, cbp);
1422		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1423		disk = cbp->bio_caller1;
1424		cbp->bio_caller1 = NULL;
1425		cp = disk->d_consumer;
1426		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1427		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1428		    cp->acr, cp->acw, cp->ace));
1429		g_io_request(cbp, disk->d_consumer);
1430	}
1431}
1432
1433static void
1434g_raid3_start(struct bio *bp)
1435{
1436	struct g_raid3_softc *sc;
1437
1438	sc = bp->bio_to->geom->softc;
1439	/*
1440	 * If sc == NULL or there are no valid disks, provider's error
1441	 * should be set and g_raid3_start() should not be called at all.
1442	 */
1443	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1444	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1445	    ("Provider's error should be set (error=%d)(device=%s).",
1446	    bp->bio_to->error, bp->bio_to->name));
1447	G_RAID3_LOGREQ(3, bp, "Request received.");
1448
1449	switch (bp->bio_cmd) {
1450	case BIO_READ:
1451	case BIO_WRITE:
1452	case BIO_DELETE:
1453		break;
1454	case BIO_FLUSH:
1455		g_raid3_flush(sc, bp);
1456		return;
1457	case BIO_GETATTR:
1458	default:
1459		g_io_deliver(bp, EOPNOTSUPP);
1460		return;
1461	}
1462	mtx_lock(&sc->sc_queue_mtx);
1463	bioq_insert_tail(&sc->sc_queue, bp);
1464	mtx_unlock(&sc->sc_queue_mtx);
1465	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1466	wakeup(sc);
1467}
1468
1469/*
1470 * Return TRUE if the given request is colliding with a in-progress
1471 * synchronization request.
1472 */
1473static int
1474g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
1475{
1476	struct g_raid3_disk *disk;
1477	struct bio *sbp;
1478	off_t rstart, rend, sstart, send;
1479	int i;
1480
1481	disk = sc->sc_syncdisk;
1482	if (disk == NULL)
1483		return (0);
1484	rstart = bp->bio_offset;
1485	rend = bp->bio_offset + bp->bio_length;
1486	for (i = 0; i < g_raid3_syncreqs; i++) {
1487		sbp = disk->d_sync.ds_bios[i];
1488		if (sbp == NULL)
1489			continue;
1490		sstart = sbp->bio_offset;
1491		send = sbp->bio_length;
1492		if (sbp->bio_cmd == BIO_WRITE) {
1493			sstart *= sc->sc_ndisks - 1;
1494			send *= sc->sc_ndisks - 1;
1495		}
1496		send += sstart;
1497		if (rend > sstart && rstart < send)
1498			return (1);
1499	}
1500	return (0);
1501}
1502
1503/*
1504 * Return TRUE if the given sync request is colliding with a in-progress regular
1505 * request.
1506 */
1507static int
1508g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
1509{
1510	off_t rstart, rend, sstart, send;
1511	struct bio *bp;
1512
1513	if (sc->sc_syncdisk == NULL)
1514		return (0);
1515	sstart = sbp->bio_offset;
1516	send = sstart + sbp->bio_length;
1517	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1518		rstart = bp->bio_offset;
1519		rend = bp->bio_offset + bp->bio_length;
1520		if (rend > sstart && rstart < send)
1521			return (1);
1522	}
1523	return (0);
1524}
1525
1526/*
1527 * Puts request onto delayed queue.
1528 */
1529static void
1530g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
1531{
1532
1533	G_RAID3_LOGREQ(2, bp, "Delaying request.");
1534	bioq_insert_head(&sc->sc_regular_delayed, bp);
1535}
1536
1537/*
1538 * Puts synchronization request onto delayed queue.
1539 */
1540static void
1541g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
1542{
1543
1544	G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
1545	bioq_insert_tail(&sc->sc_sync_delayed, bp);
1546}
1547
1548/*
1549 * Releases delayed regular requests which don't collide anymore with sync
1550 * requests.
1551 */
1552static void
1553g_raid3_regular_release(struct g_raid3_softc *sc)
1554{
1555	struct bio *bp, *bp2;
1556
1557	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1558		if (g_raid3_sync_collision(sc, bp))
1559			continue;
1560		bioq_remove(&sc->sc_regular_delayed, bp);
1561		G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1562		mtx_lock(&sc->sc_queue_mtx);
1563		bioq_insert_head(&sc->sc_queue, bp);
1564#if 0
1565		/*
1566		 * wakeup() is not needed, because this function is called from
1567		 * the worker thread.
1568		 */
1569		wakeup(&sc->sc_queue);
1570#endif
1571		mtx_unlock(&sc->sc_queue_mtx);
1572	}
1573}
1574
1575/*
1576 * Releases delayed sync requests which don't collide anymore with regular
1577 * requests.
1578 */
1579static void
1580g_raid3_sync_release(struct g_raid3_softc *sc)
1581{
1582	struct bio *bp, *bp2;
1583
1584	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1585		if (g_raid3_regular_collision(sc, bp))
1586			continue;
1587		bioq_remove(&sc->sc_sync_delayed, bp);
1588		G_RAID3_LOGREQ(2, bp,
1589		    "Releasing delayed synchronization request.");
1590		g_io_request(bp, bp->bio_from);
1591	}
1592}
1593
1594/*
1595 * Handle synchronization requests.
1596 * Every synchronization request is two-steps process: first, READ request is
1597 * send to active provider and then WRITE request (with read data) to the provider
1598 * beeing synchronized. When WRITE is finished, new synchronization request is
1599 * send.
1600 */
1601static void
1602g_raid3_sync_request(struct bio *bp)
1603{
1604	struct g_raid3_softc *sc;
1605	struct g_raid3_disk *disk;
1606
1607	bp->bio_from->index--;
1608	sc = bp->bio_from->geom->softc;
1609	disk = bp->bio_from->private;
1610	if (disk == NULL) {
1611		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1612		g_topology_lock();
1613		g_raid3_kill_consumer(sc, bp->bio_from);
1614		g_topology_unlock();
1615		free(bp->bio_data, M_RAID3);
1616		g_destroy_bio(bp);
1617		sx_xlock(&sc->sc_lock);
1618		return;
1619	}
1620
1621	/*
1622	 * Synchronization request.
1623	 */
1624	switch (bp->bio_cmd) {
1625	case BIO_READ:
1626	    {
1627		struct g_consumer *cp;
1628		u_char *dst, *src;
1629		off_t left;
1630		u_int atom;
1631
1632		if (bp->bio_error != 0) {
1633			G_RAID3_LOGREQ(0, bp,
1634			    "Synchronization request failed (error=%d).",
1635			    bp->bio_error);
1636			g_destroy_bio(bp);
1637			return;
1638		}
1639		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1640		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1641		dst = src = bp->bio_data;
1642		if (disk->d_no == sc->sc_ndisks - 1) {
1643			u_int n;
1644
1645			/* Parity component. */
1646			for (left = bp->bio_length; left > 0;
1647			    left -= sc->sc_sectorsize) {
1648				bcopy(src, dst, atom);
1649				src += atom;
1650				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1651					g_raid3_xor(src, dst, atom);
1652					src += atom;
1653				}
1654				dst += atom;
1655			}
1656		} else {
1657			/* Regular component. */
1658			src += atom * disk->d_no;
1659			for (left = bp->bio_length; left > 0;
1660			    left -= sc->sc_sectorsize) {
1661				bcopy(src, dst, atom);
1662				src += sc->sc_sectorsize;
1663				dst += atom;
1664			}
1665		}
1666		bp->bio_driver1 = bp->bio_driver2 = NULL;
1667		bp->bio_pflags = 0;
1668		bp->bio_offset /= sc->sc_ndisks - 1;
1669		bp->bio_length /= sc->sc_ndisks - 1;
1670		bp->bio_cmd = BIO_WRITE;
1671		bp->bio_cflags = 0;
1672		bp->bio_children = bp->bio_inbed = 0;
1673		cp = disk->d_consumer;
1674		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1675		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1676		    cp->acr, cp->acw, cp->ace));
1677		cp->index++;
1678		g_io_request(bp, cp);
1679		return;
1680	    }
1681	case BIO_WRITE:
1682	    {
1683		struct g_raid3_disk_sync *sync;
1684		off_t boffset, moffset;
1685		void *data;
1686		int i;
1687
1688		if (bp->bio_error != 0) {
1689			G_RAID3_LOGREQ(0, bp,
1690			    "Synchronization request failed (error=%d).",
1691			    bp->bio_error);
1692			g_destroy_bio(bp);
1693			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1694			g_raid3_event_send(disk,
1695			    G_RAID3_DISK_STATE_DISCONNECTED,
1696			    G_RAID3_EVENT_DONTWAIT);
1697			return;
1698		}
1699		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1700		sync = &disk->d_sync;
1701		if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
1702		    sync->ds_consumer == NULL ||
1703		    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1704			/* Don't send more synchronization requests. */
1705			sync->ds_inflight--;
1706			if (sync->ds_bios != NULL) {
1707				i = (int)(uintptr_t)bp->bio_caller1;
1708				sync->ds_bios[i] = NULL;
1709			}
1710			free(bp->bio_data, M_RAID3);
1711			g_destroy_bio(bp);
1712			if (sync->ds_inflight > 0)
1713				return;
1714			if (sync->ds_consumer == NULL ||
1715			    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1716				return;
1717			}
1718			/*
1719			 * Disk up-to-date, activate it.
1720			 */
1721			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1722			    G_RAID3_EVENT_DONTWAIT);
1723			return;
1724		}
1725
1726		/* Send next synchronization request. */
1727		data = bp->bio_data;
1728		bzero(bp, sizeof(*bp));
1729		bp->bio_cmd = BIO_READ;
1730		bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
1731		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1732		sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1733		bp->bio_done = g_raid3_sync_done;
1734		bp->bio_data = data;
1735		bp->bio_from = sync->ds_consumer;
1736		bp->bio_to = sc->sc_provider;
1737		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1738		sync->ds_consumer->index++;
1739		/*
1740		 * Delay the request if it is colliding with a regular request.
1741		 */
1742		if (g_raid3_regular_collision(sc, bp))
1743			g_raid3_sync_delay(sc, bp);
1744		else
1745			g_io_request(bp, sync->ds_consumer);
1746
1747		/* Release delayed requests if possible. */
1748		g_raid3_regular_release(sc);
1749
1750		/* Find the smallest offset. */
1751		moffset = sc->sc_mediasize;
1752		for (i = 0; i < g_raid3_syncreqs; i++) {
1753			bp = sync->ds_bios[i];
1754			boffset = bp->bio_offset;
1755			if (bp->bio_cmd == BIO_WRITE)
1756				boffset *= sc->sc_ndisks - 1;
1757			if (boffset < moffset)
1758				moffset = boffset;
1759		}
1760		if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
1761			/* Update offset_done on every 100 blocks. */
1762			sync->ds_offset_done = moffset;
1763			g_raid3_update_metadata(disk);
1764		}
1765		return;
1766	    }
1767	default:
1768		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1769		    bp->bio_cmd, sc->sc_name));
1770		break;
1771	}
1772}
1773
1774static int
1775g_raid3_register_request(struct bio *pbp)
1776{
1777	struct g_raid3_softc *sc;
1778	struct g_raid3_disk *disk;
1779	struct g_consumer *cp;
1780	struct bio *cbp, *tmpbp;
1781	off_t offset, length;
1782	u_int n, ndisks;
1783	int round_robin, verify;
1784
1785	ndisks = 0;
1786	sc = pbp->bio_to->geom->softc;
1787	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1788	    sc->sc_syncdisk == NULL) {
1789		g_io_deliver(pbp, EIO);
1790		return (0);
1791	}
1792	g_raid3_init_bio(pbp);
1793	length = pbp->bio_length / (sc->sc_ndisks - 1);
1794	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1795	round_robin = verify = 0;
1796	switch (pbp->bio_cmd) {
1797	case BIO_READ:
1798		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1799		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1800			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1801			verify = 1;
1802			ndisks = sc->sc_ndisks;
1803		} else {
1804			verify = 0;
1805			ndisks = sc->sc_ndisks - 1;
1806		}
1807		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1808		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1809			round_robin = 1;
1810		} else {
1811			round_robin = 0;
1812		}
1813		KASSERT(!round_robin || !verify,
1814		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1815		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1816		break;
1817	case BIO_WRITE:
1818	case BIO_DELETE:
1819		/*
1820		 * Delay the request if it is colliding with a synchronization
1821		 * request.
1822		 */
1823		if (g_raid3_sync_collision(sc, pbp)) {
1824			g_raid3_regular_delay(sc, pbp);
1825			return (0);
1826		}
1827
1828		if (sc->sc_idle)
1829			g_raid3_unidle(sc);
1830		else
1831			sc->sc_last_write = time_uptime;
1832
1833		ndisks = sc->sc_ndisks;
1834		break;
1835	}
1836	for (n = 0; n < ndisks; n++) {
1837		disk = &sc->sc_disks[n];
1838		cbp = g_raid3_clone_bio(sc, pbp);
1839		if (cbp == NULL) {
1840			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1841				g_raid3_destroy_bio(sc, cbp);
1842			/*
1843			 * To prevent deadlock, we must run back up
1844			 * with the ENOMEM for failed requests of any
1845			 * of our consumers.  Our own sync requests
1846			 * can stick around, as they are finite.
1847			 */
1848			if ((pbp->bio_cflags &
1849			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1850				g_io_deliver(pbp, ENOMEM);
1851				return (0);
1852			}
1853			return (ENOMEM);
1854		}
1855		cbp->bio_offset = offset;
1856		cbp->bio_length = length;
1857		cbp->bio_done = g_raid3_done;
1858		switch (pbp->bio_cmd) {
1859		case BIO_READ:
1860			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1861				/*
1862				 * Replace invalid component with the parity
1863				 * component.
1864				 */
1865				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1866				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1867				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1868			} else if (round_robin &&
1869			    disk->d_no == sc->sc_round_robin) {
1870				/*
1871				 * In round-robin mode skip one data component
1872				 * and use parity component when reading.
1873				 */
1874				pbp->bio_driver2 = disk;
1875				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1876				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1877				sc->sc_round_robin++;
1878				round_robin = 0;
1879			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1880				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1881			}
1882			break;
1883		case BIO_WRITE:
1884		case BIO_DELETE:
1885			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1886			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1887				if (n == ndisks - 1) {
1888					/*
1889					 * Active parity component, mark it as such.
1890					 */
1891					cbp->bio_cflags |=
1892					    G_RAID3_BIO_CFLAG_PARITY;
1893				}
1894			} else {
1895				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1896				if (n == ndisks - 1) {
1897					/*
1898					 * Parity component is not connected,
1899					 * so destroy its request.
1900					 */
1901					pbp->bio_pflags |=
1902					    G_RAID3_BIO_PFLAG_NOPARITY;
1903					g_raid3_destroy_bio(sc, cbp);
1904					cbp = NULL;
1905				} else {
1906					cbp->bio_cflags |=
1907					    G_RAID3_BIO_CFLAG_NODISK;
1908					disk = NULL;
1909				}
1910			}
1911			break;
1912		}
1913		if (cbp != NULL)
1914			cbp->bio_caller2 = disk;
1915	}
1916	switch (pbp->bio_cmd) {
1917	case BIO_READ:
1918		if (round_robin) {
1919			/*
1920			 * If we are in round-robin mode and 'round_robin' is
1921			 * still 1, it means, that we skipped parity component
1922			 * for this read and must reset sc_round_robin field.
1923			 */
1924			sc->sc_round_robin = 0;
1925		}
1926		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1927			disk = cbp->bio_caller2;
1928			cp = disk->d_consumer;
1929			cbp->bio_to = cp->provider;
1930			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1931			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1932			    ("Consumer %s not opened (r%dw%de%d).",
1933			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1934			cp->index++;
1935			g_io_request(cbp, cp);
1936		}
1937		break;
1938	case BIO_WRITE:
1939	case BIO_DELETE:
1940		/*
1941		 * Put request onto inflight queue, so we can check if new
1942		 * synchronization requests don't collide with it.
1943		 */
1944		bioq_insert_tail(&sc->sc_inflight, pbp);
1945
1946		/*
1947		 * Bump syncid on first write.
1948		 */
1949		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1950			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1951			g_raid3_bump_syncid(sc);
1952		}
1953		g_raid3_scatter(pbp);
1954		break;
1955	}
1956	return (0);
1957}
1958
1959static int
1960g_raid3_can_destroy(struct g_raid3_softc *sc)
1961{
1962	struct g_geom *gp;
1963	struct g_consumer *cp;
1964
1965	g_topology_assert();
1966	gp = sc->sc_geom;
1967	if (gp->softc == NULL)
1968		return (1);
1969	LIST_FOREACH(cp, &gp->consumer, consumer) {
1970		if (g_raid3_is_busy(sc, cp))
1971			return (0);
1972	}
1973	gp = sc->sc_sync.ds_geom;
1974	LIST_FOREACH(cp, &gp->consumer, consumer) {
1975		if (g_raid3_is_busy(sc, cp))
1976			return (0);
1977	}
1978	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1979	    sc->sc_name);
1980	return (1);
1981}
1982
1983static int
1984g_raid3_try_destroy(struct g_raid3_softc *sc)
1985{
1986
1987	g_topology_assert_not();
1988	sx_assert(&sc->sc_lock, SX_XLOCKED);
1989
1990	if (sc->sc_rootmount != NULL) {
1991		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1992		    sc->sc_rootmount);
1993		root_mount_rel(sc->sc_rootmount);
1994		sc->sc_rootmount = NULL;
1995	}
1996
1997	g_topology_lock();
1998	if (!g_raid3_can_destroy(sc)) {
1999		g_topology_unlock();
2000		return (0);
2001	}
2002	sc->sc_geom->softc = NULL;
2003	sc->sc_sync.ds_geom->softc = NULL;
2004	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
2005		g_topology_unlock();
2006		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2007		    &sc->sc_worker);
2008		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
2009		sx_xunlock(&sc->sc_lock);
2010		wakeup(&sc->sc_worker);
2011		sc->sc_worker = NULL;
2012	} else {
2013		g_topology_unlock();
2014		g_raid3_destroy_device(sc);
2015		free(sc->sc_disks, M_RAID3);
2016		free(sc, M_RAID3);
2017	}
2018	return (1);
2019}
2020
2021/*
2022 * Worker thread.
2023 */
2024static void
2025g_raid3_worker(void *arg)
2026{
2027	struct g_raid3_softc *sc;
2028	struct g_raid3_event *ep;
2029	struct bio *bp;
2030	int timeout;
2031
2032	sc = arg;
2033	thread_lock(curthread);
2034	sched_prio(curthread, PRIBIO);
2035	thread_unlock(curthread);
2036
2037	sx_xlock(&sc->sc_lock);
2038	for (;;) {
2039		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
2040		/*
2041		 * First take a look at events.
2042		 * This is important to handle events before any I/O requests.
2043		 */
2044		ep = g_raid3_event_get(sc);
2045		if (ep != NULL) {
2046			g_raid3_event_remove(sc, ep);
2047			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
2048				/* Update only device status. */
2049				G_RAID3_DEBUG(3,
2050				    "Running event for device %s.",
2051				    sc->sc_name);
2052				ep->e_error = 0;
2053				g_raid3_update_device(sc, 1);
2054			} else {
2055				/* Update disk status. */
2056				G_RAID3_DEBUG(3, "Running event for disk %s.",
2057				     g_raid3_get_diskname(ep->e_disk));
2058				ep->e_error = g_raid3_update_disk(ep->e_disk,
2059				    ep->e_state);
2060				if (ep->e_error == 0)
2061					g_raid3_update_device(sc, 0);
2062			}
2063			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
2064				KASSERT(ep->e_error == 0,
2065				    ("Error cannot be handled."));
2066				g_raid3_event_free(ep);
2067			} else {
2068				ep->e_flags |= G_RAID3_EVENT_DONE;
2069				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2070				    ep);
2071				mtx_lock(&sc->sc_events_mtx);
2072				wakeup(ep);
2073				mtx_unlock(&sc->sc_events_mtx);
2074			}
2075			if ((sc->sc_flags &
2076			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2077				if (g_raid3_try_destroy(sc)) {
2078					curthread->td_pflags &= ~TDP_GEOM;
2079					G_RAID3_DEBUG(1, "Thread exiting.");
2080					kproc_exit(0);
2081				}
2082			}
2083			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
2084			continue;
2085		}
2086		/*
2087		 * Check if we can mark array as CLEAN and if we can't take
2088		 * how much seconds should we wait.
2089		 */
2090		timeout = g_raid3_idle(sc, -1);
2091		/*
2092		 * Now I/O requests.
2093		 */
2094		/* Get first request from the queue. */
2095		mtx_lock(&sc->sc_queue_mtx);
2096		bp = bioq_first(&sc->sc_queue);
2097		if (bp == NULL) {
2098			if ((sc->sc_flags &
2099			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2100				mtx_unlock(&sc->sc_queue_mtx);
2101				if (g_raid3_try_destroy(sc)) {
2102					curthread->td_pflags &= ~TDP_GEOM;
2103					G_RAID3_DEBUG(1, "Thread exiting.");
2104					kproc_exit(0);
2105				}
2106				mtx_lock(&sc->sc_queue_mtx);
2107			}
2108			sx_xunlock(&sc->sc_lock);
2109			/*
2110			 * XXX: We can miss an event here, because an event
2111			 *      can be added without sx-device-lock and without
2112			 *      mtx-queue-lock. Maybe I should just stop using
2113			 *      dedicated mutex for events synchronization and
2114			 *      stick with the queue lock?
2115			 *      The event will hang here until next I/O request
2116			 *      or next event is received.
2117			 */
2118			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
2119			    timeout * hz);
2120			sx_xlock(&sc->sc_lock);
2121			G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
2122			continue;
2123		}
2124process:
2125		bioq_remove(&sc->sc_queue, bp);
2126		mtx_unlock(&sc->sc_queue_mtx);
2127
2128		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
2129		    (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
2130			g_raid3_sync_request(bp);	/* READ */
2131		} else if (bp->bio_to != sc->sc_provider) {
2132			if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
2133				g_raid3_regular_request(bp);
2134			else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
2135				g_raid3_sync_request(bp);	/* WRITE */
2136			else {
2137				KASSERT(0,
2138				    ("Invalid request cflags=0x%hhx to=%s.",
2139				    bp->bio_cflags, bp->bio_to->name));
2140			}
2141		} else if (g_raid3_register_request(bp) != 0) {
2142			mtx_lock(&sc->sc_queue_mtx);
2143			bioq_insert_head(&sc->sc_queue, bp);
2144			/*
2145			 * We are short in memory, let see if there are finished
2146			 * request we can free.
2147			 */
2148			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2149				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
2150					goto process;
2151			}
2152			/*
2153			 * No finished regular request, so at least keep
2154			 * synchronization running.
2155			 */
2156			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2157				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
2158					goto process;
2159			}
2160			sx_xunlock(&sc->sc_lock);
2161			MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
2162			    "r3:lowmem", hz / 10);
2163			sx_xlock(&sc->sc_lock);
2164		}
2165		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
2166	}
2167}
2168
2169static void
2170g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
2171{
2172
2173	sx_assert(&sc->sc_lock, SX_LOCKED);
2174	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
2175		return;
2176	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
2177		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2178		    g_raid3_get_diskname(disk), sc->sc_name);
2179		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2180	} else if (sc->sc_idle &&
2181	    (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
2182		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2183		    g_raid3_get_diskname(disk), sc->sc_name);
2184		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2185	}
2186}
2187
2188static void
2189g_raid3_sync_start(struct g_raid3_softc *sc)
2190{
2191	struct g_raid3_disk *disk;
2192	struct g_consumer *cp;
2193	struct bio *bp;
2194	int error;
2195	u_int n;
2196
2197	g_topology_assert_not();
2198	sx_assert(&sc->sc_lock, SX_XLOCKED);
2199
2200	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2201	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2202	    sc->sc_state));
2203	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
2204	    sc->sc_name, sc->sc_state));
2205	disk = NULL;
2206	for (n = 0; n < sc->sc_ndisks; n++) {
2207		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
2208			continue;
2209		disk = &sc->sc_disks[n];
2210		break;
2211	}
2212	if (disk == NULL)
2213		return;
2214
2215	sx_xunlock(&sc->sc_lock);
2216	g_topology_lock();
2217	cp = g_new_consumer(sc->sc_sync.ds_geom);
2218	error = g_attach(cp, sc->sc_provider);
2219	KASSERT(error == 0,
2220	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
2221	error = g_access(cp, 1, 0, 0);
2222	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
2223	g_topology_unlock();
2224	sx_xlock(&sc->sc_lock);
2225
2226	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
2227	    g_raid3_get_diskname(disk));
2228	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
2229		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2230	KASSERT(disk->d_sync.ds_consumer == NULL,
2231	    ("Sync consumer already exists (device=%s, disk=%s).",
2232	    sc->sc_name, g_raid3_get_diskname(disk)));
2233
2234	disk->d_sync.ds_consumer = cp;
2235	disk->d_sync.ds_consumer->private = disk;
2236	disk->d_sync.ds_consumer->index = 0;
2237	sc->sc_syncdisk = disk;
2238
2239	/*
2240	 * Allocate memory for synchronization bios and initialize them.
2241	 */
2242	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
2243	    M_RAID3, M_WAITOK);
2244	for (n = 0; n < g_raid3_syncreqs; n++) {
2245		bp = g_alloc_bio();
2246		disk->d_sync.ds_bios[n] = bp;
2247		bp->bio_parent = NULL;
2248		bp->bio_cmd = BIO_READ;
2249		bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
2250		bp->bio_cflags = 0;
2251		bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
2252		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2253		disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
2254		bp->bio_done = g_raid3_sync_done;
2255		bp->bio_from = disk->d_sync.ds_consumer;
2256		bp->bio_to = sc->sc_provider;
2257		bp->bio_caller1 = (void *)(uintptr_t)n;
2258	}
2259
2260	/* Set the number of in-flight synchronization requests. */
2261	disk->d_sync.ds_inflight = g_raid3_syncreqs;
2262
2263	/*
2264	 * Fire off first synchronization requests.
2265	 */
2266	for (n = 0; n < g_raid3_syncreqs; n++) {
2267		bp = disk->d_sync.ds_bios[n];
2268		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
2269		disk->d_sync.ds_consumer->index++;
2270		/*
2271		 * Delay the request if it is colliding with a regular request.
2272		 */
2273		if (g_raid3_regular_collision(sc, bp))
2274			g_raid3_sync_delay(sc, bp);
2275		else
2276			g_io_request(bp, disk->d_sync.ds_consumer);
2277	}
2278}
2279
2280/*
2281 * Stop synchronization process.
2282 * type: 0 - synchronization finished
2283 *       1 - synchronization stopped
2284 */
2285static void
2286g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
2287{
2288	struct g_raid3_disk *disk;
2289	struct g_consumer *cp;
2290
2291	g_topology_assert_not();
2292	sx_assert(&sc->sc_lock, SX_LOCKED);
2293
2294	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2295	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2296	    sc->sc_state));
2297	disk = sc->sc_syncdisk;
2298	sc->sc_syncdisk = NULL;
2299	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2300	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2301	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2302	    g_raid3_disk_state2str(disk->d_state)));
2303	if (disk->d_sync.ds_consumer == NULL)
2304		return;
2305
2306	if (type == 0) {
2307		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2308		    sc->sc_name, g_raid3_get_diskname(disk));
2309	} else /* if (type == 1) */ {
2310		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2311		    sc->sc_name, g_raid3_get_diskname(disk));
2312	}
2313	free(disk->d_sync.ds_bios, M_RAID3);
2314	disk->d_sync.ds_bios = NULL;
2315	cp = disk->d_sync.ds_consumer;
2316	disk->d_sync.ds_consumer = NULL;
2317	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2318	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2319	g_topology_lock();
2320	g_raid3_kill_consumer(sc, cp);
2321	g_topology_unlock();
2322	sx_xlock(&sc->sc_lock);
2323}
2324
2325static void
2326g_raid3_launch_provider(struct g_raid3_softc *sc)
2327{
2328	struct g_provider *pp;
2329	struct g_raid3_disk *disk;
2330	int n;
2331
2332	sx_assert(&sc->sc_lock, SX_LOCKED);
2333
2334	g_topology_lock();
2335	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2336	pp->mediasize = sc->sc_mediasize;
2337	pp->sectorsize = sc->sc_sectorsize;
2338	pp->stripesize = 0;
2339	pp->stripeoffset = 0;
2340	for (n = 0; n < sc->sc_ndisks; n++) {
2341		disk = &sc->sc_disks[n];
2342		if (disk->d_consumer && disk->d_consumer->provider &&
2343		    disk->d_consumer->provider->stripesize > pp->stripesize) {
2344			pp->stripesize = disk->d_consumer->provider->stripesize;
2345			pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
2346		}
2347	}
2348	pp->stripesize *= sc->sc_ndisks - 1;
2349	pp->stripeoffset *= sc->sc_ndisks - 1;
2350	sc->sc_provider = pp;
2351	g_error_provider(pp, 0);
2352	g_topology_unlock();
2353	G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
2354	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
2355
2356	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2357		g_raid3_sync_start(sc);
2358}
2359
2360static void
2361g_raid3_destroy_provider(struct g_raid3_softc *sc)
2362{
2363	struct bio *bp;
2364
2365	g_topology_assert_not();
2366	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2367	    sc->sc_name));
2368
2369	g_topology_lock();
2370	g_error_provider(sc->sc_provider, ENXIO);
2371	mtx_lock(&sc->sc_queue_mtx);
2372	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2373		bioq_remove(&sc->sc_queue, bp);
2374		g_io_deliver(bp, ENXIO);
2375	}
2376	mtx_unlock(&sc->sc_queue_mtx);
2377	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2378	    sc->sc_provider->name);
2379	sc->sc_provider->flags |= G_PF_WITHER;
2380	g_orphan_provider(sc->sc_provider, ENXIO);
2381	g_topology_unlock();
2382	sc->sc_provider = NULL;
2383	if (sc->sc_syncdisk != NULL)
2384		g_raid3_sync_stop(sc, 1);
2385}
2386
2387static void
2388g_raid3_go(void *arg)
2389{
2390	struct g_raid3_softc *sc;
2391
2392	sc = arg;
2393	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2394	g_raid3_event_send(sc, 0,
2395	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2396}
2397
2398static u_int
2399g_raid3_determine_state(struct g_raid3_disk *disk)
2400{
2401	struct g_raid3_softc *sc;
2402	u_int state;
2403
2404	sc = disk->d_softc;
2405	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2406		if ((disk->d_flags &
2407		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2408			/* Disk does not need synchronization. */
2409			state = G_RAID3_DISK_STATE_ACTIVE;
2410		} else {
2411			if ((sc->sc_flags &
2412			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2413			    (disk->d_flags &
2414			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2415				/*
2416				 * We can start synchronization from
2417				 * the stored offset.
2418				 */
2419				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2420			} else {
2421				state = G_RAID3_DISK_STATE_STALE;
2422			}
2423		}
2424	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2425		/*
2426		 * Reset all synchronization data for this disk,
2427		 * because if it even was synchronized, it was
2428		 * synchronized to disks with different syncid.
2429		 */
2430		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2431		disk->d_sync.ds_offset = 0;
2432		disk->d_sync.ds_offset_done = 0;
2433		disk->d_sync.ds_syncid = sc->sc_syncid;
2434		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2435		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2436			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2437		} else {
2438			state = G_RAID3_DISK_STATE_STALE;
2439		}
2440	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2441		/*
2442		 * Not good, NOT GOOD!
2443		 * It means that device was started on stale disks
2444		 * and more fresh disk just arrive.
2445		 * If there were writes, device is broken, sorry.
2446		 * I think the best choice here is don't touch
2447		 * this disk and inform the user loudly.
2448		 */
2449		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2450		    "disk (%s) arrives!! It will not be connected to the "
2451		    "running device.", sc->sc_name,
2452		    g_raid3_get_diskname(disk));
2453		g_raid3_destroy_disk(disk);
2454		state = G_RAID3_DISK_STATE_NONE;
2455		/* Return immediately, because disk was destroyed. */
2456		return (state);
2457	}
2458	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2459	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2460	return (state);
2461}
2462
2463/*
2464 * Update device state.
2465 */
2466static void
2467g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2468{
2469	struct g_raid3_disk *disk;
2470	u_int state;
2471
2472	sx_assert(&sc->sc_lock, SX_XLOCKED);
2473
2474	switch (sc->sc_state) {
2475	case G_RAID3_DEVICE_STATE_STARTING:
2476	    {
2477		u_int n, ndirty, ndisks, genid, syncid;
2478
2479		KASSERT(sc->sc_provider == NULL,
2480		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2481		/*
2482		 * Are we ready? We are, if all disks are connected or
2483		 * one disk is missing and 'force' is true.
2484		 */
2485		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2486			if (!force)
2487				callout_drain(&sc->sc_callout);
2488		} else {
2489			if (force) {
2490				/*
2491				 * Timeout expired, so destroy device.
2492				 */
2493				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2494				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2495				    __LINE__, sc->sc_rootmount);
2496				root_mount_rel(sc->sc_rootmount);
2497				sc->sc_rootmount = NULL;
2498			}
2499			return;
2500		}
2501
2502		/*
2503		 * Find the biggest genid.
2504		 */
2505		genid = 0;
2506		for (n = 0; n < sc->sc_ndisks; n++) {
2507			disk = &sc->sc_disks[n];
2508			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2509				continue;
2510			if (disk->d_genid > genid)
2511				genid = disk->d_genid;
2512		}
2513		sc->sc_genid = genid;
2514		/*
2515		 * Remove all disks without the biggest genid.
2516		 */
2517		for (n = 0; n < sc->sc_ndisks; n++) {
2518			disk = &sc->sc_disks[n];
2519			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2520				continue;
2521			if (disk->d_genid < genid) {
2522				G_RAID3_DEBUG(0,
2523				    "Component %s (device %s) broken, skipping.",
2524				    g_raid3_get_diskname(disk), sc->sc_name);
2525				g_raid3_destroy_disk(disk);
2526			}
2527		}
2528
2529		/*
2530		 * There must be at least 'sc->sc_ndisks - 1' components
2531		 * with the same syncid and without SYNCHRONIZING flag.
2532		 */
2533
2534		/*
2535		 * Find the biggest syncid, number of valid components and
2536		 * number of dirty components.
2537		 */
2538		ndirty = ndisks = syncid = 0;
2539		for (n = 0; n < sc->sc_ndisks; n++) {
2540			disk = &sc->sc_disks[n];
2541			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2542				continue;
2543			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2544				ndirty++;
2545			if (disk->d_sync.ds_syncid > syncid) {
2546				syncid = disk->d_sync.ds_syncid;
2547				ndisks = 0;
2548			} else if (disk->d_sync.ds_syncid < syncid) {
2549				continue;
2550			}
2551			if ((disk->d_flags &
2552			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2553				continue;
2554			}
2555			ndisks++;
2556		}
2557		/*
2558		 * Do we have enough valid components?
2559		 */
2560		if (ndisks + 1 < sc->sc_ndisks) {
2561			G_RAID3_DEBUG(0,
2562			    "Device %s is broken, too few valid components.",
2563			    sc->sc_name);
2564			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2565			return;
2566		}
2567		/*
2568		 * If there is one DIRTY component and all disks are present,
2569		 * mark it for synchronization. If there is more than one DIRTY
2570		 * component, mark parity component for synchronization.
2571		 */
2572		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2573			for (n = 0; n < sc->sc_ndisks; n++) {
2574				disk = &sc->sc_disks[n];
2575				if ((disk->d_flags &
2576				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2577					continue;
2578				}
2579				disk->d_flags |=
2580				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2581			}
2582		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2583			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2584			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2585		}
2586
2587		sc->sc_syncid = syncid;
2588		if (force) {
2589			/* Remember to bump syncid on first write. */
2590			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2591		}
2592		if (ndisks == sc->sc_ndisks)
2593			state = G_RAID3_DEVICE_STATE_COMPLETE;
2594		else /* if (ndisks == sc->sc_ndisks - 1) */
2595			state = G_RAID3_DEVICE_STATE_DEGRADED;
2596		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2597		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2598		    g_raid3_device_state2str(state));
2599		sc->sc_state = state;
2600		for (n = 0; n < sc->sc_ndisks; n++) {
2601			disk = &sc->sc_disks[n];
2602			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2603				continue;
2604			state = g_raid3_determine_state(disk);
2605			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2606			if (state == G_RAID3_DISK_STATE_STALE)
2607				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2608		}
2609		break;
2610	    }
2611	case G_RAID3_DEVICE_STATE_DEGRADED:
2612		/*
2613		 * Genid need to be bumped immediately, so do it here.
2614		 */
2615		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2616			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2617			g_raid3_bump_genid(sc);
2618		}
2619
2620		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2621			return;
2622		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2623		    sc->sc_ndisks - 1) {
2624			if (sc->sc_provider != NULL)
2625				g_raid3_destroy_provider(sc);
2626			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2627			return;
2628		}
2629		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2630		    sc->sc_ndisks) {
2631			state = G_RAID3_DEVICE_STATE_COMPLETE;
2632			G_RAID3_DEBUG(1,
2633			    "Device %s state changed from %s to %s.",
2634			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2635			    g_raid3_device_state2str(state));
2636			sc->sc_state = state;
2637		}
2638		if (sc->sc_provider == NULL)
2639			g_raid3_launch_provider(sc);
2640		if (sc->sc_rootmount != NULL) {
2641			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2642			    sc->sc_rootmount);
2643			root_mount_rel(sc->sc_rootmount);
2644			sc->sc_rootmount = NULL;
2645		}
2646		break;
2647	case G_RAID3_DEVICE_STATE_COMPLETE:
2648		/*
2649		 * Genid need to be bumped immediately, so do it here.
2650		 */
2651		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2652			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2653			g_raid3_bump_genid(sc);
2654		}
2655
2656		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2657			return;
2658		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2659		    sc->sc_ndisks - 1,
2660		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2661		    sc->sc_name));
2662		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2663		    sc->sc_ndisks - 1) {
2664			state = G_RAID3_DEVICE_STATE_DEGRADED;
2665			G_RAID3_DEBUG(1,
2666			    "Device %s state changed from %s to %s.",
2667			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2668			    g_raid3_device_state2str(state));
2669			sc->sc_state = state;
2670		}
2671		if (sc->sc_provider == NULL)
2672			g_raid3_launch_provider(sc);
2673		if (sc->sc_rootmount != NULL) {
2674			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2675			    sc->sc_rootmount);
2676			root_mount_rel(sc->sc_rootmount);
2677			sc->sc_rootmount = NULL;
2678		}
2679		break;
2680	default:
2681		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2682		    g_raid3_device_state2str(sc->sc_state)));
2683		break;
2684	}
2685}
2686
2687/*
2688 * Update disk state and device state if needed.
2689 */
2690#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2691	"Disk %s state changed from %s to %s (device %s).",		\
2692	g_raid3_get_diskname(disk),					\
2693	g_raid3_disk_state2str(disk->d_state),				\
2694	g_raid3_disk_state2str(state), sc->sc_name)
2695static int
2696g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2697{
2698	struct g_raid3_softc *sc;
2699
2700	sc = disk->d_softc;
2701	sx_assert(&sc->sc_lock, SX_XLOCKED);
2702
2703again:
2704	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2705	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2706	    g_raid3_disk_state2str(state));
2707	switch (state) {
2708	case G_RAID3_DISK_STATE_NEW:
2709		/*
2710		 * Possible scenarios:
2711		 * 1. New disk arrive.
2712		 */
2713		/* Previous state should be NONE. */
2714		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2715		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2716		    g_raid3_disk_state2str(disk->d_state)));
2717		DISK_STATE_CHANGED();
2718
2719		disk->d_state = state;
2720		G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
2721		    sc->sc_name, g_raid3_get_diskname(disk));
2722		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2723			break;
2724		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2725		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2726		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2727		    g_raid3_device_state2str(sc->sc_state),
2728		    g_raid3_get_diskname(disk),
2729		    g_raid3_disk_state2str(disk->d_state)));
2730		state = g_raid3_determine_state(disk);
2731		if (state != G_RAID3_DISK_STATE_NONE)
2732			goto again;
2733		break;
2734	case G_RAID3_DISK_STATE_ACTIVE:
2735		/*
2736		 * Possible scenarios:
2737		 * 1. New disk does not need synchronization.
2738		 * 2. Synchronization process finished successfully.
2739		 */
2740		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2741		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2742		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2743		    g_raid3_device_state2str(sc->sc_state),
2744		    g_raid3_get_diskname(disk),
2745		    g_raid3_disk_state2str(disk->d_state)));
2746		/* Previous state should be NEW or SYNCHRONIZING. */
2747		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2748		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2749		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2750		    g_raid3_disk_state2str(disk->d_state)));
2751		DISK_STATE_CHANGED();
2752
2753		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2754			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2755			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2756			g_raid3_sync_stop(sc, 0);
2757		}
2758		disk->d_state = state;
2759		disk->d_sync.ds_offset = 0;
2760		disk->d_sync.ds_offset_done = 0;
2761		g_raid3_update_idle(sc, disk);
2762		g_raid3_update_metadata(disk);
2763		G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
2764		    sc->sc_name, g_raid3_get_diskname(disk));
2765		break;
2766	case G_RAID3_DISK_STATE_STALE:
2767		/*
2768		 * Possible scenarios:
2769		 * 1. Stale disk was connected.
2770		 */
2771		/* Previous state should be NEW. */
2772		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2773		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2774		    g_raid3_disk_state2str(disk->d_state)));
2775		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2776		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2777		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2778		    g_raid3_device_state2str(sc->sc_state),
2779		    g_raid3_get_diskname(disk),
2780		    g_raid3_disk_state2str(disk->d_state)));
2781		/*
2782		 * STALE state is only possible if device is marked
2783		 * NOAUTOSYNC.
2784		 */
2785		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2786		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2787		    g_raid3_device_state2str(sc->sc_state),
2788		    g_raid3_get_diskname(disk),
2789		    g_raid3_disk_state2str(disk->d_state)));
2790		DISK_STATE_CHANGED();
2791
2792		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2793		disk->d_state = state;
2794		g_raid3_update_metadata(disk);
2795		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2796		    sc->sc_name, g_raid3_get_diskname(disk));
2797		break;
2798	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2799		/*
2800		 * Possible scenarios:
2801		 * 1. Disk which needs synchronization was connected.
2802		 */
2803		/* Previous state should be NEW. */
2804		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2805		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2806		    g_raid3_disk_state2str(disk->d_state)));
2807		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2808		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2809		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2810		    g_raid3_device_state2str(sc->sc_state),
2811		    g_raid3_get_diskname(disk),
2812		    g_raid3_disk_state2str(disk->d_state)));
2813		DISK_STATE_CHANGED();
2814
2815		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2816			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2817		disk->d_state = state;
2818		if (sc->sc_provider != NULL) {
2819			g_raid3_sync_start(sc);
2820			g_raid3_update_metadata(disk);
2821		}
2822		break;
2823	case G_RAID3_DISK_STATE_DISCONNECTED:
2824		/*
2825		 * Possible scenarios:
2826		 * 1. Device wasn't running yet, but disk disappear.
2827		 * 2. Disk was active and disapppear.
2828		 * 3. Disk disappear during synchronization process.
2829		 */
2830		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2831		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2832			/*
2833			 * Previous state should be ACTIVE, STALE or
2834			 * SYNCHRONIZING.
2835			 */
2836			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2837			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2838			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2839			    ("Wrong disk state (%s, %s).",
2840			    g_raid3_get_diskname(disk),
2841			    g_raid3_disk_state2str(disk->d_state)));
2842		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2843			/* Previous state should be NEW. */
2844			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2845			    ("Wrong disk state (%s, %s).",
2846			    g_raid3_get_diskname(disk),
2847			    g_raid3_disk_state2str(disk->d_state)));
2848			/*
2849			 * Reset bumping syncid if disk disappeared in STARTING
2850			 * state.
2851			 */
2852			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2853				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2854#ifdef	INVARIANTS
2855		} else {
2856			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2857			    sc->sc_name,
2858			    g_raid3_device_state2str(sc->sc_state),
2859			    g_raid3_get_diskname(disk),
2860			    g_raid3_disk_state2str(disk->d_state)));
2861#endif
2862		}
2863		DISK_STATE_CHANGED();
2864		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2865		    sc->sc_name, g_raid3_get_diskname(disk));
2866
2867		g_raid3_destroy_disk(disk);
2868		break;
2869	default:
2870		KASSERT(1 == 0, ("Unknown state (%u).", state));
2871		break;
2872	}
2873	return (0);
2874}
2875#undef	DISK_STATE_CHANGED
2876
2877int
2878g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2879{
2880	struct g_provider *pp;
2881	u_char *buf;
2882	int error;
2883
2884	g_topology_assert();
2885
2886	error = g_access(cp, 1, 0, 0);
2887	if (error != 0)
2888		return (error);
2889	pp = cp->provider;
2890	g_topology_unlock();
2891	/* Metadata are stored on last sector. */
2892	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2893	    &error);
2894	g_topology_lock();
2895	g_access(cp, -1, 0, 0);
2896	if (buf == NULL) {
2897		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2898		    cp->provider->name, error);
2899		return (error);
2900	}
2901
2902	/* Decode metadata. */
2903	error = raid3_metadata_decode(buf, md);
2904	g_free(buf);
2905	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2906		return (EINVAL);
2907	if (md->md_version > G_RAID3_VERSION) {
2908		G_RAID3_DEBUG(0,
2909		    "Kernel module is too old to handle metadata from %s.",
2910		    cp->provider->name);
2911		return (EINVAL);
2912	}
2913	if (error != 0) {
2914		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2915		    cp->provider->name);
2916		return (error);
2917	}
2918	if (md->md_sectorsize > MAXPHYS) {
2919		G_RAID3_DEBUG(0, "The blocksize is too big.");
2920		return (EINVAL);
2921	}
2922
2923	return (0);
2924}
2925
2926static int
2927g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2928    struct g_raid3_metadata *md)
2929{
2930
2931	if (md->md_no >= sc->sc_ndisks) {
2932		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2933		    pp->name, md->md_no);
2934		return (EINVAL);
2935	}
2936	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2937		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2938		    pp->name, md->md_no);
2939		return (EEXIST);
2940	}
2941	if (md->md_all != sc->sc_ndisks) {
2942		G_RAID3_DEBUG(1,
2943		    "Invalid '%s' field on disk %s (device %s), skipping.",
2944		    "md_all", pp->name, sc->sc_name);
2945		return (EINVAL);
2946	}
2947	if ((md->md_mediasize % md->md_sectorsize) != 0) {
2948		G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
2949		    "0) on disk %s (device %s), skipping.", pp->name,
2950		    sc->sc_name);
2951		return (EINVAL);
2952	}
2953	if (md->md_mediasize != sc->sc_mediasize) {
2954		G_RAID3_DEBUG(1,
2955		    "Invalid '%s' field on disk %s (device %s), skipping.",
2956		    "md_mediasize", pp->name, sc->sc_name);
2957		return (EINVAL);
2958	}
2959	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2960		G_RAID3_DEBUG(1,
2961		    "Invalid '%s' field on disk %s (device %s), skipping.",
2962		    "md_mediasize", pp->name, sc->sc_name);
2963		return (EINVAL);
2964	}
2965	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2966		G_RAID3_DEBUG(1,
2967		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2968		    sc->sc_name);
2969		return (EINVAL);
2970	}
2971	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2972		G_RAID3_DEBUG(1,
2973		    "Invalid '%s' field on disk %s (device %s), skipping.",
2974		    "md_sectorsize", pp->name, sc->sc_name);
2975		return (EINVAL);
2976	}
2977	if (md->md_sectorsize != sc->sc_sectorsize) {
2978		G_RAID3_DEBUG(1,
2979		    "Invalid '%s' field on disk %s (device %s), skipping.",
2980		    "md_sectorsize", pp->name, sc->sc_name);
2981		return (EINVAL);
2982	}
2983	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2984		G_RAID3_DEBUG(1,
2985		    "Invalid sector size of disk %s (device %s), skipping.",
2986		    pp->name, sc->sc_name);
2987		return (EINVAL);
2988	}
2989	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2990		G_RAID3_DEBUG(1,
2991		    "Invalid device flags on disk %s (device %s), skipping.",
2992		    pp->name, sc->sc_name);
2993		return (EINVAL);
2994	}
2995	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2996	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2997		/*
2998		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2999		 */
3000		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
3001		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
3002		return (EINVAL);
3003	}
3004	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
3005		G_RAID3_DEBUG(1,
3006		    "Invalid disk flags on disk %s (device %s), skipping.",
3007		    pp->name, sc->sc_name);
3008		return (EINVAL);
3009	}
3010	return (0);
3011}
3012
3013int
3014g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
3015    struct g_raid3_metadata *md)
3016{
3017	struct g_raid3_disk *disk;
3018	int error;
3019
3020	g_topology_assert_not();
3021	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
3022
3023	error = g_raid3_check_metadata(sc, pp, md);
3024	if (error != 0)
3025		return (error);
3026	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
3027	    md->md_genid < sc->sc_genid) {
3028		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
3029		    pp->name, sc->sc_name);
3030		return (EINVAL);
3031	}
3032	disk = g_raid3_init_disk(sc, pp, md, &error);
3033	if (disk == NULL)
3034		return (error);
3035	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
3036	    G_RAID3_EVENT_WAIT);
3037	if (error != 0)
3038		return (error);
3039	if (md->md_version < G_RAID3_VERSION) {
3040		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
3041		    pp->name, md->md_version, G_RAID3_VERSION);
3042		g_raid3_update_metadata(disk);
3043	}
3044	return (0);
3045}
3046
3047static void
3048g_raid3_destroy_delayed(void *arg, int flag)
3049{
3050	struct g_raid3_softc *sc;
3051	int error;
3052
3053	if (flag == EV_CANCEL) {
3054		G_RAID3_DEBUG(1, "Destroying canceled.");
3055		return;
3056	}
3057	sc = arg;
3058	g_topology_unlock();
3059	sx_xlock(&sc->sc_lock);
3060	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
3061	    ("DESTROY flag set on %s.", sc->sc_name));
3062	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
3063	    ("DESTROYING flag not set on %s.", sc->sc_name));
3064	G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
3065	error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
3066	if (error != 0) {
3067		G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
3068		sx_xunlock(&sc->sc_lock);
3069	}
3070	g_topology_lock();
3071}
3072
3073static int
3074g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
3075{
3076	struct g_raid3_softc *sc;
3077	int dcr, dcw, dce, error = 0;
3078
3079	g_topology_assert();
3080	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
3081	    acw, ace);
3082
3083	sc = pp->geom->softc;
3084	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
3085		return (0);
3086	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
3087
3088	dcr = pp->acr + acr;
3089	dcw = pp->acw + acw;
3090	dce = pp->ace + ace;
3091
3092	g_topology_unlock();
3093	sx_xlock(&sc->sc_lock);
3094	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
3095	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
3096		if (acr > 0 || acw > 0 || ace > 0)
3097			error = ENXIO;
3098		goto end;
3099	}
3100	if (dcw == 0 && !sc->sc_idle)
3101		g_raid3_idle(sc, dcw);
3102	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
3103		if (acr > 0 || acw > 0 || ace > 0) {
3104			error = ENXIO;
3105			goto end;
3106		}
3107		if (dcr == 0 && dcw == 0 && dce == 0) {
3108			g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
3109			    sc, NULL);
3110		}
3111	}
3112end:
3113	sx_xunlock(&sc->sc_lock);
3114	g_topology_lock();
3115	return (error);
3116}
3117
3118static struct g_geom *
3119g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
3120{
3121	struct g_raid3_softc *sc;
3122	struct g_geom *gp;
3123	int error, timeout;
3124	u_int n;
3125
3126	g_topology_assert();
3127	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
3128
3129	/* One disk is minimum. */
3130	if (md->md_all < 1)
3131		return (NULL);
3132	/*
3133	 * Action geom.
3134	 */
3135	gp = g_new_geomf(mp, "%s", md->md_name);
3136	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
3137	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
3138	    M_WAITOK | M_ZERO);
3139	gp->start = g_raid3_start;
3140	gp->orphan = g_raid3_orphan;
3141	gp->access = g_raid3_access;
3142	gp->dumpconf = g_raid3_dumpconf;
3143
3144	sc->sc_id = md->md_id;
3145	sc->sc_mediasize = md->md_mediasize;
3146	sc->sc_sectorsize = md->md_sectorsize;
3147	sc->sc_ndisks = md->md_all;
3148	sc->sc_round_robin = 0;
3149	sc->sc_flags = md->md_mflags;
3150	sc->sc_bump_id = 0;
3151	sc->sc_idle = 1;
3152	sc->sc_last_write = time_uptime;
3153	sc->sc_writes = 0;
3154	for (n = 0; n < sc->sc_ndisks; n++) {
3155		sc->sc_disks[n].d_softc = sc;
3156		sc->sc_disks[n].d_no = n;
3157		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
3158	}
3159	sx_init(&sc->sc_lock, "graid3:lock");
3160	bioq_init(&sc->sc_queue);
3161	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
3162	bioq_init(&sc->sc_regular_delayed);
3163	bioq_init(&sc->sc_inflight);
3164	bioq_init(&sc->sc_sync_delayed);
3165	TAILQ_INIT(&sc->sc_events);
3166	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
3167	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
3168	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
3169	gp->softc = sc;
3170	sc->sc_geom = gp;
3171	sc->sc_provider = NULL;
3172	/*
3173	 * Synchronization geom.
3174	 */
3175	gp = g_new_geomf(mp, "%s.sync", md->md_name);
3176	gp->softc = sc;
3177	gp->orphan = g_raid3_orphan;
3178	sc->sc_sync.ds_geom = gp;
3179
3180	if (!g_raid3_use_malloc) {
3181		sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
3182		    65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3183		    UMA_ALIGN_PTR, 0);
3184		sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
3185		sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
3186		sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
3187		    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
3188		sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
3189		    16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3190		    UMA_ALIGN_PTR, 0);
3191		sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
3192		sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
3193		sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
3194		    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
3195		sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
3196		    4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3197		    UMA_ALIGN_PTR, 0);
3198		sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
3199		sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
3200		sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
3201		    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
3202	}
3203
3204	error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
3205	    "g_raid3 %s", md->md_name);
3206	if (error != 0) {
3207		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
3208		    sc->sc_name);
3209		if (!g_raid3_use_malloc) {
3210			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
3211			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
3212			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
3213		}
3214		g_destroy_geom(sc->sc_sync.ds_geom);
3215		mtx_destroy(&sc->sc_events_mtx);
3216		mtx_destroy(&sc->sc_queue_mtx);
3217		sx_destroy(&sc->sc_lock);
3218		g_destroy_geom(sc->sc_geom);
3219		free(sc->sc_disks, M_RAID3);
3220		free(sc, M_RAID3);
3221		return (NULL);
3222	}
3223
3224	G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
3225	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
3226
3227	sc->sc_rootmount = root_mount_hold("GRAID3");
3228	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3229
3230	/*
3231	 * Run timeout.
3232	 */
3233	timeout = atomic_load_acq_int(&g_raid3_timeout);
3234	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
3235	return (sc->sc_geom);
3236}
3237
3238int
3239g_raid3_destroy(struct g_raid3_softc *sc, int how)
3240{
3241	struct g_provider *pp;
3242
3243	g_topology_assert_not();
3244	if (sc == NULL)
3245		return (ENXIO);
3246	sx_assert(&sc->sc_lock, SX_XLOCKED);
3247
3248	pp = sc->sc_provider;
3249	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
3250		switch (how) {
3251		case G_RAID3_DESTROY_SOFT:
3252			G_RAID3_DEBUG(1,
3253			    "Device %s is still open (r%dw%de%d).", pp->name,
3254			    pp->acr, pp->acw, pp->ace);
3255			return (EBUSY);
3256		case G_RAID3_DESTROY_DELAYED:
3257			G_RAID3_DEBUG(1,
3258			    "Device %s will be destroyed on last close.",
3259			    pp->name);
3260			if (sc->sc_syncdisk != NULL)
3261				g_raid3_sync_stop(sc, 1);
3262			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
3263			return (EBUSY);
3264		case G_RAID3_DESTROY_HARD:
3265			G_RAID3_DEBUG(1, "Device %s is still open, so it "
3266			    "can't be definitely removed.", pp->name);
3267			break;
3268		}
3269	}
3270
3271	g_topology_lock();
3272	if (sc->sc_geom->softc == NULL) {
3273		g_topology_unlock();
3274		return (0);
3275	}
3276	sc->sc_geom->softc = NULL;
3277	sc->sc_sync.ds_geom->softc = NULL;
3278	g_topology_unlock();
3279
3280	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
3281	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
3282	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3283	sx_xunlock(&sc->sc_lock);
3284	mtx_lock(&sc->sc_queue_mtx);
3285	wakeup(sc);
3286	wakeup(&sc->sc_queue);
3287	mtx_unlock(&sc->sc_queue_mtx);
3288	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3289	while (sc->sc_worker != NULL)
3290		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
3291	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3292	sx_xlock(&sc->sc_lock);
3293	g_raid3_destroy_device(sc);
3294	free(sc->sc_disks, M_RAID3);
3295	free(sc, M_RAID3);
3296	return (0);
3297}
3298
3299static void
3300g_raid3_taste_orphan(struct g_consumer *cp)
3301{
3302
3303	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3304	    cp->provider->name));
3305}
3306
3307static struct g_geom *
3308g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3309{
3310	struct g_raid3_metadata md;
3311	struct g_raid3_softc *sc;
3312	struct g_consumer *cp;
3313	struct g_geom *gp;
3314	int error;
3315
3316	g_topology_assert();
3317	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3318	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
3319
3320	gp = g_new_geomf(mp, "raid3:taste");
3321	/* This orphan function should be never called. */
3322	gp->orphan = g_raid3_taste_orphan;
3323	cp = g_new_consumer(gp);
3324	g_attach(cp, pp);
3325	error = g_raid3_read_metadata(cp, &md);
3326	g_detach(cp);
3327	g_destroy_consumer(cp);
3328	g_destroy_geom(gp);
3329	if (error != 0)
3330		return (NULL);
3331	gp = NULL;
3332
3333	if (md.md_provider[0] != '\0' &&
3334	    !g_compare_names(md.md_provider, pp->name))
3335		return (NULL);
3336	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3337		return (NULL);
3338	if (g_raid3_debug >= 2)
3339		raid3_metadata_dump(&md);
3340
3341	/*
3342	 * Let's check if device already exists.
3343	 */
3344	sc = NULL;
3345	LIST_FOREACH(gp, &mp->geom, geom) {
3346		sc = gp->softc;
3347		if (sc == NULL)
3348			continue;
3349		if (sc->sc_sync.ds_geom == gp)
3350			continue;
3351		if (strcmp(md.md_name, sc->sc_name) != 0)
3352			continue;
3353		if (md.md_id != sc->sc_id) {
3354			G_RAID3_DEBUG(0, "Device %s already configured.",
3355			    sc->sc_name);
3356			return (NULL);
3357		}
3358		break;
3359	}
3360	if (gp == NULL) {
3361		gp = g_raid3_create(mp, &md);
3362		if (gp == NULL) {
3363			G_RAID3_DEBUG(0, "Cannot create device %s.",
3364			    md.md_name);
3365			return (NULL);
3366		}
3367		sc = gp->softc;
3368	}
3369	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3370	g_topology_unlock();
3371	sx_xlock(&sc->sc_lock);
3372	error = g_raid3_add_disk(sc, pp, &md);
3373	if (error != 0) {
3374		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3375		    pp->name, gp->name, error);
3376		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
3377		    sc->sc_ndisks) {
3378			g_cancel_event(sc);
3379			g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
3380			g_topology_lock();
3381			return (NULL);
3382		}
3383		gp = NULL;
3384	}
3385	sx_xunlock(&sc->sc_lock);
3386	g_topology_lock();
3387	return (gp);
3388}
3389
3390static int
3391g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
3392    struct g_geom *gp)
3393{
3394	struct g_raid3_softc *sc;
3395	int error;
3396
3397	g_topology_unlock();
3398	sc = gp->softc;
3399	sx_xlock(&sc->sc_lock);
3400	g_cancel_event(sc);
3401	error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
3402	if (error != 0)
3403		sx_xunlock(&sc->sc_lock);
3404	g_topology_lock();
3405	return (error);
3406}
3407
3408static void
3409g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3410    struct g_consumer *cp, struct g_provider *pp)
3411{
3412	struct g_raid3_softc *sc;
3413
3414	g_topology_assert();
3415
3416	sc = gp->softc;
3417	if (sc == NULL)
3418		return;
3419	/* Skip synchronization geom. */
3420	if (gp == sc->sc_sync.ds_geom)
3421		return;
3422	if (pp != NULL) {
3423		/* Nothing here. */
3424	} else if (cp != NULL) {
3425		struct g_raid3_disk *disk;
3426
3427		disk = cp->private;
3428		if (disk == NULL)
3429			return;
3430		g_topology_unlock();
3431		sx_xlock(&sc->sc_lock);
3432		sbuf_printf(sb, "%s<Type>", indent);
3433		if (disk->d_no == sc->sc_ndisks - 1)
3434			sbuf_printf(sb, "PARITY");
3435		else
3436			sbuf_printf(sb, "DATA");
3437		sbuf_printf(sb, "</Type>\n");
3438		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3439		    (u_int)disk->d_no);
3440		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3441			sbuf_printf(sb, "%s<Synchronized>", indent);
3442			if (disk->d_sync.ds_offset == 0)
3443				sbuf_printf(sb, "0%%");
3444			else {
3445				sbuf_printf(sb, "%u%%",
3446				    (u_int)((disk->d_sync.ds_offset * 100) /
3447				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3448			}
3449			sbuf_printf(sb, "</Synchronized>\n");
3450		}
3451		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3452		    disk->d_sync.ds_syncid);
3453		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3454		sbuf_printf(sb, "%s<Flags>", indent);
3455		if (disk->d_flags == 0)
3456			sbuf_printf(sb, "NONE");
3457		else {
3458			int first = 1;
3459
3460#define	ADD_FLAG(flag, name)	do {					\
3461	if ((disk->d_flags & (flag)) != 0) {				\
3462		if (!first)						\
3463			sbuf_printf(sb, ", ");				\
3464		else							\
3465			first = 0;					\
3466		sbuf_printf(sb, name);					\
3467	}								\
3468} while (0)
3469			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3470			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3471			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3472			    "SYNCHRONIZING");
3473			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3474			ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
3475#undef	ADD_FLAG
3476		}
3477		sbuf_printf(sb, "</Flags>\n");
3478		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3479		    g_raid3_disk_state2str(disk->d_state));
3480		sx_xunlock(&sc->sc_lock);
3481		g_topology_lock();
3482	} else {
3483		g_topology_unlock();
3484		sx_xlock(&sc->sc_lock);
3485		if (!g_raid3_use_malloc) {
3486			sbuf_printf(sb,
3487			    "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
3488			    sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
3489			sbuf_printf(sb,
3490			    "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
3491			    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
3492			sbuf_printf(sb,
3493			    "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
3494			    sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
3495			sbuf_printf(sb,
3496			    "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
3497			    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
3498			sbuf_printf(sb,
3499			    "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
3500			    sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
3501			sbuf_printf(sb,
3502			    "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
3503			    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
3504		}
3505		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3506		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3507		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3508		sbuf_printf(sb, "%s<Flags>", indent);
3509		if (sc->sc_flags == 0)
3510			sbuf_printf(sb, "NONE");
3511		else {
3512			int first = 1;
3513
3514#define	ADD_FLAG(flag, name)	do {					\
3515	if ((sc->sc_flags & (flag)) != 0) {				\
3516		if (!first)						\
3517			sbuf_printf(sb, ", ");				\
3518		else							\
3519			first = 0;					\
3520		sbuf_printf(sb, name);					\
3521	}								\
3522} while (0)
3523			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
3524			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3525			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3526			    "ROUND-ROBIN");
3527			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3528#undef	ADD_FLAG
3529		}
3530		sbuf_printf(sb, "</Flags>\n");
3531		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3532		    sc->sc_ndisks);
3533		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3534		    g_raid3_device_state2str(sc->sc_state));
3535		sx_xunlock(&sc->sc_lock);
3536		g_topology_lock();
3537	}
3538}
3539
3540static void
3541g_raid3_shutdown_pre_sync(void *arg, int howto)
3542{
3543	struct g_class *mp;
3544	struct g_geom *gp, *gp2;
3545	struct g_raid3_softc *sc;
3546	int error;
3547
3548	mp = arg;
3549	DROP_GIANT();
3550	g_topology_lock();
3551	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3552		if ((sc = gp->softc) == NULL)
3553			continue;
3554		/* Skip synchronization geom. */
3555		if (gp == sc->sc_sync.ds_geom)
3556			continue;
3557		g_topology_unlock();
3558		sx_xlock(&sc->sc_lock);
3559		g_cancel_event(sc);
3560		error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
3561		if (error != 0)
3562			sx_xunlock(&sc->sc_lock);
3563		g_topology_lock();
3564	}
3565	g_topology_unlock();
3566	PICKUP_GIANT();
3567}
3568
3569static void
3570g_raid3_init(struct g_class *mp)
3571{
3572
3573	g_raid3_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
3574	    g_raid3_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
3575	if (g_raid3_pre_sync == NULL)
3576		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3577}
3578
3579static void
3580g_raid3_fini(struct g_class *mp)
3581{
3582
3583	if (g_raid3_pre_sync != NULL)
3584		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid3_pre_sync);
3585}
3586
3587DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3588