g_raid3.c revision 139379
1133819Stjr/*-
2133819Stjr * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3133819Stjr * All rights reserved.
4133819Stjr *
5133819Stjr * Redistribution and use in source and binary forms, with or without
6163761Snetchild * modification, are permitted provided that the following conditions
7133819Stjr * are met:
8133819Stjr * 1. Redistributions of source code must retain the above copyright
9133819Stjr *    notice, this list of conditions and the following disclaimer.
10133819Stjr * 2. Redistributions in binary form must reproduce the above copyright
11133819Stjr *    notice, this list of conditions and the following disclaimer in the
12133819Stjr *    documentation and/or other materials provided with the distribution.
13133819Stjr *
14164199Sru * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15133819Stjr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16133819Stjr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17161330Sjhb * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18161330Sjhb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19133819Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20133819Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21133819Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22133819Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23133819Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24133819Stjr * SUCH DAMAGE.
25133819Stjr */
26133819Stjr
27133819Stjr#include <sys/cdefs.h>
28133819Stjr__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 139379 2004-12-28 21:52:45Z pjd $");
29133819Stjr
30133819Stjr#include <sys/param.h>
31133819Stjr#include <sys/systm.h>
32133819Stjr#include <sys/kernel.h>
33133819Stjr#include <sys/module.h>
34143198Ssobomax#include <sys/limits.h>
35133819Stjr#include <sys/lock.h>
36133819Stjr#include <sys/mutex.h>
37133819Stjr#include <sys/bio.h>
38133819Stjr#include <sys/sysctl.h>
39133819Stjr#include <sys/malloc.h>
40133819Stjr#include <sys/eventhandler.h>
41133819Stjr#include <vm/uma.h>
42133819Stjr#include <machine/atomic.h>
43133819Stjr#include <geom/geom.h>
44133819Stjr#include <sys/proc.h>
45133819Stjr#include <sys/kthread.h>
46133819Stjr#include <geom/raid3/g_raid3.h>
47133819Stjr
48133819Stjr
49133819Stjrstatic MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50133819Stjr
51133819StjrSYSCTL_DECL(_kern_geom);
52133819StjrSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53133819Stjru_int g_raid3_debug = 0;
54133819StjrTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56133819Stjr    "Debug level");
57133819Stjrstatic u_int g_raid3_timeout = 4;
58133819StjrTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60133819Stjr    0, "Time to wait on all raid3 components");
61133819Stjrstatic u_int g_raid3_idletime = 5;
62133819StjrTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64133819Stjr    &g_raid3_idletime, 0, "Mark components as clean when idling");
65133819Stjrstatic u_int g_raid3_reqs_per_sync = 5;
66133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67133819Stjr    &g_raid3_reqs_per_sync, 0,
68133819Stjr    "Number of regular I/O requests per synchronization request");
69133819Stjrstatic u_int g_raid3_syncs_per_sec = 100;
70133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71133819Stjr    &g_raid3_syncs_per_sec, 0,
72133819Stjr    "Number of synchronizations requests per second");
73133819Stjr
74133819Stjrstatic u_int g_raid3_n64k = 50;
75133819StjrTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77133819Stjr    "Maximum number of 64kB allocations");
78133819Stjrstatic u_int g_raid3_n16k = 200;
79133819StjrTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80133819StjrSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81133819Stjr    "Maximum number of 16kB allocations");
82133819Stjrstatic u_int g_raid3_n4k = 1200;
83133819StjrTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84156919SnetchildSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85156919Snetchild    "Maximum number of 4kB allocations");
86156919Snetchild
87156919SnetchildSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88133819Stjr    "GEOM_RAID3 statistics");
89133819Stjrstatic u_int g_raid3_parity_mismatch = 0;
90133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91133819Stjr    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92133819Stjrstatic u_int g_raid3_64k_requested = 0;
93133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94133819Stjr    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95133819Stjrstatic u_int g_raid3_64k_failed = 0;
96133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97133819Stjr    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98133819Stjrstatic u_int g_raid3_16k_requested = 0;
99133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100133819Stjr    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101133819Stjrstatic u_int g_raid3_16k_failed = 0;
102133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103133819Stjr    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104133819Stjrstatic u_int g_raid3_4k_requested = 0;
105133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106133819Stjr    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107133819Stjrstatic u_int g_raid3_4k_failed = 0;
108133819StjrSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109133819Stjr    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110133819Stjr
111133819Stjr#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112133819Stjr	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113133819Stjr	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114133819Stjr	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115133819Stjr} while (0)
116133819Stjr
117133819Stjrstatic eventhandler_tag g_raid3_ehtag = NULL;
118133819Stjr
119133819Stjrstatic int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120133819Stjr    struct g_geom *gp);
121133819Stjrstatic g_taste_t g_raid3_taste;
122133819Stjrstatic void g_raid3_init(struct g_class *mp);
123133819Stjrstatic void g_raid3_fini(struct g_class *mp);
124133819Stjr
125133819Stjrstruct g_class g_raid3_class = {
126133819Stjr	.name = G_RAID3_CLASS_NAME,
127133819Stjr	.version = G_VERSION,
128133819Stjr	.ctlreq = g_raid3_config,
129133819Stjr	.taste = g_raid3_taste,
130133819Stjr	.destroy_geom = g_raid3_destroy_geom,
131133819Stjr	.init = g_raid3_init,
132133819Stjr	.fini = g_raid3_fini
133133819Stjr};
134133819Stjr
135133819Stjr
136133819Stjrstatic void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137133819Stjrstatic int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138133819Stjrstatic void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139133819Stjrstatic void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140133819Stjr    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141133819Stjrstatic void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142133819Stjr
143133819Stjr
144133819Stjrstatic const char *
145133819Stjrg_raid3_disk_state2str(int state)
146133819Stjr{
147133819Stjr
148133819Stjr	switch (state) {
149133819Stjr	case G_RAID3_DISK_STATE_NODISK:
150133819Stjr		return ("NODISK");
151133819Stjr	case G_RAID3_DISK_STATE_NONE:
152133819Stjr		return ("NONE");
153133819Stjr	case G_RAID3_DISK_STATE_NEW:
154133819Stjr		return ("NEW");
155133819Stjr	case G_RAID3_DISK_STATE_ACTIVE:
156133819Stjr		return ("ACTIVE");
157133819Stjr	case G_RAID3_DISK_STATE_STALE:
158133819Stjr		return ("STALE");
159133819Stjr	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160133819Stjr		return ("SYNCHRONIZING");
161133819Stjr	case G_RAID3_DISK_STATE_DISCONNECTED:
162133819Stjr		return ("DISCONNECTED");
163133819Stjr	default:
164133819Stjr		return ("INVALID");
165133819Stjr	}
166133819Stjr}
167133819Stjr
168133819Stjrstatic const char *
169133819Stjrg_raid3_device_state2str(int state)
170133819Stjr{
171133819Stjr
172133819Stjr	switch (state) {
173133819Stjr	case G_RAID3_DEVICE_STATE_STARTING:
174133819Stjr		return ("STARTING");
175133819Stjr	case G_RAID3_DEVICE_STATE_DEGRADED:
176133819Stjr		return ("DEGRADED");
177133819Stjr	case G_RAID3_DEVICE_STATE_COMPLETE:
178133819Stjr		return ("COMPLETE");
179133819Stjr	default:
180133819Stjr		return ("INVALID");
181133819Stjr	}
182133819Stjr}
183133819Stjr
184133819Stjrconst char *
185133819Stjrg_raid3_get_diskname(struct g_raid3_disk *disk)
186133819Stjr{
187133819Stjr
188133819Stjr	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189133819Stjr		return ("[unknown]");
190133819Stjr	return (disk->d_name);
191133819Stjr}
192133819Stjr
193133819Stjr#define	g_raid3_xor(src1, src2, dst, size)				\
194133819Stjr	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195133819Stjr	    (uint64_t *)(dst), (size_t)size)
196133819Stjrstatic void
197133819Stjr_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198133819Stjr{
199161309Snetchild
200161309Snetchild	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201161309Snetchild	for (; size > 0; size -= 128) {
202133819Stjr		*dst++ = (*src1++) ^ (*src2++);
203133819Stjr		*dst++ = (*src1++) ^ (*src2++);
204133819Stjr		*dst++ = (*src1++) ^ (*src2++);
205133819Stjr		*dst++ = (*src1++) ^ (*src2++);
206133819Stjr		*dst++ = (*src1++) ^ (*src2++);
207133819Stjr		*dst++ = (*src1++) ^ (*src2++);
208133819Stjr		*dst++ = (*src1++) ^ (*src2++);
209133819Stjr		*dst++ = (*src1++) ^ (*src2++);
210133819Stjr		*dst++ = (*src1++) ^ (*src2++);
211133819Stjr		*dst++ = (*src1++) ^ (*src2++);
212133819Stjr		*dst++ = (*src1++) ^ (*src2++);
213133819Stjr		*dst++ = (*src1++) ^ (*src2++);
214133819Stjr		*dst++ = (*src1++) ^ (*src2++);
215133819Stjr		*dst++ = (*src1++) ^ (*src2++);
216133819Stjr		*dst++ = (*src1++) ^ (*src2++);
217133819Stjr		*dst++ = (*src1++) ^ (*src2++);
218133819Stjr	}
219133819Stjr}
220133819Stjr
221133819Stjrstatic int
222133819Stjrg_raid3_is_zero(struct bio *bp)
223133819Stjr{
224133819Stjr	static const uint64_t zeros[] = {
225133819Stjr	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226133819Stjr	};
227133819Stjr	u_char *addr;
228133819Stjr	ssize_t size;
229156843Snetchild
230156843Snetchild	size = bp->bio_length;
231156843Snetchild	addr = (u_char *)bp->bio_data;
232156843Snetchild	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233133819Stjr		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234133819Stjr			return (0);
235133819Stjr	}
236133819Stjr	return (1);
237133819Stjr}
238133819Stjr
239133819Stjr/*
240133819Stjr * --- Events handling functions ---
241133819Stjr * Events in geom_raid3 are used to maintain disks and device status
242133819Stjr * from one thread to simplify locking.
243133819Stjr */
244133819Stjrstatic void
245133819Stjrg_raid3_event_free(struct g_raid3_event *ep)
246133819Stjr{
247133819Stjr
248133819Stjr	free(ep, M_RAID3);
249133819Stjr}
250133819Stjr
251133819Stjrint
252133819Stjrg_raid3_event_send(void *arg, int state, int flags)
253133819Stjr{
254133819Stjr	struct g_raid3_softc *sc;
255133819Stjr	struct g_raid3_disk *disk;
256133819Stjr	struct g_raid3_event *ep;
257133819Stjr	int error;
258133819Stjr
259133819Stjr	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260133819Stjr	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261133819Stjr	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262133819Stjr		disk = NULL;
263133819Stjr		sc = arg;
264133819Stjr	} else {
265133819Stjr		disk = arg;
266133819Stjr		sc = disk->d_softc;
267133819Stjr	}
268156919Snetchild	ep->e_disk = disk;
269156843Snetchild	ep->e_state = state;
270156919Snetchild	ep->e_flags = flags;
271156843Snetchild	ep->e_error = 0;
272133819Stjr	mtx_lock(&sc->sc_events_mtx);
273133819Stjr	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274133819Stjr	mtx_unlock(&sc->sc_events_mtx);
275133819Stjr	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276133819Stjr	mtx_lock(&sc->sc_queue_mtx);
277133819Stjr	wakeup(sc);
278133819Stjr	wakeup(&sc->sc_queue);
279133819Stjr	mtx_unlock(&sc->sc_queue_mtx);
280133819Stjr	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281133819Stjr		return (0);
282133819Stjr	g_topology_assert();
283133819Stjr	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284133819Stjr	g_topology_unlock();
285133819Stjr	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286133819Stjr		mtx_lock(&sc->sc_events_mtx);
287133819Stjr		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288133819Stjr		    hz * 5);
289133819Stjr	}
290133819Stjr	/* Don't even try to use 'sc' here, because it could be already dead. */
291133819Stjr	g_topology_lock();
292133819Stjr	error = ep->e_error;
293133819Stjr	g_raid3_event_free(ep);
294133819Stjr	return (error);
295156843Snetchild}
296156843Snetchild
297156843Snetchildstatic struct g_raid3_event *
298156843Snetchildg_raid3_event_get(struct g_raid3_softc *sc)
299147142Ssobomax{
300147142Ssobomax	struct g_raid3_event *ep;
301147142Ssobomax
302147142Ssobomax	mtx_lock(&sc->sc_events_mtx);
303133819Stjr	ep = TAILQ_FIRST(&sc->sc_events);
304133819Stjr	mtx_unlock(&sc->sc_events_mtx);
305133819Stjr	return (ep);
306133819Stjr}
307133819Stjr
308133819Stjrstatic void
309133819Stjrg_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310133819Stjr{
311133819Stjr
312133819Stjr	mtx_lock(&sc->sc_events_mtx);
313133819Stjr	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314133819Stjr	mtx_unlock(&sc->sc_events_mtx);
315133819Stjr}
316133819Stjr
317133819Stjrstatic void
318133819Stjrg_raid3_event_cancel(struct g_raid3_disk *disk)
319133819Stjr{
320133819Stjr	struct g_raid3_softc *sc;
321133819Stjr	struct g_raid3_event *ep, *tmpep;
322133819Stjr
323133819Stjr	g_topology_assert();
324133819Stjr
325133819Stjr	sc = disk->d_softc;
326133819Stjr	mtx_lock(&sc->sc_events_mtx);
327133819Stjr	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328133819Stjr		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329133819Stjr			continue;
330133819Stjr		if (ep->e_disk != disk)
331133819Stjr			continue;
332133819Stjr		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333133819Stjr		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334133819Stjr			g_raid3_event_free(ep);
335133819Stjr		else {
336133819Stjr			ep->e_error = ECANCELED;
337133819Stjr			wakeup(ep);
338133819Stjr		}
339133819Stjr	}
340133819Stjr	mtx_unlock(&sc->sc_events_mtx);
341133819Stjr}
342133819Stjr
343133819Stjr/*
344133819Stjr * Return the number of disks in the given state.
345133819Stjr * If state is equal to -1, count all connected disks.
346133819Stjr */
347133819Stjru_int
348133819Stjrg_raid3_ndisks(struct g_raid3_softc *sc, int state)
349133819Stjr{
350133819Stjr	struct g_raid3_disk *disk;
351133819Stjr	u_int n, ndisks;
352133819Stjr
353133819Stjr	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354133819Stjr		disk = &sc->sc_disks[n];
355133819Stjr		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356133819Stjr			continue;
357133819Stjr		if (state == -1 || disk->d_state == state)
358133819Stjr			ndisks++;
359133819Stjr	}
360133819Stjr	return (ndisks);
361133819Stjr}
362133819Stjr
363133819Stjrstatic u_int
364133819Stjrg_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365133819Stjr{
366133819Stjr	struct bio *bp;
367133819Stjr	u_int nreqs = 0;
368133819Stjr
369133819Stjr	mtx_lock(&sc->sc_queue_mtx);
370133819Stjr	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371133819Stjr		if (bp->bio_from == cp)
372133819Stjr			nreqs++;
373161309Snetchild	}
374161309Snetchild	mtx_unlock(&sc->sc_queue_mtx);
375161309Snetchild	return (nreqs);
376133819Stjr}
377133819Stjr
378133819Stjrstatic int
379133819Stjrg_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380133819Stjr{
381133819Stjr
382133819Stjr	if (cp->index > 0) {
383133819Stjr		G_RAID3_DEBUG(2,
384133819Stjr		    "I/O requests for %s exist, can't destroy it now.",
385133819Stjr		    cp->provider->name);
386133819Stjr		return (1);
387133819Stjr	}
388133819Stjr	if (g_raid3_nrequests(sc, cp) > 0) {
389133819Stjr		G_RAID3_DEBUG(2,
390133819Stjr		    "I/O requests for %s in queue, can't destroy it now.",
391133819Stjr		    cp->provider->name);
392133819Stjr		return (1);
393133819Stjr	}
394133819Stjr	return (0);
395133819Stjr}
396133819Stjr
397133819Stjrstatic void
398133819Stjrg_raid3_destroy_consumer(void *arg, int flags __unused)
399133819Stjr{
400133819Stjr	struct g_consumer *cp;
401133819Stjr
402133819Stjr	cp = arg;
403133819Stjr	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404133819Stjr	g_detach(cp);
405133819Stjr	g_destroy_consumer(cp);
406133819Stjr}
407133819Stjr
408133819Stjrstatic void
409133819Stjrg_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410133819Stjr{
411133819Stjr	struct g_provider *pp;
412133819Stjr	int retaste_wait;
413133819Stjr
414133819Stjr	g_topology_assert();
415133819Stjr
416133819Stjr	cp->private = NULL;
417133819Stjr	if (g_raid3_is_busy(sc, cp))
418133819Stjr		return;
419133819Stjr	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420133819Stjr	pp = cp->provider;
421133819Stjr	retaste_wait = 0;
422133819Stjr	if (cp->acw == 1) {
423133819Stjr		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424133819Stjr			retaste_wait = 1;
425133819Stjr	}
426133819Stjr	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427133819Stjr	    -cp->acw, -cp->ace, 0);
428133819Stjr	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429133819Stjr		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430133819Stjr	if (retaste_wait) {
431133819Stjr		/*
432133819Stjr		 * After retaste event was send (inside g_access()), we can send
433133819Stjr		 * event to detach and destroy consumer.
434133819Stjr		 * A class, which has consumer to the given provider connected
435133819Stjr		 * will not receive retaste event for the provider.
436133819Stjr		 * This is the way how I ignore retaste events when I close
437133819Stjr		 * consumers opened for write: I detach and destroy consumer
438133819Stjr		 * after retaste event is sent.
439133819Stjr		 */
440133819Stjr		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441133819Stjr		return;
442133819Stjr	}
443133819Stjr	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444133819Stjr	g_detach(cp);
445133819Stjr	g_destroy_consumer(cp);
446133819Stjr}
447133819Stjr
448133819Stjrstatic int
449133819Stjrg_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450133819Stjr{
451133819Stjr	int error;
452133819Stjr
453133819Stjr	g_topology_assert();
454133819Stjr	KASSERT(disk->d_consumer == NULL,
455133819Stjr	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
456133819Stjr
457133819Stjr	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
458133819Stjr	disk->d_consumer->private = disk;
459133819Stjr	disk->d_consumer->index = 0;
460133819Stjr	error = g_attach(disk->d_consumer, pp);
461133819Stjr	if (error != 0)
462133819Stjr		return (error);
463133819Stjr	error = g_access(disk->d_consumer, 1, 1, 1);
464133819Stjr	if (error != 0) {
465133819Stjr		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
466133819Stjr		    pp->name, error);
467133819Stjr		return (error);
468133819Stjr	}
469133819Stjr	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
470133819Stjr	return (0);
471133819Stjr}
472133819Stjr
473133819Stjrstatic void
474133819Stjrg_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
475133819Stjr{
476133819Stjr
477133819Stjr	g_topology_assert();
478133819Stjr
479133819Stjr	if (cp == NULL)
480133819Stjr		return;
481133819Stjr	if (cp->provider != NULL)
482133819Stjr		g_raid3_kill_consumer(sc, cp);
483133819Stjr	else
484133819Stjr		g_destroy_consumer(cp);
485133819Stjr}
486133819Stjr
487133819Stjr/*
488133819Stjr * Initialize disk. This means allocate memory, create consumer, attach it
489133819Stjr * to the provider and open access (r1w1e1) to it.
490133819Stjr */
491133819Stjrstatic struct g_raid3_disk *
492133819Stjrg_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
493133819Stjr    struct g_raid3_metadata *md, int *errorp)
494133819Stjr{
495133819Stjr	struct g_raid3_disk *disk;
496133819Stjr	int error;
497133819Stjr
498133819Stjr	disk = &sc->sc_disks[md->md_no];
499133819Stjr	error = g_raid3_connect_disk(disk, pp);
500133819Stjr	if (error != 0)
501133819Stjr		goto fail;
502133819Stjr	disk->d_state = G_RAID3_DISK_STATE_NONE;
503133819Stjr	disk->d_flags = md->md_dflags;
504133819Stjr	if (md->md_provider[0] != '\0')
505133819Stjr		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
506133819Stjr	disk->d_sync.ds_consumer = NULL;
507133819Stjr	disk->d_sync.ds_offset = md->md_sync_offset;
508133819Stjr	disk->d_sync.ds_offset_done = md->md_sync_offset;
509133819Stjr	disk->d_sync.ds_resync = -1;
510133819Stjr	disk->d_genid = md->md_genid;
511133819Stjr	disk->d_sync.ds_syncid = md->md_syncid;
512133819Stjr	if (errorp != NULL)
513133819Stjr		*errorp = 0;
514133819Stjr	return (disk);
515133819Stjrfail:
516133819Stjr	if (errorp != NULL)
517133819Stjr		*errorp = error;
518133819Stjr	if (disk != NULL)
519133819Stjr		g_raid3_disconnect_consumer(sc, disk->d_consumer);
520133819Stjr	return (NULL);
521133819Stjr}
522133819Stjr
523133819Stjrstatic void
524163736Snetchildg_raid3_destroy_disk(struct g_raid3_disk *disk)
525163736Snetchild{
526163736Snetchild	struct g_raid3_softc *sc;
527163736Snetchild
528163736Snetchild	g_topology_assert();
529133819Stjr
530133819Stjr	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
531133819Stjr		return;
532133819Stjr	g_raid3_event_cancel(disk);
533133819Stjr	sc = disk->d_softc;
534133819Stjr	switch (disk->d_state) {
535133819Stjr	case G_RAID3_DISK_STATE_SYNCHRONIZING:
536133819Stjr		if (sc->sc_syncdisk != NULL)
537133819Stjr			g_raid3_sync_stop(sc, 1);
538133819Stjr		/* FALLTHROUGH */
539133819Stjr	case G_RAID3_DISK_STATE_NEW:
540133819Stjr	case G_RAID3_DISK_STATE_STALE:
541133819Stjr	case G_RAID3_DISK_STATE_ACTIVE:
542133819Stjr		g_raid3_disconnect_consumer(sc, disk->d_consumer);
543133819Stjr		disk->d_consumer = NULL;
544133819Stjr		break;
545133819Stjr	default:
546158407Snetchild		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
547158407Snetchild		    g_raid3_get_diskname(disk),
548133819Stjr		    g_raid3_disk_state2str(disk->d_state)));
549133819Stjr	}
550133819Stjr	disk->d_state = G_RAID3_DISK_STATE_NODISK;
551133819Stjr}
552133819Stjr
553133819Stjrstatic void
554133819Stjrg_raid3_destroy_device(struct g_raid3_softc *sc)
555133819Stjr{
556133819Stjr	struct g_raid3_event *ep;
557133819Stjr	struct g_raid3_disk *disk;
558133819Stjr	struct g_geom *gp;
559133819Stjr	struct g_consumer *cp;
560133819Stjr	u_int n;
561133819Stjr
562133819Stjr	g_topology_assert();
563133819Stjr
564133819Stjr	gp = sc->sc_geom;
565133819Stjr	if (sc->sc_provider != NULL)
566133819Stjr		g_raid3_destroy_provider(sc);
567133819Stjr	for (n = 0; n < sc->sc_ndisks; n++) {
568133819Stjr		disk = &sc->sc_disks[n];
569133819Stjr		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
570133819Stjr			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
571133819Stjr			g_raid3_update_metadata(disk);
572133819Stjr			g_raid3_destroy_disk(disk);
573133819Stjr		}
574133819Stjr	}
575133819Stjr	while ((ep = g_raid3_event_get(sc)) != NULL) {
576133819Stjr		g_raid3_event_remove(sc, ep);
577133819Stjr		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
578133819Stjr			g_raid3_event_free(ep);
579133819Stjr		else {
580133819Stjr			ep->e_error = ECANCELED;
581133819Stjr			ep->e_flags |= G_RAID3_EVENT_DONE;
582133819Stjr			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
583133819Stjr			mtx_lock(&sc->sc_events_mtx);
584133819Stjr			wakeup(ep);
585133819Stjr			mtx_unlock(&sc->sc_events_mtx);
586133819Stjr		}
587133819Stjr	}
588133819Stjr	callout_drain(&sc->sc_callout);
589133819Stjr	gp->softc = NULL;
590133819Stjr	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
591133819Stjr	if (cp != NULL)
592133819Stjr		g_raid3_disconnect_consumer(sc, cp);
593133819Stjr	sc->sc_sync.ds_geom->softc = NULL;
594133819Stjr	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
595133819Stjr	uma_zdestroy(sc->sc_zone_64k);
596133819Stjr	uma_zdestroy(sc->sc_zone_16k);
597133819Stjr	uma_zdestroy(sc->sc_zone_4k);
598133819Stjr	mtx_destroy(&sc->sc_queue_mtx);
599133819Stjr	mtx_destroy(&sc->sc_events_mtx);
600133819Stjr	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
601133819Stjr	g_wither_geom(gp, ENXIO);
602133819Stjr}
603133819Stjr
604133819Stjrstatic void
605133819Stjrg_raid3_orphan(struct g_consumer *cp)
606133819Stjr{
607133819Stjr	struct g_raid3_disk *disk;
608133819Stjr
609133819Stjr	g_topology_assert();
610133819Stjr
611133819Stjr	disk = cp->private;
612133819Stjr	if (disk == NULL)
613133819Stjr		return;
614133819Stjr	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID_OFW;
615133819Stjr	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
616133819Stjr	    G_RAID3_EVENT_DONTWAIT);
617133819Stjr}
618133819Stjr
619133819Stjrstatic void
620133819Stjrg_raid3_spoiled(struct g_consumer *cp)
621133819Stjr{
622133819Stjr	struct g_raid3_disk *disk;
623133819Stjr
624133819Stjr	g_topology_assert();
625133819Stjr
626133819Stjr	disk = cp->private;
627133819Stjr	if (disk == NULL)
628133819Stjr		return;
629133819Stjr	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID_IMM;
630133819Stjr	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
631133819Stjr	    G_RAID3_EVENT_DONTWAIT);
632133819Stjr}
633133819Stjr
634133819Stjrstatic int
635133819Stjrg_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
636133819Stjr{
637133819Stjr	struct g_raid3_softc *sc;
638133819Stjr	struct g_consumer *cp;
639133819Stjr	off_t offset, length;
640133819Stjr	u_char *sector;
641133819Stjr	int error = 0;
642133819Stjr
643133819Stjr	g_topology_assert();
644133819Stjr
645133819Stjr	sc = disk->d_softc;
646133819Stjr	cp = disk->d_consumer;
647133819Stjr	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
648133819Stjr	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
649133819Stjr	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
650133819Stjr	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
651133819Stjr	    cp->acw, cp->ace));
652133819Stjr	length = cp->provider->sectorsize;
653133819Stjr	offset = cp->provider->mediasize - length;
654133819Stjr	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
655133819Stjr	if (md != NULL)
656133819Stjr		raid3_metadata_encode(md, sector);
657133819Stjr	g_topology_unlock();
658133819Stjr	error = g_write_data(cp, offset, sector, length);
659133819Stjr	g_topology_lock();
660133819Stjr	free(sector, M_RAID3);
661133819Stjr	if (error != 0) {
662133819Stjr		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID_IMM;
663133819Stjr		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
664133819Stjr		    G_RAID3_EVENT_DONTWAIT);
665133819Stjr	}
666133819Stjr	return (error);
667133819Stjr}
668133819Stjr
669133819Stjrint
670133819Stjrg_raid3_clear_metadata(struct g_raid3_disk *disk)
671133819Stjr{
672133819Stjr	int error;
673133819Stjr
674133819Stjr	g_topology_assert();
675133819Stjr	error = g_raid3_write_metadata(disk, NULL);
676133819Stjr	if (error == 0) {
677133819Stjr		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
678133819Stjr		    g_raid3_get_diskname(disk));
679133819Stjr	} else {
680161309Snetchild		G_RAID3_DEBUG(0,
681161309Snetchild		    "Cannot clear metadata on disk %s (error=%d).",
682161309Snetchild		    g_raid3_get_diskname(disk), error);
683133819Stjr	}
684133819Stjr	return (error);
685133819Stjr}
686133819Stjr
687133819Stjrvoid
688133819Stjrg_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
689133819Stjr{
690133819Stjr	struct g_raid3_softc *sc;
691133819Stjr
692133819Stjr	sc = disk->d_softc;
693133819Stjr	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
694133819Stjr	md->md_version = G_RAID3_VERSION;
695133819Stjr	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
696133819Stjr	md->md_id = sc->sc_id;
697133819Stjr	md->md_all = sc->sc_ndisks;
698133819Stjr	md->md_genid = sc->sc_genid;
699133819Stjr	md->md_mediasize = sc->sc_mediasize;
700133819Stjr	md->md_sectorsize = sc->sc_sectorsize;
701133819Stjr	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
702133819Stjr	md->md_no = disk->d_no;
703133819Stjr	md->md_syncid = disk->d_sync.ds_syncid;
704133819Stjr	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
705133819Stjr	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
706133819Stjr		md->md_sync_offset = disk->d_sync.ds_offset_done;
707133819Stjr	else
708133819Stjr		md->md_sync_offset = 0;
709133819Stjr	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
710133819Stjr	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
711133819Stjr		strlcpy(md->md_provider, disk->d_consumer->provider->name,
712133819Stjr		    sizeof(md->md_provider));
713133819Stjr	} else {
714133819Stjr		bzero(md->md_provider, sizeof(md->md_provider));
715133819Stjr	}
716133819Stjr}
717133819Stjr
718133819Stjrvoid
719161309Snetchildg_raid3_update_metadata(struct g_raid3_disk *disk)
720161309Snetchild{
721161309Snetchild	struct g_raid3_metadata md;
722161309Snetchild	int error;
723161309Snetchild
724161309Snetchild	g_topology_assert();
725161309Snetchild	g_raid3_fill_metadata(disk, &md);
726161309Snetchild	error = g_raid3_write_metadata(disk, &md);
727161309Snetchild	if (error == 0) {
728161309Snetchild		G_RAID3_DEBUG(2, "Metadata on %s updated.",
729161309Snetchild		    g_raid3_get_diskname(disk));
730161309Snetchild	} else {
731133819Stjr		G_RAID3_DEBUG(0,
732133819Stjr		    "Cannot update metadata on disk %s (error=%d).",
733133819Stjr		    g_raid3_get_diskname(disk), error);
734161309Snetchild	}
735161309Snetchild}
736161309Snetchild
737159801Snetchildstatic void
738159801Snetchildg_raid3_bump_syncid(struct g_raid3_softc *sc)
739159801Snetchild{
740159801Snetchild	struct g_raid3_disk *disk;
741159801Snetchild	u_int n;
742159801Snetchild
743159801Snetchild	g_topology_assert();
744159801Snetchild	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
745159801Snetchild	    ("%s called with no active disks (device=%s).", __func__,
746159801Snetchild	    sc->sc_name));
747159801Snetchild
748159801Snetchild	sc->sc_syncid++;
749159801Snetchild	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
750159801Snetchild	    sc->sc_syncid);
751159801Snetchild	for (n = 0; n < sc->sc_ndisks; n++) {
752159801Snetchild		disk = &sc->sc_disks[n];
753161309Snetchild		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
754159801Snetchild		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
755159801Snetchild			disk->d_sync.ds_syncid = sc->sc_syncid;
756159801Snetchild			g_raid3_update_metadata(disk);
757159801Snetchild		}
758159801Snetchild	}
759159801Snetchild}
760159801Snetchild
761159801Snetchildstatic void
762159801Snetchildg_raid3_bump_genid(struct g_raid3_softc *sc)
763159801Snetchild{
764159801Snetchild	struct g_raid3_disk *disk;
765159801Snetchild	u_int n;
766159801Snetchild
767159801Snetchild	g_topology_assert();
768159801Snetchild	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
769159801Snetchild	    ("%s called with no active disks (device=%s).", __func__,
770159801Snetchild	    sc->sc_name));
771161309Snetchild
772161309Snetchild	sc->sc_genid++;
773159801Snetchild	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
774159801Snetchild	    sc->sc_genid);
775161309Snetchild	for (n = 0; n < sc->sc_ndisks; n++) {
776161309Snetchild		disk = &sc->sc_disks[n];
777159801Snetchild		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
778159801Snetchild		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
779161309Snetchild			disk->d_genid = sc->sc_genid;
780161309Snetchild			g_raid3_update_metadata(disk);
781159801Snetchild		}
782159801Snetchild	}
783161309Snetchild}
784161309Snetchild
785161309Snetchildstatic void
786161309Snetchildg_raid3_idle(struct g_raid3_softc *sc)
787159801Snetchild{
788159801Snetchild	struct g_raid3_disk *disk;
789161666Snetchild	u_int i;
790161666Snetchild
791159801Snetchild	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
792159801Snetchild		return;
793159801Snetchild	sc->sc_idle = 1;
794159801Snetchild	g_topology_lock();
795159801Snetchild	for (i = 0; i < sc->sc_ndisks; i++) {
796161309Snetchild		disk = &sc->sc_disks[i];
797161309Snetchild		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
798161309Snetchild			continue;
799159801Snetchild		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
800159801Snetchild		    g_raid3_get_diskname(disk), sc->sc_name);
801159801Snetchild		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
802159801Snetchild		g_raid3_update_metadata(disk);
803159801Snetchild	}
804159801Snetchild	g_topology_unlock();
805159801Snetchild}
806159801Snetchild
807159801Snetchildstatic void
808159801Snetchildg_raid3_unidle(struct g_raid3_softc *sc)
809159801Snetchild{
810159801Snetchild	struct g_raid3_disk *disk;
811159801Snetchild	u_int i;
812159801Snetchild
813159801Snetchild	sc->sc_idle = 0;
814159801Snetchild	g_topology_lock();
815159801Snetchild	for (i = 0; i < sc->sc_ndisks; i++) {
816159801Snetchild		disk = &sc->sc_disks[i];
817159801Snetchild		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
818159801Snetchild			continue;
819159801Snetchild		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
820159801Snetchild		    g_raid3_get_diskname(disk), sc->sc_name);
821159801Snetchild		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
822159801Snetchild		g_raid3_update_metadata(disk);
823159801Snetchild	}
824159801Snetchild	g_topology_unlock();
825159801Snetchild}
826159801Snetchild
827159801Snetchild/*
828159801Snetchild * Return 1 if we should check if RAID3 device is idling.
829159801Snetchild */
830159801Snetchildstatic int
831159801Snetchildg_raid3_check_idle(struct g_raid3_softc *sc)
832159801Snetchild{
833159801Snetchild	struct g_raid3_disk *disk;
834159801Snetchild	u_int i;
835159801Snetchild
836159801Snetchild	if (sc->sc_idle)
837159801Snetchild		return (0);
838159801Snetchild	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
839159801Snetchild		return (0);
840159801Snetchild	/*
841159801Snetchild	 * Check if there are no in-flight requests.
842159801Snetchild	 */
843159801Snetchild	for (i = 0; i < sc->sc_ndisks; i++) {
844159801Snetchild		disk = &sc->sc_disks[i];
845159801Snetchild		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
846159801Snetchild			continue;
847159801Snetchild		if (disk->d_consumer->index > 0)
848159801Snetchild			return (0);
849159801Snetchild	}
850159801Snetchild	return (1);
851159801Snetchild}
852159801Snetchild
853159801Snetchild/*
854159801Snetchild * Treat bio_driver1 field in parent bio as list head and field bio_caller1
855159801Snetchild * in child bio as pointer to the next element on the list.
856159801Snetchild */
857159801Snetchild#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
858159801Snetchild
859159801Snetchild#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
860159801Snetchild
861159801Snetchild#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
862159801Snetchild	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
863159801Snetchild	    (bp) = G_RAID3_NEXT_BIO(bp))
864159801Snetchild
865159801Snetchild#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
866159801Snetchild	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
867159801Snetchild	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
868159801Snetchild	    (bp) = (tmpbp))
869159801Snetchild
870159801Snetchildstatic void
871159801Snetchildg_raid3_init_bio(struct bio *pbp)
872159801Snetchild{
873159801Snetchild
874159801Snetchild	G_RAID3_HEAD_BIO(pbp) = NULL;
875159801Snetchild}
876159801Snetchild
877159801Snetchildstatic void
878159801Snetchildg_raid3_remove_bio(struct bio *cbp)
879159801Snetchild{
880159801Snetchild	struct bio *pbp, *bp;
881159801Snetchild
882159801Snetchild	pbp = cbp->bio_parent;
883159801Snetchild	if (G_RAID3_HEAD_BIO(pbp) == cbp)
884159801Snetchild		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
885159801Snetchild	else {
886159801Snetchild		G_RAID3_FOREACH_BIO(pbp, bp) {
887159801Snetchild			if (G_RAID3_NEXT_BIO(bp) == cbp) {
888159801Snetchild				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
889159801Snetchild				break;
890159801Snetchild			}
891159801Snetchild		}
892159801Snetchild	}
893159801Snetchild	G_RAID3_NEXT_BIO(cbp) = NULL;
894159801Snetchild}
895159801Snetchild
896159801Snetchildstatic void
897159801Snetchildg_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
898159801Snetchild{
899159801Snetchild	struct bio *pbp, *bp;
900159801Snetchild
901159801Snetchild	g_raid3_remove_bio(sbp);
902159801Snetchild	pbp = dbp->bio_parent;
903159801Snetchild	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
904159801Snetchild	if (G_RAID3_HEAD_BIO(pbp) == dbp)
905159801Snetchild		G_RAID3_HEAD_BIO(pbp) = sbp;
906159801Snetchild	else {
907159801Snetchild		G_RAID3_FOREACH_BIO(pbp, bp) {
908159801Snetchild			if (G_RAID3_NEXT_BIO(bp) == dbp) {
909159801Snetchild				G_RAID3_NEXT_BIO(bp) = sbp;
910159801Snetchild				break;
911159801Snetchild			}
912159801Snetchild		}
913159801Snetchild	}
914143198Ssobomax	G_RAID3_NEXT_BIO(dbp) = NULL;
915133819Stjr}
916133819Stjr
917133819Stjrstatic void
918133819Stjrg_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
919133819Stjr{
920133819Stjr	struct bio *bp, *pbp;
921133819Stjr	size_t size;
922133819Stjr
923133819Stjr	pbp = cbp->bio_parent;
924133819Stjr	pbp->bio_children--;
925133819Stjr	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
926133819Stjr	size = pbp->bio_length / (sc->sc_ndisks - 1);
927156919Snetchild	if (size > 16384)
928133819Stjr		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
929133819Stjr	else if (size > 4096)
930133819Stjr		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
931133819Stjr	else
932133819Stjr		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
933133819Stjr	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
934133819Stjr		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
935133819Stjr		G_RAID3_NEXT_BIO(cbp) = NULL;
936133819Stjr		g_destroy_bio(cbp);
937133819Stjr	} else {
938133819Stjr		G_RAID3_FOREACH_BIO(pbp, bp) {
939133819Stjr			if (G_RAID3_NEXT_BIO(bp) == cbp)
940133819Stjr				break;
941133819Stjr		}
942133819Stjr		if (bp != NULL) {
943133819Stjr			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
944133819Stjr			    ("NULL bp->bio_driver1"));
945133819Stjr			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
946133819Stjr			G_RAID3_NEXT_BIO(cbp) = NULL;
947133819Stjr		}
948133819Stjr		g_destroy_bio(cbp);
949133819Stjr	}
950133819Stjr}
951133819Stjr
952133819Stjrstatic struct bio *
953133819Stjrg_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
954133819Stjr{
955133819Stjr	struct bio *bp, *cbp;
956133819Stjr	size_t size;
957133819Stjr
958161309Snetchild	cbp = g_clone_bio(pbp);
959133819Stjr	if (cbp == NULL)
960133819Stjr		return (NULL);
961133819Stjr	size = pbp->bio_length / (sc->sc_ndisks - 1);
962133819Stjr	if (size > 16384) {
963133819Stjr		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
964133819Stjr		g_raid3_64k_requested++;
965133819Stjr	} else if (size > 4096) {
966156843Snetchild		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
967133819Stjr		g_raid3_16k_requested++;
968133819Stjr	} else {
969133819Stjr		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
970133819Stjr		g_raid3_4k_requested++;
971133819Stjr	}
972133819Stjr	if (cbp->bio_data == NULL) {
973133819Stjr		if (size > 16384)
974133819Stjr			g_raid3_64k_failed++;
975133819Stjr		if (size > 4096)
976156919Snetchild			g_raid3_16k_failed++;
977133819Stjr		else
978133819Stjr			g_raid3_4k_failed++;
979133819Stjr		pbp->bio_children--;
980133819Stjr		g_destroy_bio(cbp);
981133819Stjr		return (NULL);
982156843Snetchild	}
983147142Ssobomax	G_RAID3_NEXT_BIO(cbp) = NULL;
984133819Stjr	if (G_RAID3_HEAD_BIO(pbp) == NULL)
985133819Stjr		G_RAID3_HEAD_BIO(pbp) = cbp;
986133819Stjr	else {
987133819Stjr		G_RAID3_FOREACH_BIO(pbp, bp) {
988133819Stjr			if (G_RAID3_NEXT_BIO(bp) == NULL) {
989133819Stjr				G_RAID3_NEXT_BIO(bp) = cbp;
990133819Stjr				break;
991133819Stjr			}
992133819Stjr		}
993133819Stjr	}
994133819Stjr	return (cbp);
995133819Stjr}
996133819Stjr
997133819Stjrstatic void
998133819Stjrg_raid3_scatter(struct bio *pbp)
999133819Stjr{
1000133819Stjr	struct g_raid3_softc *sc;
1001133819Stjr	struct g_raid3_disk *disk;
1002133819Stjr	struct bio *bp, *cbp;
1003133819Stjr	off_t atom, cadd, padd, left;
1004133819Stjr
1005133819Stjr	sc = pbp->bio_to->geom->softc;
1006133819Stjr	bp = NULL;
1007133819Stjr	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1008133819Stjr		/*
1009133819Stjr		 * Find bio for which we should calculate data.
1010133819Stjr		 */
1011133819Stjr		G_RAID3_FOREACH_BIO(pbp, cbp) {
1012133819Stjr			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1013133819Stjr				bp = cbp;
1014133819Stjr				break;
1015133819Stjr			}
1016133819Stjr		}
1017133819Stjr		KASSERT(bp != NULL, ("NULL parity bio."));
1018133819Stjr	}
1019133819Stjr	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1020133819Stjr	cadd = padd = 0;
1021133819Stjr	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1022133819Stjr		G_RAID3_FOREACH_BIO(pbp, cbp) {
1023133819Stjr			if (cbp == bp)
1024133819Stjr				continue;
1025133819Stjr			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1026133819Stjr			padd += atom;
1027133819Stjr		}
1028133819Stjr		cadd += atom;
1029133819Stjr	}
1030133819Stjr	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1031133819Stjr		struct bio *tmpbp;
1032133819Stjr
1033133819Stjr		/*
1034133819Stjr		 * Calculate parity.
1035133819Stjr		 */
1036133819Stjr		bzero(bp->bio_data, bp->bio_length);
1037133819Stjr		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1038133819Stjr			if (cbp == bp)
1039133819Stjr				continue;
1040133819Stjr			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1041133819Stjr			    bp->bio_length);
1042133819Stjr			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1043133819Stjr				g_raid3_destroy_bio(sc, cbp);
1044133819Stjr		}
1045133819Stjr	}
1046133819Stjr	G_RAID3_FOREACH_BIO(pbp, cbp) {
1047133819Stjr		struct g_consumer *cp;
1048133819Stjr
1049133819Stjr		disk = cbp->bio_caller2;
1050133819Stjr		cp = disk->d_consumer;
1051133819Stjr		cbp->bio_to = cp->provider;
1052133819Stjr		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1053133819Stjr		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1054133819Stjr		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1055133819Stjr		    cp->acr, cp->acw, cp->ace));
1056133819Stjr		cp->index++;
1057133819Stjr		g_io_request(cbp, cp);
1058133819Stjr	}
1059133819Stjr}
1060133819Stjr
1061133819Stjrstatic void
1062133819Stjrg_raid3_gather(struct bio *pbp)
1063133819Stjr{
1064133819Stjr	struct g_raid3_softc *sc;
1065133819Stjr	struct g_raid3_disk *disk;
1066133819Stjr	struct bio *xbp, *fbp, *cbp;
1067133819Stjr	off_t atom, cadd, padd, left;
1068133819Stjr
1069133819Stjr	sc = pbp->bio_to->geom->softc;
1070133819Stjr	/*
1071133819Stjr	 * Find bio for which we have to calculate data.
1072133819Stjr	 * While going through this path, check if all requests
1073161309Snetchild	 * succeeded, if not, deny whole request.
1074133819Stjr	 * If we're in COMPLETE mode, we allow one request to fail,
1075133819Stjr	 * so if we find one, we're sending it to the parity consumer.
1076133819Stjr	 * If there are more failed requests, we deny whole request.
1077133819Stjr	 */
1078133819Stjr	xbp = fbp = NULL;
1079133819Stjr	G_RAID3_FOREACH_BIO(pbp, cbp) {
1080133819Stjr		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1081133819Stjr			KASSERT(xbp == NULL, ("More than one parity bio."));
1082133819Stjr			xbp = cbp;
1083133819Stjr		}
1084133819Stjr		if (cbp->bio_error == 0)
1085133819Stjr			continue;
1086161309Snetchild		/*
1087161309Snetchild		 * Found failed request.
1088133819Stjr		 */
1089161309Snetchild		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1090159801Snetchild		disk = cbp->bio_caller2;
1091159801Snetchild		if (disk != NULL) {
1092159801Snetchild			/*
1093159801Snetchild			 * Actually this is pointless to bump genid,
1094159801Snetchild			 * because whole device is fucked up.
1095159801Snetchild			 */
1096159801Snetchild			sc->sc_bump_id |= G_RAID3_BUMP_GENID_IMM;
1097159801Snetchild			g_raid3_event_send(disk,
1098159801Snetchild			    G_RAID3_DISK_STATE_DISCONNECTED,
1099159801Snetchild			    G_RAID3_EVENT_DONTWAIT);
1100159801Snetchild		}
1101159801Snetchild		if (fbp == NULL) {
1102159801Snetchild			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1103159801Snetchild				/*
1104159801Snetchild				 * We are already in degraded mode, so we can't
1105159801Snetchild				 * accept any failures.
1106159801Snetchild				 */
1107159801Snetchild				if (pbp->bio_error == 0)
1108159801Snetchild					pbp->bio_error = fbp->bio_error;
1109159801Snetchild			} else {
1110159801Snetchild				fbp = cbp;
1111159801Snetchild			}
1112159801Snetchild		} else {
1113159801Snetchild			/*
1114159801Snetchild			 * Next failed request, that's too many.
1115159801Snetchild			 */
1116159801Snetchild			if (pbp->bio_error == 0)
1117159801Snetchild				pbp->bio_error = fbp->bio_error;
1118159801Snetchild		}
1119159801Snetchild	}
1120159801Snetchild	if (pbp->bio_error != 0)
1121159801Snetchild		goto finish;
1122159801Snetchild	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1123159801Snetchild		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1124159801Snetchild		if (xbp != fbp)
1125159801Snetchild			g_raid3_replace_bio(xbp, fbp);
1126159801Snetchild		g_raid3_destroy_bio(sc, fbp);
1127159801Snetchild	} else if (fbp != NULL) {
1128159801Snetchild		struct g_consumer *cp;
1129159801Snetchild
1130159801Snetchild		/*
1131159801Snetchild		 * One request failed, so send the same request to
1132159801Snetchild		 * the parity consumer.
1133159801Snetchild		 */
1134159801Snetchild		disk = pbp->bio_driver2;
1135159801Snetchild		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1136159801Snetchild			pbp->bio_error = fbp->bio_error;
1137159801Snetchild			goto finish;
1138159801Snetchild		}
1139159801Snetchild		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1140159801Snetchild		pbp->bio_inbed--;
1141159801Snetchild		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1142159801Snetchild		if (disk->d_no == sc->sc_ndisks - 1)
1143159801Snetchild			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1144159801Snetchild		fbp->bio_error = 0;
1145159801Snetchild		fbp->bio_completed = 0;
1146133819Stjr		fbp->bio_children = 0;
1147133819Stjr		fbp->bio_inbed = 0;
1148133819Stjr		cp = disk->d_consumer;
1149143198Ssobomax		fbp->bio_caller2 = disk;
1150133819Stjr		fbp->bio_to = cp->provider;
1151133819Stjr		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1152133819Stjr		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1153133819Stjr		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1154133819Stjr		    cp->acr, cp->acw, cp->ace));
1155133819Stjr		cp->index++;
1156143198Ssobomax		g_io_request(fbp, cp);
1157133819Stjr		return;
1158133819Stjr	}
1159133819Stjr	if (xbp != NULL) {
1160161330Sjhb		/*
1161161330Sjhb		 * Calculate parity.
1162161330Sjhb		 */
1163161330Sjhb		G_RAID3_FOREACH_BIO(pbp, cbp) {
1164161330Sjhb			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1165161330Sjhb				continue;
1166161330Sjhb			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1167161330Sjhb			    xbp->bio_length);
1168161330Sjhb		}
1169161330Sjhb		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1170161330Sjhb		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1171161330Sjhb			if (!g_raid3_is_zero(xbp)) {
1172161330Sjhb				g_raid3_parity_mismatch++;
1173161330Sjhb				pbp->bio_error = EIO;
1174161330Sjhb				goto finish;
1175161330Sjhb			}
1176161330Sjhb			g_raid3_destroy_bio(sc, xbp);
1177161330Sjhb		}
1178161330Sjhb	}
1179161330Sjhb	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1180161330Sjhb	cadd = padd = 0;
1181161330Sjhb	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1182161330Sjhb		G_RAID3_FOREACH_BIO(pbp, cbp) {
1183161330Sjhb			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1184161330Sjhb			pbp->bio_completed += atom;
1185161330Sjhb			padd += atom;
1186161330Sjhb		}
1187161330Sjhb		cadd += atom;
1188161330Sjhb	}
1189161330Sjhbfinish:
1190161330Sjhb	if (pbp->bio_error == 0)
1191161330Sjhb		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1192161330Sjhb	else {
1193161330Sjhb		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1194161330Sjhb			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1195161330Sjhb		else
1196161330Sjhb			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1197161330Sjhb	}
1198161330Sjhb	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1199161330Sjhb	g_io_deliver(pbp, pbp->bio_error);
1200161330Sjhb	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1201161330Sjhb		g_raid3_destroy_bio(sc, cbp);
1202161330Sjhb}
1203161330Sjhb
1204161330Sjhbstatic void
1205161330Sjhbg_raid3_done(struct bio *bp)
1206161330Sjhb{
1207161330Sjhb	struct g_raid3_softc *sc;
1208161330Sjhb
1209161330Sjhb	sc = bp->bio_from->geom->softc;
1210161330Sjhb	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1211161330Sjhb	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1212161330Sjhb	mtx_lock(&sc->sc_queue_mtx);
1213161330Sjhb	bioq_insert_head(&sc->sc_queue, bp);
1214161330Sjhb	wakeup(sc);
1215161330Sjhb	wakeup(&sc->sc_queue);
1216161330Sjhb	mtx_unlock(&sc->sc_queue_mtx);
1217161330Sjhb}
1218161330Sjhb
1219161330Sjhbstatic void
1220161330Sjhbg_raid3_regular_request(struct bio *cbp)
1221161330Sjhb{
1222161330Sjhb	struct g_raid3_softc *sc;
1223161330Sjhb	struct g_raid3_disk *disk;
1224161330Sjhb	struct bio *pbp;
1225161330Sjhb
1226161330Sjhb	g_topology_assert_not();
1227161330Sjhb
1228161330Sjhb	cbp->bio_from->index--;
1229161330Sjhb	pbp = cbp->bio_parent;
1230161330Sjhb	sc = pbp->bio_to->geom->softc;
1231161330Sjhb	disk = cbp->bio_from->private;
1232161330Sjhb	if (disk == NULL) {
1233161330Sjhb		g_topology_lock();
1234161330Sjhb		g_raid3_kill_consumer(sc, cbp->bio_from);
1235161330Sjhb		g_topology_unlock();
1236161330Sjhb	}
1237161330Sjhb
1238161330Sjhb	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1239161330Sjhb	pbp->bio_inbed++;
1240161330Sjhb	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1241161330Sjhb	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1242161330Sjhb	    pbp->bio_children));
1243161330Sjhb	if (pbp->bio_inbed != pbp->bio_children)
1244161330Sjhb		return;
1245161330Sjhb	switch (pbp->bio_cmd) {
1246161330Sjhb	case BIO_READ:
1247161330Sjhb		g_raid3_gather(pbp);
1248161330Sjhb		break;
1249161330Sjhb	case BIO_WRITE:
1250161330Sjhb	case BIO_DELETE:
1251161330Sjhb	    {
1252161330Sjhb		int error = 0;
1253161330Sjhb
1254161330Sjhb		pbp->bio_completed = pbp->bio_length;
1255161330Sjhb		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1256161330Sjhb			if (cbp->bio_error != 0) {
1257161330Sjhb				disk = cbp->bio_caller2;
1258161330Sjhb				if (disk != NULL) {
1259161330Sjhb					sc->sc_bump_id |=
1260161330Sjhb					    G_RAID3_BUMP_GENID_IMM;
1261161330Sjhb					g_raid3_event_send(disk,
1262161330Sjhb					    G_RAID3_DISK_STATE_DISCONNECTED,
1263161330Sjhb					    G_RAID3_EVENT_DONTWAIT);
1264161330Sjhb				}
1265161330Sjhb				if (error == 0)
1266161330Sjhb					error = cbp->bio_error;
1267161330Sjhb				else if (pbp->bio_error == 0) {
1268161330Sjhb					/*
1269161330Sjhb					 * Next failed request, that's too many.
1270161330Sjhb					 */
1271161330Sjhb					pbp->bio_error = error;
1272161330Sjhb				}
1273161330Sjhb			}
1274161330Sjhb			g_raid3_destroy_bio(sc, cbp);
1275161330Sjhb		}
1276161330Sjhb		if (pbp->bio_error == 0)
1277161330Sjhb			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1278161330Sjhb		else
1279161330Sjhb			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1280161330Sjhb		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1281161330Sjhb		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1282161330Sjhb		g_io_deliver(pbp, pbp->bio_error);
1283161330Sjhb		break;
1284161330Sjhb	    }
1285161330Sjhb	}
1286161330Sjhb}
1287161330Sjhb
1288161330Sjhbstatic void
1289161330Sjhbg_raid3_sync_done(struct bio *bp)
1290161330Sjhb{
1291161330Sjhb	struct g_raid3_softc *sc;
1292161330Sjhb
1293161330Sjhb	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1294161330Sjhb	sc = bp->bio_from->geom->softc;
1295161330Sjhb	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1296161330Sjhb	mtx_lock(&sc->sc_queue_mtx);
1297161330Sjhb	bioq_insert_head(&sc->sc_queue, bp);
1298161330Sjhb	wakeup(sc);
1299161330Sjhb	wakeup(&sc->sc_queue);
1300161330Sjhb	mtx_unlock(&sc->sc_queue_mtx);
1301161330Sjhb}
1302161330Sjhb
1303161330Sjhbstatic void
1304161330Sjhbg_raid3_start(struct bio *bp)
1305161330Sjhb{
1306161330Sjhb	struct g_raid3_softc *sc;
1307161330Sjhb
1308161330Sjhb	sc = bp->bio_to->geom->softc;
1309161330Sjhb	/*
1310161330Sjhb	 * If sc == NULL or there are no valid disks, provider's error
1311161330Sjhb	 * should be set and g_raid3_start() should not be called at all.
1312161330Sjhb	 */
1313161330Sjhb	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1314161330Sjhb	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1315161330Sjhb	    ("Provider's error should be set (error=%d)(device=%s).",
1316161330Sjhb	    bp->bio_to->error, bp->bio_to->name));
1317161330Sjhb	G_RAID3_LOGREQ(3, bp, "Request received.");
1318161330Sjhb
1319161330Sjhb	switch (bp->bio_cmd) {
1320161330Sjhb	case BIO_READ:
1321161330Sjhb	case BIO_WRITE:
1322161330Sjhb	case BIO_DELETE:
1323161330Sjhb		break;
1324161330Sjhb	case BIO_GETATTR:
1325161330Sjhb	default:
1326161330Sjhb		g_io_deliver(bp, EOPNOTSUPP);
1327161330Sjhb		return;
1328161330Sjhb	}
1329161330Sjhb	mtx_lock(&sc->sc_queue_mtx);
1330161330Sjhb	bioq_insert_tail(&sc->sc_queue, bp);
1331161330Sjhb	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1332161330Sjhb	wakeup(sc);
1333161330Sjhb	mtx_unlock(&sc->sc_queue_mtx);
1334161330Sjhb}
1335161330Sjhb
1336161330Sjhb/*
1337161330Sjhb * Send one synchronization request.
1338161330Sjhb */
1339161330Sjhbstatic void
1340161330Sjhbg_raid3_sync_one(struct g_raid3_softc *sc)
1341161330Sjhb{
1342161330Sjhb	struct g_raid3_disk *disk;
1343161330Sjhb	struct bio *bp;
1344161330Sjhb
1345161330Sjhb	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1346161330Sjhb	    ("Wrong device state (%s, %s).", sc->sc_name,
1347161330Sjhb	    g_raid3_device_state2str(sc->sc_state)));
1348161330Sjhb	disk = sc->sc_syncdisk;
1349161330Sjhb	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1350161330Sjhb	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1351161330Sjhb	    ("Disk %s is not marked for synchronization.",
1352161330Sjhb	    g_raid3_get_diskname(disk)));
1353161330Sjhb
1354161330Sjhb	bp = g_new_bio();
1355161330Sjhb	if (bp == NULL)
1356161330Sjhb		return;
1357161330Sjhb	bp->bio_parent = NULL;
1358161330Sjhb	bp->bio_cmd = BIO_READ;
1359161330Sjhb	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1360161330Sjhb	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1361161330Sjhb	bp->bio_cflags = 0;
1362161330Sjhb	bp->bio_done = g_raid3_sync_done;
1363161330Sjhb	bp->bio_data = disk->d_sync.ds_data;
1364161330Sjhb	if (bp->bio_data == NULL) {
1365161330Sjhb		g_destroy_bio(bp);
1366161330Sjhb		return;
1367161330Sjhb	}
1368161330Sjhb	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1369161330Sjhb	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1370161330Sjhb	bp->bio_to = sc->sc_provider;
1371161330Sjhb	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1372161330Sjhb	disk->d_sync.ds_consumer->index++;
1373161330Sjhb	g_io_request(bp, disk->d_sync.ds_consumer);
1374161330Sjhb}
1375161330Sjhb
1376161330Sjhbstatic void
1377161330Sjhbg_raid3_sync_request(struct bio *bp)
1378161330Sjhb{
1379161330Sjhb	struct g_raid3_softc *sc;
1380161330Sjhb	struct g_raid3_disk *disk;
1381161330Sjhb
1382161330Sjhb	bp->bio_from->index--;
1383161330Sjhb	sc = bp->bio_from->geom->softc;
1384161330Sjhb	disk = bp->bio_from->private;
1385161330Sjhb	if (disk == NULL) {
1386161330Sjhb		g_topology_lock();
1387161330Sjhb		g_raid3_kill_consumer(sc, bp->bio_from);
1388161330Sjhb		g_topology_unlock();
1389161330Sjhb		g_destroy_bio(bp);
1390161330Sjhb		return;
1391161330Sjhb	}
1392133819Stjr
1393133819Stjr	/*
1394133819Stjr	 * Synchronization request.
1395133819Stjr	 */
1396133819Stjr	switch (bp->bio_cmd) {
1397	case BIO_READ:
1398	    {
1399		struct g_consumer *cp;
1400		u_char *dst, *src;
1401		off_t left;
1402		u_int atom;
1403
1404		if (bp->bio_error != 0) {
1405			G_RAID3_LOGREQ(0, bp,
1406			    "Synchronization request failed (error=%d).",
1407			    bp->bio_error);
1408			g_destroy_bio(bp);
1409			return;
1410		}
1411		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1412		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1413		dst = src = bp->bio_data;
1414		if (disk->d_no == sc->sc_ndisks - 1) {
1415			u_int n;
1416
1417			/* Parity component. */
1418			for (left = bp->bio_length; left > 0;
1419			    left -= sc->sc_sectorsize) {
1420				bcopy(src, dst, atom);
1421				src += atom;
1422				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1423					g_raid3_xor(src, dst, dst, atom);
1424					src += atom;
1425				}
1426				dst += atom;
1427			}
1428		} else {
1429			/* Regular component. */
1430			src += atom * disk->d_no;
1431			for (left = bp->bio_length; left > 0;
1432			    left -= sc->sc_sectorsize) {
1433				bcopy(src, dst, atom);
1434				src += sc->sc_sectorsize;
1435				dst += atom;
1436			}
1437		}
1438		bp->bio_offset /= sc->sc_ndisks - 1;
1439		bp->bio_length /= sc->sc_ndisks - 1;
1440		bp->bio_cmd = BIO_WRITE;
1441		bp->bio_cflags = 0;
1442		bp->bio_children = bp->bio_inbed = 0;
1443		cp = disk->d_consumer;
1444		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1445		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1446		    cp->acr, cp->acw, cp->ace));
1447		cp->index++;
1448		g_io_request(bp, cp);
1449		return;
1450	    }
1451	case BIO_WRITE:
1452	    {
1453		struct g_raid3_disk_sync *sync;
1454
1455		if (bp->bio_error != 0) {
1456			G_RAID3_LOGREQ(0, bp,
1457			    "Synchronization request failed (error=%d).",
1458			    bp->bio_error);
1459			g_destroy_bio(bp);
1460			sc->sc_bump_id |= G_RAID3_BUMP_GENID_IMM;
1461			g_raid3_event_send(disk,
1462			    G_RAID3_DISK_STATE_DISCONNECTED,
1463			    G_RAID3_EVENT_DONTWAIT);
1464			return;
1465		}
1466		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1467		sync = &disk->d_sync;
1468		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1469		g_destroy_bio(bp);
1470		if (sync->ds_resync != -1)
1471			return;
1472		if (sync->ds_offset_done ==
1473		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1474			/*
1475			 * Disk up-to-date, activate it.
1476			 */
1477			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1478			    G_RAID3_EVENT_DONTWAIT);
1479			return;
1480		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1481			/*
1482			 * Update offset_done on every 100 blocks.
1483			 * XXX: This should be configurable.
1484			 */
1485			g_topology_lock();
1486			g_raid3_update_metadata(disk);
1487			g_topology_unlock();
1488		}
1489		return;
1490	    }
1491	default:
1492		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1493		    bp->bio_cmd, sc->sc_name));
1494		break;
1495	}
1496}
1497
1498static int
1499g_raid3_register_request(struct bio *pbp)
1500{
1501	struct g_raid3_softc *sc;
1502	struct g_raid3_disk *disk;
1503	struct g_consumer *cp;
1504	struct bio *cbp;
1505	off_t offset, length;
1506	u_int n, ndisks;
1507	int round_robin, verify;
1508
1509	ndisks = 0;
1510	sc = pbp->bio_to->geom->softc;
1511	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1512	    sc->sc_syncdisk == NULL) {
1513		g_io_deliver(pbp, EIO);
1514		return (0);
1515	}
1516	g_raid3_init_bio(pbp);
1517	length = pbp->bio_length / (sc->sc_ndisks - 1);
1518	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1519	round_robin = verify = 0;
1520	switch (pbp->bio_cmd) {
1521	case BIO_READ:
1522		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1523		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1524			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1525			verify = 1;
1526			ndisks = sc->sc_ndisks;
1527		} else {
1528			verify = 0;
1529			ndisks = sc->sc_ndisks - 1;
1530		}
1531		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1532		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1533			round_robin = 1;
1534		} else {
1535			round_robin = 0;
1536		}
1537		KASSERT(!round_robin || !verify,
1538		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1539		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1540		break;
1541	case BIO_WRITE:
1542	case BIO_DELETE:
1543	    {
1544		struct g_raid3_disk_sync *sync;
1545
1546		if (sc->sc_idle)
1547			g_raid3_unidle(sc);
1548
1549		ndisks = sc->sc_ndisks;
1550
1551		if (sc->sc_syncdisk == NULL)
1552			break;
1553		sync = &sc->sc_syncdisk->d_sync;
1554		if (offset >= sync->ds_offset)
1555			break;
1556		if (offset + length <= sync->ds_offset_done)
1557			break;
1558		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1559			break;
1560		sync->ds_resync = offset - (offset % MAXPHYS);
1561		break;
1562	    }
1563	}
1564	for (n = 0; n < ndisks; n++) {
1565		disk = &sc->sc_disks[n];
1566		cbp = g_raid3_clone_bio(sc, pbp);
1567		if (cbp == NULL) {
1568			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1569				g_raid3_destroy_bio(sc, cbp);
1570			return (ENOMEM);
1571		}
1572		cbp->bio_offset = offset;
1573		cbp->bio_length = length;
1574		cbp->bio_done = g_raid3_done;
1575		switch (pbp->bio_cmd) {
1576		case BIO_READ:
1577			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1578				/*
1579				 * Replace invalid component with the parity
1580				 * component.
1581				 */
1582				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1583				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1584				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1585			} else if (round_robin &&
1586			    disk->d_no == sc->sc_round_robin) {
1587				/*
1588				 * In round-robin mode skip one data component
1589				 * and use parity component when reading.
1590				 */
1591				pbp->bio_driver2 = disk;
1592				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1593				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1594				sc->sc_round_robin++;
1595				round_robin = 0;
1596			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1597				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1598			}
1599			break;
1600		case BIO_WRITE:
1601		case BIO_DELETE:
1602			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1603			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1604				if (n == ndisks - 1) {
1605					/*
1606					 * Active parity component, mark it as such.
1607					 */
1608					cbp->bio_cflags |=
1609					    G_RAID3_BIO_CFLAG_PARITY;
1610				}
1611			} else {
1612				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1613				if (n == ndisks - 1) {
1614					/*
1615					 * Parity component is not connected,
1616					 * so destroy its request.
1617					 */
1618					pbp->bio_pflags |=
1619					    G_RAID3_BIO_PFLAG_NOPARITY;
1620					g_raid3_destroy_bio(sc, cbp);
1621					cbp = NULL;
1622				} else {
1623					cbp->bio_cflags |=
1624					    G_RAID3_BIO_CFLAG_NODISK;
1625					disk = NULL;
1626				}
1627			}
1628			break;
1629		}
1630		if (cbp != NULL)
1631			cbp->bio_caller2 = disk;
1632	}
1633	switch (pbp->bio_cmd) {
1634	case BIO_READ:
1635		if (round_robin) {
1636			/*
1637			 * If we are in round-robin mode and 'round_robin' is
1638			 * still 1, it means, that we skipped parity component
1639			 * for this read and must reset sc_round_robin field.
1640			 */
1641			sc->sc_round_robin = 0;
1642		}
1643		G_RAID3_FOREACH_BIO(pbp, cbp) {
1644			disk = cbp->bio_caller2;
1645			cp = disk->d_consumer;
1646			cbp->bio_to = cp->provider;
1647			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1648			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1649			    ("Consumer %s not opened (r%dw%de%d).",
1650			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1651			cp->index++;
1652			g_io_request(cbp, cp);
1653		}
1654		break;
1655	case BIO_WRITE:
1656	case BIO_DELETE:
1657		/*
1658		 * Bump syncid on first write.
1659		 */
1660		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_OFW) != 0) {
1661			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1662			g_topology_lock();
1663			g_raid3_bump_syncid(sc);
1664			g_topology_unlock();
1665		}
1666		g_raid3_scatter(pbp);
1667		break;
1668	}
1669	return (0);
1670}
1671
1672static int
1673g_raid3_can_destroy(struct g_raid3_softc *sc)
1674{
1675	struct g_geom *gp;
1676	struct g_consumer *cp;
1677
1678	g_topology_assert();
1679	gp = sc->sc_geom;
1680	LIST_FOREACH(cp, &gp->consumer, consumer) {
1681		if (g_raid3_is_busy(sc, cp))
1682			return (0);
1683	}
1684	gp = sc->sc_sync.ds_geom;
1685	LIST_FOREACH(cp, &gp->consumer, consumer) {
1686		if (g_raid3_is_busy(sc, cp))
1687			return (0);
1688	}
1689	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1690	    sc->sc_name);
1691	return (1);
1692}
1693
1694static int
1695g_raid3_try_destroy(struct g_raid3_softc *sc)
1696{
1697
1698	g_topology_lock();
1699	if (!g_raid3_can_destroy(sc)) {
1700		g_topology_unlock();
1701		return (0);
1702	}
1703	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1704		g_topology_unlock();
1705		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1706		    &sc->sc_worker);
1707		wakeup(&sc->sc_worker);
1708		sc->sc_worker = NULL;
1709	} else {
1710		g_raid3_destroy_device(sc);
1711		g_topology_unlock();
1712		free(sc->sc_disks, M_RAID3);
1713		free(sc, M_RAID3);
1714	}
1715	return (1);
1716}
1717
1718/*
1719 * Worker thread.
1720 */
1721static void
1722g_raid3_worker(void *arg)
1723{
1724	struct g_raid3_softc *sc;
1725	struct g_raid3_disk *disk;
1726	struct g_raid3_disk_sync *sync;
1727	struct g_raid3_event *ep;
1728	struct bio *bp;
1729	u_int nreqs;
1730
1731	sc = arg;
1732	curthread->td_base_pri = PRIBIO;
1733
1734	nreqs = 0;
1735	for (;;) {
1736		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1737		/*
1738		 * First take a look at events.
1739		 * This is important to handle events before any I/O requests.
1740		 */
1741		ep = g_raid3_event_get(sc);
1742		if (ep != NULL && g_topology_try_lock()) {
1743			g_raid3_event_remove(sc, ep);
1744			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1745				/* Update only device status. */
1746				G_RAID3_DEBUG(3,
1747				    "Running event for device %s.",
1748				    sc->sc_name);
1749				ep->e_error = 0;
1750				g_raid3_update_device(sc, 1);
1751			} else {
1752				/* Update disk status. */
1753				G_RAID3_DEBUG(3, "Running event for disk %s.",
1754				     g_raid3_get_diskname(ep->e_disk));
1755				ep->e_error = g_raid3_update_disk(ep->e_disk,
1756				    ep->e_state);
1757				if (ep->e_error == 0)
1758					g_raid3_update_device(sc, 0);
1759			}
1760			g_topology_unlock();
1761			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1762				KASSERT(ep->e_error == 0,
1763				    ("Error cannot be handled."));
1764				g_raid3_event_free(ep);
1765			} else {
1766				ep->e_flags |= G_RAID3_EVENT_DONE;
1767				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1768				    ep);
1769				mtx_lock(&sc->sc_events_mtx);
1770				wakeup(ep);
1771				mtx_unlock(&sc->sc_events_mtx);
1772			}
1773			if ((sc->sc_flags &
1774			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1775				if (g_raid3_try_destroy(sc))
1776					kthread_exit(0);
1777			}
1778			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1779			continue;
1780		}
1781		/*
1782		 * Now I/O requests.
1783		 */
1784		/* Get first request from the queue. */
1785		mtx_lock(&sc->sc_queue_mtx);
1786		bp = bioq_first(&sc->sc_queue);
1787		if (bp == NULL) {
1788			if (ep != NULL) {
1789				/*
1790				 * No I/O requests and topology lock was
1791				 * already held? Try again.
1792				 */
1793				mtx_unlock(&sc->sc_queue_mtx);
1794				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1795				continue;
1796			}
1797			if ((sc->sc_flags &
1798			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1799				mtx_unlock(&sc->sc_queue_mtx);
1800				if (g_raid3_try_destroy(sc))
1801					kthread_exit(0);
1802				mtx_lock(&sc->sc_queue_mtx);
1803			}
1804		}
1805		if (sc->sc_syncdisk != NULL &&
1806		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1807			mtx_unlock(&sc->sc_queue_mtx);
1808			/*
1809			 * It is time for synchronization...
1810			 */
1811			nreqs = 0;
1812			disk = sc->sc_syncdisk;
1813			sync = &disk->d_sync;
1814			if (sync->ds_offset <
1815			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1816			    sync->ds_offset == sync->ds_offset_done) {
1817				if (sync->ds_resync != -1) {
1818					sync->ds_offset = sync->ds_resync;
1819					sync->ds_offset_done = sync->ds_resync;
1820					sync->ds_resync = -1;
1821				}
1822				g_raid3_sync_one(sc);
1823			}
1824			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1825			goto sleep;
1826		}
1827		if (bp == NULL) {
1828			if (g_raid3_check_idle(sc)) {
1829				u_int idletime;
1830
1831				idletime = g_raid3_idletime;
1832				if (idletime == 0)
1833					idletime = 1;
1834				idletime *= hz;
1835				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1836				    "r3:w1", idletime) == EWOULDBLOCK) {
1837					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1838					    __func__);
1839					/*
1840					 * No I/O requests in 'idletime'
1841					 * seconds, so mark components as clean.
1842					 */
1843					g_raid3_idle(sc);
1844				}
1845				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1846			} else {
1847				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1848				    "r3:w2", 0);
1849				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1850			}
1851			continue;
1852		}
1853		nreqs++;
1854		bioq_remove(&sc->sc_queue, bp);
1855		mtx_unlock(&sc->sc_queue_mtx);
1856
1857		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1858			g_raid3_regular_request(bp);
1859		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1860			u_int timeout, sps;
1861
1862			g_raid3_sync_request(bp);
1863sleep:
1864			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1865			if (sps == 0) {
1866				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1867				continue;
1868			}
1869			if (ep != NULL) {
1870				/*
1871				 * We have some pending events, don't sleep now.
1872				 */
1873				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1874				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1875				continue;
1876			}
1877			mtx_lock(&sc->sc_queue_mtx);
1878			if (bioq_first(&sc->sc_queue) != NULL) {
1879				mtx_unlock(&sc->sc_queue_mtx);
1880				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1881				continue;
1882			}
1883			timeout = hz / sps;
1884			if (timeout == 0)
1885				timeout = 1;
1886			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1887			    timeout);
1888		} else {
1889			if (g_raid3_register_request(bp) != 0) {
1890				mtx_lock(&sc->sc_queue_mtx);
1891				bioq_insert_tail(&sc->sc_queue, bp);
1892				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1893				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1894			}
1895		}
1896		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1897	}
1898}
1899
1900/*
1901 * Open disk's consumer if needed.
1902 */
1903static void
1904g_raid3_update_access(struct g_raid3_disk *disk)
1905{
1906	struct g_provider *pp;
1907
1908	g_topology_assert();
1909
1910	pp = disk->d_softc->sc_provider;
1911	if (pp == NULL)
1912		return;
1913	if (pp->acw > 0) {
1914		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1915			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1916			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1917			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1918		}
1919	} else if (pp->acw == 0) {
1920		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1921			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1922			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1923			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1924		}
1925	}
1926}
1927
1928static void
1929g_raid3_sync_start(struct g_raid3_softc *sc)
1930{
1931	struct g_raid3_disk *disk;
1932	int error;
1933	u_int n;
1934
1935	g_topology_assert();
1936
1937	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1938	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1939	    sc->sc_state));
1940	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1941	    sc->sc_name, sc->sc_state));
1942	disk = NULL;
1943	for (n = 0; n < sc->sc_ndisks; n++) {
1944		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1945			continue;
1946		disk = &sc->sc_disks[n];
1947		break;
1948	}
1949	if (disk == NULL)
1950		return;
1951
1952	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1953	    g_raid3_get_diskname(disk));
1954	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1955	KASSERT(disk->d_sync.ds_consumer == NULL,
1956	    ("Sync consumer already exists (device=%s, disk=%s).",
1957	    sc->sc_name, g_raid3_get_diskname(disk)));
1958	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1959	disk->d_sync.ds_consumer->private = disk;
1960	disk->d_sync.ds_consumer->index = 0;
1961	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1962	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1963	    disk->d_softc->sc_name, error));
1964	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1965	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1966	    disk->d_softc->sc_name, error));
1967	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1968	sc->sc_syncdisk = disk;
1969}
1970
1971/*
1972 * Stop synchronization process.
1973 * type: 0 - synchronization finished
1974 *       1 - synchronization stopped
1975 */
1976static void
1977g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1978{
1979	struct g_raid3_disk *disk;
1980
1981	g_topology_assert();
1982	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1983	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1984	    sc->sc_state));
1985	disk = sc->sc_syncdisk;
1986	sc->sc_syncdisk = NULL;
1987	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1988	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1989	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1990	    g_raid3_disk_state2str(disk->d_state)));
1991	if (disk->d_sync.ds_consumer == NULL)
1992		return;
1993
1994	if (type == 0) {
1995		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1996		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1997	} else /* if (type == 1) */ {
1998		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1999		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
2000	}
2001	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
2002	free(disk->d_sync.ds_data, M_RAID3);
2003	disk->d_sync.ds_consumer = NULL;
2004	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2005}
2006
2007static void
2008g_raid3_launch_provider(struct g_raid3_softc *sc)
2009{
2010	struct g_provider *pp;
2011
2012	g_topology_assert();
2013
2014	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2015	pp->mediasize = sc->sc_mediasize;
2016	pp->sectorsize = sc->sc_sectorsize;
2017	sc->sc_provider = pp;
2018	g_error_provider(pp, 0);
2019	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2020	    pp->name);
2021	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2022		g_raid3_sync_start(sc);
2023}
2024
2025static void
2026g_raid3_destroy_provider(struct g_raid3_softc *sc)
2027{
2028	struct bio *bp;
2029
2030	g_topology_assert();
2031	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2032	    sc->sc_name));
2033
2034	g_error_provider(sc->sc_provider, ENXIO);
2035	mtx_lock(&sc->sc_queue_mtx);
2036	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2037		bioq_remove(&sc->sc_queue, bp);
2038		g_io_deliver(bp, ENXIO);
2039	}
2040	mtx_unlock(&sc->sc_queue_mtx);
2041	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2042	    sc->sc_provider->name);
2043	sc->sc_provider->flags |= G_PF_WITHER;
2044	g_orphan_provider(sc->sc_provider, ENXIO);
2045	sc->sc_provider = NULL;
2046	if (sc->sc_syncdisk != NULL)
2047		g_raid3_sync_stop(sc, 1);
2048}
2049
2050static void
2051g_raid3_go(void *arg)
2052{
2053	struct g_raid3_softc *sc;
2054
2055	sc = arg;
2056	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2057	g_raid3_event_send(sc, 0,
2058	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2059}
2060
2061static u_int
2062g_raid3_determine_state(struct g_raid3_disk *disk)
2063{
2064	struct g_raid3_softc *sc;
2065	u_int state;
2066
2067	sc = disk->d_softc;
2068	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2069		if ((disk->d_flags &
2070		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2071			/* Disk does not need synchronization. */
2072			state = G_RAID3_DISK_STATE_ACTIVE;
2073		} else {
2074			if ((sc->sc_flags &
2075			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2076			    (disk->d_flags &
2077			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2078				/*
2079				 * We can start synchronization from
2080				 * the stored offset.
2081				 */
2082				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2083			} else {
2084				state = G_RAID3_DISK_STATE_STALE;
2085			}
2086		}
2087	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2088		/*
2089		 * Reset all synchronization data for this disk,
2090		 * because if it even was synchronized, it was
2091		 * synchronized to disks with different syncid.
2092		 */
2093		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2094		disk->d_sync.ds_offset = 0;
2095		disk->d_sync.ds_offset_done = 0;
2096		disk->d_sync.ds_syncid = sc->sc_syncid;
2097		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2098		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2099			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2100		} else {
2101			state = G_RAID3_DISK_STATE_STALE;
2102		}
2103	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2104		/*
2105		 * Not good, NOT GOOD!
2106		 * It means that device was started on stale disks
2107		 * and more fresh disk just arrive.
2108		 * If there were writes, device is fucked up, sorry.
2109		 * I think the best choice here is don't touch
2110		 * this disk and inform the user laudly.
2111		 */
2112		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2113		    "disk (%s) arrives!! It will not be connected to the "
2114		    "running device.", sc->sc_name,
2115		    g_raid3_get_diskname(disk));
2116		g_raid3_destroy_disk(disk);
2117		state = G_RAID3_DISK_STATE_NONE;
2118		/* Return immediately, because disk was destroyed. */
2119		return (state);
2120	}
2121	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2122	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2123	return (state);
2124}
2125
2126/*
2127 * Update device state.
2128 */
2129static void
2130g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2131{
2132	struct g_raid3_disk *disk;
2133	u_int state;
2134
2135	g_topology_assert();
2136
2137	switch (sc->sc_state) {
2138	case G_RAID3_DEVICE_STATE_STARTING:
2139	    {
2140		u_int n, ndirty, ndisks, genid, syncid;
2141
2142		KASSERT(sc->sc_provider == NULL,
2143		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2144		/*
2145		 * Are we ready? We are, if all disks are connected or
2146		 * one disk is missing and 'force' is true.
2147		 */
2148		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2149			if (!force)
2150				callout_drain(&sc->sc_callout);
2151		} else {
2152			if (force) {
2153				/*
2154				 * Timeout expired, so destroy device.
2155				 */
2156				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2157			}
2158			return;
2159		}
2160
2161		/*
2162		 * Find the biggest genid.
2163		 */
2164		genid = 0;
2165		for (n = 0; n < sc->sc_ndisks; n++) {
2166			disk = &sc->sc_disks[n];
2167			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2168				continue;
2169			if (disk->d_genid > genid)
2170				genid = disk->d_genid;
2171		}
2172		sc->sc_genid = genid;
2173		/*
2174		 * Remove all disks without the biggest genid.
2175		 */
2176		for (n = 0; n < sc->sc_ndisks; n++) {
2177			disk = &sc->sc_disks[n];
2178			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2179				continue;
2180			if (disk->d_genid < genid) {
2181				G_RAID3_DEBUG(0,
2182				    "Component %s (device %s) broken, skipping.",
2183				    g_raid3_get_diskname(disk), sc->sc_name);
2184				g_raid3_destroy_disk(disk);
2185			}
2186		}
2187
2188		/*
2189		 * There must be at least 'sc->sc_ndisks - 1' components
2190		 * with the same syncid and without SYNCHRONIZING flag.
2191		 */
2192
2193		/*
2194		 * Find the biggest syncid, number of valid components and
2195		 * number of dirty components.
2196		 */
2197		ndirty = ndisks = syncid = 0;
2198		for (n = 0; n < sc->sc_ndisks; n++) {
2199			disk = &sc->sc_disks[n];
2200			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2201				continue;
2202			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2203				ndirty++;
2204			if (disk->d_sync.ds_syncid > syncid) {
2205				syncid = disk->d_sync.ds_syncid;
2206				ndisks = 0;
2207			} else if (disk->d_sync.ds_syncid < syncid) {
2208				continue;
2209			}
2210			if ((disk->d_flags &
2211			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2212				continue;
2213			}
2214			ndisks++;
2215		}
2216		/*
2217		 * Do we have enough valid components?
2218		 */
2219		if (ndisks + 1 < sc->sc_ndisks) {
2220			G_RAID3_DEBUG(0,
2221			    "Device %s is broken, too few valid components.",
2222			    sc->sc_name);
2223			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2224			return;
2225		}
2226		/*
2227		 * If there is one DIRTY component and all disks are present,
2228		 * mark it for synchronization. If there is more than one DIRTY
2229		 * component, mark parity component for synchronization.
2230		 */
2231		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2232			for (n = 0; n < sc->sc_ndisks; n++) {
2233				disk = &sc->sc_disks[n];
2234				if ((disk->d_flags &
2235				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2236					continue;
2237				}
2238				disk->d_flags |=
2239				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2240			}
2241		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2242			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2243			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2244		}
2245
2246		sc->sc_syncid = syncid;
2247		if (force) {
2248			/* Remember to bump syncid on first write. */
2249			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID_OFW;
2250		}
2251		if (ndisks == sc->sc_ndisks)
2252			state = G_RAID3_DEVICE_STATE_COMPLETE;
2253		else /* if (ndisks == sc->sc_ndisks - 1) */
2254			state = G_RAID3_DEVICE_STATE_DEGRADED;
2255		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2256		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2257		    g_raid3_device_state2str(state));
2258		sc->sc_state = state;
2259		for (n = 0; n < sc->sc_ndisks; n++) {
2260			disk = &sc->sc_disks[n];
2261			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2262				continue;
2263			state = g_raid3_determine_state(disk);
2264			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2265			if (state == G_RAID3_DISK_STATE_STALE)
2266				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID_OFW;
2267		}
2268		break;
2269	    }
2270	case G_RAID3_DEVICE_STATE_DEGRADED:
2271		/*
2272		 * Bump syncid and/or genid here, if we need to do it
2273		 * immediately.
2274		 */
2275		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_IMM) != 0) {
2276			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2277			g_raid3_bump_syncid(sc);
2278		}
2279		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID_IMM) != 0) {
2280			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2281			g_raid3_bump_genid(sc);
2282		}
2283
2284		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2285			return;
2286		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2287		    sc->sc_ndisks - 1) {
2288			if (sc->sc_provider != NULL)
2289				g_raid3_destroy_provider(sc);
2290			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2291			return;
2292		}
2293		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2294		    sc->sc_ndisks) {
2295			state = G_RAID3_DEVICE_STATE_COMPLETE;
2296			G_RAID3_DEBUG(1,
2297			    "Device %s state changed from %s to %s.",
2298			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2299			    g_raid3_device_state2str(state));
2300			sc->sc_state = state;
2301		}
2302		if (sc->sc_provider == NULL)
2303			g_raid3_launch_provider(sc);
2304		break;
2305	case G_RAID3_DEVICE_STATE_COMPLETE:
2306		/*
2307		 * Bump syncid and/or genid here, if we need to do it
2308		 * immediately.
2309		 */
2310		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_IMM) != 0) {
2311			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2312			g_raid3_bump_syncid(sc);
2313		}
2314		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID_IMM) != 0) {
2315			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2316			g_raid3_bump_genid(sc);
2317		}
2318
2319		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2320			return;
2321		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2322		    sc->sc_ndisks - 1,
2323		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2324		    sc->sc_name));
2325		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2326		    sc->sc_ndisks - 1) {
2327			state = G_RAID3_DEVICE_STATE_DEGRADED;
2328			G_RAID3_DEBUG(1,
2329			    "Device %s state changed from %s to %s.",
2330			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2331			    g_raid3_device_state2str(state));
2332			sc->sc_state = state;
2333		}
2334		if (sc->sc_provider == NULL)
2335			g_raid3_launch_provider(sc);
2336		break;
2337	default:
2338		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2339		    g_raid3_device_state2str(sc->sc_state)));
2340		break;
2341	}
2342}
2343
2344/*
2345 * Update disk state and device state if needed.
2346 */
2347#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2348	"Disk %s state changed from %s to %s (device %s).",		\
2349	g_raid3_get_diskname(disk),					\
2350	g_raid3_disk_state2str(disk->d_state),				\
2351	g_raid3_disk_state2str(state), sc->sc_name)
2352static int
2353g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2354{
2355	struct g_raid3_softc *sc;
2356
2357	g_topology_assert();
2358
2359	sc = disk->d_softc;
2360again:
2361	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2362	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2363	    g_raid3_disk_state2str(state));
2364	switch (state) {
2365	case G_RAID3_DISK_STATE_NEW:
2366		/*
2367		 * Possible scenarios:
2368		 * 1. New disk arrive.
2369		 */
2370		/* Previous state should be NONE. */
2371		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2372		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2373		    g_raid3_disk_state2str(disk->d_state)));
2374		DISK_STATE_CHANGED();
2375
2376		disk->d_state = state;
2377		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2378		    sc->sc_name, g_raid3_get_diskname(disk));
2379		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2380			break;
2381		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2382		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2383		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2384		    g_raid3_device_state2str(sc->sc_state),
2385		    g_raid3_get_diskname(disk),
2386		    g_raid3_disk_state2str(disk->d_state)));
2387		state = g_raid3_determine_state(disk);
2388		if (state != G_RAID3_DISK_STATE_NONE)
2389			goto again;
2390		break;
2391	case G_RAID3_DISK_STATE_ACTIVE:
2392		/*
2393		 * Possible scenarios:
2394		 * 1. New disk does not need synchronization.
2395		 * 2. Synchronization process finished successfully.
2396		 */
2397		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2398		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2399		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2400		    g_raid3_device_state2str(sc->sc_state),
2401		    g_raid3_get_diskname(disk),
2402		    g_raid3_disk_state2str(disk->d_state)));
2403		/* Previous state should be NEW or SYNCHRONIZING. */
2404		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2405		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2406		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2407		    g_raid3_disk_state2str(disk->d_state)));
2408		DISK_STATE_CHANGED();
2409
2410		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2411			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2412		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2413			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2414			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2415			g_raid3_sync_stop(sc, 0);
2416		}
2417		disk->d_state = state;
2418		disk->d_sync.ds_offset = 0;
2419		disk->d_sync.ds_offset_done = 0;
2420		g_raid3_update_access(disk);
2421		g_raid3_update_metadata(disk);
2422		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2423		    sc->sc_name, g_raid3_get_diskname(disk));
2424		break;
2425	case G_RAID3_DISK_STATE_STALE:
2426		/*
2427		 * Possible scenarios:
2428		 * 1. Stale disk was connected.
2429		 */
2430		/* Previous state should be NEW. */
2431		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2432		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2433		    g_raid3_disk_state2str(disk->d_state)));
2434		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2435		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2436		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2437		    g_raid3_device_state2str(sc->sc_state),
2438		    g_raid3_get_diskname(disk),
2439		    g_raid3_disk_state2str(disk->d_state)));
2440		/*
2441		 * STALE state is only possible if device is marked
2442		 * NOAUTOSYNC.
2443		 */
2444		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2445		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2446		    g_raid3_device_state2str(sc->sc_state),
2447		    g_raid3_get_diskname(disk),
2448		    g_raid3_disk_state2str(disk->d_state)));
2449		DISK_STATE_CHANGED();
2450
2451		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2452		disk->d_state = state;
2453		g_raid3_update_metadata(disk);
2454		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2455		    sc->sc_name, g_raid3_get_diskname(disk));
2456		break;
2457	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2458		/*
2459		 * Possible scenarios:
2460		 * 1. Disk which needs synchronization was connected.
2461		 */
2462		/* Previous state should be NEW. */
2463		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2464		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2465		    g_raid3_disk_state2str(disk->d_state)));
2466		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2467		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2468		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2469		    g_raid3_device_state2str(sc->sc_state),
2470		    g_raid3_get_diskname(disk),
2471		    g_raid3_disk_state2str(disk->d_state)));
2472		DISK_STATE_CHANGED();
2473
2474		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2475			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2476		disk->d_state = state;
2477		if (sc->sc_provider != NULL) {
2478			g_raid3_sync_start(sc);
2479			g_raid3_update_metadata(disk);
2480		}
2481		break;
2482	case G_RAID3_DISK_STATE_DISCONNECTED:
2483		/*
2484		 * Possible scenarios:
2485		 * 1. Device wasn't running yet, but disk disappear.
2486		 * 2. Disk was active and disapppear.
2487		 * 3. Disk disappear during synchronization process.
2488		 */
2489		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2490		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2491			/*
2492			 * Previous state should be ACTIVE, STALE or
2493			 * SYNCHRONIZING.
2494			 */
2495			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2496			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2497			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2498			    ("Wrong disk state (%s, %s).",
2499			    g_raid3_get_diskname(disk),
2500			    g_raid3_disk_state2str(disk->d_state)));
2501		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2502			/* Previous state should be NEW. */
2503			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2504			    ("Wrong disk state (%s, %s).",
2505			    g_raid3_get_diskname(disk),
2506			    g_raid3_disk_state2str(disk->d_state)));
2507			/*
2508			 * Reset bumping syncid if disk disappeared in STARTING
2509			 * state.
2510			 */
2511			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_OFW) != 0)
2512				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2513#ifdef	INVARIANTS
2514		} else {
2515			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2516			    sc->sc_name,
2517			    g_raid3_device_state2str(sc->sc_state),
2518			    g_raid3_get_diskname(disk),
2519			    g_raid3_disk_state2str(disk->d_state)));
2520#endif
2521		}
2522		DISK_STATE_CHANGED();
2523		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2524		    sc->sc_name, g_raid3_get_diskname(disk));
2525
2526		g_raid3_destroy_disk(disk);
2527		break;
2528	default:
2529		KASSERT(1 == 0, ("Unknown state (%u).", state));
2530		break;
2531	}
2532	return (0);
2533}
2534#undef	DISK_STATE_CHANGED
2535
2536static int
2537g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2538{
2539	struct g_provider *pp;
2540	u_char *buf;
2541	int error;
2542
2543	g_topology_assert();
2544
2545	error = g_access(cp, 1, 0, 0);
2546	if (error != 0)
2547		return (error);
2548	pp = cp->provider;
2549	g_topology_unlock();
2550	/* Metadata are stored on last sector. */
2551	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2552	    &error);
2553	g_topology_lock();
2554	g_access(cp, -1, 0, 0);
2555	if (error != 0) {
2556		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2557		    cp->provider->name, error);
2558		if (buf != NULL)
2559			g_free(buf);
2560		return (error);
2561	}
2562
2563	/* Decode metadata. */
2564	error = raid3_metadata_decode(buf, md);
2565	g_free(buf);
2566	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2567		return (EINVAL);
2568	if (md->md_version > G_RAID3_VERSION) {
2569		G_RAID3_DEBUG(0,
2570		    "Kernel module is too old to handle metadata from %s.",
2571		    cp->provider->name);
2572		return (EINVAL);
2573	}
2574	if (error != 0) {
2575		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2576		    cp->provider->name);
2577		return (error);
2578	}
2579
2580	return (0);
2581}
2582
2583static int
2584g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2585    struct g_raid3_metadata *md)
2586{
2587
2588	if (md->md_no >= sc->sc_ndisks) {
2589		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2590		    pp->name, md->md_no);
2591		return (EINVAL);
2592	}
2593	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2594		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2595		    pp->name, md->md_no);
2596		return (EEXIST);
2597	}
2598	if (md->md_all != sc->sc_ndisks) {
2599		G_RAID3_DEBUG(1,
2600		    "Invalid '%s' field on disk %s (device %s), skipping.",
2601		    "md_all", pp->name, sc->sc_name);
2602		return (EINVAL);
2603	}
2604	if (md->md_mediasize != sc->sc_mediasize) {
2605		G_RAID3_DEBUG(1,
2606		    "Invalid '%s' field on disk %s (device %s), skipping.",
2607		    "md_mediasize", pp->name, sc->sc_name);
2608		return (EINVAL);
2609	}
2610	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2611		G_RAID3_DEBUG(1,
2612		    "Invalid '%s' field on disk %s (device %s), skipping.",
2613		    "md_mediasize", pp->name, sc->sc_name);
2614		return (EINVAL);
2615	}
2616	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2617		G_RAID3_DEBUG(1,
2618		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2619		    sc->sc_name);
2620		return (EINVAL);
2621	}
2622	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2623		G_RAID3_DEBUG(1,
2624		    "Invalid '%s' field on disk %s (device %s), skipping.",
2625		    "md_sectorsize", pp->name, sc->sc_name);
2626		return (EINVAL);
2627	}
2628	if (md->md_sectorsize != sc->sc_sectorsize) {
2629		G_RAID3_DEBUG(1,
2630		    "Invalid '%s' field on disk %s (device %s), skipping.",
2631		    "md_sectorsize", pp->name, sc->sc_name);
2632		return (EINVAL);
2633	}
2634	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2635		G_RAID3_DEBUG(1,
2636		    "Invalid sector size of disk %s (device %s), skipping.",
2637		    pp->name, sc->sc_name);
2638		return (EINVAL);
2639	}
2640	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2641		G_RAID3_DEBUG(1,
2642		    "Invalid device flags on disk %s (device %s), skipping.",
2643		    pp->name, sc->sc_name);
2644		return (EINVAL);
2645	}
2646	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2647	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2648		/*
2649		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2650		 */
2651		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2652		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2653		return (EINVAL);
2654	}
2655	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2656		G_RAID3_DEBUG(1,
2657		    "Invalid disk flags on disk %s (device %s), skipping.",
2658		    pp->name, sc->sc_name);
2659		return (EINVAL);
2660	}
2661	return (0);
2662}
2663
2664static int
2665g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2666    struct g_raid3_metadata *md)
2667{
2668	struct g_raid3_disk *disk;
2669	int error;
2670
2671	g_topology_assert();
2672	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2673
2674	error = g_raid3_check_metadata(sc, pp, md);
2675	if (error != 0)
2676		return (error);
2677	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2678	    md->md_genid < sc->sc_genid) {
2679		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2680		    pp->name, sc->sc_name);
2681		return (EINVAL);
2682	}
2683	disk = g_raid3_init_disk(sc, pp, md, &error);
2684	if (disk == NULL)
2685		return (error);
2686	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2687	    G_RAID3_EVENT_WAIT);
2688	if (error != 0)
2689		return (error);
2690	if (md->md_version < G_RAID3_VERSION) {
2691		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2692		    pp->name, md->md_version, G_RAID3_VERSION);
2693		g_raid3_update_metadata(disk);
2694	}
2695	return (0);
2696}
2697
2698static int
2699g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2700{
2701	struct g_raid3_softc *sc;
2702	struct g_raid3_disk *disk;
2703	int dcr, dcw, dce;
2704	u_int n;
2705
2706	g_topology_assert();
2707	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2708	    acw, ace);
2709
2710	dcr = pp->acr + acr;
2711	dcw = pp->acw + acw;
2712	dce = pp->ace + ace;
2713
2714	sc = pp->geom->softc;
2715	if (sc == NULL ||
2716	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2717	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2718		if (acr <= 0 && acw <= 0 && ace <= 0)
2719			return (0);
2720		else
2721			return (ENXIO);
2722	}
2723	for (n = 0; n < sc->sc_ndisks; n++) {
2724		disk = &sc->sc_disks[n];
2725		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2726			continue;
2727		/*
2728		 * Mark disk as dirty on open and unmark on close.
2729		 */
2730		if (pp->acw == 0 && dcw > 0) {
2731			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2732			    g_raid3_get_diskname(disk), sc->sc_name);
2733			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2734			g_raid3_update_metadata(disk);
2735		} else if (pp->acw > 0 && dcw == 0) {
2736			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2737			    g_raid3_get_diskname(disk), sc->sc_name);
2738			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2739			g_raid3_update_metadata(disk);
2740		}
2741	}
2742	return (0);
2743}
2744
2745static struct g_geom *
2746g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2747{
2748	struct g_raid3_softc *sc;
2749	struct g_geom *gp;
2750	int error, timeout;
2751	u_int n;
2752
2753	g_topology_assert();
2754	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2755
2756	/* One disk is minimum. */
2757	if (md->md_all < 1)
2758		return (NULL);
2759	/*
2760	 * Action geom.
2761	 */
2762	gp = g_new_geomf(mp, "%s", md->md_name);
2763	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2764	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2765	    M_WAITOK | M_ZERO);
2766	gp->start = g_raid3_start;
2767	gp->spoiled = g_raid3_spoiled;
2768	gp->orphan = g_raid3_orphan;
2769	gp->access = g_raid3_access;
2770	gp->dumpconf = g_raid3_dumpconf;
2771
2772	sc->sc_id = md->md_id;
2773	sc->sc_mediasize = md->md_mediasize;
2774	sc->sc_sectorsize = md->md_sectorsize;
2775	sc->sc_ndisks = md->md_all;
2776	sc->sc_round_robin = 0;
2777	sc->sc_flags = md->md_mflags;
2778	sc->sc_bump_id = 0;
2779	sc->sc_idle = 0;
2780	for (n = 0; n < sc->sc_ndisks; n++) {
2781		sc->sc_disks[n].d_softc = sc;
2782		sc->sc_disks[n].d_no = n;
2783		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2784	}
2785	bioq_init(&sc->sc_queue);
2786	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2787	TAILQ_INIT(&sc->sc_events);
2788	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2789	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2790	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2791	gp->softc = sc;
2792	sc->sc_geom = gp;
2793	sc->sc_provider = NULL;
2794	/*
2795	 * Synchronization geom.
2796	 */
2797	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2798	gp->softc = sc;
2799	gp->orphan = g_raid3_orphan;
2800	sc->sc_sync.ds_geom = gp;
2801	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2802	    UMA_ALIGN_PTR, 0);
2803	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2804	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2805	    UMA_ALIGN_PTR, 0);
2806	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2807	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2808	    UMA_ALIGN_PTR, 0);
2809	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2810	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2811	    "g_raid3 %s", md->md_name);
2812	if (error != 0) {
2813		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2814		    sc->sc_name);
2815		uma_zdestroy(sc->sc_zone_64k);
2816		uma_zdestroy(sc->sc_zone_16k);
2817		uma_zdestroy(sc->sc_zone_4k);
2818		g_destroy_geom(sc->sc_sync.ds_geom);
2819		mtx_destroy(&sc->sc_events_mtx);
2820		mtx_destroy(&sc->sc_queue_mtx);
2821		g_destroy_geom(sc->sc_geom);
2822		free(sc->sc_disks, M_RAID3);
2823		free(sc, M_RAID3);
2824		return (NULL);
2825	}
2826
2827	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2828
2829	/*
2830	 * Run timeout.
2831	 */
2832	timeout = atomic_load_acq_int(&g_raid3_timeout);
2833	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2834	return (sc->sc_geom);
2835}
2836
2837int
2838g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2839{
2840	struct g_provider *pp;
2841
2842	g_topology_assert();
2843
2844	if (sc == NULL)
2845		return (ENXIO);
2846	pp = sc->sc_provider;
2847	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2848		if (force) {
2849			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2850			    "can't be definitely removed.", pp->name);
2851		} else {
2852			G_RAID3_DEBUG(1,
2853			    "Device %s is still open (r%dw%de%d).", pp->name,
2854			    pp->acr, pp->acw, pp->ace);
2855			return (EBUSY);
2856		}
2857	}
2858
2859	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2860	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2861	g_topology_unlock();
2862	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2863	mtx_lock(&sc->sc_queue_mtx);
2864	wakeup(sc);
2865	wakeup(&sc->sc_queue);
2866	mtx_unlock(&sc->sc_queue_mtx);
2867	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2868	while (sc->sc_worker != NULL)
2869		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2870	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2871	g_topology_lock();
2872	g_raid3_destroy_device(sc);
2873	free(sc->sc_disks, M_RAID3);
2874	free(sc, M_RAID3);
2875	return (0);
2876}
2877
2878static void
2879g_raid3_taste_orphan(struct g_consumer *cp)
2880{
2881
2882	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2883	    cp->provider->name));
2884}
2885
2886static struct g_geom *
2887g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2888{
2889	struct g_raid3_metadata md;
2890	struct g_raid3_softc *sc;
2891	struct g_consumer *cp;
2892	struct g_geom *gp;
2893	int error;
2894
2895	g_topology_assert();
2896	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2897	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2898
2899	gp = g_new_geomf(mp, "raid3:taste");
2900	/* This orphan function should be never called. */
2901	gp->orphan = g_raid3_taste_orphan;
2902	cp = g_new_consumer(gp);
2903	g_attach(cp, pp);
2904	error = g_raid3_read_metadata(cp, &md);
2905	g_detach(cp);
2906	g_destroy_consumer(cp);
2907	g_destroy_geom(gp);
2908	if (error != 0)
2909		return (NULL);
2910	gp = NULL;
2911
2912	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2913		return (NULL);
2914	if (g_raid3_debug >= 2)
2915		raid3_metadata_dump(&md);
2916
2917	/*
2918	 * Let's check if device already exists.
2919	 */
2920	sc = NULL;
2921	LIST_FOREACH(gp, &mp->geom, geom) {
2922		sc = gp->softc;
2923		if (sc == NULL)
2924			continue;
2925		if (sc->sc_sync.ds_geom == gp)
2926			continue;
2927		if (strcmp(md.md_name, sc->sc_name) != 0)
2928			continue;
2929		if (md.md_id != sc->sc_id) {
2930			G_RAID3_DEBUG(0, "Device %s already configured.",
2931			    sc->sc_name);
2932			return (NULL);
2933		}
2934		break;
2935	}
2936	if (gp == NULL) {
2937		gp = g_raid3_create(mp, &md);
2938		if (gp == NULL) {
2939			G_RAID3_DEBUG(0, "Cannot create device %s.",
2940			    md.md_name);
2941			return (NULL);
2942		}
2943		sc = gp->softc;
2944	}
2945	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2946	error = g_raid3_add_disk(sc, pp, &md);
2947	if (error != 0) {
2948		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2949		    pp->name, gp->name, error);
2950		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2951		    sc->sc_ndisks) {
2952			g_raid3_destroy(sc, 1);
2953		}
2954		return (NULL);
2955	}
2956	return (gp);
2957}
2958
2959static int
2960g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2961    struct g_geom *gp)
2962{
2963
2964	return (g_raid3_destroy(gp->softc, 0));
2965}
2966
2967static void
2968g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2969    struct g_consumer *cp, struct g_provider *pp)
2970{
2971	struct g_raid3_softc *sc;
2972
2973	g_topology_assert();
2974
2975	sc = gp->softc;
2976	if (sc == NULL)
2977		return;
2978	/* Skip synchronization geom. */
2979	if (gp == sc->sc_sync.ds_geom)
2980		return;
2981	if (pp != NULL) {
2982		/* Nothing here. */
2983	} else if (cp != NULL) {
2984		struct g_raid3_disk *disk;
2985
2986		disk = cp->private;
2987		if (disk == NULL)
2988			return;
2989		sbuf_printf(sb, "%s<Type>", indent);
2990		if (disk->d_no == sc->sc_ndisks - 1)
2991			sbuf_printf(sb, "PARITY");
2992		else
2993			sbuf_printf(sb, "DATA");
2994		sbuf_printf(sb, "</Type>\n");
2995		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2996		    (u_int)disk->d_no);
2997		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2998			sbuf_printf(sb, "%s<Synchronized>", indent);
2999			if (disk->d_sync.ds_offset_done == 0)
3000				sbuf_printf(sb, "0%%");
3001			else {
3002				sbuf_printf(sb, "%u%%",
3003				    (u_int)((disk->d_sync.ds_offset_done * 100) /
3004				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3005			}
3006			sbuf_printf(sb, "</Synchronized>\n");
3007		}
3008		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3009		    disk->d_sync.ds_syncid);
3010		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3011		sbuf_printf(sb, "%s<Flags>", indent);
3012		if (disk->d_flags == 0)
3013			sbuf_printf(sb, "NONE");
3014		else {
3015			int first = 1;
3016
3017#define	ADD_FLAG(flag, name)	do {					\
3018	if ((disk->d_flags & (flag)) != 0) {				\
3019		if (!first)						\
3020			sbuf_printf(sb, ", ");				\
3021		else							\
3022			first = 0;					\
3023		sbuf_printf(sb, name);					\
3024	}								\
3025} while (0)
3026			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3027			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3028			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3029			    "SYNCHRONIZING");
3030			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3031#undef	ADD_FLAG
3032		}
3033		sbuf_printf(sb, "</Flags>\n");
3034		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3035		    g_raid3_disk_state2str(disk->d_state));
3036	} else {
3037		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3038		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3039		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3040		sbuf_printf(sb, "%s<Flags>", indent);
3041		if (sc->sc_flags == 0)
3042			sbuf_printf(sb, "NONE");
3043		else {
3044			int first = 1;
3045
3046#define	ADD_FLAG(flag, name)	do {					\
3047	if ((sc->sc_flags & (flag)) != 0) {				\
3048		if (!first)						\
3049			sbuf_printf(sb, ", ");				\
3050		else							\
3051			first = 0;					\
3052		sbuf_printf(sb, name);					\
3053	}								\
3054} while (0)
3055			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3056			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3057			    "ROUND-ROBIN");
3058			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3059#undef	ADD_FLAG
3060		}
3061		sbuf_printf(sb, "</Flags>\n");
3062		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3063		    sc->sc_ndisks);
3064		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3065		    g_raid3_device_state2str(sc->sc_state));
3066	}
3067}
3068
3069static void
3070g_raid3_shutdown(void *arg, int howto)
3071{
3072	struct g_class *mp;
3073	struct g_geom *gp, *gp2;
3074
3075	mp = arg;
3076	DROP_GIANT();
3077	g_topology_lock();
3078	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3079		if (gp->softc == NULL)
3080			continue;
3081		g_raid3_destroy(gp->softc, 1);
3082	}
3083	g_topology_unlock();
3084	PICKUP_GIANT();
3085#if 0
3086	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3087#endif
3088}
3089
3090static void
3091g_raid3_init(struct g_class *mp)
3092{
3093
3094	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3095	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3096	if (g_raid3_ehtag == NULL)
3097		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3098}
3099
3100static void
3101g_raid3_fini(struct g_class *mp)
3102{
3103
3104	if (g_raid3_ehtag == NULL)
3105		return;
3106	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3107}
3108
3109DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3110