g_raid3.c revision 144144
1133808Spjd/*-
2141994Spjd * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3133808Spjd * All rights reserved.
4133808Spjd *
5133808Spjd * Redistribution and use in source and binary forms, with or without
6133808Spjd * modification, are permitted provided that the following conditions
7133808Spjd * are met:
8133808Spjd * 1. Redistributions of source code must retain the above copyright
9133808Spjd *    notice, this list of conditions and the following disclaimer.
10133808Spjd * 2. Redistributions in binary form must reproduce the above copyright
11133808Spjd *    notice, this list of conditions and the following disclaimer in the
12133808Spjd *    documentation and/or other materials provided with the distribution.
13133808Spjd *
14133808Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15133808Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16133808Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17133808Spjd * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18133808Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19133808Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20133808Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21133808Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22133808Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23133808Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24133808Spjd * SUCH DAMAGE.
25133808Spjd */
26133808Spjd
27133808Spjd#include <sys/cdefs.h>
28133808Spjd__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 144144 2005-03-26 17:24:19Z pjd $");
29133808Spjd
30133808Spjd#include <sys/param.h>
31133808Spjd#include <sys/systm.h>
32133808Spjd#include <sys/kernel.h>
33133808Spjd#include <sys/module.h>
34133808Spjd#include <sys/limits.h>
35133808Spjd#include <sys/lock.h>
36133808Spjd#include <sys/mutex.h>
37133808Spjd#include <sys/bio.h>
38133808Spjd#include <sys/sysctl.h>
39133808Spjd#include <sys/malloc.h>
40137257Spjd#include <sys/eventhandler.h>
41133808Spjd#include <vm/uma.h>
42133808Spjd#include <geom/geom.h>
43133808Spjd#include <sys/proc.h>
44133808Spjd#include <sys/kthread.h>
45139451Sjhb#include <sys/sched.h>
46133808Spjd#include <geom/raid3/g_raid3.h>
47133808Spjd
48133808Spjd
49133808Spjdstatic MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50133808Spjd
51133808SpjdSYSCTL_DECL(_kern_geom);
52133808SpjdSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53133825Spjdu_int g_raid3_debug = 0;
54134528SpjdTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56133808Spjd    "Debug level");
57135866Spjdstatic u_int g_raid3_timeout = 4;
58137258SpjdTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60133808Spjd    0, "Time to wait on all raid3 components");
61137258Spjdstatic u_int g_raid3_idletime = 5;
62137258SpjdTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63137258SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64137258Spjd    &g_raid3_idletime, 0, "Mark components as clean when idling");
65133808Spjdstatic u_int g_raid3_reqs_per_sync = 5;
66133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67133808Spjd    &g_raid3_reqs_per_sync, 0,
68133808Spjd    "Number of regular I/O requests per synchronization request");
69139940Spjdstatic u_int g_raid3_syncs_per_sec = 1000;
70133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71133808Spjd    &g_raid3_syncs_per_sec, 0,
72133808Spjd    "Number of synchronizations requests per second");
73133808Spjd
74133808Spjdstatic u_int g_raid3_n64k = 50;
75133808SpjdTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77133808Spjd    "Maximum number of 64kB allocations");
78133808Spjdstatic u_int g_raid3_n16k = 200;
79133808SpjdTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81133808Spjd    "Maximum number of 16kB allocations");
82133808Spjdstatic u_int g_raid3_n4k = 1200;
83133808SpjdTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84133808SpjdSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85133808Spjd    "Maximum number of 4kB allocations");
86133808Spjd
87133808SpjdSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88133808Spjd    "GEOM_RAID3 statistics");
89134168Spjdstatic u_int g_raid3_parity_mismatch = 0;
90134168SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91134168Spjd    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92133808Spjdstatic u_int g_raid3_64k_requested = 0;
93133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94133808Spjd    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95133808Spjdstatic u_int g_raid3_64k_failed = 0;
96133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97133808Spjd    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98133808Spjdstatic u_int g_raid3_16k_requested = 0;
99133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100133808Spjd    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101133808Spjdstatic u_int g_raid3_16k_failed = 0;
102133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103133808Spjd    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104133808Spjdstatic u_int g_raid3_4k_requested = 0;
105133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106133808Spjd    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107133808Spjdstatic u_int g_raid3_4k_failed = 0;
108133808SpjdSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109133808Spjd    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110133808Spjd
111133808Spjd#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112133808Spjd	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113133808Spjd	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114133808Spjd	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115133808Spjd} while (0)
116133808Spjd
117137257Spjdstatic eventhandler_tag g_raid3_ehtag = NULL;
118133808Spjd
119133808Spjdstatic int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120133808Spjd    struct g_geom *gp);
121133808Spjdstatic g_taste_t g_raid3_taste;
122137257Spjdstatic void g_raid3_init(struct g_class *mp);
123137257Spjdstatic void g_raid3_fini(struct g_class *mp);
124133808Spjd
125133808Spjdstruct g_class g_raid3_class = {
126133808Spjd	.name = G_RAID3_CLASS_NAME,
127133808Spjd	.version = G_VERSION,
128133808Spjd	.ctlreq = g_raid3_config,
129133808Spjd	.taste = g_raid3_taste,
130137257Spjd	.destroy_geom = g_raid3_destroy_geom,
131137257Spjd	.init = g_raid3_init,
132137257Spjd	.fini = g_raid3_fini
133133808Spjd};
134133808Spjd
135133808Spjd
136133808Spjdstatic void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137139144Spjdstatic int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138139144Spjdstatic void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139133808Spjdstatic void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140133808Spjd    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141133808Spjdstatic void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142133808Spjd
143133808Spjd
144133808Spjdstatic const char *
145133808Spjdg_raid3_disk_state2str(int state)
146133808Spjd{
147133808Spjd
148133808Spjd	switch (state) {
149133808Spjd	case G_RAID3_DISK_STATE_NODISK:
150133808Spjd		return ("NODISK");
151133808Spjd	case G_RAID3_DISK_STATE_NONE:
152133808Spjd		return ("NONE");
153133808Spjd	case G_RAID3_DISK_STATE_NEW:
154133808Spjd		return ("NEW");
155133808Spjd	case G_RAID3_DISK_STATE_ACTIVE:
156133808Spjd		return ("ACTIVE");
157133808Spjd	case G_RAID3_DISK_STATE_STALE:
158133808Spjd		return ("STALE");
159133808Spjd	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160133808Spjd		return ("SYNCHRONIZING");
161133808Spjd	case G_RAID3_DISK_STATE_DISCONNECTED:
162133808Spjd		return ("DISCONNECTED");
163133808Spjd	default:
164133808Spjd		return ("INVALID");
165133808Spjd	}
166133808Spjd}
167133808Spjd
168133808Spjdstatic const char *
169133808Spjdg_raid3_device_state2str(int state)
170133808Spjd{
171133808Spjd
172133808Spjd	switch (state) {
173133808Spjd	case G_RAID3_DEVICE_STATE_STARTING:
174133808Spjd		return ("STARTING");
175133808Spjd	case G_RAID3_DEVICE_STATE_DEGRADED:
176133808Spjd		return ("DEGRADED");
177133808Spjd	case G_RAID3_DEVICE_STATE_COMPLETE:
178133808Spjd		return ("COMPLETE");
179133808Spjd	default:
180133808Spjd		return ("INVALID");
181133808Spjd	}
182133808Spjd}
183133808Spjd
184133808Spjdconst char *
185133808Spjdg_raid3_get_diskname(struct g_raid3_disk *disk)
186133808Spjd{
187133808Spjd
188133808Spjd	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189133808Spjd		return ("[unknown]");
190133808Spjd	return (disk->d_name);
191133808Spjd}
192133808Spjd
193133808Spjd#define	g_raid3_xor(src1, src2, dst, size)				\
194133808Spjd	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195133808Spjd	    (uint64_t *)(dst), (size_t)size)
196133808Spjdstatic void
197133808Spjd_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198133808Spjd{
199133808Spjd
200133808Spjd	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201133808Spjd	for (; size > 0; size -= 128) {
202133808Spjd		*dst++ = (*src1++) ^ (*src2++);
203133808Spjd		*dst++ = (*src1++) ^ (*src2++);
204133808Spjd		*dst++ = (*src1++) ^ (*src2++);
205133808Spjd		*dst++ = (*src1++) ^ (*src2++);
206133808Spjd		*dst++ = (*src1++) ^ (*src2++);
207133808Spjd		*dst++ = (*src1++) ^ (*src2++);
208133808Spjd		*dst++ = (*src1++) ^ (*src2++);
209133808Spjd		*dst++ = (*src1++) ^ (*src2++);
210133808Spjd		*dst++ = (*src1++) ^ (*src2++);
211133808Spjd		*dst++ = (*src1++) ^ (*src2++);
212133808Spjd		*dst++ = (*src1++) ^ (*src2++);
213133808Spjd		*dst++ = (*src1++) ^ (*src2++);
214133808Spjd		*dst++ = (*src1++) ^ (*src2++);
215133808Spjd		*dst++ = (*src1++) ^ (*src2++);
216133808Spjd		*dst++ = (*src1++) ^ (*src2++);
217133808Spjd		*dst++ = (*src1++) ^ (*src2++);
218133808Spjd	}
219133808Spjd}
220133808Spjd
221134168Spjdstatic int
222134168Spjdg_raid3_is_zero(struct bio *bp)
223134168Spjd{
224134168Spjd	static const uint64_t zeros[] = {
225134168Spjd	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226134168Spjd	};
227134168Spjd	u_char *addr;
228134168Spjd	ssize_t size;
229134168Spjd
230134168Spjd	size = bp->bio_length;
231134168Spjd	addr = (u_char *)bp->bio_data;
232134168Spjd	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233134168Spjd		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234134168Spjd			return (0);
235134168Spjd	}
236134168Spjd	return (1);
237134168Spjd}
238134168Spjd
239133808Spjd/*
240133808Spjd * --- Events handling functions ---
241133808Spjd * Events in geom_raid3 are used to maintain disks and device status
242133808Spjd * from one thread to simplify locking.
243133808Spjd */
244133808Spjdstatic void
245133808Spjdg_raid3_event_free(struct g_raid3_event *ep)
246133808Spjd{
247133808Spjd
248133808Spjd	free(ep, M_RAID3);
249133808Spjd}
250133808Spjd
251133808Spjdint
252133808Spjdg_raid3_event_send(void *arg, int state, int flags)
253133808Spjd{
254133808Spjd	struct g_raid3_softc *sc;
255133808Spjd	struct g_raid3_disk *disk;
256133808Spjd	struct g_raid3_event *ep;
257133808Spjd	int error;
258133808Spjd
259133808Spjd	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260133808Spjd	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261133808Spjd	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262133808Spjd		disk = NULL;
263133808Spjd		sc = arg;
264133808Spjd	} else {
265133808Spjd		disk = arg;
266133808Spjd		sc = disk->d_softc;
267133808Spjd	}
268133808Spjd	ep->e_disk = disk;
269133808Spjd	ep->e_state = state;
270133808Spjd	ep->e_flags = flags;
271133808Spjd	ep->e_error = 0;
272133808Spjd	mtx_lock(&sc->sc_events_mtx);
273133808Spjd	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274133808Spjd	mtx_unlock(&sc->sc_events_mtx);
275133808Spjd	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276133808Spjd	mtx_lock(&sc->sc_queue_mtx);
277133808Spjd	wakeup(sc);
278133808Spjd	wakeup(&sc->sc_queue);
279133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
280133808Spjd	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281133808Spjd		return (0);
282133808Spjd	g_topology_assert();
283133808Spjd	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284133808Spjd	g_topology_unlock();
285133808Spjd	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286133808Spjd		mtx_lock(&sc->sc_events_mtx);
287133808Spjd		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288133808Spjd		    hz * 5);
289133808Spjd	}
290133808Spjd	/* Don't even try to use 'sc' here, because it could be already dead. */
291133808Spjd	g_topology_lock();
292133808Spjd	error = ep->e_error;
293133808Spjd	g_raid3_event_free(ep);
294133808Spjd	return (error);
295133808Spjd}
296133808Spjd
297133808Spjdstatic struct g_raid3_event *
298133808Spjdg_raid3_event_get(struct g_raid3_softc *sc)
299133808Spjd{
300133808Spjd	struct g_raid3_event *ep;
301133808Spjd
302133808Spjd	mtx_lock(&sc->sc_events_mtx);
303133808Spjd	ep = TAILQ_FIRST(&sc->sc_events);
304133808Spjd	mtx_unlock(&sc->sc_events_mtx);
305133808Spjd	return (ep);
306133808Spjd}
307133808Spjd
308133808Spjdstatic void
309139144Spjdg_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310139144Spjd{
311139144Spjd
312139144Spjd	mtx_lock(&sc->sc_events_mtx);
313139144Spjd	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314139144Spjd	mtx_unlock(&sc->sc_events_mtx);
315139144Spjd}
316139144Spjd
317139144Spjdstatic void
318133808Spjdg_raid3_event_cancel(struct g_raid3_disk *disk)
319133808Spjd{
320133808Spjd	struct g_raid3_softc *sc;
321133808Spjd	struct g_raid3_event *ep, *tmpep;
322133808Spjd
323133808Spjd	g_topology_assert();
324133808Spjd
325133808Spjd	sc = disk->d_softc;
326133808Spjd	mtx_lock(&sc->sc_events_mtx);
327133808Spjd	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328133808Spjd		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329133808Spjd			continue;
330133808Spjd		if (ep->e_disk != disk)
331133808Spjd			continue;
332133808Spjd		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333133808Spjd		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334133808Spjd			g_raid3_event_free(ep);
335133808Spjd		else {
336133808Spjd			ep->e_error = ECANCELED;
337133808Spjd			wakeup(ep);
338133808Spjd		}
339133808Spjd	}
340133808Spjd	mtx_unlock(&sc->sc_events_mtx);
341133808Spjd}
342133808Spjd
343133808Spjd/*
344133808Spjd * Return the number of disks in the given state.
345133808Spjd * If state is equal to -1, count all connected disks.
346133808Spjd */
347133808Spjdu_int
348133808Spjdg_raid3_ndisks(struct g_raid3_softc *sc, int state)
349133808Spjd{
350133808Spjd	struct g_raid3_disk *disk;
351133839Sobrien	u_int n, ndisks;
352133808Spjd
353133839Sobrien	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354133808Spjd		disk = &sc->sc_disks[n];
355133808Spjd		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356133808Spjd			continue;
357133808Spjd		if (state == -1 || disk->d_state == state)
358133808Spjd			ndisks++;
359133808Spjd	}
360133808Spjd	return (ndisks);
361133808Spjd}
362133808Spjd
363133808Spjdstatic u_int
364133808Spjdg_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365133808Spjd{
366133808Spjd	struct bio *bp;
367133808Spjd	u_int nreqs = 0;
368133808Spjd
369133808Spjd	mtx_lock(&sc->sc_queue_mtx);
370133808Spjd	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371133808Spjd		if (bp->bio_from == cp)
372133808Spjd			nreqs++;
373133808Spjd	}
374133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
375133808Spjd	return (nreqs);
376133808Spjd}
377133808Spjd
378133808Spjdstatic int
379133808Spjdg_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380133808Spjd{
381133808Spjd
382137256Spjd	if (cp->index > 0) {
383133808Spjd		G_RAID3_DEBUG(2,
384133808Spjd		    "I/O requests for %s exist, can't destroy it now.",
385133808Spjd		    cp->provider->name);
386133808Spjd		return (1);
387133808Spjd	}
388133808Spjd	if (g_raid3_nrequests(sc, cp) > 0) {
389133808Spjd		G_RAID3_DEBUG(2,
390133808Spjd		    "I/O requests for %s in queue, can't destroy it now.",
391133808Spjd		    cp->provider->name);
392133808Spjd		return (1);
393133808Spjd	}
394133808Spjd	return (0);
395133808Spjd}
396133808Spjd
397133808Spjdstatic void
398139144Spjdg_raid3_destroy_consumer(void *arg, int flags __unused)
399139144Spjd{
400139144Spjd	struct g_consumer *cp;
401139144Spjd
402139144Spjd	cp = arg;
403139144Spjd	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404139144Spjd	g_detach(cp);
405139144Spjd	g_destroy_consumer(cp);
406139144Spjd}
407139144Spjd
408139144Spjdstatic void
409133808Spjdg_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410133808Spjd{
411139144Spjd	struct g_provider *pp;
412139144Spjd	int retaste_wait;
413133808Spjd
414133808Spjd	g_topology_assert();
415133808Spjd
416133808Spjd	cp->private = NULL;
417133808Spjd	if (g_raid3_is_busy(sc, cp))
418133808Spjd		return;
419133808Spjd	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420139144Spjd	pp = cp->provider;
421139144Spjd	retaste_wait = 0;
422139144Spjd	if (cp->acw == 1) {
423139144Spjd		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424139144Spjd			retaste_wait = 1;
425139144Spjd	}
426139144Spjd	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427139144Spjd	    -cp->acw, -cp->ace, 0);
428139144Spjd	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429139144Spjd		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430139144Spjd	if (retaste_wait) {
431139144Spjd		/*
432139144Spjd		 * After retaste event was send (inside g_access()), we can send
433139144Spjd		 * event to detach and destroy consumer.
434139144Spjd		 * A class, which has consumer to the given provider connected
435139144Spjd		 * will not receive retaste event for the provider.
436139144Spjd		 * This is the way how I ignore retaste events when I close
437139144Spjd		 * consumers opened for write: I detach and destroy consumer
438139144Spjd		 * after retaste event is sent.
439139144Spjd		 */
440139144Spjd		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441139144Spjd		return;
442139144Spjd	}
443139144Spjd	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444133808Spjd	g_detach(cp);
445133808Spjd	g_destroy_consumer(cp);
446133808Spjd}
447133808Spjd
448133808Spjdstatic int
449133808Spjdg_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450133808Spjd{
451144144Spjd	struct g_consumer *cp;
452133808Spjd	int error;
453133808Spjd
454133808Spjd	g_topology_assert();
455133808Spjd	KASSERT(disk->d_consumer == NULL,
456133808Spjd	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
457133808Spjd
458144144Spjd	cp = g_new_consumer(disk->d_softc->sc_geom);
459144144Spjd	error = g_attach(cp, pp);
460144144Spjd	if (error != 0) {
461144144Spjd		g_destroy_consumer(cp);
462133808Spjd		return (error);
463144144Spjd	}
464144144Spjd	error = g_access(cp, 1, 1, 1);
465139144Spjd	if (error != 0) {
466144144Spjd		g_detach(cp);
467144144Spjd		g_destroy_consumer(cp);
468139144Spjd		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
469139144Spjd		    pp->name, error);
470139144Spjd		return (error);
471139144Spjd	}
472144144Spjd	disk->d_consumer = cp;
473144144Spjd	disk->d_consumer->private = disk;
474144144Spjd	disk->d_consumer->index = 0;
475133808Spjd	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
476133808Spjd	return (0);
477133808Spjd}
478133808Spjd
479133808Spjdstatic void
480133808Spjdg_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
481133808Spjd{
482133808Spjd
483133808Spjd	g_topology_assert();
484133808Spjd
485133808Spjd	if (cp == NULL)
486133808Spjd		return;
487139144Spjd	if (cp->provider != NULL)
488133808Spjd		g_raid3_kill_consumer(sc, cp);
489139144Spjd	else
490133808Spjd		g_destroy_consumer(cp);
491133808Spjd}
492133808Spjd
493133808Spjd/*
494133808Spjd * Initialize disk. This means allocate memory, create consumer, attach it
495133808Spjd * to the provider and open access (r1w1e1) to it.
496133808Spjd */
497133808Spjdstatic struct g_raid3_disk *
498133808Spjdg_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
499133808Spjd    struct g_raid3_metadata *md, int *errorp)
500133808Spjd{
501133808Spjd	struct g_raid3_disk *disk;
502133808Spjd	int error;
503133808Spjd
504133808Spjd	disk = &sc->sc_disks[md->md_no];
505133808Spjd	error = g_raid3_connect_disk(disk, pp);
506144144Spjd	if (error != 0) {
507144144Spjd		if (errorp != NULL)
508144144Spjd			*errorp = error;
509144144Spjd		return (NULL);
510144144Spjd	}
511133808Spjd	disk->d_state = G_RAID3_DISK_STATE_NONE;
512133808Spjd	disk->d_flags = md->md_dflags;
513133808Spjd	if (md->md_provider[0] != '\0')
514133808Spjd		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
515133808Spjd	disk->d_sync.ds_consumer = NULL;
516133808Spjd	disk->d_sync.ds_offset = md->md_sync_offset;
517133808Spjd	disk->d_sync.ds_offset_done = md->md_sync_offset;
518135863Spjd	disk->d_sync.ds_resync = -1;
519139295Spjd	disk->d_genid = md->md_genid;
520133808Spjd	disk->d_sync.ds_syncid = md->md_syncid;
521133808Spjd	if (errorp != NULL)
522133808Spjd		*errorp = 0;
523133808Spjd	return (disk);
524133808Spjd}
525133808Spjd
526133808Spjdstatic void
527133808Spjdg_raid3_destroy_disk(struct g_raid3_disk *disk)
528133808Spjd{
529133808Spjd	struct g_raid3_softc *sc;
530133808Spjd
531133808Spjd	g_topology_assert();
532133808Spjd
533133808Spjd	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
534133808Spjd		return;
535133808Spjd	g_raid3_event_cancel(disk);
536133808Spjd	sc = disk->d_softc;
537133808Spjd	switch (disk->d_state) {
538133808Spjd	case G_RAID3_DISK_STATE_SYNCHRONIZING:
539133808Spjd		if (sc->sc_syncdisk != NULL)
540133808Spjd			g_raid3_sync_stop(sc, 1);
541133808Spjd		/* FALLTHROUGH */
542133808Spjd	case G_RAID3_DISK_STATE_NEW:
543133808Spjd	case G_RAID3_DISK_STATE_STALE:
544133808Spjd	case G_RAID3_DISK_STATE_ACTIVE:
545133808Spjd		g_raid3_disconnect_consumer(sc, disk->d_consumer);
546133808Spjd		disk->d_consumer = NULL;
547133808Spjd		break;
548133808Spjd	default:
549133808Spjd		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
550133808Spjd		    g_raid3_get_diskname(disk),
551133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
552133808Spjd	}
553133808Spjd	disk->d_state = G_RAID3_DISK_STATE_NODISK;
554133808Spjd}
555133808Spjd
556133808Spjdstatic void
557133808Spjdg_raid3_destroy_device(struct g_raid3_softc *sc)
558133808Spjd{
559133808Spjd	struct g_raid3_event *ep;
560137257Spjd	struct g_raid3_disk *disk;
561133808Spjd	struct g_geom *gp;
562133808Spjd	struct g_consumer *cp;
563133808Spjd	u_int n;
564133808Spjd
565133808Spjd	g_topology_assert();
566133808Spjd
567133808Spjd	gp = sc->sc_geom;
568133808Spjd	if (sc->sc_provider != NULL)
569133808Spjd		g_raid3_destroy_provider(sc);
570137257Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
571137257Spjd		disk = &sc->sc_disks[n];
572139144Spjd		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
573139144Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
574139144Spjd			g_raid3_update_metadata(disk);
575139144Spjd			g_raid3_destroy_disk(disk);
576139144Spjd		}
577137257Spjd	}
578133808Spjd	while ((ep = g_raid3_event_get(sc)) != NULL) {
579139144Spjd		g_raid3_event_remove(sc, ep);
580133808Spjd		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
581133808Spjd			g_raid3_event_free(ep);
582133808Spjd		else {
583133808Spjd			ep->e_error = ECANCELED;
584133808Spjd			ep->e_flags |= G_RAID3_EVENT_DONE;
585133808Spjd			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
586133808Spjd			mtx_lock(&sc->sc_events_mtx);
587133808Spjd			wakeup(ep);
588133808Spjd			mtx_unlock(&sc->sc_events_mtx);
589133808Spjd		}
590133808Spjd	}
591133808Spjd	callout_drain(&sc->sc_callout);
592133808Spjd	gp->softc = NULL;
593133808Spjd	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
594133808Spjd	if (cp != NULL)
595133808Spjd		g_raid3_disconnect_consumer(sc, cp);
596133808Spjd	sc->sc_sync.ds_geom->softc = NULL;
597133808Spjd	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
598133808Spjd	uma_zdestroy(sc->sc_zone_64k);
599133808Spjd	uma_zdestroy(sc->sc_zone_16k);
600133808Spjd	uma_zdestroy(sc->sc_zone_4k);
601133808Spjd	mtx_destroy(&sc->sc_queue_mtx);
602133808Spjd	mtx_destroy(&sc->sc_events_mtx);
603133808Spjd	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
604133808Spjd	g_wither_geom(gp, ENXIO);
605133808Spjd}
606133808Spjd
607133808Spjdstatic void
608133808Spjdg_raid3_orphan(struct g_consumer *cp)
609133808Spjd{
610133808Spjd	struct g_raid3_disk *disk;
611133808Spjd
612133808Spjd	g_topology_assert();
613133808Spjd
614133808Spjd	disk = cp->private;
615133808Spjd	if (disk == NULL)
616133808Spjd		return;
617139671Spjd	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
618133808Spjd	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
619133808Spjd	    G_RAID3_EVENT_DONTWAIT);
620133808Spjd}
621133808Spjd
622133808Spjdstatic int
623133808Spjdg_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
624133808Spjd{
625133808Spjd	struct g_raid3_softc *sc;
626133808Spjd	struct g_consumer *cp;
627133808Spjd	off_t offset, length;
628133808Spjd	u_char *sector;
629139144Spjd	int error = 0;
630133808Spjd
631133808Spjd	g_topology_assert();
632133808Spjd
633133808Spjd	sc = disk->d_softc;
634133808Spjd	cp = disk->d_consumer;
635133808Spjd	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
636133808Spjd	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
637139144Spjd	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
638139144Spjd	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
639139144Spjd	    cp->acw, cp->ace));
640133808Spjd	length = cp->provider->sectorsize;
641133808Spjd	offset = cp->provider->mediasize - length;
642133808Spjd	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
643139144Spjd	if (md != NULL)
644139144Spjd		raid3_metadata_encode(md, sector);
645139144Spjd	g_topology_unlock();
646139144Spjd	error = g_write_data(cp, offset, sector, length);
647139144Spjd	g_topology_lock();
648133808Spjd	free(sector, M_RAID3);
649133808Spjd	if (error != 0) {
650139671Spjd		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID;
651133808Spjd		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
652133808Spjd		    G_RAID3_EVENT_DONTWAIT);
653133808Spjd	}
654133808Spjd	return (error);
655133808Spjd}
656133808Spjd
657133808Spjdint
658133808Spjdg_raid3_clear_metadata(struct g_raid3_disk *disk)
659133808Spjd{
660133808Spjd	int error;
661133808Spjd
662133808Spjd	g_topology_assert();
663133808Spjd	error = g_raid3_write_metadata(disk, NULL);
664133808Spjd	if (error == 0) {
665133808Spjd		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
666133808Spjd		    g_raid3_get_diskname(disk));
667133808Spjd	} else {
668133808Spjd		G_RAID3_DEBUG(0,
669133808Spjd		    "Cannot clear metadata on disk %s (error=%d).",
670133808Spjd		    g_raid3_get_diskname(disk), error);
671133808Spjd	}
672133808Spjd	return (error);
673133808Spjd}
674133808Spjd
675133808Spjdvoid
676133808Spjdg_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
677133808Spjd{
678133808Spjd	struct g_raid3_softc *sc;
679142727Spjd	struct g_provider *pp;
680133808Spjd
681133808Spjd	sc = disk->d_softc;
682133808Spjd	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
683133808Spjd	md->md_version = G_RAID3_VERSION;
684133808Spjd	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
685133808Spjd	md->md_id = sc->sc_id;
686133808Spjd	md->md_all = sc->sc_ndisks;
687139295Spjd	md->md_genid = sc->sc_genid;
688133808Spjd	md->md_mediasize = sc->sc_mediasize;
689133808Spjd	md->md_sectorsize = sc->sc_sectorsize;
690133808Spjd	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
691133808Spjd	md->md_no = disk->d_no;
692133808Spjd	md->md_syncid = disk->d_sync.ds_syncid;
693133808Spjd	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
694133808Spjd	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
695133808Spjd		md->md_sync_offset = disk->d_sync.ds_offset_done;
696133808Spjd	else
697133808Spjd		md->md_sync_offset = 0;
698142727Spjd	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
699142727Spjd		pp = disk->d_consumer->provider;
700142727Spjd	else
701142727Spjd		pp = NULL;
702142727Spjd	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
703142727Spjd		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
704142727Spjd	else
705133808Spjd		bzero(md->md_provider, sizeof(md->md_provider));
706142727Spjd	if (pp != NULL)
707142727Spjd		md->md_provsize = pp->mediasize;
708142727Spjd	else
709142727Spjd		md->md_provsize = 0;
710133808Spjd}
711133808Spjd
712133808Spjdvoid
713133808Spjdg_raid3_update_metadata(struct g_raid3_disk *disk)
714133808Spjd{
715133808Spjd	struct g_raid3_metadata md;
716133808Spjd	int error;
717133808Spjd
718133808Spjd	g_topology_assert();
719133808Spjd	g_raid3_fill_metadata(disk, &md);
720133808Spjd	error = g_raid3_write_metadata(disk, &md);
721133808Spjd	if (error == 0) {
722133808Spjd		G_RAID3_DEBUG(2, "Metadata on %s updated.",
723133808Spjd		    g_raid3_get_diskname(disk));
724133808Spjd	} else {
725133808Spjd		G_RAID3_DEBUG(0,
726133808Spjd		    "Cannot update metadata on disk %s (error=%d).",
727133808Spjd		    g_raid3_get_diskname(disk), error);
728133808Spjd	}
729133808Spjd}
730133808Spjd
731133808Spjdstatic void
732139144Spjdg_raid3_bump_syncid(struct g_raid3_softc *sc)
733133808Spjd{
734133808Spjd	struct g_raid3_disk *disk;
735133808Spjd	u_int n;
736133808Spjd
737133808Spjd	g_topology_assert();
738133808Spjd	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
739133808Spjd	    ("%s called with no active disks (device=%s).", __func__,
740133808Spjd	    sc->sc_name));
741133808Spjd
742133808Spjd	sc->sc_syncid++;
743139295Spjd	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
744139295Spjd	    sc->sc_syncid);
745133808Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
746133808Spjd		disk = &sc->sc_disks[n];
747133808Spjd		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
748133808Spjd		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
749133808Spjd			disk->d_sync.ds_syncid = sc->sc_syncid;
750133808Spjd			g_raid3_update_metadata(disk);
751133808Spjd		}
752133808Spjd	}
753133808Spjd}
754133808Spjd
755137258Spjdstatic void
756139295Spjdg_raid3_bump_genid(struct g_raid3_softc *sc)
757139295Spjd{
758139295Spjd	struct g_raid3_disk *disk;
759139295Spjd	u_int n;
760139295Spjd
761139295Spjd	g_topology_assert();
762139295Spjd	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
763139295Spjd	    ("%s called with no active disks (device=%s).", __func__,
764139295Spjd	    sc->sc_name));
765139295Spjd
766139295Spjd	sc->sc_genid++;
767139295Spjd	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
768139295Spjd	    sc->sc_genid);
769139295Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
770139295Spjd		disk = &sc->sc_disks[n];
771139295Spjd		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
772139295Spjd		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
773139295Spjd			disk->d_genid = sc->sc_genid;
774139295Spjd			g_raid3_update_metadata(disk);
775139295Spjd		}
776139295Spjd	}
777139295Spjd}
778139295Spjd
779139295Spjdstatic void
780137258Spjdg_raid3_idle(struct g_raid3_softc *sc)
781137258Spjd{
782137258Spjd	struct g_raid3_disk *disk;
783137258Spjd	u_int i;
784137258Spjd
785137258Spjd	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
786137258Spjd		return;
787137258Spjd	sc->sc_idle = 1;
788137258Spjd	g_topology_lock();
789137258Spjd	for (i = 0; i < sc->sc_ndisks; i++) {
790137258Spjd		disk = &sc->sc_disks[i];
791137258Spjd		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
792137258Spjd			continue;
793137258Spjd		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
794137258Spjd		    g_raid3_get_diskname(disk), sc->sc_name);
795137258Spjd		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
796137258Spjd		g_raid3_update_metadata(disk);
797137258Spjd	}
798137258Spjd	g_topology_unlock();
799137258Spjd}
800137258Spjd
801137258Spjdstatic void
802137258Spjdg_raid3_unidle(struct g_raid3_softc *sc)
803137258Spjd{
804137258Spjd	struct g_raid3_disk *disk;
805137258Spjd	u_int i;
806137258Spjd
807137258Spjd	sc->sc_idle = 0;
808137258Spjd	g_topology_lock();
809137258Spjd	for (i = 0; i < sc->sc_ndisks; i++) {
810137258Spjd		disk = &sc->sc_disks[i];
811137258Spjd		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
812137258Spjd			continue;
813137258Spjd		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
814137258Spjd		    g_raid3_get_diskname(disk), sc->sc_name);
815137258Spjd		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
816137258Spjd		g_raid3_update_metadata(disk);
817137258Spjd	}
818137258Spjd	g_topology_unlock();
819137258Spjd}
820137258Spjd
821137259Spjd/*
822137259Spjd * Return 1 if we should check if RAID3 device is idling.
823137259Spjd */
824137259Spjdstatic int
825137259Spjdg_raid3_check_idle(struct g_raid3_softc *sc)
826137259Spjd{
827137259Spjd	struct g_raid3_disk *disk;
828137259Spjd	u_int i;
829137259Spjd
830137259Spjd	if (sc->sc_idle)
831137259Spjd		return (0);
832137259Spjd	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
833137259Spjd		return (0);
834137259Spjd	/*
835137259Spjd	 * Check if there are no in-flight requests.
836137259Spjd	 */
837137259Spjd	for (i = 0; i < sc->sc_ndisks; i++) {
838137259Spjd		disk = &sc->sc_disks[i];
839137259Spjd		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
840137259Spjd			continue;
841137259Spjd		if (disk->d_consumer->index > 0)
842137259Spjd			return (0);
843137259Spjd	}
844137259Spjd	return (1);
845137259Spjd}
846137259Spjd
847133808Spjd/*
848133808Spjd * Treat bio_driver1 field in parent bio as list head and field bio_caller1
849133808Spjd * in child bio as pointer to the next element on the list.
850133808Spjd */
851133808Spjd#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
852133808Spjd
853133808Spjd#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
854133808Spjd
855133808Spjd#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
856133808Spjd	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
857133808Spjd	    (bp) = G_RAID3_NEXT_BIO(bp))
858133808Spjd
859133808Spjd#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
860133808Spjd	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
861133808Spjd	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
862133808Spjd	    (bp) = (tmpbp))
863133808Spjd
864133808Spjdstatic void
865133808Spjdg_raid3_init_bio(struct bio *pbp)
866133808Spjd{
867133808Spjd
868133808Spjd	G_RAID3_HEAD_BIO(pbp) = NULL;
869133808Spjd}
870133808Spjd
871133808Spjdstatic void
872134168Spjdg_raid3_remove_bio(struct bio *cbp)
873134168Spjd{
874134168Spjd	struct bio *pbp, *bp;
875134168Spjd
876134168Spjd	pbp = cbp->bio_parent;
877134168Spjd	if (G_RAID3_HEAD_BIO(pbp) == cbp)
878134168Spjd		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
879134168Spjd	else {
880134168Spjd		G_RAID3_FOREACH_BIO(pbp, bp) {
881134168Spjd			if (G_RAID3_NEXT_BIO(bp) == cbp) {
882134168Spjd				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
883134168Spjd				break;
884134168Spjd			}
885134168Spjd		}
886134168Spjd	}
887134168Spjd	G_RAID3_NEXT_BIO(cbp) = NULL;
888134168Spjd}
889134168Spjd
890134168Spjdstatic void
891134168Spjdg_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
892134168Spjd{
893134168Spjd	struct bio *pbp, *bp;
894134168Spjd
895134168Spjd	g_raid3_remove_bio(sbp);
896134168Spjd	pbp = dbp->bio_parent;
897134168Spjd	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
898134168Spjd	if (G_RAID3_HEAD_BIO(pbp) == dbp)
899134168Spjd		G_RAID3_HEAD_BIO(pbp) = sbp;
900134168Spjd	else {
901134168Spjd		G_RAID3_FOREACH_BIO(pbp, bp) {
902134168Spjd			if (G_RAID3_NEXT_BIO(bp) == dbp) {
903134168Spjd				G_RAID3_NEXT_BIO(bp) = sbp;
904134168Spjd				break;
905134168Spjd			}
906134168Spjd		}
907134168Spjd	}
908134168Spjd	G_RAID3_NEXT_BIO(dbp) = NULL;
909134168Spjd}
910134168Spjd
911134168Spjdstatic void
912133808Spjdg_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
913133808Spjd{
914133808Spjd	struct bio *bp, *pbp;
915133808Spjd	size_t size;
916133808Spjd
917133808Spjd	pbp = cbp->bio_parent;
918133808Spjd	pbp->bio_children--;
919133808Spjd	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
920133808Spjd	size = pbp->bio_length / (sc->sc_ndisks - 1);
921133808Spjd	if (size > 16384)
922133808Spjd		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
923133808Spjd	else if (size > 4096)
924133808Spjd		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
925133808Spjd	else
926133808Spjd		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
927133808Spjd	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
928133808Spjd		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
929133808Spjd		G_RAID3_NEXT_BIO(cbp) = NULL;
930133808Spjd		g_destroy_bio(cbp);
931133808Spjd	} else {
932133808Spjd		G_RAID3_FOREACH_BIO(pbp, bp) {
933133808Spjd			if (G_RAID3_NEXT_BIO(bp) == cbp)
934133808Spjd				break;
935133808Spjd		}
936134168Spjd		if (bp != NULL) {
937134168Spjd			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
938134168Spjd			    ("NULL bp->bio_driver1"));
939134168Spjd			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
940134168Spjd			G_RAID3_NEXT_BIO(cbp) = NULL;
941134168Spjd		}
942133808Spjd		g_destroy_bio(cbp);
943133808Spjd	}
944133808Spjd}
945133808Spjd
946133808Spjdstatic struct bio *
947133808Spjdg_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
948133808Spjd{
949133808Spjd	struct bio *bp, *cbp;
950133808Spjd	size_t size;
951133808Spjd
952133808Spjd	cbp = g_clone_bio(pbp);
953133808Spjd	if (cbp == NULL)
954133808Spjd		return (NULL);
955133808Spjd	size = pbp->bio_length / (sc->sc_ndisks - 1);
956133808Spjd	if (size > 16384) {
957133808Spjd		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
958133808Spjd		g_raid3_64k_requested++;
959133808Spjd	} else if (size > 4096) {
960133808Spjd		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
961133808Spjd		g_raid3_16k_requested++;
962133808Spjd	} else {
963133808Spjd		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
964133808Spjd		g_raid3_4k_requested++;
965133808Spjd	}
966133808Spjd	if (cbp->bio_data == NULL) {
967133808Spjd		if (size > 16384)
968133808Spjd			g_raid3_64k_failed++;
969133808Spjd		if (size > 4096)
970133808Spjd			g_raid3_16k_failed++;
971133808Spjd		else
972133808Spjd			g_raid3_4k_failed++;
973133808Spjd		pbp->bio_children--;
974133808Spjd		g_destroy_bio(cbp);
975133808Spjd		return (NULL);
976133808Spjd	}
977133808Spjd	G_RAID3_NEXT_BIO(cbp) = NULL;
978133808Spjd	if (G_RAID3_HEAD_BIO(pbp) == NULL)
979133808Spjd		G_RAID3_HEAD_BIO(pbp) = cbp;
980133808Spjd	else {
981133808Spjd		G_RAID3_FOREACH_BIO(pbp, bp) {
982133808Spjd			if (G_RAID3_NEXT_BIO(bp) == NULL) {
983133808Spjd				G_RAID3_NEXT_BIO(bp) = cbp;
984133808Spjd				break;
985133808Spjd			}
986133808Spjd		}
987133808Spjd	}
988133808Spjd	return (cbp);
989133808Spjd}
990133808Spjd
991133808Spjdstatic void
992133808Spjdg_raid3_scatter(struct bio *pbp)
993133808Spjd{
994133808Spjd	struct g_raid3_softc *sc;
995133808Spjd	struct g_raid3_disk *disk;
996133808Spjd	struct bio *bp, *cbp;
997133808Spjd	off_t atom, cadd, padd, left;
998133808Spjd
999133808Spjd	sc = pbp->bio_to->geom->softc;
1000133808Spjd	bp = NULL;
1001133808Spjd	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1002133808Spjd		/*
1003133808Spjd		 * Find bio for which we should calculate data.
1004133808Spjd		 */
1005133808Spjd		G_RAID3_FOREACH_BIO(pbp, cbp) {
1006133808Spjd			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1007133808Spjd				bp = cbp;
1008133808Spjd				break;
1009133808Spjd			}
1010133808Spjd		}
1011133808Spjd		KASSERT(bp != NULL, ("NULL parity bio."));
1012133808Spjd	}
1013133808Spjd	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1014133808Spjd	cadd = padd = 0;
1015133808Spjd	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1016133808Spjd		G_RAID3_FOREACH_BIO(pbp, cbp) {
1017133808Spjd			if (cbp == bp)
1018133808Spjd				continue;
1019133808Spjd			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1020133808Spjd			padd += atom;
1021133808Spjd		}
1022133808Spjd		cadd += atom;
1023133808Spjd	}
1024133808Spjd	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1025133808Spjd		struct bio *tmpbp;
1026133808Spjd
1027133808Spjd		/*
1028133808Spjd		 * Calculate parity.
1029133808Spjd		 */
1030133808Spjd		bzero(bp->bio_data, bp->bio_length);
1031133808Spjd		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1032133808Spjd			if (cbp == bp)
1033133808Spjd				continue;
1034133808Spjd			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1035133808Spjd			    bp->bio_length);
1036133808Spjd			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1037133808Spjd				g_raid3_destroy_bio(sc, cbp);
1038133808Spjd		}
1039133808Spjd	}
1040133808Spjd	G_RAID3_FOREACH_BIO(pbp, cbp) {
1041133808Spjd		struct g_consumer *cp;
1042133808Spjd
1043133808Spjd		disk = cbp->bio_caller2;
1044133808Spjd		cp = disk->d_consumer;
1045133808Spjd		cbp->bio_to = cp->provider;
1046133808Spjd		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1047139144Spjd		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1048139144Spjd		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1049139144Spjd		    cp->acr, cp->acw, cp->ace));
1050137256Spjd		cp->index++;
1051133808Spjd		g_io_request(cbp, cp);
1052133808Spjd	}
1053133808Spjd}
1054133808Spjd
1055133808Spjdstatic void
1056133808Spjdg_raid3_gather(struct bio *pbp)
1057133808Spjd{
1058133808Spjd	struct g_raid3_softc *sc;
1059133808Spjd	struct g_raid3_disk *disk;
1060134124Spjd	struct bio *xbp, *fbp, *cbp;
1061133808Spjd	off_t atom, cadd, padd, left;
1062133808Spjd
1063133808Spjd	sc = pbp->bio_to->geom->softc;
1064134124Spjd	/*
1065134124Spjd	 * Find bio for which we have to calculate data.
1066134124Spjd	 * While going through this path, check if all requests
1067134124Spjd	 * succeeded, if not, deny whole request.
1068134124Spjd	 * If we're in COMPLETE mode, we allow one request to fail,
1069134124Spjd	 * so if we find one, we're sending it to the parity consumer.
1070134124Spjd	 * If there are more failed requests, we deny whole request.
1071134124Spjd	 */
1072134124Spjd	xbp = fbp = NULL;
1073134124Spjd	G_RAID3_FOREACH_BIO(pbp, cbp) {
1074134124Spjd		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1075134124Spjd			KASSERT(xbp == NULL, ("More than one parity bio."));
1076134124Spjd			xbp = cbp;
1077134124Spjd		}
1078134124Spjd		if (cbp->bio_error == 0)
1079134124Spjd			continue;
1080133808Spjd		/*
1081134124Spjd		 * Found failed request.
1082133808Spjd		 */
1083134124Spjd		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1084134124Spjd		disk = cbp->bio_caller2;
1085134124Spjd		if (disk != NULL) {
1086133808Spjd			/*
1087139295Spjd			 * Actually this is pointless to bump genid,
1088134124Spjd			 * because whole device is fucked up.
1089133808Spjd			 */
1090139671Spjd			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1091134124Spjd			g_raid3_event_send(disk,
1092134124Spjd			    G_RAID3_DISK_STATE_DISCONNECTED,
1093134124Spjd			    G_RAID3_EVENT_DONTWAIT);
1094134124Spjd		}
1095134124Spjd		if (fbp == NULL) {
1096134124Spjd			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1097133808Spjd				/*
1098134124Spjd				 * We are already in degraded mode, so we can't
1099134124Spjd				 * accept any failures.
1100133808Spjd				 */
1101134124Spjd				if (pbp->bio_error == 0)
1102134124Spjd					pbp->bio_error = fbp->bio_error;
1103134124Spjd			} else {
1104134124Spjd				fbp = cbp;
1105133808Spjd			}
1106134124Spjd		} else {
1107133808Spjd			/*
1108134124Spjd			 * Next failed request, that's too many.
1109133808Spjd			 */
1110134124Spjd			if (pbp->bio_error == 0)
1111134124Spjd				pbp->bio_error = fbp->bio_error;
1112134124Spjd		}
1113134124Spjd	}
1114134124Spjd	if (pbp->bio_error != 0)
1115134124Spjd		goto finish;
1116134168Spjd	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1117134168Spjd		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1118134168Spjd		if (xbp != fbp)
1119134168Spjd			g_raid3_replace_bio(xbp, fbp);
1120134168Spjd		g_raid3_destroy_bio(sc, fbp);
1121134168Spjd	} else if (fbp != NULL) {
1122134124Spjd		struct g_consumer *cp;
1123134124Spjd
1124134124Spjd		/*
1125134124Spjd		 * One request failed, so send the same request to
1126134124Spjd		 * the parity consumer.
1127134124Spjd		 */
1128134124Spjd		disk = pbp->bio_driver2;
1129134124Spjd		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1130134124Spjd			pbp->bio_error = fbp->bio_error;
1131133808Spjd			goto finish;
1132133808Spjd		}
1133134124Spjd		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1134134124Spjd		pbp->bio_inbed--;
1135134124Spjd		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1136134124Spjd		if (disk->d_no == sc->sc_ndisks - 1)
1137134124Spjd			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1138134124Spjd		fbp->bio_error = 0;
1139134124Spjd		fbp->bio_completed = 0;
1140134124Spjd		fbp->bio_children = 0;
1141134124Spjd		fbp->bio_inbed = 0;
1142134124Spjd		cp = disk->d_consumer;
1143134124Spjd		fbp->bio_caller2 = disk;
1144134124Spjd		fbp->bio_to = cp->provider;
1145134124Spjd		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1146139144Spjd		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1147134124Spjd		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1148134124Spjd		    cp->acr, cp->acw, cp->ace));
1149137256Spjd		cp->index++;
1150134124Spjd		g_io_request(fbp, cp);
1151134124Spjd		return;
1152134124Spjd	}
1153134124Spjd	if (xbp != NULL) {
1154133808Spjd		/*
1155133808Spjd		 * Calculate parity.
1156133808Spjd		 */
1157133808Spjd		G_RAID3_FOREACH_BIO(pbp, cbp) {
1158133808Spjd			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1159133808Spjd				continue;
1160134124Spjd			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1161134124Spjd			    xbp->bio_length);
1162133808Spjd		}
1163134124Spjd		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1164134168Spjd		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1165134168Spjd			if (!g_raid3_is_zero(xbp)) {
1166134168Spjd				g_raid3_parity_mismatch++;
1167134168Spjd				pbp->bio_error = EIO;
1168134168Spjd				goto finish;
1169134168Spjd			}
1170134168Spjd			g_raid3_destroy_bio(sc, xbp);
1171134168Spjd		}
1172133808Spjd	}
1173133808Spjd	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1174133808Spjd	cadd = padd = 0;
1175133808Spjd	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1176133808Spjd		G_RAID3_FOREACH_BIO(pbp, cbp) {
1177133808Spjd			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1178133808Spjd			pbp->bio_completed += atom;
1179133808Spjd			padd += atom;
1180133808Spjd		}
1181133808Spjd		cadd += atom;
1182133808Spjd	}
1183133808Spjdfinish:
1184133808Spjd	if (pbp->bio_error == 0)
1185133808Spjd		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1186134303Spjd	else {
1187134303Spjd		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1188134303Spjd			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1189134303Spjd		else
1190134303Spjd			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1191134303Spjd	}
1192134168Spjd	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1193133808Spjd	g_io_deliver(pbp, pbp->bio_error);
1194133808Spjd	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1195133808Spjd		g_raid3_destroy_bio(sc, cbp);
1196133808Spjd}
1197133808Spjd
1198133808Spjdstatic void
1199133808Spjdg_raid3_done(struct bio *bp)
1200133808Spjd{
1201133808Spjd	struct g_raid3_softc *sc;
1202133808Spjd
1203133808Spjd	sc = bp->bio_from->geom->softc;
1204133808Spjd	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1205133808Spjd	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1206133808Spjd	mtx_lock(&sc->sc_queue_mtx);
1207133808Spjd	bioq_insert_head(&sc->sc_queue, bp);
1208133808Spjd	wakeup(sc);
1209133808Spjd	wakeup(&sc->sc_queue);
1210133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
1211133808Spjd}
1212133808Spjd
1213133808Spjdstatic void
1214133808Spjdg_raid3_regular_request(struct bio *cbp)
1215133808Spjd{
1216133808Spjd	struct g_raid3_softc *sc;
1217133808Spjd	struct g_raid3_disk *disk;
1218133808Spjd	struct bio *pbp;
1219133808Spjd
1220133808Spjd	g_topology_assert_not();
1221133808Spjd
1222137256Spjd	cbp->bio_from->index--;
1223133808Spjd	pbp = cbp->bio_parent;
1224133808Spjd	sc = pbp->bio_to->geom->softc;
1225133808Spjd	disk = cbp->bio_from->private;
1226133808Spjd	if (disk == NULL) {
1227133808Spjd		g_topology_lock();
1228133808Spjd		g_raid3_kill_consumer(sc, cbp->bio_from);
1229133808Spjd		g_topology_unlock();
1230133808Spjd	}
1231133808Spjd
1232133808Spjd	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1233133808Spjd	pbp->bio_inbed++;
1234133808Spjd	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1235133808Spjd	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1236133808Spjd	    pbp->bio_children));
1237133808Spjd	if (pbp->bio_inbed != pbp->bio_children)
1238133808Spjd		return;
1239133808Spjd	switch (pbp->bio_cmd) {
1240133808Spjd	case BIO_READ:
1241133808Spjd		g_raid3_gather(pbp);
1242133808Spjd		break;
1243133808Spjd	case BIO_WRITE:
1244133808Spjd	case BIO_DELETE:
1245133808Spjd	    {
1246133808Spjd		int error = 0;
1247133808Spjd
1248133808Spjd		pbp->bio_completed = pbp->bio_length;
1249133808Spjd		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1250133808Spjd			if (cbp->bio_error != 0) {
1251133808Spjd				disk = cbp->bio_caller2;
1252133808Spjd				if (disk != NULL) {
1253139671Spjd					sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1254133808Spjd					g_raid3_event_send(disk,
1255133808Spjd					    G_RAID3_DISK_STATE_DISCONNECTED,
1256133808Spjd					    G_RAID3_EVENT_DONTWAIT);
1257133808Spjd				}
1258133808Spjd				if (error == 0)
1259133808Spjd					error = cbp->bio_error;
1260133808Spjd				else if (pbp->bio_error == 0) {
1261133808Spjd					/*
1262133808Spjd					 * Next failed request, that's too many.
1263133808Spjd					 */
1264133808Spjd					pbp->bio_error = error;
1265133808Spjd				}
1266133808Spjd			}
1267133808Spjd			g_raid3_destroy_bio(sc, cbp);
1268133808Spjd		}
1269133808Spjd		if (pbp->bio_error == 0)
1270133808Spjd			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1271133808Spjd		else
1272133808Spjd			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1273133808Spjd		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1274133808Spjd		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1275133808Spjd		g_io_deliver(pbp, pbp->bio_error);
1276133808Spjd		break;
1277133808Spjd	    }
1278133808Spjd	}
1279133808Spjd}
1280133808Spjd
1281133808Spjdstatic void
1282133808Spjdg_raid3_sync_done(struct bio *bp)
1283133808Spjd{
1284133808Spjd	struct g_raid3_softc *sc;
1285133808Spjd
1286133808Spjd	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1287133808Spjd	sc = bp->bio_from->geom->softc;
1288133808Spjd	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1289133808Spjd	mtx_lock(&sc->sc_queue_mtx);
1290133808Spjd	bioq_insert_head(&sc->sc_queue, bp);
1291133808Spjd	wakeup(sc);
1292133808Spjd	wakeup(&sc->sc_queue);
1293133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
1294133808Spjd}
1295133808Spjd
1296133808Spjdstatic void
1297133808Spjdg_raid3_start(struct bio *bp)
1298133808Spjd{
1299133808Spjd	struct g_raid3_softc *sc;
1300133808Spjd
1301133808Spjd	sc = bp->bio_to->geom->softc;
1302133808Spjd	/*
1303133808Spjd	 * If sc == NULL or there are no valid disks, provider's error
1304133808Spjd	 * should be set and g_raid3_start() should not be called at all.
1305133808Spjd	 */
1306133808Spjd	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1307133808Spjd	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1308133808Spjd	    ("Provider's error should be set (error=%d)(device=%s).",
1309133808Spjd	    bp->bio_to->error, bp->bio_to->name));
1310133808Spjd	G_RAID3_LOGREQ(3, bp, "Request received.");
1311133808Spjd
1312133808Spjd	switch (bp->bio_cmd) {
1313133808Spjd	case BIO_READ:
1314133808Spjd	case BIO_WRITE:
1315133808Spjd	case BIO_DELETE:
1316133808Spjd		break;
1317133808Spjd	case BIO_GETATTR:
1318133808Spjd	default:
1319133808Spjd		g_io_deliver(bp, EOPNOTSUPP);
1320133808Spjd		return;
1321133808Spjd	}
1322133808Spjd	mtx_lock(&sc->sc_queue_mtx);
1323133808Spjd	bioq_insert_tail(&sc->sc_queue, bp);
1324133808Spjd	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1325133808Spjd	wakeup(sc);
1326133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
1327133808Spjd}
1328133808Spjd
1329133808Spjd/*
1330133808Spjd * Send one synchronization request.
1331133808Spjd */
1332133808Spjdstatic void
1333133808Spjdg_raid3_sync_one(struct g_raid3_softc *sc)
1334133808Spjd{
1335133808Spjd	struct g_raid3_disk *disk;
1336133808Spjd	struct bio *bp;
1337133808Spjd
1338133808Spjd	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1339133808Spjd	    ("Wrong device state (%s, %s).", sc->sc_name,
1340133808Spjd	    g_raid3_device_state2str(sc->sc_state)));
1341133808Spjd	disk = sc->sc_syncdisk;
1342133808Spjd	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1343133808Spjd	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1344133808Spjd	    ("Disk %s is not marked for synchronization.",
1345133808Spjd	    g_raid3_get_diskname(disk)));
1346133808Spjd
1347133808Spjd	bp = g_new_bio();
1348133808Spjd	if (bp == NULL)
1349133808Spjd		return;
1350133808Spjd	bp->bio_parent = NULL;
1351133808Spjd	bp->bio_cmd = BIO_READ;
1352133808Spjd	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1353135872Spjd	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1354133808Spjd	bp->bio_cflags = 0;
1355133808Spjd	bp->bio_done = g_raid3_sync_done;
1356133808Spjd	bp->bio_data = disk->d_sync.ds_data;
1357133808Spjd	if (bp->bio_data == NULL) {
1358133808Spjd		g_destroy_bio(bp);
1359133808Spjd		return;
1360133808Spjd	}
1361133808Spjd	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1362133808Spjd	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1363133808Spjd	bp->bio_to = sc->sc_provider;
1364133808Spjd	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1365137256Spjd	disk->d_sync.ds_consumer->index++;
1366133808Spjd	g_io_request(bp, disk->d_sync.ds_consumer);
1367133808Spjd}
1368133808Spjd
1369133808Spjdstatic void
1370133808Spjdg_raid3_sync_request(struct bio *bp)
1371133808Spjd{
1372133808Spjd	struct g_raid3_softc *sc;
1373133808Spjd	struct g_raid3_disk *disk;
1374133808Spjd
1375137256Spjd	bp->bio_from->index--;
1376133808Spjd	sc = bp->bio_from->geom->softc;
1377133808Spjd	disk = bp->bio_from->private;
1378133808Spjd	if (disk == NULL) {
1379133808Spjd		g_topology_lock();
1380133808Spjd		g_raid3_kill_consumer(sc, bp->bio_from);
1381133808Spjd		g_topology_unlock();
1382133808Spjd		g_destroy_bio(bp);
1383133808Spjd		return;
1384133808Spjd	}
1385133808Spjd
1386133808Spjd	/*
1387133808Spjd	 * Synchronization request.
1388133808Spjd	 */
1389133808Spjd	switch (bp->bio_cmd) {
1390133808Spjd	case BIO_READ:
1391133808Spjd	    {
1392133808Spjd		struct g_consumer *cp;
1393133808Spjd		u_char *dst, *src;
1394133808Spjd		off_t left;
1395133808Spjd		u_int atom;
1396133808Spjd
1397133808Spjd		if (bp->bio_error != 0) {
1398133808Spjd			G_RAID3_LOGREQ(0, bp,
1399133808Spjd			    "Synchronization request failed (error=%d).",
1400133808Spjd			    bp->bio_error);
1401133808Spjd			g_destroy_bio(bp);
1402133808Spjd			return;
1403133808Spjd		}
1404133808Spjd		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1405133808Spjd		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1406133808Spjd		dst = src = bp->bio_data;
1407133808Spjd		if (disk->d_no == sc->sc_ndisks - 1) {
1408133808Spjd			u_int n;
1409133808Spjd
1410133808Spjd			/* Parity component. */
1411133808Spjd			for (left = bp->bio_length; left > 0;
1412133808Spjd			    left -= sc->sc_sectorsize) {
1413133808Spjd				bcopy(src, dst, atom);
1414133808Spjd				src += atom;
1415133808Spjd				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1416133808Spjd					g_raid3_xor(src, dst, dst, atom);
1417133808Spjd					src += atom;
1418133808Spjd				}
1419133808Spjd				dst += atom;
1420133808Spjd			}
1421133808Spjd		} else {
1422133808Spjd			/* Regular component. */
1423133808Spjd			src += atom * disk->d_no;
1424133808Spjd			for (left = bp->bio_length; left > 0;
1425133808Spjd			    left -= sc->sc_sectorsize) {
1426133808Spjd				bcopy(src, dst, atom);
1427133808Spjd				src += sc->sc_sectorsize;
1428133808Spjd				dst += atom;
1429133808Spjd			}
1430133808Spjd		}
1431133808Spjd		bp->bio_offset /= sc->sc_ndisks - 1;
1432133808Spjd		bp->bio_length /= sc->sc_ndisks - 1;
1433133808Spjd		bp->bio_cmd = BIO_WRITE;
1434133808Spjd		bp->bio_cflags = 0;
1435133808Spjd		bp->bio_children = bp->bio_inbed = 0;
1436133808Spjd		cp = disk->d_consumer;
1437139144Spjd		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1438133808Spjd		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1439133808Spjd		    cp->acr, cp->acw, cp->ace));
1440137256Spjd		cp->index++;
1441133808Spjd		g_io_request(bp, cp);
1442133808Spjd		return;
1443133808Spjd	    }
1444133808Spjd	case BIO_WRITE:
1445135863Spjd	    {
1446135863Spjd		struct g_raid3_disk_sync *sync;
1447135863Spjd
1448133808Spjd		if (bp->bio_error != 0) {
1449133808Spjd			G_RAID3_LOGREQ(0, bp,
1450133808Spjd			    "Synchronization request failed (error=%d).",
1451133808Spjd			    bp->bio_error);
1452133808Spjd			g_destroy_bio(bp);
1453139671Spjd			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1454133808Spjd			g_raid3_event_send(disk,
1455133808Spjd			    G_RAID3_DISK_STATE_DISCONNECTED,
1456133808Spjd			    G_RAID3_EVENT_DONTWAIT);
1457133808Spjd			return;
1458133808Spjd		}
1459133808Spjd		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1460135863Spjd		sync = &disk->d_sync;
1461135863Spjd		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1462133808Spjd		g_destroy_bio(bp);
1463135863Spjd		if (sync->ds_resync != -1)
1464135863Spjd			return;
1465135863Spjd		if (sync->ds_offset_done ==
1466134421Spjd		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1467133808Spjd			/*
1468133808Spjd			 * Disk up-to-date, activate it.
1469133808Spjd			 */
1470133808Spjd			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1471133808Spjd			    G_RAID3_EVENT_DONTWAIT);
1472133808Spjd			return;
1473135872Spjd		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1474133808Spjd			/*
1475133808Spjd			 * Update offset_done on every 100 blocks.
1476133808Spjd			 * XXX: This should be configurable.
1477133808Spjd			 */
1478133808Spjd			g_topology_lock();
1479133808Spjd			g_raid3_update_metadata(disk);
1480133808Spjd			g_topology_unlock();
1481133808Spjd		}
1482133808Spjd		return;
1483135863Spjd	    }
1484133808Spjd	default:
1485133808Spjd		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1486133808Spjd		    bp->bio_cmd, sc->sc_name));
1487133808Spjd		break;
1488133808Spjd	}
1489133808Spjd}
1490133808Spjd
1491133808Spjdstatic int
1492133808Spjdg_raid3_register_request(struct bio *pbp)
1493133808Spjd{
1494133808Spjd	struct g_raid3_softc *sc;
1495133808Spjd	struct g_raid3_disk *disk;
1496133808Spjd	struct g_consumer *cp;
1497133808Spjd	struct bio *cbp;
1498133808Spjd	off_t offset, length;
1499133839Sobrien	u_int n, ndisks;
1500134168Spjd	int round_robin, verify;
1501133808Spjd
1502133839Sobrien	ndisks = 0;
1503133808Spjd	sc = pbp->bio_to->geom->softc;
1504133808Spjd	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1505133808Spjd	    sc->sc_syncdisk == NULL) {
1506133808Spjd		g_io_deliver(pbp, EIO);
1507133808Spjd		return (0);
1508133808Spjd	}
1509133808Spjd	g_raid3_init_bio(pbp);
1510133808Spjd	length = pbp->bio_length / (sc->sc_ndisks - 1);
1511133808Spjd	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1512134168Spjd	round_robin = verify = 0;
1513133808Spjd	switch (pbp->bio_cmd) {
1514133808Spjd	case BIO_READ:
1515134168Spjd		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1516134168Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1517134168Spjd			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1518134168Spjd			verify = 1;
1519134168Spjd			ndisks = sc->sc_ndisks;
1520134168Spjd		} else {
1521134168Spjd			verify = 0;
1522134168Spjd			ndisks = sc->sc_ndisks - 1;
1523134168Spjd		}
1524134168Spjd		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1525134168Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1526134168Spjd			round_robin = 1;
1527134168Spjd		} else {
1528134168Spjd			round_robin = 0;
1529134168Spjd		}
1530134168Spjd		KASSERT(!round_robin || !verify,
1531134168Spjd		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1532134124Spjd		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1533133808Spjd		break;
1534133808Spjd	case BIO_WRITE:
1535133808Spjd	case BIO_DELETE:
1536135863Spjd	    {
1537135863Spjd		struct g_raid3_disk_sync *sync;
1538135863Spjd
1539137258Spjd		if (sc->sc_idle)
1540137258Spjd			g_raid3_unidle(sc);
1541137258Spjd
1542133808Spjd		ndisks = sc->sc_ndisks;
1543135863Spjd
1544135863Spjd		if (sc->sc_syncdisk == NULL)
1545135863Spjd			break;
1546135863Spjd		sync = &sc->sc_syncdisk->d_sync;
1547135863Spjd		if (offset >= sync->ds_offset)
1548135863Spjd			break;
1549135863Spjd		if (offset + length <= sync->ds_offset_done)
1550135863Spjd			break;
1551135863Spjd		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1552135863Spjd			break;
1553135872Spjd		sync->ds_resync = offset - (offset % MAXPHYS);
1554133808Spjd		break;
1555135863Spjd	    }
1556133808Spjd	}
1557133808Spjd	for (n = 0; n < ndisks; n++) {
1558133808Spjd		disk = &sc->sc_disks[n];
1559133808Spjd		cbp = g_raid3_clone_bio(sc, pbp);
1560133808Spjd		if (cbp == NULL) {
1561133808Spjd			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1562133808Spjd				g_raid3_destroy_bio(sc, cbp);
1563133808Spjd			return (ENOMEM);
1564133808Spjd		}
1565133808Spjd		cbp->bio_offset = offset;
1566133808Spjd		cbp->bio_length = length;
1567133808Spjd		cbp->bio_done = g_raid3_done;
1568133808Spjd		switch (pbp->bio_cmd) {
1569133808Spjd		case BIO_READ:
1570133808Spjd			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1571133808Spjd				/*
1572133808Spjd				 * Replace invalid component with the parity
1573133808Spjd				 * component.
1574133808Spjd				 */
1575133808Spjd				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1576133808Spjd				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1577133808Spjd				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1578134124Spjd			} else if (round_robin &&
1579134124Spjd			    disk->d_no == sc->sc_round_robin) {
1580134124Spjd				/*
1581134124Spjd				 * In round-robin mode skip one data component
1582134124Spjd				 * and use parity component when reading.
1583134124Spjd				 */
1584134124Spjd				pbp->bio_driver2 = disk;
1585134124Spjd				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1586134124Spjd				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1587134124Spjd				sc->sc_round_robin++;
1588134124Spjd				round_robin = 0;
1589134168Spjd			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1590134168Spjd				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1591133808Spjd			}
1592133808Spjd			break;
1593133808Spjd		case BIO_WRITE:
1594133808Spjd		case BIO_DELETE:
1595133808Spjd			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1596133808Spjd			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1597133808Spjd				if (n == ndisks - 1) {
1598133808Spjd					/*
1599133808Spjd					 * Active parity component, mark it as such.
1600133808Spjd					 */
1601133808Spjd					cbp->bio_cflags |=
1602133808Spjd					    G_RAID3_BIO_CFLAG_PARITY;
1603133808Spjd				}
1604133808Spjd			} else {
1605133808Spjd				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1606133808Spjd				if (n == ndisks - 1) {
1607133808Spjd					/*
1608133808Spjd					 * Parity component is not connected,
1609133808Spjd					 * so destroy its request.
1610133808Spjd					 */
1611133808Spjd					pbp->bio_pflags |=
1612133808Spjd					    G_RAID3_BIO_PFLAG_NOPARITY;
1613133808Spjd					g_raid3_destroy_bio(sc, cbp);
1614133808Spjd					cbp = NULL;
1615133808Spjd				} else {
1616133808Spjd					cbp->bio_cflags |=
1617133808Spjd					    G_RAID3_BIO_CFLAG_NODISK;
1618133808Spjd					disk = NULL;
1619133808Spjd				}
1620133808Spjd			}
1621133808Spjd			break;
1622133808Spjd		}
1623133808Spjd		if (cbp != NULL)
1624133808Spjd			cbp->bio_caller2 = disk;
1625133808Spjd	}
1626133808Spjd	switch (pbp->bio_cmd) {
1627133808Spjd	case BIO_READ:
1628134124Spjd		if (round_robin) {
1629134124Spjd			/*
1630134124Spjd			 * If we are in round-robin mode and 'round_robin' is
1631134124Spjd			 * still 1, it means, that we skipped parity component
1632134124Spjd			 * for this read and must reset sc_round_robin field.
1633134124Spjd			 */
1634134124Spjd			sc->sc_round_robin = 0;
1635134124Spjd		}
1636133808Spjd		G_RAID3_FOREACH_BIO(pbp, cbp) {
1637133808Spjd			disk = cbp->bio_caller2;
1638133808Spjd			cp = disk->d_consumer;
1639133808Spjd			cbp->bio_to = cp->provider;
1640133808Spjd			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1641139144Spjd			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1642133808Spjd			    ("Consumer %s not opened (r%dw%de%d).",
1643133808Spjd			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1644137256Spjd			cp->index++;
1645133808Spjd			g_io_request(cbp, cp);
1646133808Spjd		}
1647133808Spjd		break;
1648133808Spjd	case BIO_WRITE:
1649133808Spjd	case BIO_DELETE:
1650133808Spjd		/*
1651133808Spjd		 * Bump syncid on first write.
1652133808Spjd		 */
1653139671Spjd		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1654139295Spjd			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1655133808Spjd			g_topology_lock();
1656139144Spjd			g_raid3_bump_syncid(sc);
1657133808Spjd			g_topology_unlock();
1658133808Spjd		}
1659133808Spjd		g_raid3_scatter(pbp);
1660133808Spjd		break;
1661133808Spjd	}
1662133808Spjd	return (0);
1663133808Spjd}
1664133808Spjd
1665133808Spjdstatic int
1666133808Spjdg_raid3_can_destroy(struct g_raid3_softc *sc)
1667133808Spjd{
1668133808Spjd	struct g_geom *gp;
1669133808Spjd	struct g_consumer *cp;
1670133808Spjd
1671133808Spjd	g_topology_assert();
1672133808Spjd	gp = sc->sc_geom;
1673133808Spjd	LIST_FOREACH(cp, &gp->consumer, consumer) {
1674133808Spjd		if (g_raid3_is_busy(sc, cp))
1675133808Spjd			return (0);
1676133808Spjd	}
1677133808Spjd	gp = sc->sc_sync.ds_geom;
1678133808Spjd	LIST_FOREACH(cp, &gp->consumer, consumer) {
1679133808Spjd		if (g_raid3_is_busy(sc, cp))
1680133808Spjd			return (0);
1681133808Spjd	}
1682133808Spjd	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1683133808Spjd	    sc->sc_name);
1684133808Spjd	return (1);
1685133808Spjd}
1686133808Spjd
1687133808Spjdstatic int
1688133808Spjdg_raid3_try_destroy(struct g_raid3_softc *sc)
1689133808Spjd{
1690133808Spjd
1691139295Spjd	g_topology_lock();
1692139295Spjd	if (!g_raid3_can_destroy(sc)) {
1693139295Spjd		g_topology_unlock();
1694139295Spjd		return (0);
1695139295Spjd	}
1696133808Spjd	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1697133808Spjd		g_topology_unlock();
1698133808Spjd		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1699133808Spjd		    &sc->sc_worker);
1700133808Spjd		wakeup(&sc->sc_worker);
1701133808Spjd		sc->sc_worker = NULL;
1702133808Spjd	} else {
1703133808Spjd		g_raid3_destroy_device(sc);
1704133808Spjd		g_topology_unlock();
1705133808Spjd		free(sc->sc_disks, M_RAID3);
1706133808Spjd		free(sc, M_RAID3);
1707133808Spjd	}
1708133808Spjd	return (1);
1709133808Spjd}
1710133808Spjd
1711133808Spjd/*
1712133808Spjd * Worker thread.
1713133808Spjd */
1714133808Spjdstatic void
1715133808Spjdg_raid3_worker(void *arg)
1716133808Spjd{
1717133808Spjd	struct g_raid3_softc *sc;
1718133808Spjd	struct g_raid3_disk *disk;
1719135863Spjd	struct g_raid3_disk_sync *sync;
1720133808Spjd	struct g_raid3_event *ep;
1721133808Spjd	struct bio *bp;
1722133808Spjd	u_int nreqs;
1723133808Spjd
1724133808Spjd	sc = arg;
1725139451Sjhb	mtx_lock_spin(&sched_lock);
1726139451Sjhb	sched_prio(curthread, PRIBIO);
1727139451Sjhb	mtx_unlock_spin(&sched_lock);
1728133808Spjd
1729133808Spjd	nreqs = 0;
1730133808Spjd	for (;;) {
1731133808Spjd		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1732133808Spjd		/*
1733133808Spjd		 * First take a look at events.
1734133808Spjd		 * This is important to handle events before any I/O requests.
1735133808Spjd		 */
1736133808Spjd		ep = g_raid3_event_get(sc);
1737139144Spjd		if (ep != NULL && g_topology_try_lock()) {
1738139144Spjd			g_raid3_event_remove(sc, ep);
1739133808Spjd			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1740133808Spjd				/* Update only device status. */
1741133808Spjd				G_RAID3_DEBUG(3,
1742133808Spjd				    "Running event for device %s.",
1743133808Spjd				    sc->sc_name);
1744133808Spjd				ep->e_error = 0;
1745139144Spjd				g_raid3_update_device(sc, 1);
1746133808Spjd			} else {
1747133808Spjd				/* Update disk status. */
1748133808Spjd				G_RAID3_DEBUG(3, "Running event for disk %s.",
1749133808Spjd				     g_raid3_get_diskname(ep->e_disk));
1750133808Spjd				ep->e_error = g_raid3_update_disk(ep->e_disk,
1751139144Spjd				    ep->e_state);
1752133808Spjd				if (ep->e_error == 0)
1753139144Spjd					g_raid3_update_device(sc, 0);
1754133808Spjd			}
1755133808Spjd			g_topology_unlock();
1756133808Spjd			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1757133808Spjd				KASSERT(ep->e_error == 0,
1758133808Spjd				    ("Error cannot be handled."));
1759133808Spjd				g_raid3_event_free(ep);
1760133808Spjd			} else {
1761133808Spjd				ep->e_flags |= G_RAID3_EVENT_DONE;
1762133808Spjd				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1763133808Spjd				    ep);
1764133808Spjd				mtx_lock(&sc->sc_events_mtx);
1765133808Spjd				wakeup(ep);
1766133808Spjd				mtx_unlock(&sc->sc_events_mtx);
1767133808Spjd			}
1768133808Spjd			if ((sc->sc_flags &
1769133808Spjd			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1770133808Spjd				if (g_raid3_try_destroy(sc))
1771133808Spjd					kthread_exit(0);
1772133808Spjd			}
1773133808Spjd			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1774133808Spjd			continue;
1775133808Spjd		}
1776133808Spjd		/*
1777133808Spjd		 * Now I/O requests.
1778133808Spjd		 */
1779133808Spjd		/* Get first request from the queue. */
1780133808Spjd		mtx_lock(&sc->sc_queue_mtx);
1781133808Spjd		bp = bioq_first(&sc->sc_queue);
1782133808Spjd		if (bp == NULL) {
1783139144Spjd			if (ep != NULL) {
1784139144Spjd				/*
1785139144Spjd				 * No I/O requests and topology lock was
1786139144Spjd				 * already held? Try again.
1787139144Spjd				 */
1788139144Spjd				mtx_unlock(&sc->sc_queue_mtx);
1789139379Spjd				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1790139144Spjd				continue;
1791139144Spjd			}
1792133808Spjd			if ((sc->sc_flags &
1793133808Spjd			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1794133808Spjd				mtx_unlock(&sc->sc_queue_mtx);
1795133808Spjd				if (g_raid3_try_destroy(sc))
1796133808Spjd					kthread_exit(0);
1797133808Spjd				mtx_lock(&sc->sc_queue_mtx);
1798133808Spjd			}
1799133808Spjd		}
1800133808Spjd		if (sc->sc_syncdisk != NULL &&
1801133808Spjd		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1802133808Spjd			mtx_unlock(&sc->sc_queue_mtx);
1803133808Spjd			/*
1804133808Spjd			 * It is time for synchronization...
1805133808Spjd			 */
1806133808Spjd			nreqs = 0;
1807133808Spjd			disk = sc->sc_syncdisk;
1808135863Spjd			sync = &disk->d_sync;
1809135863Spjd			if (sync->ds_offset <
1810134421Spjd			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1811135863Spjd			    sync->ds_offset == sync->ds_offset_done) {
1812135863Spjd				if (sync->ds_resync != -1) {
1813135863Spjd					sync->ds_offset = sync->ds_resync;
1814135863Spjd					sync->ds_offset_done = sync->ds_resync;
1815135863Spjd					sync->ds_resync = -1;
1816135863Spjd				}
1817133808Spjd				g_raid3_sync_one(sc);
1818133808Spjd			}
1819133808Spjd			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1820133808Spjd			goto sleep;
1821133808Spjd		}
1822133808Spjd		if (bp == NULL) {
1823137259Spjd			if (g_raid3_check_idle(sc)) {
1824137258Spjd				u_int idletime;
1825137258Spjd
1826137258Spjd				idletime = g_raid3_idletime;
1827137258Spjd				if (idletime == 0)
1828137258Spjd					idletime = 1;
1829137258Spjd				idletime *= hz;
1830137258Spjd				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1831137259Spjd				    "r3:w1", idletime) == EWOULDBLOCK) {
1832137259Spjd					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1833137258Spjd					    __func__);
1834137258Spjd					/*
1835137258Spjd					 * No I/O requests in 'idletime'
1836137258Spjd					 * seconds, so mark components as clean.
1837137258Spjd					 */
1838137258Spjd					g_raid3_idle(sc);
1839137258Spjd				}
1840137259Spjd				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1841137259Spjd			} else {
1842137259Spjd				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1843137259Spjd				    "r3:w2", 0);
1844137258Spjd				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1845137258Spjd			}
1846133808Spjd			continue;
1847133808Spjd		}
1848133808Spjd		nreqs++;
1849133808Spjd		bioq_remove(&sc->sc_queue, bp);
1850133808Spjd		mtx_unlock(&sc->sc_queue_mtx);
1851133808Spjd
1852133808Spjd		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1853133808Spjd			g_raid3_regular_request(bp);
1854133808Spjd		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1855133808Spjd			u_int timeout, sps;
1856133808Spjd
1857133808Spjd			g_raid3_sync_request(bp);
1858133808Spjdsleep:
1859133808Spjd			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1860133808Spjd			if (sps == 0) {
1861139144Spjd				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1862133808Spjd				continue;
1863133808Spjd			}
1864139144Spjd			if (ep != NULL) {
1865139144Spjd				/*
1866139144Spjd				 * We have some pending events, don't sleep now.
1867139144Spjd				 */
1868139144Spjd				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1869139379Spjd				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1870139144Spjd				continue;
1871139144Spjd			}
1872133808Spjd			mtx_lock(&sc->sc_queue_mtx);
1873133808Spjd			if (bioq_first(&sc->sc_queue) != NULL) {
1874133808Spjd				mtx_unlock(&sc->sc_queue_mtx);
1875139144Spjd				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1876133808Spjd				continue;
1877133808Spjd			}
1878133808Spjd			timeout = hz / sps;
1879133808Spjd			if (timeout == 0)
1880133808Spjd				timeout = 1;
1881133808Spjd			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1882133808Spjd			    timeout);
1883133808Spjd		} else {
1884133808Spjd			if (g_raid3_register_request(bp) != 0) {
1885133808Spjd				mtx_lock(&sc->sc_queue_mtx);
1886133808Spjd				bioq_insert_tail(&sc->sc_queue, bp);
1887133808Spjd				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1888133808Spjd				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1889133808Spjd			}
1890133808Spjd		}
1891139144Spjd		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1892133808Spjd	}
1893133808Spjd}
1894133808Spjd
1895133808Spjd/*
1896133808Spjd * Open disk's consumer if needed.
1897133808Spjd */
1898133808Spjdstatic void
1899133808Spjdg_raid3_update_access(struct g_raid3_disk *disk)
1900133808Spjd{
1901133808Spjd	struct g_provider *pp;
1902133808Spjd
1903133808Spjd	g_topology_assert();
1904133808Spjd
1905133808Spjd	pp = disk->d_softc->sc_provider;
1906139144Spjd	if (pp == NULL)
1907133808Spjd		return;
1908139144Spjd	if (pp->acw > 0) {
1909139144Spjd		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1910139144Spjd			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1911139144Spjd			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1912139144Spjd			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1913139144Spjd		}
1914139144Spjd	} else if (pp->acw == 0) {
1915139144Spjd		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1916139144Spjd			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1917139144Spjd			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1918139144Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1919139144Spjd		}
1920133808Spjd	}
1921133808Spjd}
1922133808Spjd
1923133808Spjdstatic void
1924133808Spjdg_raid3_sync_start(struct g_raid3_softc *sc)
1925133808Spjd{
1926133808Spjd	struct g_raid3_disk *disk;
1927133808Spjd	int error;
1928133808Spjd	u_int n;
1929133808Spjd
1930133808Spjd	g_topology_assert();
1931133808Spjd
1932133808Spjd	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1933133808Spjd	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1934133808Spjd	    sc->sc_state));
1935133808Spjd	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1936133808Spjd	    sc->sc_name, sc->sc_state));
1937133808Spjd	disk = NULL;
1938133808Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
1939133808Spjd		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1940133808Spjd			continue;
1941133808Spjd		disk = &sc->sc_disks[n];
1942133808Spjd		break;
1943133808Spjd	}
1944133808Spjd	if (disk == NULL)
1945133808Spjd		return;
1946133808Spjd
1947133808Spjd	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1948133808Spjd	    g_raid3_get_diskname(disk));
1949133808Spjd	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1950133808Spjd	KASSERT(disk->d_sync.ds_consumer == NULL,
1951133808Spjd	    ("Sync consumer already exists (device=%s, disk=%s).",
1952133808Spjd	    sc->sc_name, g_raid3_get_diskname(disk)));
1953133808Spjd	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1954133808Spjd	disk->d_sync.ds_consumer->private = disk;
1955137256Spjd	disk->d_sync.ds_consumer->index = 0;
1956133808Spjd	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1957133808Spjd	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1958133808Spjd	    disk->d_softc->sc_name, error));
1959133808Spjd	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1960133808Spjd	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1961133808Spjd	    disk->d_softc->sc_name, error));
1962135872Spjd	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1963133808Spjd	sc->sc_syncdisk = disk;
1964133808Spjd}
1965133808Spjd
1966133808Spjd/*
1967133808Spjd * Stop synchronization process.
1968133808Spjd * type: 0 - synchronization finished
1969133808Spjd *       1 - synchronization stopped
1970133808Spjd */
1971133808Spjdstatic void
1972133808Spjdg_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1973133808Spjd{
1974133808Spjd	struct g_raid3_disk *disk;
1975133808Spjd
1976133808Spjd	g_topology_assert();
1977133808Spjd	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1978133808Spjd	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1979133808Spjd	    sc->sc_state));
1980133808Spjd	disk = sc->sc_syncdisk;
1981133808Spjd	sc->sc_syncdisk = NULL;
1982133808Spjd	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1983133808Spjd	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1984133808Spjd	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1985133808Spjd	    g_raid3_disk_state2str(disk->d_state)));
1986133808Spjd	if (disk->d_sync.ds_consumer == NULL)
1987133808Spjd		return;
1988133808Spjd
1989133808Spjd	if (type == 0) {
1990133808Spjd		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1991133808Spjd		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1992133808Spjd	} else /* if (type == 1) */ {
1993133808Spjd		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1994133808Spjd		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1995133808Spjd	}
1996139144Spjd	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
1997133808Spjd	free(disk->d_sync.ds_data, M_RAID3);
1998133808Spjd	disk->d_sync.ds_consumer = NULL;
1999133808Spjd	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2000133808Spjd}
2001133808Spjd
2002133808Spjdstatic void
2003133808Spjdg_raid3_launch_provider(struct g_raid3_softc *sc)
2004133808Spjd{
2005133808Spjd	struct g_provider *pp;
2006133808Spjd
2007133808Spjd	g_topology_assert();
2008133808Spjd
2009133808Spjd	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2010133808Spjd	pp->mediasize = sc->sc_mediasize;
2011133808Spjd	pp->sectorsize = sc->sc_sectorsize;
2012133808Spjd	sc->sc_provider = pp;
2013133808Spjd	g_error_provider(pp, 0);
2014133808Spjd	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2015133808Spjd	    pp->name);
2016133808Spjd	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2017133808Spjd		g_raid3_sync_start(sc);
2018133808Spjd}
2019133808Spjd
2020133808Spjdstatic void
2021133808Spjdg_raid3_destroy_provider(struct g_raid3_softc *sc)
2022133808Spjd{
2023133808Spjd	struct bio *bp;
2024133808Spjd
2025133808Spjd	g_topology_assert();
2026133808Spjd	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2027133808Spjd	    sc->sc_name));
2028133808Spjd
2029133808Spjd	g_error_provider(sc->sc_provider, ENXIO);
2030133808Spjd	mtx_lock(&sc->sc_queue_mtx);
2031133808Spjd	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2032133808Spjd		bioq_remove(&sc->sc_queue, bp);
2033133808Spjd		g_io_deliver(bp, ENXIO);
2034133808Spjd	}
2035133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
2036133808Spjd	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2037133808Spjd	    sc->sc_provider->name);
2038133808Spjd	sc->sc_provider->flags |= G_PF_WITHER;
2039133808Spjd	g_orphan_provider(sc->sc_provider, ENXIO);
2040133808Spjd	sc->sc_provider = NULL;
2041133808Spjd	if (sc->sc_syncdisk != NULL)
2042133808Spjd		g_raid3_sync_stop(sc, 1);
2043133808Spjd}
2044133808Spjd
2045133808Spjdstatic void
2046133808Spjdg_raid3_go(void *arg)
2047133808Spjd{
2048133808Spjd	struct g_raid3_softc *sc;
2049133808Spjd
2050133808Spjd	sc = arg;
2051133808Spjd	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2052133808Spjd	g_raid3_event_send(sc, 0,
2053133808Spjd	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2054133808Spjd}
2055133808Spjd
2056133808Spjdstatic u_int
2057133808Spjdg_raid3_determine_state(struct g_raid3_disk *disk)
2058133808Spjd{
2059133808Spjd	struct g_raid3_softc *sc;
2060133808Spjd	u_int state;
2061133808Spjd
2062133808Spjd	sc = disk->d_softc;
2063133808Spjd	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2064133808Spjd		if ((disk->d_flags &
2065133808Spjd		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2066133808Spjd			/* Disk does not need synchronization. */
2067133808Spjd			state = G_RAID3_DISK_STATE_ACTIVE;
2068133808Spjd		} else {
2069133808Spjd			if ((sc->sc_flags &
2070133808Spjd			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2071133808Spjd			    (disk->d_flags &
2072133808Spjd			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2073133808Spjd				/*
2074133808Spjd				 * We can start synchronization from
2075133808Spjd				 * the stored offset.
2076133808Spjd				 */
2077133808Spjd				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2078133808Spjd			} else {
2079133808Spjd				state = G_RAID3_DISK_STATE_STALE;
2080133808Spjd			}
2081133808Spjd		}
2082133808Spjd	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2083133808Spjd		/*
2084133808Spjd		 * Reset all synchronization data for this disk,
2085133808Spjd		 * because if it even was synchronized, it was
2086133808Spjd		 * synchronized to disks with different syncid.
2087133808Spjd		 */
2088133808Spjd		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2089133808Spjd		disk->d_sync.ds_offset = 0;
2090133808Spjd		disk->d_sync.ds_offset_done = 0;
2091133808Spjd		disk->d_sync.ds_syncid = sc->sc_syncid;
2092133808Spjd		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2093133808Spjd		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2094133808Spjd			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2095133808Spjd		} else {
2096133808Spjd			state = G_RAID3_DISK_STATE_STALE;
2097133808Spjd		}
2098133808Spjd	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2099133808Spjd		/*
2100133808Spjd		 * Not good, NOT GOOD!
2101133808Spjd		 * It means that device was started on stale disks
2102133808Spjd		 * and more fresh disk just arrive.
2103133808Spjd		 * If there were writes, device is fucked up, sorry.
2104133808Spjd		 * I think the best choice here is don't touch
2105133808Spjd		 * this disk and inform the user laudly.
2106133808Spjd		 */
2107133808Spjd		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2108133808Spjd		    "disk (%s) arrives!! It will not be connected to the "
2109133808Spjd		    "running device.", sc->sc_name,
2110133808Spjd		    g_raid3_get_diskname(disk));
2111133808Spjd		g_raid3_destroy_disk(disk);
2112133808Spjd		state = G_RAID3_DISK_STATE_NONE;
2113133808Spjd		/* Return immediately, because disk was destroyed. */
2114133808Spjd		return (state);
2115133808Spjd	}
2116133808Spjd	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2117133808Spjd	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2118133808Spjd	return (state);
2119133808Spjd}
2120133808Spjd
2121133808Spjd/*
2122133808Spjd * Update device state.
2123133808Spjd */
2124133808Spjdstatic void
2125139144Spjdg_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2126133808Spjd{
2127133808Spjd	struct g_raid3_disk *disk;
2128133808Spjd	u_int state;
2129133808Spjd
2130133808Spjd	g_topology_assert();
2131133808Spjd
2132133808Spjd	switch (sc->sc_state) {
2133133808Spjd	case G_RAID3_DEVICE_STATE_STARTING:
2134133808Spjd	    {
2135139295Spjd		u_int n, ndirty, ndisks, genid, syncid;
2136133808Spjd
2137133808Spjd		KASSERT(sc->sc_provider == NULL,
2138133808Spjd		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2139133808Spjd		/*
2140133808Spjd		 * Are we ready? We are, if all disks are connected or
2141133808Spjd		 * one disk is missing and 'force' is true.
2142133808Spjd		 */
2143133808Spjd		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2144133808Spjd			if (!force)
2145133808Spjd				callout_drain(&sc->sc_callout);
2146133808Spjd		} else {
2147133808Spjd			if (force) {
2148133808Spjd				/*
2149133808Spjd				 * Timeout expired, so destroy device.
2150133808Spjd				 */
2151133808Spjd				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2152133808Spjd			}
2153133808Spjd			return;
2154133808Spjd		}
2155133808Spjd
2156133808Spjd		/*
2157139295Spjd		 * Find the biggest genid.
2158139295Spjd		 */
2159139295Spjd		genid = 0;
2160139295Spjd		for (n = 0; n < sc->sc_ndisks; n++) {
2161139295Spjd			disk = &sc->sc_disks[n];
2162139295Spjd			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2163139295Spjd				continue;
2164139295Spjd			if (disk->d_genid > genid)
2165139295Spjd				genid = disk->d_genid;
2166139295Spjd		}
2167139295Spjd		sc->sc_genid = genid;
2168139295Spjd		/*
2169139295Spjd		 * Remove all disks without the biggest genid.
2170139295Spjd		 */
2171139295Spjd		for (n = 0; n < sc->sc_ndisks; n++) {
2172139295Spjd			disk = &sc->sc_disks[n];
2173139295Spjd			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2174139295Spjd				continue;
2175139295Spjd			if (disk->d_genid < genid) {
2176139295Spjd				G_RAID3_DEBUG(0,
2177139295Spjd				    "Component %s (device %s) broken, skipping.",
2178139295Spjd				    g_raid3_get_diskname(disk), sc->sc_name);
2179139295Spjd				g_raid3_destroy_disk(disk);
2180139295Spjd			}
2181139295Spjd		}
2182139295Spjd
2183139295Spjd		/*
2184133808Spjd		 * There must be at least 'sc->sc_ndisks - 1' components
2185133808Spjd		 * with the same syncid and without SYNCHRONIZING flag.
2186133808Spjd		 */
2187133808Spjd
2188133808Spjd		/*
2189133808Spjd		 * Find the biggest syncid, number of valid components and
2190133808Spjd		 * number of dirty components.
2191133808Spjd		 */
2192133808Spjd		ndirty = ndisks = syncid = 0;
2193133808Spjd		for (n = 0; n < sc->sc_ndisks; n++) {
2194133808Spjd			disk = &sc->sc_disks[n];
2195133808Spjd			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2196133808Spjd				continue;
2197133808Spjd			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2198133808Spjd				ndirty++;
2199133808Spjd			if (disk->d_sync.ds_syncid > syncid) {
2200133808Spjd				syncid = disk->d_sync.ds_syncid;
2201133808Spjd				ndisks = 0;
2202133808Spjd			} else if (disk->d_sync.ds_syncid < syncid) {
2203133808Spjd				continue;
2204133808Spjd			}
2205133808Spjd			if ((disk->d_flags &
2206133808Spjd			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2207133808Spjd				continue;
2208133808Spjd			}
2209133808Spjd			ndisks++;
2210133808Spjd		}
2211133808Spjd		/*
2212133808Spjd		 * Do we have enough valid components?
2213133808Spjd		 */
2214133808Spjd		if (ndisks + 1 < sc->sc_ndisks) {
2215133808Spjd			G_RAID3_DEBUG(0,
2216133808Spjd			    "Device %s is broken, too few valid components.",
2217133808Spjd			    sc->sc_name);
2218133808Spjd			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2219133808Spjd			return;
2220133808Spjd		}
2221133808Spjd		/*
2222133808Spjd		 * If there is one DIRTY component and all disks are present,
2223133808Spjd		 * mark it for synchronization. If there is more than one DIRTY
2224133808Spjd		 * component, mark parity component for synchronization.
2225133808Spjd		 */
2226133808Spjd		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2227133808Spjd			for (n = 0; n < sc->sc_ndisks; n++) {
2228133808Spjd				disk = &sc->sc_disks[n];
2229133808Spjd				if ((disk->d_flags &
2230133808Spjd				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2231133808Spjd					continue;
2232133808Spjd				}
2233133808Spjd				disk->d_flags |=
2234133808Spjd				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2235133808Spjd			}
2236133808Spjd		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2237133808Spjd			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2238133808Spjd			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2239133808Spjd		}
2240133808Spjd
2241133808Spjd		sc->sc_syncid = syncid;
2242133808Spjd		if (force) {
2243133808Spjd			/* Remember to bump syncid on first write. */
2244139671Spjd			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2245133808Spjd		}
2246133808Spjd		if (ndisks == sc->sc_ndisks)
2247133808Spjd			state = G_RAID3_DEVICE_STATE_COMPLETE;
2248133808Spjd		else /* if (ndisks == sc->sc_ndisks - 1) */
2249133808Spjd			state = G_RAID3_DEVICE_STATE_DEGRADED;
2250133808Spjd		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2251133808Spjd		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2252133808Spjd		    g_raid3_device_state2str(state));
2253133808Spjd		sc->sc_state = state;
2254133808Spjd		for (n = 0; n < sc->sc_ndisks; n++) {
2255133808Spjd			disk = &sc->sc_disks[n];
2256133808Spjd			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2257133808Spjd				continue;
2258133808Spjd			state = g_raid3_determine_state(disk);
2259133808Spjd			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2260139295Spjd			if (state == G_RAID3_DISK_STATE_STALE)
2261139671Spjd				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2262133808Spjd		}
2263133808Spjd		break;
2264133808Spjd	    }
2265133808Spjd	case G_RAID3_DEVICE_STATE_DEGRADED:
2266133808Spjd		/*
2267139671Spjd		 * Genid need to be bumped immediately, so do it here.
2268133808Spjd		 */
2269139671Spjd		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2270139295Spjd			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2271139295Spjd			g_raid3_bump_genid(sc);
2272139295Spjd		}
2273139295Spjd
2274133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2275133808Spjd			return;
2276133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2277133808Spjd		    sc->sc_ndisks - 1) {
2278133808Spjd			if (sc->sc_provider != NULL)
2279133808Spjd				g_raid3_destroy_provider(sc);
2280133808Spjd			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2281133808Spjd			return;
2282133808Spjd		}
2283133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2284133808Spjd		    sc->sc_ndisks) {
2285133808Spjd			state = G_RAID3_DEVICE_STATE_COMPLETE;
2286133808Spjd			G_RAID3_DEBUG(1,
2287133808Spjd			    "Device %s state changed from %s to %s.",
2288133808Spjd			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2289133808Spjd			    g_raid3_device_state2str(state));
2290133808Spjd			sc->sc_state = state;
2291133808Spjd		}
2292133808Spjd		if (sc->sc_provider == NULL)
2293133808Spjd			g_raid3_launch_provider(sc);
2294133808Spjd		break;
2295133808Spjd	case G_RAID3_DEVICE_STATE_COMPLETE:
2296133808Spjd		/*
2297139671Spjd		 * Genid need to be bumped immediately, so do it here.
2298133808Spjd		 */
2299139671Spjd		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2300139295Spjd			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2301139295Spjd			g_raid3_bump_genid(sc);
2302139295Spjd		}
2303139295Spjd
2304133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2305133808Spjd			return;
2306133808Spjd		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2307133808Spjd		    sc->sc_ndisks - 1,
2308133808Spjd		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2309133808Spjd		    sc->sc_name));
2310133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2311133808Spjd		    sc->sc_ndisks - 1) {
2312133808Spjd			state = G_RAID3_DEVICE_STATE_DEGRADED;
2313133808Spjd			G_RAID3_DEBUG(1,
2314133808Spjd			    "Device %s state changed from %s to %s.",
2315133808Spjd			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2316133808Spjd			    g_raid3_device_state2str(state));
2317133808Spjd			sc->sc_state = state;
2318133808Spjd		}
2319133808Spjd		if (sc->sc_provider == NULL)
2320133808Spjd			g_raid3_launch_provider(sc);
2321133808Spjd		break;
2322133808Spjd	default:
2323133808Spjd		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2324133808Spjd		    g_raid3_device_state2str(sc->sc_state)));
2325133808Spjd		break;
2326133808Spjd	}
2327133808Spjd}
2328133808Spjd
2329133808Spjd/*
2330133808Spjd * Update disk state and device state if needed.
2331133808Spjd */
2332133808Spjd#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2333133808Spjd	"Disk %s state changed from %s to %s (device %s).",		\
2334133808Spjd	g_raid3_get_diskname(disk),					\
2335133808Spjd	g_raid3_disk_state2str(disk->d_state),				\
2336133808Spjd	g_raid3_disk_state2str(state), sc->sc_name)
2337133808Spjdstatic int
2338139144Spjdg_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2339133808Spjd{
2340133808Spjd	struct g_raid3_softc *sc;
2341133808Spjd
2342133808Spjd	g_topology_assert();
2343133808Spjd
2344133808Spjd	sc = disk->d_softc;
2345133808Spjdagain:
2346133808Spjd	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2347133808Spjd	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2348133808Spjd	    g_raid3_disk_state2str(state));
2349133808Spjd	switch (state) {
2350133808Spjd	case G_RAID3_DISK_STATE_NEW:
2351133808Spjd		/*
2352133808Spjd		 * Possible scenarios:
2353133808Spjd		 * 1. New disk arrive.
2354133808Spjd		 */
2355133808Spjd		/* Previous state should be NONE. */
2356133808Spjd		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2357133808Spjd		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2358133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2359133808Spjd		DISK_STATE_CHANGED();
2360133808Spjd
2361133808Spjd		disk->d_state = state;
2362133808Spjd		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2363133808Spjd		    sc->sc_name, g_raid3_get_diskname(disk));
2364133808Spjd		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2365133808Spjd			break;
2366133808Spjd		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2367133808Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2368133808Spjd		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2369133808Spjd		    g_raid3_device_state2str(sc->sc_state),
2370133808Spjd		    g_raid3_get_diskname(disk),
2371133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2372133808Spjd		state = g_raid3_determine_state(disk);
2373133808Spjd		if (state != G_RAID3_DISK_STATE_NONE)
2374133808Spjd			goto again;
2375133808Spjd		break;
2376133808Spjd	case G_RAID3_DISK_STATE_ACTIVE:
2377133808Spjd		/*
2378133808Spjd		 * Possible scenarios:
2379133808Spjd		 * 1. New disk does not need synchronization.
2380133808Spjd		 * 2. Synchronization process finished successfully.
2381133808Spjd		 */
2382133808Spjd		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2383133808Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2384133808Spjd		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2385133808Spjd		    g_raid3_device_state2str(sc->sc_state),
2386133808Spjd		    g_raid3_get_diskname(disk),
2387133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2388133808Spjd		/* Previous state should be NEW or SYNCHRONIZING. */
2389133808Spjd		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2390133808Spjd		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2391133808Spjd		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2392133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2393133808Spjd		DISK_STATE_CHANGED();
2394133808Spjd
2395133808Spjd		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2396133808Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2397133808Spjd		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2398133808Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2399133808Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2400133808Spjd			g_raid3_sync_stop(sc, 0);
2401133808Spjd		}
2402133808Spjd		disk->d_state = state;
2403133808Spjd		disk->d_sync.ds_offset = 0;
2404133808Spjd		disk->d_sync.ds_offset_done = 0;
2405133808Spjd		g_raid3_update_access(disk);
2406133808Spjd		g_raid3_update_metadata(disk);
2407133808Spjd		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2408133808Spjd		    sc->sc_name, g_raid3_get_diskname(disk));
2409133808Spjd		break;
2410133808Spjd	case G_RAID3_DISK_STATE_STALE:
2411133808Spjd		/*
2412133808Spjd		 * Possible scenarios:
2413133808Spjd		 * 1. Stale disk was connected.
2414133808Spjd		 */
2415133808Spjd		/* Previous state should be NEW. */
2416133808Spjd		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2417133808Spjd		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2418133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2419133808Spjd		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2420133808Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2421133808Spjd		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2422133808Spjd		    g_raid3_device_state2str(sc->sc_state),
2423133808Spjd		    g_raid3_get_diskname(disk),
2424133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2425133808Spjd		/*
2426133808Spjd		 * STALE state is only possible if device is marked
2427133808Spjd		 * NOAUTOSYNC.
2428133808Spjd		 */
2429133808Spjd		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2430133808Spjd		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2431133808Spjd		    g_raid3_device_state2str(sc->sc_state),
2432133808Spjd		    g_raid3_get_diskname(disk),
2433133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2434133808Spjd		DISK_STATE_CHANGED();
2435133808Spjd
2436133808Spjd		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2437133808Spjd		disk->d_state = state;
2438133808Spjd		g_raid3_update_metadata(disk);
2439133808Spjd		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2440133808Spjd		    sc->sc_name, g_raid3_get_diskname(disk));
2441133808Spjd		break;
2442133808Spjd	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2443133808Spjd		/*
2444133808Spjd		 * Possible scenarios:
2445133808Spjd		 * 1. Disk which needs synchronization was connected.
2446133808Spjd		 */
2447133808Spjd		/* Previous state should be NEW. */
2448133808Spjd		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2449133808Spjd		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2450133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2451133808Spjd		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2452133808Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2453133808Spjd		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2454133808Spjd		    g_raid3_device_state2str(sc->sc_state),
2455133808Spjd		    g_raid3_get_diskname(disk),
2456133808Spjd		    g_raid3_disk_state2str(disk->d_state)));
2457133808Spjd		DISK_STATE_CHANGED();
2458133808Spjd
2459133808Spjd		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2460133808Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2461133808Spjd		disk->d_state = state;
2462133808Spjd		if (sc->sc_provider != NULL) {
2463133808Spjd			g_raid3_sync_start(sc);
2464133808Spjd			g_raid3_update_metadata(disk);
2465133808Spjd		}
2466133808Spjd		break;
2467133808Spjd	case G_RAID3_DISK_STATE_DISCONNECTED:
2468133808Spjd		/*
2469133808Spjd		 * Possible scenarios:
2470133808Spjd		 * 1. Device wasn't running yet, but disk disappear.
2471133808Spjd		 * 2. Disk was active and disapppear.
2472133808Spjd		 * 3. Disk disappear during synchronization process.
2473133808Spjd		 */
2474133808Spjd		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2475133808Spjd		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2476133808Spjd			/*
2477133808Spjd			 * Previous state should be ACTIVE, STALE or
2478133808Spjd			 * SYNCHRONIZING.
2479133808Spjd			 */
2480133808Spjd			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2481133808Spjd			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2482133808Spjd			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2483133808Spjd			    ("Wrong disk state (%s, %s).",
2484133808Spjd			    g_raid3_get_diskname(disk),
2485133808Spjd			    g_raid3_disk_state2str(disk->d_state)));
2486133808Spjd		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2487133808Spjd			/* Previous state should be NEW. */
2488133808Spjd			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2489133808Spjd			    ("Wrong disk state (%s, %s).",
2490133808Spjd			    g_raid3_get_diskname(disk),
2491133808Spjd			    g_raid3_disk_state2str(disk->d_state)));
2492133808Spjd			/*
2493133808Spjd			 * Reset bumping syncid if disk disappeared in STARTING
2494133808Spjd			 * state.
2495133808Spjd			 */
2496139671Spjd			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2497139295Spjd				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2498133808Spjd#ifdef	INVARIANTS
2499133808Spjd		} else {
2500133808Spjd			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2501133808Spjd			    sc->sc_name,
2502133808Spjd			    g_raid3_device_state2str(sc->sc_state),
2503133808Spjd			    g_raid3_get_diskname(disk),
2504133808Spjd			    g_raid3_disk_state2str(disk->d_state)));
2505133808Spjd#endif
2506133808Spjd		}
2507133808Spjd		DISK_STATE_CHANGED();
2508133808Spjd		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2509133808Spjd		    sc->sc_name, g_raid3_get_diskname(disk));
2510133808Spjd
2511133808Spjd		g_raid3_destroy_disk(disk);
2512133808Spjd		break;
2513133808Spjd	default:
2514133808Spjd		KASSERT(1 == 0, ("Unknown state (%u).", state));
2515133808Spjd		break;
2516133808Spjd	}
2517133808Spjd	return (0);
2518133808Spjd}
2519133808Spjd#undef	DISK_STATE_CHANGED
2520133808Spjd
2521139671Spjdint
2522133808Spjdg_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2523133808Spjd{
2524133808Spjd	struct g_provider *pp;
2525133808Spjd	u_char *buf;
2526133808Spjd	int error;
2527133808Spjd
2528133808Spjd	g_topology_assert();
2529133808Spjd
2530133808Spjd	error = g_access(cp, 1, 0, 0);
2531133808Spjd	if (error != 0)
2532133808Spjd		return (error);
2533133808Spjd	pp = cp->provider;
2534133808Spjd	g_topology_unlock();
2535133808Spjd	/* Metadata are stored on last sector. */
2536133808Spjd	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2537133808Spjd	    &error);
2538133808Spjd	g_topology_lock();
2539139144Spjd	g_access(cp, -1, 0, 0);
2540133808Spjd	if (error != 0) {
2541139295Spjd		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2542139295Spjd		    cp->provider->name, error);
2543139144Spjd		if (buf != NULL)
2544139144Spjd			g_free(buf);
2545133808Spjd		return (error);
2546133808Spjd	}
2547133808Spjd
2548133808Spjd	/* Decode metadata. */
2549133808Spjd	error = raid3_metadata_decode(buf, md);
2550133808Spjd	g_free(buf);
2551133808Spjd	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2552133808Spjd		return (EINVAL);
2553139295Spjd	if (md->md_version > G_RAID3_VERSION) {
2554139295Spjd		G_RAID3_DEBUG(0,
2555139295Spjd		    "Kernel module is too old to handle metadata from %s.",
2556139295Spjd		    cp->provider->name);
2557139295Spjd		return (EINVAL);
2558139295Spjd	}
2559133808Spjd	if (error != 0) {
2560133808Spjd		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2561133808Spjd		    cp->provider->name);
2562133808Spjd		return (error);
2563133808Spjd	}
2564133808Spjd
2565133808Spjd	return (0);
2566133808Spjd}
2567133808Spjd
2568133808Spjdstatic int
2569133808Spjdg_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2570133808Spjd    struct g_raid3_metadata *md)
2571133808Spjd{
2572133808Spjd
2573133808Spjd	if (md->md_no >= sc->sc_ndisks) {
2574133808Spjd		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2575133808Spjd		    pp->name, md->md_no);
2576133808Spjd		return (EINVAL);
2577133808Spjd	}
2578133808Spjd	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2579133808Spjd		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2580133808Spjd		    pp->name, md->md_no);
2581133808Spjd		return (EEXIST);
2582133808Spjd	}
2583133808Spjd	if (md->md_all != sc->sc_ndisks) {
2584133808Spjd		G_RAID3_DEBUG(1,
2585133808Spjd		    "Invalid '%s' field on disk %s (device %s), skipping.",
2586133808Spjd		    "md_all", pp->name, sc->sc_name);
2587133808Spjd		return (EINVAL);
2588133808Spjd	}
2589133808Spjd	if (md->md_mediasize != sc->sc_mediasize) {
2590133808Spjd		G_RAID3_DEBUG(1,
2591133808Spjd		    "Invalid '%s' field on disk %s (device %s), skipping.",
2592133808Spjd		    "md_mediasize", pp->name, sc->sc_name);
2593133808Spjd		return (EINVAL);
2594133808Spjd	}
2595133808Spjd	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2596133808Spjd		G_RAID3_DEBUG(1,
2597133808Spjd		    "Invalid '%s' field on disk %s (device %s), skipping.",
2598133808Spjd		    "md_mediasize", pp->name, sc->sc_name);
2599133808Spjd		return (EINVAL);
2600133808Spjd	}
2601133808Spjd	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2602133808Spjd		G_RAID3_DEBUG(1,
2603133808Spjd		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2604133808Spjd		    sc->sc_name);
2605133808Spjd		return (EINVAL);
2606133808Spjd	}
2607133808Spjd	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2608133808Spjd		G_RAID3_DEBUG(1,
2609133808Spjd		    "Invalid '%s' field on disk %s (device %s), skipping.",
2610133808Spjd		    "md_sectorsize", pp->name, sc->sc_name);
2611133808Spjd		return (EINVAL);
2612133808Spjd	}
2613133808Spjd	if (md->md_sectorsize != sc->sc_sectorsize) {
2614133808Spjd		G_RAID3_DEBUG(1,
2615133808Spjd		    "Invalid '%s' field on disk %s (device %s), skipping.",
2616133808Spjd		    "md_sectorsize", pp->name, sc->sc_name);
2617133808Spjd		return (EINVAL);
2618133808Spjd	}
2619133808Spjd	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2620133808Spjd		G_RAID3_DEBUG(1,
2621133808Spjd		    "Invalid sector size of disk %s (device %s), skipping.",
2622133808Spjd		    pp->name, sc->sc_name);
2623133808Spjd		return (EINVAL);
2624133808Spjd	}
2625133808Spjd	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2626133808Spjd		G_RAID3_DEBUG(1,
2627133808Spjd		    "Invalid device flags on disk %s (device %s), skipping.",
2628133808Spjd		    pp->name, sc->sc_name);
2629133808Spjd		return (EINVAL);
2630133808Spjd	}
2631134168Spjd	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2632134168Spjd	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2633134168Spjd		/*
2634134168Spjd		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2635134168Spjd		 */
2636134168Spjd		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2637134168Spjd		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2638134168Spjd		return (EINVAL);
2639134168Spjd	}
2640133808Spjd	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2641133808Spjd		G_RAID3_DEBUG(1,
2642133808Spjd		    "Invalid disk flags on disk %s (device %s), skipping.",
2643133808Spjd		    pp->name, sc->sc_name);
2644133808Spjd		return (EINVAL);
2645133808Spjd	}
2646133808Spjd	return (0);
2647133808Spjd}
2648133808Spjd
2649139671Spjdint
2650133808Spjdg_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2651133808Spjd    struct g_raid3_metadata *md)
2652133808Spjd{
2653133808Spjd	struct g_raid3_disk *disk;
2654133808Spjd	int error;
2655133808Spjd
2656133808Spjd	g_topology_assert();
2657133808Spjd	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2658133808Spjd
2659133808Spjd	error = g_raid3_check_metadata(sc, pp, md);
2660133808Spjd	if (error != 0)
2661133808Spjd		return (error);
2662139295Spjd	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2663139295Spjd	    md->md_genid < sc->sc_genid) {
2664139295Spjd		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2665139295Spjd		    pp->name, sc->sc_name);
2666139295Spjd		return (EINVAL);
2667139295Spjd	}
2668133808Spjd	disk = g_raid3_init_disk(sc, pp, md, &error);
2669133808Spjd	if (disk == NULL)
2670133808Spjd		return (error);
2671133808Spjd	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2672133808Spjd	    G_RAID3_EVENT_WAIT);
2673139295Spjd	if (error != 0)
2674139295Spjd		return (error);
2675139295Spjd	if (md->md_version < G_RAID3_VERSION) {
2676139295Spjd		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2677139295Spjd		    pp->name, md->md_version, G_RAID3_VERSION);
2678139295Spjd		g_raid3_update_metadata(disk);
2679139295Spjd	}
2680139295Spjd	return (0);
2681133808Spjd}
2682133808Spjd
2683133808Spjdstatic int
2684133808Spjdg_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2685133808Spjd{
2686133808Spjd	struct g_raid3_softc *sc;
2687133808Spjd	struct g_raid3_disk *disk;
2688139144Spjd	int dcr, dcw, dce;
2689133808Spjd	u_int n;
2690133808Spjd
2691133808Spjd	g_topology_assert();
2692133808Spjd	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2693133808Spjd	    acw, ace);
2694133808Spjd
2695133808Spjd	dcr = pp->acr + acr;
2696133808Spjd	dcw = pp->acw + acw;
2697133808Spjd	dce = pp->ace + ace;
2698133808Spjd
2699133808Spjd	sc = pp->geom->softc;
2700133808Spjd	if (sc == NULL ||
2701137412Spjd	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2702137412Spjd	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2703133808Spjd		if (acr <= 0 && acw <= 0 && ace <= 0)
2704133808Spjd			return (0);
2705133808Spjd		else
2706133808Spjd			return (ENXIO);
2707133808Spjd	}
2708133808Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
2709133808Spjd		disk = &sc->sc_disks[n];
2710133808Spjd		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2711133808Spjd			continue;
2712139144Spjd		/*
2713139144Spjd		 * Mark disk as dirty on open and unmark on close.
2714139144Spjd		 */
2715139144Spjd		if (pp->acw == 0 && dcw > 0) {
2716139144Spjd			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2717139144Spjd			    g_raid3_get_diskname(disk), sc->sc_name);
2718139144Spjd			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2719139144Spjd			g_raid3_update_metadata(disk);
2720139144Spjd		} else if (pp->acw > 0 && dcw == 0) {
2721139144Spjd			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2722139144Spjd			    g_raid3_get_diskname(disk), sc->sc_name);
2723139144Spjd			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2724139144Spjd			g_raid3_update_metadata(disk);
2725133808Spjd		}
2726133808Spjd	}
2727139144Spjd	return (0);
2728133808Spjd}
2729133808Spjd
2730133808Spjdstatic struct g_geom *
2731133808Spjdg_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2732133808Spjd{
2733133808Spjd	struct g_raid3_softc *sc;
2734133808Spjd	struct g_geom *gp;
2735133808Spjd	int error, timeout;
2736133808Spjd	u_int n;
2737133808Spjd
2738133808Spjd	g_topology_assert();
2739133808Spjd	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2740133808Spjd
2741133808Spjd	/* One disk is minimum. */
2742133808Spjd	if (md->md_all < 1)
2743133808Spjd		return (NULL);
2744133808Spjd	/*
2745133808Spjd	 * Action geom.
2746133808Spjd	 */
2747133808Spjd	gp = g_new_geomf(mp, "%s", md->md_name);
2748133808Spjd	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2749133808Spjd	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2750133808Spjd	    M_WAITOK | M_ZERO);
2751133808Spjd	gp->start = g_raid3_start;
2752133808Spjd	gp->orphan = g_raid3_orphan;
2753133808Spjd	gp->access = g_raid3_access;
2754133808Spjd	gp->dumpconf = g_raid3_dumpconf;
2755133808Spjd
2756133808Spjd	sc->sc_id = md->md_id;
2757133808Spjd	sc->sc_mediasize = md->md_mediasize;
2758133808Spjd	sc->sc_sectorsize = md->md_sectorsize;
2759133808Spjd	sc->sc_ndisks = md->md_all;
2760134124Spjd	sc->sc_round_robin = 0;
2761133808Spjd	sc->sc_flags = md->md_mflags;
2762139295Spjd	sc->sc_bump_id = 0;
2763137258Spjd	sc->sc_idle = 0;
2764138374Spjd	for (n = 0; n < sc->sc_ndisks; n++) {
2765138374Spjd		sc->sc_disks[n].d_softc = sc;
2766138374Spjd		sc->sc_disks[n].d_no = n;
2767133808Spjd		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2768138374Spjd	}
2769133808Spjd	bioq_init(&sc->sc_queue);
2770133808Spjd	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2771133808Spjd	TAILQ_INIT(&sc->sc_events);
2772133808Spjd	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2773133808Spjd	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2774133808Spjd	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2775133808Spjd	gp->softc = sc;
2776133808Spjd	sc->sc_geom = gp;
2777133808Spjd	sc->sc_provider = NULL;
2778133808Spjd	/*
2779133808Spjd	 * Synchronization geom.
2780133808Spjd	 */
2781133808Spjd	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2782133808Spjd	gp->softc = sc;
2783133808Spjd	gp->orphan = g_raid3_orphan;
2784133808Spjd	sc->sc_sync.ds_geom = gp;
2785133808Spjd	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2786133808Spjd	    UMA_ALIGN_PTR, 0);
2787133808Spjd	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2788133808Spjd	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2789133808Spjd	    UMA_ALIGN_PTR, 0);
2790133808Spjd	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2791133808Spjd	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2792133808Spjd	    UMA_ALIGN_PTR, 0);
2793133808Spjd	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2794133808Spjd	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2795133808Spjd	    "g_raid3 %s", md->md_name);
2796133808Spjd	if (error != 0) {
2797133808Spjd		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2798133808Spjd		    sc->sc_name);
2799133808Spjd		uma_zdestroy(sc->sc_zone_64k);
2800133808Spjd		uma_zdestroy(sc->sc_zone_16k);
2801133808Spjd		uma_zdestroy(sc->sc_zone_4k);
2802133808Spjd		g_destroy_geom(sc->sc_sync.ds_geom);
2803133808Spjd		mtx_destroy(&sc->sc_events_mtx);
2804133808Spjd		mtx_destroy(&sc->sc_queue_mtx);
2805133808Spjd		g_destroy_geom(sc->sc_geom);
2806133808Spjd		free(sc->sc_disks, M_RAID3);
2807133808Spjd		free(sc, M_RAID3);
2808133808Spjd		return (NULL);
2809133808Spjd	}
2810133808Spjd
2811133808Spjd	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2812133808Spjd
2813133808Spjd	/*
2814133808Spjd	 * Run timeout.
2815133808Spjd	 */
2816133808Spjd	timeout = atomic_load_acq_int(&g_raid3_timeout);
2817133808Spjd	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2818133808Spjd	return (sc->sc_geom);
2819133808Spjd}
2820133808Spjd
2821133808Spjdint
2822133808Spjdg_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2823133808Spjd{
2824133808Spjd	struct g_provider *pp;
2825133808Spjd
2826133808Spjd	g_topology_assert();
2827133808Spjd
2828133808Spjd	if (sc == NULL)
2829133808Spjd		return (ENXIO);
2830133808Spjd	pp = sc->sc_provider;
2831133808Spjd	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2832133808Spjd		if (force) {
2833139146Spjd			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2834133808Spjd			    "can't be definitely removed.", pp->name);
2835133808Spjd		} else {
2836133808Spjd			G_RAID3_DEBUG(1,
2837133808Spjd			    "Device %s is still open (r%dw%de%d).", pp->name,
2838133808Spjd			    pp->acr, pp->acw, pp->ace);
2839133808Spjd			return (EBUSY);
2840133808Spjd		}
2841133808Spjd	}
2842133808Spjd
2843133808Spjd	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2844133808Spjd	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2845133808Spjd	g_topology_unlock();
2846133808Spjd	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2847133808Spjd	mtx_lock(&sc->sc_queue_mtx);
2848133808Spjd	wakeup(sc);
2849133808Spjd	wakeup(&sc->sc_queue);
2850133808Spjd	mtx_unlock(&sc->sc_queue_mtx);
2851133808Spjd	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2852133808Spjd	while (sc->sc_worker != NULL)
2853133808Spjd		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2854133808Spjd	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2855133808Spjd	g_topology_lock();
2856133808Spjd	g_raid3_destroy_device(sc);
2857133808Spjd	free(sc->sc_disks, M_RAID3);
2858133808Spjd	free(sc, M_RAID3);
2859133808Spjd	return (0);
2860133808Spjd}
2861133808Spjd
2862133808Spjdstatic void
2863133808Spjdg_raid3_taste_orphan(struct g_consumer *cp)
2864133808Spjd{
2865133808Spjd
2866133808Spjd	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2867133808Spjd	    cp->provider->name));
2868133808Spjd}
2869133808Spjd
2870133808Spjdstatic struct g_geom *
2871133808Spjdg_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2872133808Spjd{
2873133808Spjd	struct g_raid3_metadata md;
2874133808Spjd	struct g_raid3_softc *sc;
2875133808Spjd	struct g_consumer *cp;
2876133808Spjd	struct g_geom *gp;
2877133808Spjd	int error;
2878133808Spjd
2879133808Spjd	g_topology_assert();
2880133808Spjd	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2881133808Spjd	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2882133808Spjd
2883133808Spjd	gp = g_new_geomf(mp, "raid3:taste");
2884133808Spjd	/* This orphan function should be never called. */
2885133808Spjd	gp->orphan = g_raid3_taste_orphan;
2886133808Spjd	cp = g_new_consumer(gp);
2887133808Spjd	g_attach(cp, pp);
2888133808Spjd	error = g_raid3_read_metadata(cp, &md);
2889133808Spjd	g_detach(cp);
2890133808Spjd	g_destroy_consumer(cp);
2891133808Spjd	g_destroy_geom(gp);
2892133808Spjd	if (error != 0)
2893133808Spjd		return (NULL);
2894133808Spjd	gp = NULL;
2895133808Spjd
2896133808Spjd	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2897133808Spjd		return (NULL);
2898142727Spjd	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2899142727Spjd		return (NULL);
2900133808Spjd	if (g_raid3_debug >= 2)
2901133808Spjd		raid3_metadata_dump(&md);
2902133808Spjd
2903133808Spjd	/*
2904133808Spjd	 * Let's check if device already exists.
2905133808Spjd	 */
2906134486Spjd	sc = NULL;
2907133808Spjd	LIST_FOREACH(gp, &mp->geom, geom) {
2908133808Spjd		sc = gp->softc;
2909133808Spjd		if (sc == NULL)
2910133808Spjd			continue;
2911133808Spjd		if (sc->sc_sync.ds_geom == gp)
2912133808Spjd			continue;
2913133808Spjd		if (strcmp(md.md_name, sc->sc_name) != 0)
2914133808Spjd			continue;
2915133808Spjd		if (md.md_id != sc->sc_id) {
2916133808Spjd			G_RAID3_DEBUG(0, "Device %s already configured.",
2917133808Spjd			    sc->sc_name);
2918133808Spjd			return (NULL);
2919133808Spjd		}
2920133808Spjd		break;
2921133808Spjd	}
2922133808Spjd	if (gp == NULL) {
2923133808Spjd		gp = g_raid3_create(mp, &md);
2924133808Spjd		if (gp == NULL) {
2925133808Spjd			G_RAID3_DEBUG(0, "Cannot create device %s.",
2926133808Spjd			    md.md_name);
2927133808Spjd			return (NULL);
2928133808Spjd		}
2929133808Spjd		sc = gp->softc;
2930133808Spjd	}
2931133808Spjd	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2932133808Spjd	error = g_raid3_add_disk(sc, pp, &md);
2933133808Spjd	if (error != 0) {
2934133808Spjd		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2935133808Spjd		    pp->name, gp->name, error);
2936133808Spjd		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2937133808Spjd		    sc->sc_ndisks) {
2938133808Spjd			g_raid3_destroy(sc, 1);
2939133808Spjd		}
2940133808Spjd		return (NULL);
2941133808Spjd	}
2942133808Spjd	return (gp);
2943133808Spjd}
2944133808Spjd
2945133808Spjdstatic int
2946133808Spjdg_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2947133808Spjd    struct g_geom *gp)
2948133808Spjd{
2949133808Spjd
2950133808Spjd	return (g_raid3_destroy(gp->softc, 0));
2951133808Spjd}
2952133808Spjd
2953133808Spjdstatic void
2954133808Spjdg_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2955133808Spjd    struct g_consumer *cp, struct g_provider *pp)
2956133808Spjd{
2957133808Spjd	struct g_raid3_softc *sc;
2958133808Spjd
2959133808Spjd	g_topology_assert();
2960133808Spjd
2961133808Spjd	sc = gp->softc;
2962133808Spjd	if (sc == NULL)
2963133808Spjd		return;
2964133808Spjd	/* Skip synchronization geom. */
2965133808Spjd	if (gp == sc->sc_sync.ds_geom)
2966133808Spjd		return;
2967133808Spjd	if (pp != NULL) {
2968133808Spjd		/* Nothing here. */
2969133808Spjd	} else if (cp != NULL) {
2970133808Spjd		struct g_raid3_disk *disk;
2971133808Spjd
2972133808Spjd		disk = cp->private;
2973133808Spjd		if (disk == NULL)
2974133808Spjd			return;
2975133808Spjd		sbuf_printf(sb, "%s<Type>", indent);
2976133808Spjd		if (disk->d_no == sc->sc_ndisks - 1)
2977133808Spjd			sbuf_printf(sb, "PARITY");
2978133808Spjd		else
2979133808Spjd			sbuf_printf(sb, "DATA");
2980133808Spjd		sbuf_printf(sb, "</Type>\n");
2981133808Spjd		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2982133808Spjd		    (u_int)disk->d_no);
2983133808Spjd		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2984133808Spjd			sbuf_printf(sb, "%s<Synchronized>", indent);
2985133808Spjd			if (disk->d_sync.ds_offset_done == 0)
2986133808Spjd				sbuf_printf(sb, "0%%");
2987133808Spjd			else {
2988133808Spjd				sbuf_printf(sb, "%u%%",
2989133808Spjd				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2990134421Spjd				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2991133808Spjd			}
2992133808Spjd			sbuf_printf(sb, "</Synchronized>\n");
2993133808Spjd		}
2994133808Spjd		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2995133808Spjd		    disk->d_sync.ds_syncid);
2996139295Spjd		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
2997133808Spjd		sbuf_printf(sb, "%s<Flags>", indent);
2998133808Spjd		if (disk->d_flags == 0)
2999133808Spjd			sbuf_printf(sb, "NONE");
3000133808Spjd		else {
3001133808Spjd			int first = 1;
3002133808Spjd
3003133808Spjd#define	ADD_FLAG(flag, name)	do {					\
3004133808Spjd	if ((disk->d_flags & (flag)) != 0) {				\
3005133808Spjd		if (!first)						\
3006133808Spjd			sbuf_printf(sb, ", ");				\
3007133808Spjd		else							\
3008133808Spjd			first = 0;					\
3009133808Spjd		sbuf_printf(sb, name);					\
3010133808Spjd	}								\
3011133808Spjd} while (0)
3012133808Spjd			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3013133808Spjd			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3014133808Spjd			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3015133808Spjd			    "SYNCHRONIZING");
3016133808Spjd			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3017133808Spjd#undef	ADD_FLAG
3018133808Spjd		}
3019133808Spjd		sbuf_printf(sb, "</Flags>\n");
3020133808Spjd		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3021133808Spjd		    g_raid3_disk_state2str(disk->d_state));
3022133808Spjd	} else {
3023133808Spjd		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3024133808Spjd		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3025139295Spjd		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3026133808Spjd		sbuf_printf(sb, "%s<Flags>", indent);
3027133808Spjd		if (sc->sc_flags == 0)
3028133808Spjd			sbuf_printf(sb, "NONE");
3029133808Spjd		else {
3030133808Spjd			int first = 1;
3031133808Spjd
3032133808Spjd#define	ADD_FLAG(flag, name)	do {					\
3033133808Spjd	if ((sc->sc_flags & (flag)) != 0) {				\
3034133808Spjd		if (!first)						\
3035133808Spjd			sbuf_printf(sb, ", ");				\
3036133808Spjd		else							\
3037133808Spjd			first = 0;					\
3038133808Spjd		sbuf_printf(sb, name);					\
3039133808Spjd	}								\
3040133808Spjd} while (0)
3041133808Spjd			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3042134124Spjd			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3043134124Spjd			    "ROUND-ROBIN");
3044134168Spjd			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3045133808Spjd#undef	ADD_FLAG
3046133808Spjd		}
3047133808Spjd		sbuf_printf(sb, "</Flags>\n");
3048133808Spjd		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3049133808Spjd		    sc->sc_ndisks);
3050133979Spjd		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3051133979Spjd		    g_raid3_device_state2str(sc->sc_state));
3052133808Spjd	}
3053133808Spjd}
3054133808Spjd
3055137257Spjdstatic void
3056137257Spjdg_raid3_shutdown(void *arg, int howto)
3057137257Spjd{
3058137257Spjd	struct g_class *mp;
3059137257Spjd	struct g_geom *gp, *gp2;
3060137257Spjd
3061137257Spjd	mp = arg;
3062137421Spjd	DROP_GIANT();
3063137257Spjd	g_topology_lock();
3064137257Spjd	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3065137257Spjd		if (gp->softc == NULL)
3066137257Spjd			continue;
3067137257Spjd		g_raid3_destroy(gp->softc, 1);
3068137257Spjd	}
3069137257Spjd	g_topology_unlock();
3070137421Spjd	PICKUP_GIANT();
3071137257Spjd#if 0
3072137257Spjd	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3073137257Spjd#endif
3074137257Spjd}
3075137257Spjd
3076137257Spjdstatic void
3077137257Spjdg_raid3_init(struct g_class *mp)
3078137257Spjd{
3079137257Spjd
3080137257Spjd	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3081137257Spjd	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3082137257Spjd	if (g_raid3_ehtag == NULL)
3083137257Spjd		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3084137257Spjd}
3085137257Spjd
3086137257Spjdstatic void
3087137257Spjdg_raid3_fini(struct g_class *mp)
3088137257Spjd{
3089137257Spjd
3090137257Spjd	if (g_raid3_ehtag == NULL)
3091137257Spjd		return;
3092137257Spjd	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3093137257Spjd}
3094137257Spjd
3095133808SpjdDECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3096