g_raid3.c revision 139451
1/*-
2 * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 139451 2004-12-30 20:29:58Z jhb $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sysctl.h>
39#include <sys/malloc.h>
40#include <sys/eventhandler.h>
41#include <vm/uma.h>
42#include <machine/atomic.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/raid3/g_raid3.h>
48
49
50static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
51
52SYSCTL_DECL(_kern_geom);
53SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
54u_int g_raid3_debug = 0;
55TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
56SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
57    "Debug level");
58static u_int g_raid3_timeout = 4;
59TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
60SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
61    0, "Time to wait on all raid3 components");
62static u_int g_raid3_idletime = 5;
63TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
64SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
65    &g_raid3_idletime, 0, "Mark components as clean when idling");
66static u_int g_raid3_reqs_per_sync = 5;
67SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
68    &g_raid3_reqs_per_sync, 0,
69    "Number of regular I/O requests per synchronization request");
70static u_int g_raid3_syncs_per_sec = 100;
71SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
72    &g_raid3_syncs_per_sec, 0,
73    "Number of synchronizations requests per second");
74
75static u_int g_raid3_n64k = 50;
76TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
77SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
78    "Maximum number of 64kB allocations");
79static u_int g_raid3_n16k = 200;
80TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
81SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
82    "Maximum number of 16kB allocations");
83static u_int g_raid3_n4k = 1200;
84TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
85SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
86    "Maximum number of 4kB allocations");
87
88SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
89    "GEOM_RAID3 statistics");
90static u_int g_raid3_parity_mismatch = 0;
91SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
92    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
93static u_int g_raid3_64k_requested = 0;
94SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
95    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
96static u_int g_raid3_64k_failed = 0;
97SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
98    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
99static u_int g_raid3_16k_requested = 0;
100SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
101    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
102static u_int g_raid3_16k_failed = 0;
103SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
104    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
105static u_int g_raid3_4k_requested = 0;
106SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
107    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
108static u_int g_raid3_4k_failed = 0;
109SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
110    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
111
112#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
113	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
114	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
115	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
116} while (0)
117
118static eventhandler_tag g_raid3_ehtag = NULL;
119
120static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
121    struct g_geom *gp);
122static g_taste_t g_raid3_taste;
123static void g_raid3_init(struct g_class *mp);
124static void g_raid3_fini(struct g_class *mp);
125
126struct g_class g_raid3_class = {
127	.name = G_RAID3_CLASS_NAME,
128	.version = G_VERSION,
129	.ctlreq = g_raid3_config,
130	.taste = g_raid3_taste,
131	.destroy_geom = g_raid3_destroy_geom,
132	.init = g_raid3_init,
133	.fini = g_raid3_fini
134};
135
136
137static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
138static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
139static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
140static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
141    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
142static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
143
144
145static const char *
146g_raid3_disk_state2str(int state)
147{
148
149	switch (state) {
150	case G_RAID3_DISK_STATE_NODISK:
151		return ("NODISK");
152	case G_RAID3_DISK_STATE_NONE:
153		return ("NONE");
154	case G_RAID3_DISK_STATE_NEW:
155		return ("NEW");
156	case G_RAID3_DISK_STATE_ACTIVE:
157		return ("ACTIVE");
158	case G_RAID3_DISK_STATE_STALE:
159		return ("STALE");
160	case G_RAID3_DISK_STATE_SYNCHRONIZING:
161		return ("SYNCHRONIZING");
162	case G_RAID3_DISK_STATE_DISCONNECTED:
163		return ("DISCONNECTED");
164	default:
165		return ("INVALID");
166	}
167}
168
169static const char *
170g_raid3_device_state2str(int state)
171{
172
173	switch (state) {
174	case G_RAID3_DEVICE_STATE_STARTING:
175		return ("STARTING");
176	case G_RAID3_DEVICE_STATE_DEGRADED:
177		return ("DEGRADED");
178	case G_RAID3_DEVICE_STATE_COMPLETE:
179		return ("COMPLETE");
180	default:
181		return ("INVALID");
182	}
183}
184
185const char *
186g_raid3_get_diskname(struct g_raid3_disk *disk)
187{
188
189	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
190		return ("[unknown]");
191	return (disk->d_name);
192}
193
194#define	g_raid3_xor(src1, src2, dst, size)				\
195	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
196	    (uint64_t *)(dst), (size_t)size)
197static void
198_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
199{
200
201	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
202	for (; size > 0; size -= 128) {
203		*dst++ = (*src1++) ^ (*src2++);
204		*dst++ = (*src1++) ^ (*src2++);
205		*dst++ = (*src1++) ^ (*src2++);
206		*dst++ = (*src1++) ^ (*src2++);
207		*dst++ = (*src1++) ^ (*src2++);
208		*dst++ = (*src1++) ^ (*src2++);
209		*dst++ = (*src1++) ^ (*src2++);
210		*dst++ = (*src1++) ^ (*src2++);
211		*dst++ = (*src1++) ^ (*src2++);
212		*dst++ = (*src1++) ^ (*src2++);
213		*dst++ = (*src1++) ^ (*src2++);
214		*dst++ = (*src1++) ^ (*src2++);
215		*dst++ = (*src1++) ^ (*src2++);
216		*dst++ = (*src1++) ^ (*src2++);
217		*dst++ = (*src1++) ^ (*src2++);
218		*dst++ = (*src1++) ^ (*src2++);
219	}
220}
221
222static int
223g_raid3_is_zero(struct bio *bp)
224{
225	static const uint64_t zeros[] = {
226	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
227	};
228	u_char *addr;
229	ssize_t size;
230
231	size = bp->bio_length;
232	addr = (u_char *)bp->bio_data;
233	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
234		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
235			return (0);
236	}
237	return (1);
238}
239
240/*
241 * --- Events handling functions ---
242 * Events in geom_raid3 are used to maintain disks and device status
243 * from one thread to simplify locking.
244 */
245static void
246g_raid3_event_free(struct g_raid3_event *ep)
247{
248
249	free(ep, M_RAID3);
250}
251
252int
253g_raid3_event_send(void *arg, int state, int flags)
254{
255	struct g_raid3_softc *sc;
256	struct g_raid3_disk *disk;
257	struct g_raid3_event *ep;
258	int error;
259
260	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
261	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
262	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
263		disk = NULL;
264		sc = arg;
265	} else {
266		disk = arg;
267		sc = disk->d_softc;
268	}
269	ep->e_disk = disk;
270	ep->e_state = state;
271	ep->e_flags = flags;
272	ep->e_error = 0;
273	mtx_lock(&sc->sc_events_mtx);
274	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
275	mtx_unlock(&sc->sc_events_mtx);
276	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
277	mtx_lock(&sc->sc_queue_mtx);
278	wakeup(sc);
279	wakeup(&sc->sc_queue);
280	mtx_unlock(&sc->sc_queue_mtx);
281	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
282		return (0);
283	g_topology_assert();
284	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
285	g_topology_unlock();
286	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
287		mtx_lock(&sc->sc_events_mtx);
288		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
289		    hz * 5);
290	}
291	/* Don't even try to use 'sc' here, because it could be already dead. */
292	g_topology_lock();
293	error = ep->e_error;
294	g_raid3_event_free(ep);
295	return (error);
296}
297
298static struct g_raid3_event *
299g_raid3_event_get(struct g_raid3_softc *sc)
300{
301	struct g_raid3_event *ep;
302
303	mtx_lock(&sc->sc_events_mtx);
304	ep = TAILQ_FIRST(&sc->sc_events);
305	mtx_unlock(&sc->sc_events_mtx);
306	return (ep);
307}
308
309static void
310g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
311{
312
313	mtx_lock(&sc->sc_events_mtx);
314	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
315	mtx_unlock(&sc->sc_events_mtx);
316}
317
318static void
319g_raid3_event_cancel(struct g_raid3_disk *disk)
320{
321	struct g_raid3_softc *sc;
322	struct g_raid3_event *ep, *tmpep;
323
324	g_topology_assert();
325
326	sc = disk->d_softc;
327	mtx_lock(&sc->sc_events_mtx);
328	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
329		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
330			continue;
331		if (ep->e_disk != disk)
332			continue;
333		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
334		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
335			g_raid3_event_free(ep);
336		else {
337			ep->e_error = ECANCELED;
338			wakeup(ep);
339		}
340	}
341	mtx_unlock(&sc->sc_events_mtx);
342}
343
344/*
345 * Return the number of disks in the given state.
346 * If state is equal to -1, count all connected disks.
347 */
348u_int
349g_raid3_ndisks(struct g_raid3_softc *sc, int state)
350{
351	struct g_raid3_disk *disk;
352	u_int n, ndisks;
353
354	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
355		disk = &sc->sc_disks[n];
356		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
357			continue;
358		if (state == -1 || disk->d_state == state)
359			ndisks++;
360	}
361	return (ndisks);
362}
363
364static u_int
365g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
366{
367	struct bio *bp;
368	u_int nreqs = 0;
369
370	mtx_lock(&sc->sc_queue_mtx);
371	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
372		if (bp->bio_from == cp)
373			nreqs++;
374	}
375	mtx_unlock(&sc->sc_queue_mtx);
376	return (nreqs);
377}
378
379static int
380g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
381{
382
383	if (cp->index > 0) {
384		G_RAID3_DEBUG(2,
385		    "I/O requests for %s exist, can't destroy it now.",
386		    cp->provider->name);
387		return (1);
388	}
389	if (g_raid3_nrequests(sc, cp) > 0) {
390		G_RAID3_DEBUG(2,
391		    "I/O requests for %s in queue, can't destroy it now.",
392		    cp->provider->name);
393		return (1);
394	}
395	return (0);
396}
397
398static void
399g_raid3_destroy_consumer(void *arg, int flags __unused)
400{
401	struct g_consumer *cp;
402
403	cp = arg;
404	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
405	g_detach(cp);
406	g_destroy_consumer(cp);
407}
408
409static void
410g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
411{
412	struct g_provider *pp;
413	int retaste_wait;
414
415	g_topology_assert();
416
417	cp->private = NULL;
418	if (g_raid3_is_busy(sc, cp))
419		return;
420	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
421	pp = cp->provider;
422	retaste_wait = 0;
423	if (cp->acw == 1) {
424		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
425			retaste_wait = 1;
426	}
427	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
428	    -cp->acw, -cp->ace, 0);
429	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
430		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
431	if (retaste_wait) {
432		/*
433		 * After retaste event was send (inside g_access()), we can send
434		 * event to detach and destroy consumer.
435		 * A class, which has consumer to the given provider connected
436		 * will not receive retaste event for the provider.
437		 * This is the way how I ignore retaste events when I close
438		 * consumers opened for write: I detach and destroy consumer
439		 * after retaste event is sent.
440		 */
441		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
442		return;
443	}
444	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
445	g_detach(cp);
446	g_destroy_consumer(cp);
447}
448
449static int
450g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
451{
452	int error;
453
454	g_topology_assert();
455	KASSERT(disk->d_consumer == NULL,
456	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
457
458	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
459	disk->d_consumer->private = disk;
460	disk->d_consumer->index = 0;
461	error = g_attach(disk->d_consumer, pp);
462	if (error != 0)
463		return (error);
464	error = g_access(disk->d_consumer, 1, 1, 1);
465	if (error != 0) {
466		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
467		    pp->name, error);
468		return (error);
469	}
470	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
471	return (0);
472}
473
474static void
475g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
476{
477
478	g_topology_assert();
479
480	if (cp == NULL)
481		return;
482	if (cp->provider != NULL)
483		g_raid3_kill_consumer(sc, cp);
484	else
485		g_destroy_consumer(cp);
486}
487
488/*
489 * Initialize disk. This means allocate memory, create consumer, attach it
490 * to the provider and open access (r1w1e1) to it.
491 */
492static struct g_raid3_disk *
493g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
494    struct g_raid3_metadata *md, int *errorp)
495{
496	struct g_raid3_disk *disk;
497	int error;
498
499	disk = &sc->sc_disks[md->md_no];
500	error = g_raid3_connect_disk(disk, pp);
501	if (error != 0)
502		goto fail;
503	disk->d_state = G_RAID3_DISK_STATE_NONE;
504	disk->d_flags = md->md_dflags;
505	if (md->md_provider[0] != '\0')
506		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
507	disk->d_sync.ds_consumer = NULL;
508	disk->d_sync.ds_offset = md->md_sync_offset;
509	disk->d_sync.ds_offset_done = md->md_sync_offset;
510	disk->d_sync.ds_resync = -1;
511	disk->d_genid = md->md_genid;
512	disk->d_sync.ds_syncid = md->md_syncid;
513	if (errorp != NULL)
514		*errorp = 0;
515	return (disk);
516fail:
517	if (errorp != NULL)
518		*errorp = error;
519	if (disk != NULL)
520		g_raid3_disconnect_consumer(sc, disk->d_consumer);
521	return (NULL);
522}
523
524static void
525g_raid3_destroy_disk(struct g_raid3_disk *disk)
526{
527	struct g_raid3_softc *sc;
528
529	g_topology_assert();
530
531	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
532		return;
533	g_raid3_event_cancel(disk);
534	sc = disk->d_softc;
535	switch (disk->d_state) {
536	case G_RAID3_DISK_STATE_SYNCHRONIZING:
537		if (sc->sc_syncdisk != NULL)
538			g_raid3_sync_stop(sc, 1);
539		/* FALLTHROUGH */
540	case G_RAID3_DISK_STATE_NEW:
541	case G_RAID3_DISK_STATE_STALE:
542	case G_RAID3_DISK_STATE_ACTIVE:
543		g_raid3_disconnect_consumer(sc, disk->d_consumer);
544		disk->d_consumer = NULL;
545		break;
546	default:
547		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
548		    g_raid3_get_diskname(disk),
549		    g_raid3_disk_state2str(disk->d_state)));
550	}
551	disk->d_state = G_RAID3_DISK_STATE_NODISK;
552}
553
554static void
555g_raid3_destroy_device(struct g_raid3_softc *sc)
556{
557	struct g_raid3_event *ep;
558	struct g_raid3_disk *disk;
559	struct g_geom *gp;
560	struct g_consumer *cp;
561	u_int n;
562
563	g_topology_assert();
564
565	gp = sc->sc_geom;
566	if (sc->sc_provider != NULL)
567		g_raid3_destroy_provider(sc);
568	for (n = 0; n < sc->sc_ndisks; n++) {
569		disk = &sc->sc_disks[n];
570		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
571			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
572			g_raid3_update_metadata(disk);
573			g_raid3_destroy_disk(disk);
574		}
575	}
576	while ((ep = g_raid3_event_get(sc)) != NULL) {
577		g_raid3_event_remove(sc, ep);
578		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
579			g_raid3_event_free(ep);
580		else {
581			ep->e_error = ECANCELED;
582			ep->e_flags |= G_RAID3_EVENT_DONE;
583			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
584			mtx_lock(&sc->sc_events_mtx);
585			wakeup(ep);
586			mtx_unlock(&sc->sc_events_mtx);
587		}
588	}
589	callout_drain(&sc->sc_callout);
590	gp->softc = NULL;
591	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
592	if (cp != NULL)
593		g_raid3_disconnect_consumer(sc, cp);
594	sc->sc_sync.ds_geom->softc = NULL;
595	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
596	uma_zdestroy(sc->sc_zone_64k);
597	uma_zdestroy(sc->sc_zone_16k);
598	uma_zdestroy(sc->sc_zone_4k);
599	mtx_destroy(&sc->sc_queue_mtx);
600	mtx_destroy(&sc->sc_events_mtx);
601	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
602	g_wither_geom(gp, ENXIO);
603}
604
605static void
606g_raid3_orphan(struct g_consumer *cp)
607{
608	struct g_raid3_disk *disk;
609
610	g_topology_assert();
611
612	disk = cp->private;
613	if (disk == NULL)
614		return;
615	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID_OFW;
616	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
617	    G_RAID3_EVENT_DONTWAIT);
618}
619
620static void
621g_raid3_spoiled(struct g_consumer *cp)
622{
623	struct g_raid3_disk *disk;
624
625	g_topology_assert();
626
627	disk = cp->private;
628	if (disk == NULL)
629		return;
630	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID_IMM;
631	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
632	    G_RAID3_EVENT_DONTWAIT);
633}
634
635static int
636g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
637{
638	struct g_raid3_softc *sc;
639	struct g_consumer *cp;
640	off_t offset, length;
641	u_char *sector;
642	int error = 0;
643
644	g_topology_assert();
645
646	sc = disk->d_softc;
647	cp = disk->d_consumer;
648	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
649	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
650	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
651	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
652	    cp->acw, cp->ace));
653	length = cp->provider->sectorsize;
654	offset = cp->provider->mediasize - length;
655	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
656	if (md != NULL)
657		raid3_metadata_encode(md, sector);
658	g_topology_unlock();
659	error = g_write_data(cp, offset, sector, length);
660	g_topology_lock();
661	free(sector, M_RAID3);
662	if (error != 0) {
663		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID_IMM;
664		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
665		    G_RAID3_EVENT_DONTWAIT);
666	}
667	return (error);
668}
669
670int
671g_raid3_clear_metadata(struct g_raid3_disk *disk)
672{
673	int error;
674
675	g_topology_assert();
676	error = g_raid3_write_metadata(disk, NULL);
677	if (error == 0) {
678		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
679		    g_raid3_get_diskname(disk));
680	} else {
681		G_RAID3_DEBUG(0,
682		    "Cannot clear metadata on disk %s (error=%d).",
683		    g_raid3_get_diskname(disk), error);
684	}
685	return (error);
686}
687
688void
689g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
690{
691	struct g_raid3_softc *sc;
692
693	sc = disk->d_softc;
694	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
695	md->md_version = G_RAID3_VERSION;
696	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
697	md->md_id = sc->sc_id;
698	md->md_all = sc->sc_ndisks;
699	md->md_genid = sc->sc_genid;
700	md->md_mediasize = sc->sc_mediasize;
701	md->md_sectorsize = sc->sc_sectorsize;
702	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
703	md->md_no = disk->d_no;
704	md->md_syncid = disk->d_sync.ds_syncid;
705	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
706	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
707		md->md_sync_offset = disk->d_sync.ds_offset_done;
708	else
709		md->md_sync_offset = 0;
710	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
711	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
712		strlcpy(md->md_provider, disk->d_consumer->provider->name,
713		    sizeof(md->md_provider));
714	} else {
715		bzero(md->md_provider, sizeof(md->md_provider));
716	}
717}
718
719void
720g_raid3_update_metadata(struct g_raid3_disk *disk)
721{
722	struct g_raid3_metadata md;
723	int error;
724
725	g_topology_assert();
726	g_raid3_fill_metadata(disk, &md);
727	error = g_raid3_write_metadata(disk, &md);
728	if (error == 0) {
729		G_RAID3_DEBUG(2, "Metadata on %s updated.",
730		    g_raid3_get_diskname(disk));
731	} else {
732		G_RAID3_DEBUG(0,
733		    "Cannot update metadata on disk %s (error=%d).",
734		    g_raid3_get_diskname(disk), error);
735	}
736}
737
738static void
739g_raid3_bump_syncid(struct g_raid3_softc *sc)
740{
741	struct g_raid3_disk *disk;
742	u_int n;
743
744	g_topology_assert();
745	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
746	    ("%s called with no active disks (device=%s).", __func__,
747	    sc->sc_name));
748
749	sc->sc_syncid++;
750	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
751	    sc->sc_syncid);
752	for (n = 0; n < sc->sc_ndisks; n++) {
753		disk = &sc->sc_disks[n];
754		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
755		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
756			disk->d_sync.ds_syncid = sc->sc_syncid;
757			g_raid3_update_metadata(disk);
758		}
759	}
760}
761
762static void
763g_raid3_bump_genid(struct g_raid3_softc *sc)
764{
765	struct g_raid3_disk *disk;
766	u_int n;
767
768	g_topology_assert();
769	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
770	    ("%s called with no active disks (device=%s).", __func__,
771	    sc->sc_name));
772
773	sc->sc_genid++;
774	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
775	    sc->sc_genid);
776	for (n = 0; n < sc->sc_ndisks; n++) {
777		disk = &sc->sc_disks[n];
778		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
779		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
780			disk->d_genid = sc->sc_genid;
781			g_raid3_update_metadata(disk);
782		}
783	}
784}
785
786static void
787g_raid3_idle(struct g_raid3_softc *sc)
788{
789	struct g_raid3_disk *disk;
790	u_int i;
791
792	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
793		return;
794	sc->sc_idle = 1;
795	g_topology_lock();
796	for (i = 0; i < sc->sc_ndisks; i++) {
797		disk = &sc->sc_disks[i];
798		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
799			continue;
800		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
801		    g_raid3_get_diskname(disk), sc->sc_name);
802		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
803		g_raid3_update_metadata(disk);
804	}
805	g_topology_unlock();
806}
807
808static void
809g_raid3_unidle(struct g_raid3_softc *sc)
810{
811	struct g_raid3_disk *disk;
812	u_int i;
813
814	sc->sc_idle = 0;
815	g_topology_lock();
816	for (i = 0; i < sc->sc_ndisks; i++) {
817		disk = &sc->sc_disks[i];
818		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
819			continue;
820		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
821		    g_raid3_get_diskname(disk), sc->sc_name);
822		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
823		g_raid3_update_metadata(disk);
824	}
825	g_topology_unlock();
826}
827
828/*
829 * Return 1 if we should check if RAID3 device is idling.
830 */
831static int
832g_raid3_check_idle(struct g_raid3_softc *sc)
833{
834	struct g_raid3_disk *disk;
835	u_int i;
836
837	if (sc->sc_idle)
838		return (0);
839	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
840		return (0);
841	/*
842	 * Check if there are no in-flight requests.
843	 */
844	for (i = 0; i < sc->sc_ndisks; i++) {
845		disk = &sc->sc_disks[i];
846		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
847			continue;
848		if (disk->d_consumer->index > 0)
849			return (0);
850	}
851	return (1);
852}
853
854/*
855 * Treat bio_driver1 field in parent bio as list head and field bio_caller1
856 * in child bio as pointer to the next element on the list.
857 */
858#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
859
860#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
861
862#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
863	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
864	    (bp) = G_RAID3_NEXT_BIO(bp))
865
866#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
867	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
868	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
869	    (bp) = (tmpbp))
870
871static void
872g_raid3_init_bio(struct bio *pbp)
873{
874
875	G_RAID3_HEAD_BIO(pbp) = NULL;
876}
877
878static void
879g_raid3_remove_bio(struct bio *cbp)
880{
881	struct bio *pbp, *bp;
882
883	pbp = cbp->bio_parent;
884	if (G_RAID3_HEAD_BIO(pbp) == cbp)
885		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
886	else {
887		G_RAID3_FOREACH_BIO(pbp, bp) {
888			if (G_RAID3_NEXT_BIO(bp) == cbp) {
889				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
890				break;
891			}
892		}
893	}
894	G_RAID3_NEXT_BIO(cbp) = NULL;
895}
896
897static void
898g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
899{
900	struct bio *pbp, *bp;
901
902	g_raid3_remove_bio(sbp);
903	pbp = dbp->bio_parent;
904	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
905	if (G_RAID3_HEAD_BIO(pbp) == dbp)
906		G_RAID3_HEAD_BIO(pbp) = sbp;
907	else {
908		G_RAID3_FOREACH_BIO(pbp, bp) {
909			if (G_RAID3_NEXT_BIO(bp) == dbp) {
910				G_RAID3_NEXT_BIO(bp) = sbp;
911				break;
912			}
913		}
914	}
915	G_RAID3_NEXT_BIO(dbp) = NULL;
916}
917
918static void
919g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
920{
921	struct bio *bp, *pbp;
922	size_t size;
923
924	pbp = cbp->bio_parent;
925	pbp->bio_children--;
926	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
927	size = pbp->bio_length / (sc->sc_ndisks - 1);
928	if (size > 16384)
929		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
930	else if (size > 4096)
931		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
932	else
933		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
934	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
935		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
936		G_RAID3_NEXT_BIO(cbp) = NULL;
937		g_destroy_bio(cbp);
938	} else {
939		G_RAID3_FOREACH_BIO(pbp, bp) {
940			if (G_RAID3_NEXT_BIO(bp) == cbp)
941				break;
942		}
943		if (bp != NULL) {
944			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
945			    ("NULL bp->bio_driver1"));
946			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
947			G_RAID3_NEXT_BIO(cbp) = NULL;
948		}
949		g_destroy_bio(cbp);
950	}
951}
952
953static struct bio *
954g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
955{
956	struct bio *bp, *cbp;
957	size_t size;
958
959	cbp = g_clone_bio(pbp);
960	if (cbp == NULL)
961		return (NULL);
962	size = pbp->bio_length / (sc->sc_ndisks - 1);
963	if (size > 16384) {
964		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
965		g_raid3_64k_requested++;
966	} else if (size > 4096) {
967		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
968		g_raid3_16k_requested++;
969	} else {
970		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
971		g_raid3_4k_requested++;
972	}
973	if (cbp->bio_data == NULL) {
974		if (size > 16384)
975			g_raid3_64k_failed++;
976		if (size > 4096)
977			g_raid3_16k_failed++;
978		else
979			g_raid3_4k_failed++;
980		pbp->bio_children--;
981		g_destroy_bio(cbp);
982		return (NULL);
983	}
984	G_RAID3_NEXT_BIO(cbp) = NULL;
985	if (G_RAID3_HEAD_BIO(pbp) == NULL)
986		G_RAID3_HEAD_BIO(pbp) = cbp;
987	else {
988		G_RAID3_FOREACH_BIO(pbp, bp) {
989			if (G_RAID3_NEXT_BIO(bp) == NULL) {
990				G_RAID3_NEXT_BIO(bp) = cbp;
991				break;
992			}
993		}
994	}
995	return (cbp);
996}
997
998static void
999g_raid3_scatter(struct bio *pbp)
1000{
1001	struct g_raid3_softc *sc;
1002	struct g_raid3_disk *disk;
1003	struct bio *bp, *cbp;
1004	off_t atom, cadd, padd, left;
1005
1006	sc = pbp->bio_to->geom->softc;
1007	bp = NULL;
1008	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1009		/*
1010		 * Find bio for which we should calculate data.
1011		 */
1012		G_RAID3_FOREACH_BIO(pbp, cbp) {
1013			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1014				bp = cbp;
1015				break;
1016			}
1017		}
1018		KASSERT(bp != NULL, ("NULL parity bio."));
1019	}
1020	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1021	cadd = padd = 0;
1022	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1023		G_RAID3_FOREACH_BIO(pbp, cbp) {
1024			if (cbp == bp)
1025				continue;
1026			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1027			padd += atom;
1028		}
1029		cadd += atom;
1030	}
1031	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1032		struct bio *tmpbp;
1033
1034		/*
1035		 * Calculate parity.
1036		 */
1037		bzero(bp->bio_data, bp->bio_length);
1038		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1039			if (cbp == bp)
1040				continue;
1041			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1042			    bp->bio_length);
1043			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1044				g_raid3_destroy_bio(sc, cbp);
1045		}
1046	}
1047	G_RAID3_FOREACH_BIO(pbp, cbp) {
1048		struct g_consumer *cp;
1049
1050		disk = cbp->bio_caller2;
1051		cp = disk->d_consumer;
1052		cbp->bio_to = cp->provider;
1053		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1054		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1055		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1056		    cp->acr, cp->acw, cp->ace));
1057		cp->index++;
1058		g_io_request(cbp, cp);
1059	}
1060}
1061
1062static void
1063g_raid3_gather(struct bio *pbp)
1064{
1065	struct g_raid3_softc *sc;
1066	struct g_raid3_disk *disk;
1067	struct bio *xbp, *fbp, *cbp;
1068	off_t atom, cadd, padd, left;
1069
1070	sc = pbp->bio_to->geom->softc;
1071	/*
1072	 * Find bio for which we have to calculate data.
1073	 * While going through this path, check if all requests
1074	 * succeeded, if not, deny whole request.
1075	 * If we're in COMPLETE mode, we allow one request to fail,
1076	 * so if we find one, we're sending it to the parity consumer.
1077	 * If there are more failed requests, we deny whole request.
1078	 */
1079	xbp = fbp = NULL;
1080	G_RAID3_FOREACH_BIO(pbp, cbp) {
1081		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1082			KASSERT(xbp == NULL, ("More than one parity bio."));
1083			xbp = cbp;
1084		}
1085		if (cbp->bio_error == 0)
1086			continue;
1087		/*
1088		 * Found failed request.
1089		 */
1090		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1091		disk = cbp->bio_caller2;
1092		if (disk != NULL) {
1093			/*
1094			 * Actually this is pointless to bump genid,
1095			 * because whole device is fucked up.
1096			 */
1097			sc->sc_bump_id |= G_RAID3_BUMP_GENID_IMM;
1098			g_raid3_event_send(disk,
1099			    G_RAID3_DISK_STATE_DISCONNECTED,
1100			    G_RAID3_EVENT_DONTWAIT);
1101		}
1102		if (fbp == NULL) {
1103			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1104				/*
1105				 * We are already in degraded mode, so we can't
1106				 * accept any failures.
1107				 */
1108				if (pbp->bio_error == 0)
1109					pbp->bio_error = fbp->bio_error;
1110			} else {
1111				fbp = cbp;
1112			}
1113		} else {
1114			/*
1115			 * Next failed request, that's too many.
1116			 */
1117			if (pbp->bio_error == 0)
1118				pbp->bio_error = fbp->bio_error;
1119		}
1120	}
1121	if (pbp->bio_error != 0)
1122		goto finish;
1123	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1124		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1125		if (xbp != fbp)
1126			g_raid3_replace_bio(xbp, fbp);
1127		g_raid3_destroy_bio(sc, fbp);
1128	} else if (fbp != NULL) {
1129		struct g_consumer *cp;
1130
1131		/*
1132		 * One request failed, so send the same request to
1133		 * the parity consumer.
1134		 */
1135		disk = pbp->bio_driver2;
1136		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1137			pbp->bio_error = fbp->bio_error;
1138			goto finish;
1139		}
1140		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1141		pbp->bio_inbed--;
1142		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1143		if (disk->d_no == sc->sc_ndisks - 1)
1144			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1145		fbp->bio_error = 0;
1146		fbp->bio_completed = 0;
1147		fbp->bio_children = 0;
1148		fbp->bio_inbed = 0;
1149		cp = disk->d_consumer;
1150		fbp->bio_caller2 = disk;
1151		fbp->bio_to = cp->provider;
1152		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1153		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1154		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1155		    cp->acr, cp->acw, cp->ace));
1156		cp->index++;
1157		g_io_request(fbp, cp);
1158		return;
1159	}
1160	if (xbp != NULL) {
1161		/*
1162		 * Calculate parity.
1163		 */
1164		G_RAID3_FOREACH_BIO(pbp, cbp) {
1165			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1166				continue;
1167			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1168			    xbp->bio_length);
1169		}
1170		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1171		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1172			if (!g_raid3_is_zero(xbp)) {
1173				g_raid3_parity_mismatch++;
1174				pbp->bio_error = EIO;
1175				goto finish;
1176			}
1177			g_raid3_destroy_bio(sc, xbp);
1178		}
1179	}
1180	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1181	cadd = padd = 0;
1182	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1183		G_RAID3_FOREACH_BIO(pbp, cbp) {
1184			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1185			pbp->bio_completed += atom;
1186			padd += atom;
1187		}
1188		cadd += atom;
1189	}
1190finish:
1191	if (pbp->bio_error == 0)
1192		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1193	else {
1194		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1195			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1196		else
1197			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1198	}
1199	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1200	g_io_deliver(pbp, pbp->bio_error);
1201	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1202		g_raid3_destroy_bio(sc, cbp);
1203}
1204
1205static void
1206g_raid3_done(struct bio *bp)
1207{
1208	struct g_raid3_softc *sc;
1209
1210	sc = bp->bio_from->geom->softc;
1211	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1212	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1213	mtx_lock(&sc->sc_queue_mtx);
1214	bioq_insert_head(&sc->sc_queue, bp);
1215	wakeup(sc);
1216	wakeup(&sc->sc_queue);
1217	mtx_unlock(&sc->sc_queue_mtx);
1218}
1219
1220static void
1221g_raid3_regular_request(struct bio *cbp)
1222{
1223	struct g_raid3_softc *sc;
1224	struct g_raid3_disk *disk;
1225	struct bio *pbp;
1226
1227	g_topology_assert_not();
1228
1229	cbp->bio_from->index--;
1230	pbp = cbp->bio_parent;
1231	sc = pbp->bio_to->geom->softc;
1232	disk = cbp->bio_from->private;
1233	if (disk == NULL) {
1234		g_topology_lock();
1235		g_raid3_kill_consumer(sc, cbp->bio_from);
1236		g_topology_unlock();
1237	}
1238
1239	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1240	pbp->bio_inbed++;
1241	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1242	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1243	    pbp->bio_children));
1244	if (pbp->bio_inbed != pbp->bio_children)
1245		return;
1246	switch (pbp->bio_cmd) {
1247	case BIO_READ:
1248		g_raid3_gather(pbp);
1249		break;
1250	case BIO_WRITE:
1251	case BIO_DELETE:
1252	    {
1253		int error = 0;
1254
1255		pbp->bio_completed = pbp->bio_length;
1256		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1257			if (cbp->bio_error != 0) {
1258				disk = cbp->bio_caller2;
1259				if (disk != NULL) {
1260					sc->sc_bump_id |=
1261					    G_RAID3_BUMP_GENID_IMM;
1262					g_raid3_event_send(disk,
1263					    G_RAID3_DISK_STATE_DISCONNECTED,
1264					    G_RAID3_EVENT_DONTWAIT);
1265				}
1266				if (error == 0)
1267					error = cbp->bio_error;
1268				else if (pbp->bio_error == 0) {
1269					/*
1270					 * Next failed request, that's too many.
1271					 */
1272					pbp->bio_error = error;
1273				}
1274			}
1275			g_raid3_destroy_bio(sc, cbp);
1276		}
1277		if (pbp->bio_error == 0)
1278			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1279		else
1280			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1281		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1282		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1283		g_io_deliver(pbp, pbp->bio_error);
1284		break;
1285	    }
1286	}
1287}
1288
1289static void
1290g_raid3_sync_done(struct bio *bp)
1291{
1292	struct g_raid3_softc *sc;
1293
1294	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1295	sc = bp->bio_from->geom->softc;
1296	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1297	mtx_lock(&sc->sc_queue_mtx);
1298	bioq_insert_head(&sc->sc_queue, bp);
1299	wakeup(sc);
1300	wakeup(&sc->sc_queue);
1301	mtx_unlock(&sc->sc_queue_mtx);
1302}
1303
1304static void
1305g_raid3_start(struct bio *bp)
1306{
1307	struct g_raid3_softc *sc;
1308
1309	sc = bp->bio_to->geom->softc;
1310	/*
1311	 * If sc == NULL or there are no valid disks, provider's error
1312	 * should be set and g_raid3_start() should not be called at all.
1313	 */
1314	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1315	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1316	    ("Provider's error should be set (error=%d)(device=%s).",
1317	    bp->bio_to->error, bp->bio_to->name));
1318	G_RAID3_LOGREQ(3, bp, "Request received.");
1319
1320	switch (bp->bio_cmd) {
1321	case BIO_READ:
1322	case BIO_WRITE:
1323	case BIO_DELETE:
1324		break;
1325	case BIO_GETATTR:
1326	default:
1327		g_io_deliver(bp, EOPNOTSUPP);
1328		return;
1329	}
1330	mtx_lock(&sc->sc_queue_mtx);
1331	bioq_insert_tail(&sc->sc_queue, bp);
1332	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1333	wakeup(sc);
1334	mtx_unlock(&sc->sc_queue_mtx);
1335}
1336
1337/*
1338 * Send one synchronization request.
1339 */
1340static void
1341g_raid3_sync_one(struct g_raid3_softc *sc)
1342{
1343	struct g_raid3_disk *disk;
1344	struct bio *bp;
1345
1346	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1347	    ("Wrong device state (%s, %s).", sc->sc_name,
1348	    g_raid3_device_state2str(sc->sc_state)));
1349	disk = sc->sc_syncdisk;
1350	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1351	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1352	    ("Disk %s is not marked for synchronization.",
1353	    g_raid3_get_diskname(disk)));
1354
1355	bp = g_new_bio();
1356	if (bp == NULL)
1357		return;
1358	bp->bio_parent = NULL;
1359	bp->bio_cmd = BIO_READ;
1360	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1361	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1362	bp->bio_cflags = 0;
1363	bp->bio_done = g_raid3_sync_done;
1364	bp->bio_data = disk->d_sync.ds_data;
1365	if (bp->bio_data == NULL) {
1366		g_destroy_bio(bp);
1367		return;
1368	}
1369	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1370	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1371	bp->bio_to = sc->sc_provider;
1372	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1373	disk->d_sync.ds_consumer->index++;
1374	g_io_request(bp, disk->d_sync.ds_consumer);
1375}
1376
1377static void
1378g_raid3_sync_request(struct bio *bp)
1379{
1380	struct g_raid3_softc *sc;
1381	struct g_raid3_disk *disk;
1382
1383	bp->bio_from->index--;
1384	sc = bp->bio_from->geom->softc;
1385	disk = bp->bio_from->private;
1386	if (disk == NULL) {
1387		g_topology_lock();
1388		g_raid3_kill_consumer(sc, bp->bio_from);
1389		g_topology_unlock();
1390		g_destroy_bio(bp);
1391		return;
1392	}
1393
1394	/*
1395	 * Synchronization request.
1396	 */
1397	switch (bp->bio_cmd) {
1398	case BIO_READ:
1399	    {
1400		struct g_consumer *cp;
1401		u_char *dst, *src;
1402		off_t left;
1403		u_int atom;
1404
1405		if (bp->bio_error != 0) {
1406			G_RAID3_LOGREQ(0, bp,
1407			    "Synchronization request failed (error=%d).",
1408			    bp->bio_error);
1409			g_destroy_bio(bp);
1410			return;
1411		}
1412		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1413		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1414		dst = src = bp->bio_data;
1415		if (disk->d_no == sc->sc_ndisks - 1) {
1416			u_int n;
1417
1418			/* Parity component. */
1419			for (left = bp->bio_length; left > 0;
1420			    left -= sc->sc_sectorsize) {
1421				bcopy(src, dst, atom);
1422				src += atom;
1423				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1424					g_raid3_xor(src, dst, dst, atom);
1425					src += atom;
1426				}
1427				dst += atom;
1428			}
1429		} else {
1430			/* Regular component. */
1431			src += atom * disk->d_no;
1432			for (left = bp->bio_length; left > 0;
1433			    left -= sc->sc_sectorsize) {
1434				bcopy(src, dst, atom);
1435				src += sc->sc_sectorsize;
1436				dst += atom;
1437			}
1438		}
1439		bp->bio_offset /= sc->sc_ndisks - 1;
1440		bp->bio_length /= sc->sc_ndisks - 1;
1441		bp->bio_cmd = BIO_WRITE;
1442		bp->bio_cflags = 0;
1443		bp->bio_children = bp->bio_inbed = 0;
1444		cp = disk->d_consumer;
1445		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1446		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1447		    cp->acr, cp->acw, cp->ace));
1448		cp->index++;
1449		g_io_request(bp, cp);
1450		return;
1451	    }
1452	case BIO_WRITE:
1453	    {
1454		struct g_raid3_disk_sync *sync;
1455
1456		if (bp->bio_error != 0) {
1457			G_RAID3_LOGREQ(0, bp,
1458			    "Synchronization request failed (error=%d).",
1459			    bp->bio_error);
1460			g_destroy_bio(bp);
1461			sc->sc_bump_id |= G_RAID3_BUMP_GENID_IMM;
1462			g_raid3_event_send(disk,
1463			    G_RAID3_DISK_STATE_DISCONNECTED,
1464			    G_RAID3_EVENT_DONTWAIT);
1465			return;
1466		}
1467		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1468		sync = &disk->d_sync;
1469		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1470		g_destroy_bio(bp);
1471		if (sync->ds_resync != -1)
1472			return;
1473		if (sync->ds_offset_done ==
1474		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1475			/*
1476			 * Disk up-to-date, activate it.
1477			 */
1478			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1479			    G_RAID3_EVENT_DONTWAIT);
1480			return;
1481		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1482			/*
1483			 * Update offset_done on every 100 blocks.
1484			 * XXX: This should be configurable.
1485			 */
1486			g_topology_lock();
1487			g_raid3_update_metadata(disk);
1488			g_topology_unlock();
1489		}
1490		return;
1491	    }
1492	default:
1493		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1494		    bp->bio_cmd, sc->sc_name));
1495		break;
1496	}
1497}
1498
1499static int
1500g_raid3_register_request(struct bio *pbp)
1501{
1502	struct g_raid3_softc *sc;
1503	struct g_raid3_disk *disk;
1504	struct g_consumer *cp;
1505	struct bio *cbp;
1506	off_t offset, length;
1507	u_int n, ndisks;
1508	int round_robin, verify;
1509
1510	ndisks = 0;
1511	sc = pbp->bio_to->geom->softc;
1512	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1513	    sc->sc_syncdisk == NULL) {
1514		g_io_deliver(pbp, EIO);
1515		return (0);
1516	}
1517	g_raid3_init_bio(pbp);
1518	length = pbp->bio_length / (sc->sc_ndisks - 1);
1519	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1520	round_robin = verify = 0;
1521	switch (pbp->bio_cmd) {
1522	case BIO_READ:
1523		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1524		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1525			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1526			verify = 1;
1527			ndisks = sc->sc_ndisks;
1528		} else {
1529			verify = 0;
1530			ndisks = sc->sc_ndisks - 1;
1531		}
1532		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1533		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1534			round_robin = 1;
1535		} else {
1536			round_robin = 0;
1537		}
1538		KASSERT(!round_robin || !verify,
1539		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1540		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1541		break;
1542	case BIO_WRITE:
1543	case BIO_DELETE:
1544	    {
1545		struct g_raid3_disk_sync *sync;
1546
1547		if (sc->sc_idle)
1548			g_raid3_unidle(sc);
1549
1550		ndisks = sc->sc_ndisks;
1551
1552		if (sc->sc_syncdisk == NULL)
1553			break;
1554		sync = &sc->sc_syncdisk->d_sync;
1555		if (offset >= sync->ds_offset)
1556			break;
1557		if (offset + length <= sync->ds_offset_done)
1558			break;
1559		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1560			break;
1561		sync->ds_resync = offset - (offset % MAXPHYS);
1562		break;
1563	    }
1564	}
1565	for (n = 0; n < ndisks; n++) {
1566		disk = &sc->sc_disks[n];
1567		cbp = g_raid3_clone_bio(sc, pbp);
1568		if (cbp == NULL) {
1569			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1570				g_raid3_destroy_bio(sc, cbp);
1571			return (ENOMEM);
1572		}
1573		cbp->bio_offset = offset;
1574		cbp->bio_length = length;
1575		cbp->bio_done = g_raid3_done;
1576		switch (pbp->bio_cmd) {
1577		case BIO_READ:
1578			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1579				/*
1580				 * Replace invalid component with the parity
1581				 * component.
1582				 */
1583				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1584				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1585				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1586			} else if (round_robin &&
1587			    disk->d_no == sc->sc_round_robin) {
1588				/*
1589				 * In round-robin mode skip one data component
1590				 * and use parity component when reading.
1591				 */
1592				pbp->bio_driver2 = disk;
1593				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1594				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1595				sc->sc_round_robin++;
1596				round_robin = 0;
1597			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1598				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1599			}
1600			break;
1601		case BIO_WRITE:
1602		case BIO_DELETE:
1603			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1604			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1605				if (n == ndisks - 1) {
1606					/*
1607					 * Active parity component, mark it as such.
1608					 */
1609					cbp->bio_cflags |=
1610					    G_RAID3_BIO_CFLAG_PARITY;
1611				}
1612			} else {
1613				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1614				if (n == ndisks - 1) {
1615					/*
1616					 * Parity component is not connected,
1617					 * so destroy its request.
1618					 */
1619					pbp->bio_pflags |=
1620					    G_RAID3_BIO_PFLAG_NOPARITY;
1621					g_raid3_destroy_bio(sc, cbp);
1622					cbp = NULL;
1623				} else {
1624					cbp->bio_cflags |=
1625					    G_RAID3_BIO_CFLAG_NODISK;
1626					disk = NULL;
1627				}
1628			}
1629			break;
1630		}
1631		if (cbp != NULL)
1632			cbp->bio_caller2 = disk;
1633	}
1634	switch (pbp->bio_cmd) {
1635	case BIO_READ:
1636		if (round_robin) {
1637			/*
1638			 * If we are in round-robin mode and 'round_robin' is
1639			 * still 1, it means, that we skipped parity component
1640			 * for this read and must reset sc_round_robin field.
1641			 */
1642			sc->sc_round_robin = 0;
1643		}
1644		G_RAID3_FOREACH_BIO(pbp, cbp) {
1645			disk = cbp->bio_caller2;
1646			cp = disk->d_consumer;
1647			cbp->bio_to = cp->provider;
1648			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1649			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1650			    ("Consumer %s not opened (r%dw%de%d).",
1651			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1652			cp->index++;
1653			g_io_request(cbp, cp);
1654		}
1655		break;
1656	case BIO_WRITE:
1657	case BIO_DELETE:
1658		/*
1659		 * Bump syncid on first write.
1660		 */
1661		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_OFW) != 0) {
1662			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1663			g_topology_lock();
1664			g_raid3_bump_syncid(sc);
1665			g_topology_unlock();
1666		}
1667		g_raid3_scatter(pbp);
1668		break;
1669	}
1670	return (0);
1671}
1672
1673static int
1674g_raid3_can_destroy(struct g_raid3_softc *sc)
1675{
1676	struct g_geom *gp;
1677	struct g_consumer *cp;
1678
1679	g_topology_assert();
1680	gp = sc->sc_geom;
1681	LIST_FOREACH(cp, &gp->consumer, consumer) {
1682		if (g_raid3_is_busy(sc, cp))
1683			return (0);
1684	}
1685	gp = sc->sc_sync.ds_geom;
1686	LIST_FOREACH(cp, &gp->consumer, consumer) {
1687		if (g_raid3_is_busy(sc, cp))
1688			return (0);
1689	}
1690	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1691	    sc->sc_name);
1692	return (1);
1693}
1694
1695static int
1696g_raid3_try_destroy(struct g_raid3_softc *sc)
1697{
1698
1699	g_topology_lock();
1700	if (!g_raid3_can_destroy(sc)) {
1701		g_topology_unlock();
1702		return (0);
1703	}
1704	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1705		g_topology_unlock();
1706		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1707		    &sc->sc_worker);
1708		wakeup(&sc->sc_worker);
1709		sc->sc_worker = NULL;
1710	} else {
1711		g_raid3_destroy_device(sc);
1712		g_topology_unlock();
1713		free(sc->sc_disks, M_RAID3);
1714		free(sc, M_RAID3);
1715	}
1716	return (1);
1717}
1718
1719/*
1720 * Worker thread.
1721 */
1722static void
1723g_raid3_worker(void *arg)
1724{
1725	struct g_raid3_softc *sc;
1726	struct g_raid3_disk *disk;
1727	struct g_raid3_disk_sync *sync;
1728	struct g_raid3_event *ep;
1729	struct bio *bp;
1730	u_int nreqs;
1731
1732	sc = arg;
1733	mtx_lock_spin(&sched_lock);
1734	sched_prio(curthread, PRIBIO);
1735	mtx_unlock_spin(&sched_lock);
1736
1737	nreqs = 0;
1738	for (;;) {
1739		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1740		/*
1741		 * First take a look at events.
1742		 * This is important to handle events before any I/O requests.
1743		 */
1744		ep = g_raid3_event_get(sc);
1745		if (ep != NULL && g_topology_try_lock()) {
1746			g_raid3_event_remove(sc, ep);
1747			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1748				/* Update only device status. */
1749				G_RAID3_DEBUG(3,
1750				    "Running event for device %s.",
1751				    sc->sc_name);
1752				ep->e_error = 0;
1753				g_raid3_update_device(sc, 1);
1754			} else {
1755				/* Update disk status. */
1756				G_RAID3_DEBUG(3, "Running event for disk %s.",
1757				     g_raid3_get_diskname(ep->e_disk));
1758				ep->e_error = g_raid3_update_disk(ep->e_disk,
1759				    ep->e_state);
1760				if (ep->e_error == 0)
1761					g_raid3_update_device(sc, 0);
1762			}
1763			g_topology_unlock();
1764			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1765				KASSERT(ep->e_error == 0,
1766				    ("Error cannot be handled."));
1767				g_raid3_event_free(ep);
1768			} else {
1769				ep->e_flags |= G_RAID3_EVENT_DONE;
1770				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1771				    ep);
1772				mtx_lock(&sc->sc_events_mtx);
1773				wakeup(ep);
1774				mtx_unlock(&sc->sc_events_mtx);
1775			}
1776			if ((sc->sc_flags &
1777			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1778				if (g_raid3_try_destroy(sc))
1779					kthread_exit(0);
1780			}
1781			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1782			continue;
1783		}
1784		/*
1785		 * Now I/O requests.
1786		 */
1787		/* Get first request from the queue. */
1788		mtx_lock(&sc->sc_queue_mtx);
1789		bp = bioq_first(&sc->sc_queue);
1790		if (bp == NULL) {
1791			if (ep != NULL) {
1792				/*
1793				 * No I/O requests and topology lock was
1794				 * already held? Try again.
1795				 */
1796				mtx_unlock(&sc->sc_queue_mtx);
1797				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1798				continue;
1799			}
1800			if ((sc->sc_flags &
1801			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1802				mtx_unlock(&sc->sc_queue_mtx);
1803				if (g_raid3_try_destroy(sc))
1804					kthread_exit(0);
1805				mtx_lock(&sc->sc_queue_mtx);
1806			}
1807		}
1808		if (sc->sc_syncdisk != NULL &&
1809		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1810			mtx_unlock(&sc->sc_queue_mtx);
1811			/*
1812			 * It is time for synchronization...
1813			 */
1814			nreqs = 0;
1815			disk = sc->sc_syncdisk;
1816			sync = &disk->d_sync;
1817			if (sync->ds_offset <
1818			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1819			    sync->ds_offset == sync->ds_offset_done) {
1820				if (sync->ds_resync != -1) {
1821					sync->ds_offset = sync->ds_resync;
1822					sync->ds_offset_done = sync->ds_resync;
1823					sync->ds_resync = -1;
1824				}
1825				g_raid3_sync_one(sc);
1826			}
1827			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1828			goto sleep;
1829		}
1830		if (bp == NULL) {
1831			if (g_raid3_check_idle(sc)) {
1832				u_int idletime;
1833
1834				idletime = g_raid3_idletime;
1835				if (idletime == 0)
1836					idletime = 1;
1837				idletime *= hz;
1838				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1839				    "r3:w1", idletime) == EWOULDBLOCK) {
1840					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1841					    __func__);
1842					/*
1843					 * No I/O requests in 'idletime'
1844					 * seconds, so mark components as clean.
1845					 */
1846					g_raid3_idle(sc);
1847				}
1848				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1849			} else {
1850				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1851				    "r3:w2", 0);
1852				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1853			}
1854			continue;
1855		}
1856		nreqs++;
1857		bioq_remove(&sc->sc_queue, bp);
1858		mtx_unlock(&sc->sc_queue_mtx);
1859
1860		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1861			g_raid3_regular_request(bp);
1862		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1863			u_int timeout, sps;
1864
1865			g_raid3_sync_request(bp);
1866sleep:
1867			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1868			if (sps == 0) {
1869				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1870				continue;
1871			}
1872			if (ep != NULL) {
1873				/*
1874				 * We have some pending events, don't sleep now.
1875				 */
1876				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1877				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1878				continue;
1879			}
1880			mtx_lock(&sc->sc_queue_mtx);
1881			if (bioq_first(&sc->sc_queue) != NULL) {
1882				mtx_unlock(&sc->sc_queue_mtx);
1883				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1884				continue;
1885			}
1886			timeout = hz / sps;
1887			if (timeout == 0)
1888				timeout = 1;
1889			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1890			    timeout);
1891		} else {
1892			if (g_raid3_register_request(bp) != 0) {
1893				mtx_lock(&sc->sc_queue_mtx);
1894				bioq_insert_tail(&sc->sc_queue, bp);
1895				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1896				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1897			}
1898		}
1899		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1900	}
1901}
1902
1903/*
1904 * Open disk's consumer if needed.
1905 */
1906static void
1907g_raid3_update_access(struct g_raid3_disk *disk)
1908{
1909	struct g_provider *pp;
1910
1911	g_topology_assert();
1912
1913	pp = disk->d_softc->sc_provider;
1914	if (pp == NULL)
1915		return;
1916	if (pp->acw > 0) {
1917		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1918			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1919			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1920			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1921		}
1922	} else if (pp->acw == 0) {
1923		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1924			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1925			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1926			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1927		}
1928	}
1929}
1930
1931static void
1932g_raid3_sync_start(struct g_raid3_softc *sc)
1933{
1934	struct g_raid3_disk *disk;
1935	int error;
1936	u_int n;
1937
1938	g_topology_assert();
1939
1940	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1941	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1942	    sc->sc_state));
1943	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1944	    sc->sc_name, sc->sc_state));
1945	disk = NULL;
1946	for (n = 0; n < sc->sc_ndisks; n++) {
1947		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1948			continue;
1949		disk = &sc->sc_disks[n];
1950		break;
1951	}
1952	if (disk == NULL)
1953		return;
1954
1955	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1956	    g_raid3_get_diskname(disk));
1957	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1958	KASSERT(disk->d_sync.ds_consumer == NULL,
1959	    ("Sync consumer already exists (device=%s, disk=%s).",
1960	    sc->sc_name, g_raid3_get_diskname(disk)));
1961	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1962	disk->d_sync.ds_consumer->private = disk;
1963	disk->d_sync.ds_consumer->index = 0;
1964	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1965	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1966	    disk->d_softc->sc_name, error));
1967	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1968	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1969	    disk->d_softc->sc_name, error));
1970	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1971	sc->sc_syncdisk = disk;
1972}
1973
1974/*
1975 * Stop synchronization process.
1976 * type: 0 - synchronization finished
1977 *       1 - synchronization stopped
1978 */
1979static void
1980g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1981{
1982	struct g_raid3_disk *disk;
1983
1984	g_topology_assert();
1985	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1986	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1987	    sc->sc_state));
1988	disk = sc->sc_syncdisk;
1989	sc->sc_syncdisk = NULL;
1990	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1991	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1992	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1993	    g_raid3_disk_state2str(disk->d_state)));
1994	if (disk->d_sync.ds_consumer == NULL)
1995		return;
1996
1997	if (type == 0) {
1998		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1999		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
2000	} else /* if (type == 1) */ {
2001		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2002		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
2003	}
2004	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
2005	free(disk->d_sync.ds_data, M_RAID3);
2006	disk->d_sync.ds_consumer = NULL;
2007	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2008}
2009
2010static void
2011g_raid3_launch_provider(struct g_raid3_softc *sc)
2012{
2013	struct g_provider *pp;
2014
2015	g_topology_assert();
2016
2017	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2018	pp->mediasize = sc->sc_mediasize;
2019	pp->sectorsize = sc->sc_sectorsize;
2020	sc->sc_provider = pp;
2021	g_error_provider(pp, 0);
2022	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2023	    pp->name);
2024	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2025		g_raid3_sync_start(sc);
2026}
2027
2028static void
2029g_raid3_destroy_provider(struct g_raid3_softc *sc)
2030{
2031	struct bio *bp;
2032
2033	g_topology_assert();
2034	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2035	    sc->sc_name));
2036
2037	g_error_provider(sc->sc_provider, ENXIO);
2038	mtx_lock(&sc->sc_queue_mtx);
2039	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2040		bioq_remove(&sc->sc_queue, bp);
2041		g_io_deliver(bp, ENXIO);
2042	}
2043	mtx_unlock(&sc->sc_queue_mtx);
2044	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2045	    sc->sc_provider->name);
2046	sc->sc_provider->flags |= G_PF_WITHER;
2047	g_orphan_provider(sc->sc_provider, ENXIO);
2048	sc->sc_provider = NULL;
2049	if (sc->sc_syncdisk != NULL)
2050		g_raid3_sync_stop(sc, 1);
2051}
2052
2053static void
2054g_raid3_go(void *arg)
2055{
2056	struct g_raid3_softc *sc;
2057
2058	sc = arg;
2059	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2060	g_raid3_event_send(sc, 0,
2061	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2062}
2063
2064static u_int
2065g_raid3_determine_state(struct g_raid3_disk *disk)
2066{
2067	struct g_raid3_softc *sc;
2068	u_int state;
2069
2070	sc = disk->d_softc;
2071	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2072		if ((disk->d_flags &
2073		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2074			/* Disk does not need synchronization. */
2075			state = G_RAID3_DISK_STATE_ACTIVE;
2076		} else {
2077			if ((sc->sc_flags &
2078			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2079			    (disk->d_flags &
2080			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2081				/*
2082				 * We can start synchronization from
2083				 * the stored offset.
2084				 */
2085				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2086			} else {
2087				state = G_RAID3_DISK_STATE_STALE;
2088			}
2089		}
2090	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2091		/*
2092		 * Reset all synchronization data for this disk,
2093		 * because if it even was synchronized, it was
2094		 * synchronized to disks with different syncid.
2095		 */
2096		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2097		disk->d_sync.ds_offset = 0;
2098		disk->d_sync.ds_offset_done = 0;
2099		disk->d_sync.ds_syncid = sc->sc_syncid;
2100		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2101		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2102			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2103		} else {
2104			state = G_RAID3_DISK_STATE_STALE;
2105		}
2106	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2107		/*
2108		 * Not good, NOT GOOD!
2109		 * It means that device was started on stale disks
2110		 * and more fresh disk just arrive.
2111		 * If there were writes, device is fucked up, sorry.
2112		 * I think the best choice here is don't touch
2113		 * this disk and inform the user laudly.
2114		 */
2115		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2116		    "disk (%s) arrives!! It will not be connected to the "
2117		    "running device.", sc->sc_name,
2118		    g_raid3_get_diskname(disk));
2119		g_raid3_destroy_disk(disk);
2120		state = G_RAID3_DISK_STATE_NONE;
2121		/* Return immediately, because disk was destroyed. */
2122		return (state);
2123	}
2124	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2125	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2126	return (state);
2127}
2128
2129/*
2130 * Update device state.
2131 */
2132static void
2133g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2134{
2135	struct g_raid3_disk *disk;
2136	u_int state;
2137
2138	g_topology_assert();
2139
2140	switch (sc->sc_state) {
2141	case G_RAID3_DEVICE_STATE_STARTING:
2142	    {
2143		u_int n, ndirty, ndisks, genid, syncid;
2144
2145		KASSERT(sc->sc_provider == NULL,
2146		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2147		/*
2148		 * Are we ready? We are, if all disks are connected or
2149		 * one disk is missing and 'force' is true.
2150		 */
2151		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2152			if (!force)
2153				callout_drain(&sc->sc_callout);
2154		} else {
2155			if (force) {
2156				/*
2157				 * Timeout expired, so destroy device.
2158				 */
2159				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2160			}
2161			return;
2162		}
2163
2164		/*
2165		 * Find the biggest genid.
2166		 */
2167		genid = 0;
2168		for (n = 0; n < sc->sc_ndisks; n++) {
2169			disk = &sc->sc_disks[n];
2170			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2171				continue;
2172			if (disk->d_genid > genid)
2173				genid = disk->d_genid;
2174		}
2175		sc->sc_genid = genid;
2176		/*
2177		 * Remove all disks without the biggest genid.
2178		 */
2179		for (n = 0; n < sc->sc_ndisks; n++) {
2180			disk = &sc->sc_disks[n];
2181			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2182				continue;
2183			if (disk->d_genid < genid) {
2184				G_RAID3_DEBUG(0,
2185				    "Component %s (device %s) broken, skipping.",
2186				    g_raid3_get_diskname(disk), sc->sc_name);
2187				g_raid3_destroy_disk(disk);
2188			}
2189		}
2190
2191		/*
2192		 * There must be at least 'sc->sc_ndisks - 1' components
2193		 * with the same syncid and without SYNCHRONIZING flag.
2194		 */
2195
2196		/*
2197		 * Find the biggest syncid, number of valid components and
2198		 * number of dirty components.
2199		 */
2200		ndirty = ndisks = syncid = 0;
2201		for (n = 0; n < sc->sc_ndisks; n++) {
2202			disk = &sc->sc_disks[n];
2203			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2204				continue;
2205			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2206				ndirty++;
2207			if (disk->d_sync.ds_syncid > syncid) {
2208				syncid = disk->d_sync.ds_syncid;
2209				ndisks = 0;
2210			} else if (disk->d_sync.ds_syncid < syncid) {
2211				continue;
2212			}
2213			if ((disk->d_flags &
2214			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2215				continue;
2216			}
2217			ndisks++;
2218		}
2219		/*
2220		 * Do we have enough valid components?
2221		 */
2222		if (ndisks + 1 < sc->sc_ndisks) {
2223			G_RAID3_DEBUG(0,
2224			    "Device %s is broken, too few valid components.",
2225			    sc->sc_name);
2226			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2227			return;
2228		}
2229		/*
2230		 * If there is one DIRTY component and all disks are present,
2231		 * mark it for synchronization. If there is more than one DIRTY
2232		 * component, mark parity component for synchronization.
2233		 */
2234		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2235			for (n = 0; n < sc->sc_ndisks; n++) {
2236				disk = &sc->sc_disks[n];
2237				if ((disk->d_flags &
2238				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2239					continue;
2240				}
2241				disk->d_flags |=
2242				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2243			}
2244		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2245			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2246			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2247		}
2248
2249		sc->sc_syncid = syncid;
2250		if (force) {
2251			/* Remember to bump syncid on first write. */
2252			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID_OFW;
2253		}
2254		if (ndisks == sc->sc_ndisks)
2255			state = G_RAID3_DEVICE_STATE_COMPLETE;
2256		else /* if (ndisks == sc->sc_ndisks - 1) */
2257			state = G_RAID3_DEVICE_STATE_DEGRADED;
2258		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2259		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2260		    g_raid3_device_state2str(state));
2261		sc->sc_state = state;
2262		for (n = 0; n < sc->sc_ndisks; n++) {
2263			disk = &sc->sc_disks[n];
2264			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2265				continue;
2266			state = g_raid3_determine_state(disk);
2267			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2268			if (state == G_RAID3_DISK_STATE_STALE)
2269				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID_OFW;
2270		}
2271		break;
2272	    }
2273	case G_RAID3_DEVICE_STATE_DEGRADED:
2274		/*
2275		 * Bump syncid and/or genid here, if we need to do it
2276		 * immediately.
2277		 */
2278		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_IMM) != 0) {
2279			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2280			g_raid3_bump_syncid(sc);
2281		}
2282		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID_IMM) != 0) {
2283			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2284			g_raid3_bump_genid(sc);
2285		}
2286
2287		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2288			return;
2289		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2290		    sc->sc_ndisks - 1) {
2291			if (sc->sc_provider != NULL)
2292				g_raid3_destroy_provider(sc);
2293			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2294			return;
2295		}
2296		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2297		    sc->sc_ndisks) {
2298			state = G_RAID3_DEVICE_STATE_COMPLETE;
2299			G_RAID3_DEBUG(1,
2300			    "Device %s state changed from %s to %s.",
2301			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2302			    g_raid3_device_state2str(state));
2303			sc->sc_state = state;
2304		}
2305		if (sc->sc_provider == NULL)
2306			g_raid3_launch_provider(sc);
2307		break;
2308	case G_RAID3_DEVICE_STATE_COMPLETE:
2309		/*
2310		 * Bump syncid and/or genid here, if we need to do it
2311		 * immediately.
2312		 */
2313		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_IMM) != 0) {
2314			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2315			g_raid3_bump_syncid(sc);
2316		}
2317		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID_IMM) != 0) {
2318			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2319			g_raid3_bump_genid(sc);
2320		}
2321
2322		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2323			return;
2324		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2325		    sc->sc_ndisks - 1,
2326		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2327		    sc->sc_name));
2328		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2329		    sc->sc_ndisks - 1) {
2330			state = G_RAID3_DEVICE_STATE_DEGRADED;
2331			G_RAID3_DEBUG(1,
2332			    "Device %s state changed from %s to %s.",
2333			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2334			    g_raid3_device_state2str(state));
2335			sc->sc_state = state;
2336		}
2337		if (sc->sc_provider == NULL)
2338			g_raid3_launch_provider(sc);
2339		break;
2340	default:
2341		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2342		    g_raid3_device_state2str(sc->sc_state)));
2343		break;
2344	}
2345}
2346
2347/*
2348 * Update disk state and device state if needed.
2349 */
2350#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2351	"Disk %s state changed from %s to %s (device %s).",		\
2352	g_raid3_get_diskname(disk),					\
2353	g_raid3_disk_state2str(disk->d_state),				\
2354	g_raid3_disk_state2str(state), sc->sc_name)
2355static int
2356g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2357{
2358	struct g_raid3_softc *sc;
2359
2360	g_topology_assert();
2361
2362	sc = disk->d_softc;
2363again:
2364	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2365	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2366	    g_raid3_disk_state2str(state));
2367	switch (state) {
2368	case G_RAID3_DISK_STATE_NEW:
2369		/*
2370		 * Possible scenarios:
2371		 * 1. New disk arrive.
2372		 */
2373		/* Previous state should be NONE. */
2374		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2375		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2376		    g_raid3_disk_state2str(disk->d_state)));
2377		DISK_STATE_CHANGED();
2378
2379		disk->d_state = state;
2380		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2381		    sc->sc_name, g_raid3_get_diskname(disk));
2382		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2383			break;
2384		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2385		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2386		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2387		    g_raid3_device_state2str(sc->sc_state),
2388		    g_raid3_get_diskname(disk),
2389		    g_raid3_disk_state2str(disk->d_state)));
2390		state = g_raid3_determine_state(disk);
2391		if (state != G_RAID3_DISK_STATE_NONE)
2392			goto again;
2393		break;
2394	case G_RAID3_DISK_STATE_ACTIVE:
2395		/*
2396		 * Possible scenarios:
2397		 * 1. New disk does not need synchronization.
2398		 * 2. Synchronization process finished successfully.
2399		 */
2400		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2401		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2402		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2403		    g_raid3_device_state2str(sc->sc_state),
2404		    g_raid3_get_diskname(disk),
2405		    g_raid3_disk_state2str(disk->d_state)));
2406		/* Previous state should be NEW or SYNCHRONIZING. */
2407		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2408		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2409		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2410		    g_raid3_disk_state2str(disk->d_state)));
2411		DISK_STATE_CHANGED();
2412
2413		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2414			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2415		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2416			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2417			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2418			g_raid3_sync_stop(sc, 0);
2419		}
2420		disk->d_state = state;
2421		disk->d_sync.ds_offset = 0;
2422		disk->d_sync.ds_offset_done = 0;
2423		g_raid3_update_access(disk);
2424		g_raid3_update_metadata(disk);
2425		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2426		    sc->sc_name, g_raid3_get_diskname(disk));
2427		break;
2428	case G_RAID3_DISK_STATE_STALE:
2429		/*
2430		 * Possible scenarios:
2431		 * 1. Stale disk was connected.
2432		 */
2433		/* Previous state should be NEW. */
2434		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2435		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2436		    g_raid3_disk_state2str(disk->d_state)));
2437		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2438		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2439		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2440		    g_raid3_device_state2str(sc->sc_state),
2441		    g_raid3_get_diskname(disk),
2442		    g_raid3_disk_state2str(disk->d_state)));
2443		/*
2444		 * STALE state is only possible if device is marked
2445		 * NOAUTOSYNC.
2446		 */
2447		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2448		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2449		    g_raid3_device_state2str(sc->sc_state),
2450		    g_raid3_get_diskname(disk),
2451		    g_raid3_disk_state2str(disk->d_state)));
2452		DISK_STATE_CHANGED();
2453
2454		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2455		disk->d_state = state;
2456		g_raid3_update_metadata(disk);
2457		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2458		    sc->sc_name, g_raid3_get_diskname(disk));
2459		break;
2460	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2461		/*
2462		 * Possible scenarios:
2463		 * 1. Disk which needs synchronization was connected.
2464		 */
2465		/* Previous state should be NEW. */
2466		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2467		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2468		    g_raid3_disk_state2str(disk->d_state)));
2469		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2470		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2471		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2472		    g_raid3_device_state2str(sc->sc_state),
2473		    g_raid3_get_diskname(disk),
2474		    g_raid3_disk_state2str(disk->d_state)));
2475		DISK_STATE_CHANGED();
2476
2477		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2478			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2479		disk->d_state = state;
2480		if (sc->sc_provider != NULL) {
2481			g_raid3_sync_start(sc);
2482			g_raid3_update_metadata(disk);
2483		}
2484		break;
2485	case G_RAID3_DISK_STATE_DISCONNECTED:
2486		/*
2487		 * Possible scenarios:
2488		 * 1. Device wasn't running yet, but disk disappear.
2489		 * 2. Disk was active and disapppear.
2490		 * 3. Disk disappear during synchronization process.
2491		 */
2492		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2493		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2494			/*
2495			 * Previous state should be ACTIVE, STALE or
2496			 * SYNCHRONIZING.
2497			 */
2498			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2499			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2500			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2501			    ("Wrong disk state (%s, %s).",
2502			    g_raid3_get_diskname(disk),
2503			    g_raid3_disk_state2str(disk->d_state)));
2504		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2505			/* Previous state should be NEW. */
2506			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2507			    ("Wrong disk state (%s, %s).",
2508			    g_raid3_get_diskname(disk),
2509			    g_raid3_disk_state2str(disk->d_state)));
2510			/*
2511			 * Reset bumping syncid if disk disappeared in STARTING
2512			 * state.
2513			 */
2514			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID_OFW) != 0)
2515				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2516#ifdef	INVARIANTS
2517		} else {
2518			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2519			    sc->sc_name,
2520			    g_raid3_device_state2str(sc->sc_state),
2521			    g_raid3_get_diskname(disk),
2522			    g_raid3_disk_state2str(disk->d_state)));
2523#endif
2524		}
2525		DISK_STATE_CHANGED();
2526		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2527		    sc->sc_name, g_raid3_get_diskname(disk));
2528
2529		g_raid3_destroy_disk(disk);
2530		break;
2531	default:
2532		KASSERT(1 == 0, ("Unknown state (%u).", state));
2533		break;
2534	}
2535	return (0);
2536}
2537#undef	DISK_STATE_CHANGED
2538
2539static int
2540g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2541{
2542	struct g_provider *pp;
2543	u_char *buf;
2544	int error;
2545
2546	g_topology_assert();
2547
2548	error = g_access(cp, 1, 0, 0);
2549	if (error != 0)
2550		return (error);
2551	pp = cp->provider;
2552	g_topology_unlock();
2553	/* Metadata are stored on last sector. */
2554	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2555	    &error);
2556	g_topology_lock();
2557	g_access(cp, -1, 0, 0);
2558	if (error != 0) {
2559		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2560		    cp->provider->name, error);
2561		if (buf != NULL)
2562			g_free(buf);
2563		return (error);
2564	}
2565
2566	/* Decode metadata. */
2567	error = raid3_metadata_decode(buf, md);
2568	g_free(buf);
2569	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2570		return (EINVAL);
2571	if (md->md_version > G_RAID3_VERSION) {
2572		G_RAID3_DEBUG(0,
2573		    "Kernel module is too old to handle metadata from %s.",
2574		    cp->provider->name);
2575		return (EINVAL);
2576	}
2577	if (error != 0) {
2578		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2579		    cp->provider->name);
2580		return (error);
2581	}
2582
2583	return (0);
2584}
2585
2586static int
2587g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2588    struct g_raid3_metadata *md)
2589{
2590
2591	if (md->md_no >= sc->sc_ndisks) {
2592		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2593		    pp->name, md->md_no);
2594		return (EINVAL);
2595	}
2596	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2597		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2598		    pp->name, md->md_no);
2599		return (EEXIST);
2600	}
2601	if (md->md_all != sc->sc_ndisks) {
2602		G_RAID3_DEBUG(1,
2603		    "Invalid '%s' field on disk %s (device %s), skipping.",
2604		    "md_all", pp->name, sc->sc_name);
2605		return (EINVAL);
2606	}
2607	if (md->md_mediasize != sc->sc_mediasize) {
2608		G_RAID3_DEBUG(1,
2609		    "Invalid '%s' field on disk %s (device %s), skipping.",
2610		    "md_mediasize", pp->name, sc->sc_name);
2611		return (EINVAL);
2612	}
2613	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2614		G_RAID3_DEBUG(1,
2615		    "Invalid '%s' field on disk %s (device %s), skipping.",
2616		    "md_mediasize", pp->name, sc->sc_name);
2617		return (EINVAL);
2618	}
2619	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2620		G_RAID3_DEBUG(1,
2621		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2622		    sc->sc_name);
2623		return (EINVAL);
2624	}
2625	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2626		G_RAID3_DEBUG(1,
2627		    "Invalid '%s' field on disk %s (device %s), skipping.",
2628		    "md_sectorsize", pp->name, sc->sc_name);
2629		return (EINVAL);
2630	}
2631	if (md->md_sectorsize != sc->sc_sectorsize) {
2632		G_RAID3_DEBUG(1,
2633		    "Invalid '%s' field on disk %s (device %s), skipping.",
2634		    "md_sectorsize", pp->name, sc->sc_name);
2635		return (EINVAL);
2636	}
2637	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2638		G_RAID3_DEBUG(1,
2639		    "Invalid sector size of disk %s (device %s), skipping.",
2640		    pp->name, sc->sc_name);
2641		return (EINVAL);
2642	}
2643	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2644		G_RAID3_DEBUG(1,
2645		    "Invalid device flags on disk %s (device %s), skipping.",
2646		    pp->name, sc->sc_name);
2647		return (EINVAL);
2648	}
2649	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2650	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2651		/*
2652		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2653		 */
2654		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2655		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2656		return (EINVAL);
2657	}
2658	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2659		G_RAID3_DEBUG(1,
2660		    "Invalid disk flags on disk %s (device %s), skipping.",
2661		    pp->name, sc->sc_name);
2662		return (EINVAL);
2663	}
2664	return (0);
2665}
2666
2667static int
2668g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2669    struct g_raid3_metadata *md)
2670{
2671	struct g_raid3_disk *disk;
2672	int error;
2673
2674	g_topology_assert();
2675	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2676
2677	error = g_raid3_check_metadata(sc, pp, md);
2678	if (error != 0)
2679		return (error);
2680	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2681	    md->md_genid < sc->sc_genid) {
2682		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2683		    pp->name, sc->sc_name);
2684		return (EINVAL);
2685	}
2686	disk = g_raid3_init_disk(sc, pp, md, &error);
2687	if (disk == NULL)
2688		return (error);
2689	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2690	    G_RAID3_EVENT_WAIT);
2691	if (error != 0)
2692		return (error);
2693	if (md->md_version < G_RAID3_VERSION) {
2694		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2695		    pp->name, md->md_version, G_RAID3_VERSION);
2696		g_raid3_update_metadata(disk);
2697	}
2698	return (0);
2699}
2700
2701static int
2702g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2703{
2704	struct g_raid3_softc *sc;
2705	struct g_raid3_disk *disk;
2706	int dcr, dcw, dce;
2707	u_int n;
2708
2709	g_topology_assert();
2710	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2711	    acw, ace);
2712
2713	dcr = pp->acr + acr;
2714	dcw = pp->acw + acw;
2715	dce = pp->ace + ace;
2716
2717	sc = pp->geom->softc;
2718	if (sc == NULL ||
2719	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2720	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2721		if (acr <= 0 && acw <= 0 && ace <= 0)
2722			return (0);
2723		else
2724			return (ENXIO);
2725	}
2726	for (n = 0; n < sc->sc_ndisks; n++) {
2727		disk = &sc->sc_disks[n];
2728		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2729			continue;
2730		/*
2731		 * Mark disk as dirty on open and unmark on close.
2732		 */
2733		if (pp->acw == 0 && dcw > 0) {
2734			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2735			    g_raid3_get_diskname(disk), sc->sc_name);
2736			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2737			g_raid3_update_metadata(disk);
2738		} else if (pp->acw > 0 && dcw == 0) {
2739			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2740			    g_raid3_get_diskname(disk), sc->sc_name);
2741			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2742			g_raid3_update_metadata(disk);
2743		}
2744	}
2745	return (0);
2746}
2747
2748static struct g_geom *
2749g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2750{
2751	struct g_raid3_softc *sc;
2752	struct g_geom *gp;
2753	int error, timeout;
2754	u_int n;
2755
2756	g_topology_assert();
2757	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2758
2759	/* One disk is minimum. */
2760	if (md->md_all < 1)
2761		return (NULL);
2762	/*
2763	 * Action geom.
2764	 */
2765	gp = g_new_geomf(mp, "%s", md->md_name);
2766	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2767	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2768	    M_WAITOK | M_ZERO);
2769	gp->start = g_raid3_start;
2770	gp->spoiled = g_raid3_spoiled;
2771	gp->orphan = g_raid3_orphan;
2772	gp->access = g_raid3_access;
2773	gp->dumpconf = g_raid3_dumpconf;
2774
2775	sc->sc_id = md->md_id;
2776	sc->sc_mediasize = md->md_mediasize;
2777	sc->sc_sectorsize = md->md_sectorsize;
2778	sc->sc_ndisks = md->md_all;
2779	sc->sc_round_robin = 0;
2780	sc->sc_flags = md->md_mflags;
2781	sc->sc_bump_id = 0;
2782	sc->sc_idle = 0;
2783	for (n = 0; n < sc->sc_ndisks; n++) {
2784		sc->sc_disks[n].d_softc = sc;
2785		sc->sc_disks[n].d_no = n;
2786		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2787	}
2788	bioq_init(&sc->sc_queue);
2789	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2790	TAILQ_INIT(&sc->sc_events);
2791	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2792	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2793	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2794	gp->softc = sc;
2795	sc->sc_geom = gp;
2796	sc->sc_provider = NULL;
2797	/*
2798	 * Synchronization geom.
2799	 */
2800	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2801	gp->softc = sc;
2802	gp->orphan = g_raid3_orphan;
2803	sc->sc_sync.ds_geom = gp;
2804	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2805	    UMA_ALIGN_PTR, 0);
2806	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2807	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2808	    UMA_ALIGN_PTR, 0);
2809	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2810	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2811	    UMA_ALIGN_PTR, 0);
2812	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2813	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2814	    "g_raid3 %s", md->md_name);
2815	if (error != 0) {
2816		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2817		    sc->sc_name);
2818		uma_zdestroy(sc->sc_zone_64k);
2819		uma_zdestroy(sc->sc_zone_16k);
2820		uma_zdestroy(sc->sc_zone_4k);
2821		g_destroy_geom(sc->sc_sync.ds_geom);
2822		mtx_destroy(&sc->sc_events_mtx);
2823		mtx_destroy(&sc->sc_queue_mtx);
2824		g_destroy_geom(sc->sc_geom);
2825		free(sc->sc_disks, M_RAID3);
2826		free(sc, M_RAID3);
2827		return (NULL);
2828	}
2829
2830	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2831
2832	/*
2833	 * Run timeout.
2834	 */
2835	timeout = atomic_load_acq_int(&g_raid3_timeout);
2836	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2837	return (sc->sc_geom);
2838}
2839
2840int
2841g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2842{
2843	struct g_provider *pp;
2844
2845	g_topology_assert();
2846
2847	if (sc == NULL)
2848		return (ENXIO);
2849	pp = sc->sc_provider;
2850	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2851		if (force) {
2852			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2853			    "can't be definitely removed.", pp->name);
2854		} else {
2855			G_RAID3_DEBUG(1,
2856			    "Device %s is still open (r%dw%de%d).", pp->name,
2857			    pp->acr, pp->acw, pp->ace);
2858			return (EBUSY);
2859		}
2860	}
2861
2862	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2863	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2864	g_topology_unlock();
2865	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2866	mtx_lock(&sc->sc_queue_mtx);
2867	wakeup(sc);
2868	wakeup(&sc->sc_queue);
2869	mtx_unlock(&sc->sc_queue_mtx);
2870	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2871	while (sc->sc_worker != NULL)
2872		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2873	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2874	g_topology_lock();
2875	g_raid3_destroy_device(sc);
2876	free(sc->sc_disks, M_RAID3);
2877	free(sc, M_RAID3);
2878	return (0);
2879}
2880
2881static void
2882g_raid3_taste_orphan(struct g_consumer *cp)
2883{
2884
2885	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2886	    cp->provider->name));
2887}
2888
2889static struct g_geom *
2890g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2891{
2892	struct g_raid3_metadata md;
2893	struct g_raid3_softc *sc;
2894	struct g_consumer *cp;
2895	struct g_geom *gp;
2896	int error;
2897
2898	g_topology_assert();
2899	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2900	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2901
2902	gp = g_new_geomf(mp, "raid3:taste");
2903	/* This orphan function should be never called. */
2904	gp->orphan = g_raid3_taste_orphan;
2905	cp = g_new_consumer(gp);
2906	g_attach(cp, pp);
2907	error = g_raid3_read_metadata(cp, &md);
2908	g_detach(cp);
2909	g_destroy_consumer(cp);
2910	g_destroy_geom(gp);
2911	if (error != 0)
2912		return (NULL);
2913	gp = NULL;
2914
2915	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2916		return (NULL);
2917	if (g_raid3_debug >= 2)
2918		raid3_metadata_dump(&md);
2919
2920	/*
2921	 * Let's check if device already exists.
2922	 */
2923	sc = NULL;
2924	LIST_FOREACH(gp, &mp->geom, geom) {
2925		sc = gp->softc;
2926		if (sc == NULL)
2927			continue;
2928		if (sc->sc_sync.ds_geom == gp)
2929			continue;
2930		if (strcmp(md.md_name, sc->sc_name) != 0)
2931			continue;
2932		if (md.md_id != sc->sc_id) {
2933			G_RAID3_DEBUG(0, "Device %s already configured.",
2934			    sc->sc_name);
2935			return (NULL);
2936		}
2937		break;
2938	}
2939	if (gp == NULL) {
2940		gp = g_raid3_create(mp, &md);
2941		if (gp == NULL) {
2942			G_RAID3_DEBUG(0, "Cannot create device %s.",
2943			    md.md_name);
2944			return (NULL);
2945		}
2946		sc = gp->softc;
2947	}
2948	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2949	error = g_raid3_add_disk(sc, pp, &md);
2950	if (error != 0) {
2951		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2952		    pp->name, gp->name, error);
2953		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2954		    sc->sc_ndisks) {
2955			g_raid3_destroy(sc, 1);
2956		}
2957		return (NULL);
2958	}
2959	return (gp);
2960}
2961
2962static int
2963g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2964    struct g_geom *gp)
2965{
2966
2967	return (g_raid3_destroy(gp->softc, 0));
2968}
2969
2970static void
2971g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2972    struct g_consumer *cp, struct g_provider *pp)
2973{
2974	struct g_raid3_softc *sc;
2975
2976	g_topology_assert();
2977
2978	sc = gp->softc;
2979	if (sc == NULL)
2980		return;
2981	/* Skip synchronization geom. */
2982	if (gp == sc->sc_sync.ds_geom)
2983		return;
2984	if (pp != NULL) {
2985		/* Nothing here. */
2986	} else if (cp != NULL) {
2987		struct g_raid3_disk *disk;
2988
2989		disk = cp->private;
2990		if (disk == NULL)
2991			return;
2992		sbuf_printf(sb, "%s<Type>", indent);
2993		if (disk->d_no == sc->sc_ndisks - 1)
2994			sbuf_printf(sb, "PARITY");
2995		else
2996			sbuf_printf(sb, "DATA");
2997		sbuf_printf(sb, "</Type>\n");
2998		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2999		    (u_int)disk->d_no);
3000		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3001			sbuf_printf(sb, "%s<Synchronized>", indent);
3002			if (disk->d_sync.ds_offset_done == 0)
3003				sbuf_printf(sb, "0%%");
3004			else {
3005				sbuf_printf(sb, "%u%%",
3006				    (u_int)((disk->d_sync.ds_offset_done * 100) /
3007				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3008			}
3009			sbuf_printf(sb, "</Synchronized>\n");
3010		}
3011		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3012		    disk->d_sync.ds_syncid);
3013		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3014		sbuf_printf(sb, "%s<Flags>", indent);
3015		if (disk->d_flags == 0)
3016			sbuf_printf(sb, "NONE");
3017		else {
3018			int first = 1;
3019
3020#define	ADD_FLAG(flag, name)	do {					\
3021	if ((disk->d_flags & (flag)) != 0) {				\
3022		if (!first)						\
3023			sbuf_printf(sb, ", ");				\
3024		else							\
3025			first = 0;					\
3026		sbuf_printf(sb, name);					\
3027	}								\
3028} while (0)
3029			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3030			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3031			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3032			    "SYNCHRONIZING");
3033			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3034#undef	ADD_FLAG
3035		}
3036		sbuf_printf(sb, "</Flags>\n");
3037		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3038		    g_raid3_disk_state2str(disk->d_state));
3039	} else {
3040		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3041		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3042		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3043		sbuf_printf(sb, "%s<Flags>", indent);
3044		if (sc->sc_flags == 0)
3045			sbuf_printf(sb, "NONE");
3046		else {
3047			int first = 1;
3048
3049#define	ADD_FLAG(flag, name)	do {					\
3050	if ((sc->sc_flags & (flag)) != 0) {				\
3051		if (!first)						\
3052			sbuf_printf(sb, ", ");				\
3053		else							\
3054			first = 0;					\
3055		sbuf_printf(sb, name);					\
3056	}								\
3057} while (0)
3058			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3059			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3060			    "ROUND-ROBIN");
3061			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3062#undef	ADD_FLAG
3063		}
3064		sbuf_printf(sb, "</Flags>\n");
3065		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3066		    sc->sc_ndisks);
3067		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3068		    g_raid3_device_state2str(sc->sc_state));
3069	}
3070}
3071
3072static void
3073g_raid3_shutdown(void *arg, int howto)
3074{
3075	struct g_class *mp;
3076	struct g_geom *gp, *gp2;
3077
3078	mp = arg;
3079	DROP_GIANT();
3080	g_topology_lock();
3081	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3082		if (gp->softc == NULL)
3083			continue;
3084		g_raid3_destroy(gp->softc, 1);
3085	}
3086	g_topology_unlock();
3087	PICKUP_GIANT();
3088#if 0
3089	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3090#endif
3091}
3092
3093static void
3094g_raid3_init(struct g_class *mp)
3095{
3096
3097	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3098	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3099	if (g_raid3_ehtag == NULL)
3100		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3101}
3102
3103static void
3104g_raid3_fini(struct g_class *mp)
3105{
3106
3107	if (g_raid3_ehtag == NULL)
3108		return;
3109	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3110}
3111
3112DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3113