g_raid3.c revision 142727
1/*-
2 * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 142727 2005-02-27 23:07:47Z pjd $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sysctl.h>
39#include <sys/malloc.h>
40#include <sys/eventhandler.h>
41#include <vm/uma.h>
42#include <geom/geom.h>
43#include <sys/proc.h>
44#include <sys/kthread.h>
45#include <sys/sched.h>
46#include <geom/raid3/g_raid3.h>
47
48
49static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50
51SYSCTL_DECL(_kern_geom);
52SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53u_int g_raid3_debug = 0;
54TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56    "Debug level");
57static u_int g_raid3_timeout = 4;
58TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60    0, "Time to wait on all raid3 components");
61static u_int g_raid3_idletime = 5;
62TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64    &g_raid3_idletime, 0, "Mark components as clean when idling");
65static u_int g_raid3_reqs_per_sync = 5;
66SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67    &g_raid3_reqs_per_sync, 0,
68    "Number of regular I/O requests per synchronization request");
69static u_int g_raid3_syncs_per_sec = 1000;
70SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71    &g_raid3_syncs_per_sec, 0,
72    "Number of synchronizations requests per second");
73
74static u_int g_raid3_n64k = 50;
75TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77    "Maximum number of 64kB allocations");
78static u_int g_raid3_n16k = 200;
79TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81    "Maximum number of 16kB allocations");
82static u_int g_raid3_n4k = 1200;
83TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85    "Maximum number of 4kB allocations");
86
87SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88    "GEOM_RAID3 statistics");
89static u_int g_raid3_parity_mismatch = 0;
90SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92static u_int g_raid3_64k_requested = 0;
93SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95static u_int g_raid3_64k_failed = 0;
96SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98static u_int g_raid3_16k_requested = 0;
99SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101static u_int g_raid3_16k_failed = 0;
102SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104static u_int g_raid3_4k_requested = 0;
105SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107static u_int g_raid3_4k_failed = 0;
108SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110
111#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115} while (0)
116
117static eventhandler_tag g_raid3_ehtag = NULL;
118
119static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120    struct g_geom *gp);
121static g_taste_t g_raid3_taste;
122static void g_raid3_init(struct g_class *mp);
123static void g_raid3_fini(struct g_class *mp);
124
125struct g_class g_raid3_class = {
126	.name = G_RAID3_CLASS_NAME,
127	.version = G_VERSION,
128	.ctlreq = g_raid3_config,
129	.taste = g_raid3_taste,
130	.destroy_geom = g_raid3_destroy_geom,
131	.init = g_raid3_init,
132	.fini = g_raid3_fini
133};
134
135
136static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142
143
144static const char *
145g_raid3_disk_state2str(int state)
146{
147
148	switch (state) {
149	case G_RAID3_DISK_STATE_NODISK:
150		return ("NODISK");
151	case G_RAID3_DISK_STATE_NONE:
152		return ("NONE");
153	case G_RAID3_DISK_STATE_NEW:
154		return ("NEW");
155	case G_RAID3_DISK_STATE_ACTIVE:
156		return ("ACTIVE");
157	case G_RAID3_DISK_STATE_STALE:
158		return ("STALE");
159	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160		return ("SYNCHRONIZING");
161	case G_RAID3_DISK_STATE_DISCONNECTED:
162		return ("DISCONNECTED");
163	default:
164		return ("INVALID");
165	}
166}
167
168static const char *
169g_raid3_device_state2str(int state)
170{
171
172	switch (state) {
173	case G_RAID3_DEVICE_STATE_STARTING:
174		return ("STARTING");
175	case G_RAID3_DEVICE_STATE_DEGRADED:
176		return ("DEGRADED");
177	case G_RAID3_DEVICE_STATE_COMPLETE:
178		return ("COMPLETE");
179	default:
180		return ("INVALID");
181	}
182}
183
184const char *
185g_raid3_get_diskname(struct g_raid3_disk *disk)
186{
187
188	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189		return ("[unknown]");
190	return (disk->d_name);
191}
192
193#define	g_raid3_xor(src1, src2, dst, size)				\
194	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195	    (uint64_t *)(dst), (size_t)size)
196static void
197_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198{
199
200	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201	for (; size > 0; size -= 128) {
202		*dst++ = (*src1++) ^ (*src2++);
203		*dst++ = (*src1++) ^ (*src2++);
204		*dst++ = (*src1++) ^ (*src2++);
205		*dst++ = (*src1++) ^ (*src2++);
206		*dst++ = (*src1++) ^ (*src2++);
207		*dst++ = (*src1++) ^ (*src2++);
208		*dst++ = (*src1++) ^ (*src2++);
209		*dst++ = (*src1++) ^ (*src2++);
210		*dst++ = (*src1++) ^ (*src2++);
211		*dst++ = (*src1++) ^ (*src2++);
212		*dst++ = (*src1++) ^ (*src2++);
213		*dst++ = (*src1++) ^ (*src2++);
214		*dst++ = (*src1++) ^ (*src2++);
215		*dst++ = (*src1++) ^ (*src2++);
216		*dst++ = (*src1++) ^ (*src2++);
217		*dst++ = (*src1++) ^ (*src2++);
218	}
219}
220
221static int
222g_raid3_is_zero(struct bio *bp)
223{
224	static const uint64_t zeros[] = {
225	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226	};
227	u_char *addr;
228	ssize_t size;
229
230	size = bp->bio_length;
231	addr = (u_char *)bp->bio_data;
232	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234			return (0);
235	}
236	return (1);
237}
238
239/*
240 * --- Events handling functions ---
241 * Events in geom_raid3 are used to maintain disks and device status
242 * from one thread to simplify locking.
243 */
244static void
245g_raid3_event_free(struct g_raid3_event *ep)
246{
247
248	free(ep, M_RAID3);
249}
250
251int
252g_raid3_event_send(void *arg, int state, int flags)
253{
254	struct g_raid3_softc *sc;
255	struct g_raid3_disk *disk;
256	struct g_raid3_event *ep;
257	int error;
258
259	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262		disk = NULL;
263		sc = arg;
264	} else {
265		disk = arg;
266		sc = disk->d_softc;
267	}
268	ep->e_disk = disk;
269	ep->e_state = state;
270	ep->e_flags = flags;
271	ep->e_error = 0;
272	mtx_lock(&sc->sc_events_mtx);
273	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274	mtx_unlock(&sc->sc_events_mtx);
275	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276	mtx_lock(&sc->sc_queue_mtx);
277	wakeup(sc);
278	wakeup(&sc->sc_queue);
279	mtx_unlock(&sc->sc_queue_mtx);
280	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281		return (0);
282	g_topology_assert();
283	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284	g_topology_unlock();
285	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286		mtx_lock(&sc->sc_events_mtx);
287		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288		    hz * 5);
289	}
290	/* Don't even try to use 'sc' here, because it could be already dead. */
291	g_topology_lock();
292	error = ep->e_error;
293	g_raid3_event_free(ep);
294	return (error);
295}
296
297static struct g_raid3_event *
298g_raid3_event_get(struct g_raid3_softc *sc)
299{
300	struct g_raid3_event *ep;
301
302	mtx_lock(&sc->sc_events_mtx);
303	ep = TAILQ_FIRST(&sc->sc_events);
304	mtx_unlock(&sc->sc_events_mtx);
305	return (ep);
306}
307
308static void
309g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310{
311
312	mtx_lock(&sc->sc_events_mtx);
313	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314	mtx_unlock(&sc->sc_events_mtx);
315}
316
317static void
318g_raid3_event_cancel(struct g_raid3_disk *disk)
319{
320	struct g_raid3_softc *sc;
321	struct g_raid3_event *ep, *tmpep;
322
323	g_topology_assert();
324
325	sc = disk->d_softc;
326	mtx_lock(&sc->sc_events_mtx);
327	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329			continue;
330		if (ep->e_disk != disk)
331			continue;
332		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334			g_raid3_event_free(ep);
335		else {
336			ep->e_error = ECANCELED;
337			wakeup(ep);
338		}
339	}
340	mtx_unlock(&sc->sc_events_mtx);
341}
342
343/*
344 * Return the number of disks in the given state.
345 * If state is equal to -1, count all connected disks.
346 */
347u_int
348g_raid3_ndisks(struct g_raid3_softc *sc, int state)
349{
350	struct g_raid3_disk *disk;
351	u_int n, ndisks;
352
353	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354		disk = &sc->sc_disks[n];
355		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356			continue;
357		if (state == -1 || disk->d_state == state)
358			ndisks++;
359	}
360	return (ndisks);
361}
362
363static u_int
364g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365{
366	struct bio *bp;
367	u_int nreqs = 0;
368
369	mtx_lock(&sc->sc_queue_mtx);
370	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371		if (bp->bio_from == cp)
372			nreqs++;
373	}
374	mtx_unlock(&sc->sc_queue_mtx);
375	return (nreqs);
376}
377
378static int
379g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380{
381
382	if (cp->index > 0) {
383		G_RAID3_DEBUG(2,
384		    "I/O requests for %s exist, can't destroy it now.",
385		    cp->provider->name);
386		return (1);
387	}
388	if (g_raid3_nrequests(sc, cp) > 0) {
389		G_RAID3_DEBUG(2,
390		    "I/O requests for %s in queue, can't destroy it now.",
391		    cp->provider->name);
392		return (1);
393	}
394	return (0);
395}
396
397static void
398g_raid3_destroy_consumer(void *arg, int flags __unused)
399{
400	struct g_consumer *cp;
401
402	cp = arg;
403	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404	g_detach(cp);
405	g_destroy_consumer(cp);
406}
407
408static void
409g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410{
411	struct g_provider *pp;
412	int retaste_wait;
413
414	g_topology_assert();
415
416	cp->private = NULL;
417	if (g_raid3_is_busy(sc, cp))
418		return;
419	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420	pp = cp->provider;
421	retaste_wait = 0;
422	if (cp->acw == 1) {
423		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424			retaste_wait = 1;
425	}
426	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427	    -cp->acw, -cp->ace, 0);
428	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430	if (retaste_wait) {
431		/*
432		 * After retaste event was send (inside g_access()), we can send
433		 * event to detach and destroy consumer.
434		 * A class, which has consumer to the given provider connected
435		 * will not receive retaste event for the provider.
436		 * This is the way how I ignore retaste events when I close
437		 * consumers opened for write: I detach and destroy consumer
438		 * after retaste event is sent.
439		 */
440		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441		return;
442	}
443	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444	g_detach(cp);
445	g_destroy_consumer(cp);
446}
447
448static int
449g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450{
451	int error;
452
453	g_topology_assert();
454	KASSERT(disk->d_consumer == NULL,
455	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
456
457	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
458	disk->d_consumer->private = disk;
459	disk->d_consumer->index = 0;
460	error = g_attach(disk->d_consumer, pp);
461	if (error != 0)
462		return (error);
463	error = g_access(disk->d_consumer, 1, 1, 1);
464	if (error != 0) {
465		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
466		    pp->name, error);
467		return (error);
468	}
469	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
470	return (0);
471}
472
473static void
474g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
475{
476
477	g_topology_assert();
478
479	if (cp == NULL)
480		return;
481	if (cp->provider != NULL)
482		g_raid3_kill_consumer(sc, cp);
483	else
484		g_destroy_consumer(cp);
485}
486
487/*
488 * Initialize disk. This means allocate memory, create consumer, attach it
489 * to the provider and open access (r1w1e1) to it.
490 */
491static struct g_raid3_disk *
492g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
493    struct g_raid3_metadata *md, int *errorp)
494{
495	struct g_raid3_disk *disk;
496	int error;
497
498	disk = &sc->sc_disks[md->md_no];
499	error = g_raid3_connect_disk(disk, pp);
500	if (error != 0)
501		goto fail;
502	disk->d_state = G_RAID3_DISK_STATE_NONE;
503	disk->d_flags = md->md_dflags;
504	if (md->md_provider[0] != '\0')
505		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
506	disk->d_sync.ds_consumer = NULL;
507	disk->d_sync.ds_offset = md->md_sync_offset;
508	disk->d_sync.ds_offset_done = md->md_sync_offset;
509	disk->d_sync.ds_resync = -1;
510	disk->d_genid = md->md_genid;
511	disk->d_sync.ds_syncid = md->md_syncid;
512	if (errorp != NULL)
513		*errorp = 0;
514	return (disk);
515fail:
516	if (errorp != NULL)
517		*errorp = error;
518	if (disk != NULL)
519		g_raid3_disconnect_consumer(sc, disk->d_consumer);
520	return (NULL);
521}
522
523static void
524g_raid3_destroy_disk(struct g_raid3_disk *disk)
525{
526	struct g_raid3_softc *sc;
527
528	g_topology_assert();
529
530	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
531		return;
532	g_raid3_event_cancel(disk);
533	sc = disk->d_softc;
534	switch (disk->d_state) {
535	case G_RAID3_DISK_STATE_SYNCHRONIZING:
536		if (sc->sc_syncdisk != NULL)
537			g_raid3_sync_stop(sc, 1);
538		/* FALLTHROUGH */
539	case G_RAID3_DISK_STATE_NEW:
540	case G_RAID3_DISK_STATE_STALE:
541	case G_RAID3_DISK_STATE_ACTIVE:
542		g_raid3_disconnect_consumer(sc, disk->d_consumer);
543		disk->d_consumer = NULL;
544		break;
545	default:
546		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
547		    g_raid3_get_diskname(disk),
548		    g_raid3_disk_state2str(disk->d_state)));
549	}
550	disk->d_state = G_RAID3_DISK_STATE_NODISK;
551}
552
553static void
554g_raid3_destroy_device(struct g_raid3_softc *sc)
555{
556	struct g_raid3_event *ep;
557	struct g_raid3_disk *disk;
558	struct g_geom *gp;
559	struct g_consumer *cp;
560	u_int n;
561
562	g_topology_assert();
563
564	gp = sc->sc_geom;
565	if (sc->sc_provider != NULL)
566		g_raid3_destroy_provider(sc);
567	for (n = 0; n < sc->sc_ndisks; n++) {
568		disk = &sc->sc_disks[n];
569		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
570			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
571			g_raid3_update_metadata(disk);
572			g_raid3_destroy_disk(disk);
573		}
574	}
575	while ((ep = g_raid3_event_get(sc)) != NULL) {
576		g_raid3_event_remove(sc, ep);
577		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
578			g_raid3_event_free(ep);
579		else {
580			ep->e_error = ECANCELED;
581			ep->e_flags |= G_RAID3_EVENT_DONE;
582			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
583			mtx_lock(&sc->sc_events_mtx);
584			wakeup(ep);
585			mtx_unlock(&sc->sc_events_mtx);
586		}
587	}
588	callout_drain(&sc->sc_callout);
589	gp->softc = NULL;
590	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
591	if (cp != NULL)
592		g_raid3_disconnect_consumer(sc, cp);
593	sc->sc_sync.ds_geom->softc = NULL;
594	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
595	uma_zdestroy(sc->sc_zone_64k);
596	uma_zdestroy(sc->sc_zone_16k);
597	uma_zdestroy(sc->sc_zone_4k);
598	mtx_destroy(&sc->sc_queue_mtx);
599	mtx_destroy(&sc->sc_events_mtx);
600	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
601	g_wither_geom(gp, ENXIO);
602}
603
604static void
605g_raid3_orphan(struct g_consumer *cp)
606{
607	struct g_raid3_disk *disk;
608
609	g_topology_assert();
610
611	disk = cp->private;
612	if (disk == NULL)
613		return;
614	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
615	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
616	    G_RAID3_EVENT_DONTWAIT);
617}
618
619static int
620g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
621{
622	struct g_raid3_softc *sc;
623	struct g_consumer *cp;
624	off_t offset, length;
625	u_char *sector;
626	int error = 0;
627
628	g_topology_assert();
629
630	sc = disk->d_softc;
631	cp = disk->d_consumer;
632	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
633	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
634	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
635	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
636	    cp->acw, cp->ace));
637	length = cp->provider->sectorsize;
638	offset = cp->provider->mediasize - length;
639	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
640	if (md != NULL)
641		raid3_metadata_encode(md, sector);
642	g_topology_unlock();
643	error = g_write_data(cp, offset, sector, length);
644	g_topology_lock();
645	free(sector, M_RAID3);
646	if (error != 0) {
647		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID;
648		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
649		    G_RAID3_EVENT_DONTWAIT);
650	}
651	return (error);
652}
653
654int
655g_raid3_clear_metadata(struct g_raid3_disk *disk)
656{
657	int error;
658
659	g_topology_assert();
660	error = g_raid3_write_metadata(disk, NULL);
661	if (error == 0) {
662		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
663		    g_raid3_get_diskname(disk));
664	} else {
665		G_RAID3_DEBUG(0,
666		    "Cannot clear metadata on disk %s (error=%d).",
667		    g_raid3_get_diskname(disk), error);
668	}
669	return (error);
670}
671
672void
673g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
674{
675	struct g_raid3_softc *sc;
676	struct g_provider *pp;
677
678	sc = disk->d_softc;
679	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
680	md->md_version = G_RAID3_VERSION;
681	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
682	md->md_id = sc->sc_id;
683	md->md_all = sc->sc_ndisks;
684	md->md_genid = sc->sc_genid;
685	md->md_mediasize = sc->sc_mediasize;
686	md->md_sectorsize = sc->sc_sectorsize;
687	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
688	md->md_no = disk->d_no;
689	md->md_syncid = disk->d_sync.ds_syncid;
690	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
691	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
692		md->md_sync_offset = disk->d_sync.ds_offset_done;
693	else
694		md->md_sync_offset = 0;
695	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
696		pp = disk->d_consumer->provider;
697	else
698		pp = NULL;
699	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
700		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
701	else
702		bzero(md->md_provider, sizeof(md->md_provider));
703	if (pp != NULL)
704		md->md_provsize = pp->mediasize;
705	else
706		md->md_provsize = 0;
707}
708
709void
710g_raid3_update_metadata(struct g_raid3_disk *disk)
711{
712	struct g_raid3_metadata md;
713	int error;
714
715	g_topology_assert();
716	g_raid3_fill_metadata(disk, &md);
717	error = g_raid3_write_metadata(disk, &md);
718	if (error == 0) {
719		G_RAID3_DEBUG(2, "Metadata on %s updated.",
720		    g_raid3_get_diskname(disk));
721	} else {
722		G_RAID3_DEBUG(0,
723		    "Cannot update metadata on disk %s (error=%d).",
724		    g_raid3_get_diskname(disk), error);
725	}
726}
727
728static void
729g_raid3_bump_syncid(struct g_raid3_softc *sc)
730{
731	struct g_raid3_disk *disk;
732	u_int n;
733
734	g_topology_assert();
735	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
736	    ("%s called with no active disks (device=%s).", __func__,
737	    sc->sc_name));
738
739	sc->sc_syncid++;
740	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
741	    sc->sc_syncid);
742	for (n = 0; n < sc->sc_ndisks; n++) {
743		disk = &sc->sc_disks[n];
744		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
745		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
746			disk->d_sync.ds_syncid = sc->sc_syncid;
747			g_raid3_update_metadata(disk);
748		}
749	}
750}
751
752static void
753g_raid3_bump_genid(struct g_raid3_softc *sc)
754{
755	struct g_raid3_disk *disk;
756	u_int n;
757
758	g_topology_assert();
759	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
760	    ("%s called with no active disks (device=%s).", __func__,
761	    sc->sc_name));
762
763	sc->sc_genid++;
764	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
765	    sc->sc_genid);
766	for (n = 0; n < sc->sc_ndisks; n++) {
767		disk = &sc->sc_disks[n];
768		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
769		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
770			disk->d_genid = sc->sc_genid;
771			g_raid3_update_metadata(disk);
772		}
773	}
774}
775
776static void
777g_raid3_idle(struct g_raid3_softc *sc)
778{
779	struct g_raid3_disk *disk;
780	u_int i;
781
782	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
783		return;
784	sc->sc_idle = 1;
785	g_topology_lock();
786	for (i = 0; i < sc->sc_ndisks; i++) {
787		disk = &sc->sc_disks[i];
788		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
789			continue;
790		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
791		    g_raid3_get_diskname(disk), sc->sc_name);
792		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
793		g_raid3_update_metadata(disk);
794	}
795	g_topology_unlock();
796}
797
798static void
799g_raid3_unidle(struct g_raid3_softc *sc)
800{
801	struct g_raid3_disk *disk;
802	u_int i;
803
804	sc->sc_idle = 0;
805	g_topology_lock();
806	for (i = 0; i < sc->sc_ndisks; i++) {
807		disk = &sc->sc_disks[i];
808		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
809			continue;
810		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
811		    g_raid3_get_diskname(disk), sc->sc_name);
812		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
813		g_raid3_update_metadata(disk);
814	}
815	g_topology_unlock();
816}
817
818/*
819 * Return 1 if we should check if RAID3 device is idling.
820 */
821static int
822g_raid3_check_idle(struct g_raid3_softc *sc)
823{
824	struct g_raid3_disk *disk;
825	u_int i;
826
827	if (sc->sc_idle)
828		return (0);
829	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
830		return (0);
831	/*
832	 * Check if there are no in-flight requests.
833	 */
834	for (i = 0; i < sc->sc_ndisks; i++) {
835		disk = &sc->sc_disks[i];
836		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
837			continue;
838		if (disk->d_consumer->index > 0)
839			return (0);
840	}
841	return (1);
842}
843
844/*
845 * Treat bio_driver1 field in parent bio as list head and field bio_caller1
846 * in child bio as pointer to the next element on the list.
847 */
848#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
849
850#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
851
852#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
853	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
854	    (bp) = G_RAID3_NEXT_BIO(bp))
855
856#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
857	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
858	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
859	    (bp) = (tmpbp))
860
861static void
862g_raid3_init_bio(struct bio *pbp)
863{
864
865	G_RAID3_HEAD_BIO(pbp) = NULL;
866}
867
868static void
869g_raid3_remove_bio(struct bio *cbp)
870{
871	struct bio *pbp, *bp;
872
873	pbp = cbp->bio_parent;
874	if (G_RAID3_HEAD_BIO(pbp) == cbp)
875		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
876	else {
877		G_RAID3_FOREACH_BIO(pbp, bp) {
878			if (G_RAID3_NEXT_BIO(bp) == cbp) {
879				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
880				break;
881			}
882		}
883	}
884	G_RAID3_NEXT_BIO(cbp) = NULL;
885}
886
887static void
888g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
889{
890	struct bio *pbp, *bp;
891
892	g_raid3_remove_bio(sbp);
893	pbp = dbp->bio_parent;
894	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
895	if (G_RAID3_HEAD_BIO(pbp) == dbp)
896		G_RAID3_HEAD_BIO(pbp) = sbp;
897	else {
898		G_RAID3_FOREACH_BIO(pbp, bp) {
899			if (G_RAID3_NEXT_BIO(bp) == dbp) {
900				G_RAID3_NEXT_BIO(bp) = sbp;
901				break;
902			}
903		}
904	}
905	G_RAID3_NEXT_BIO(dbp) = NULL;
906}
907
908static void
909g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
910{
911	struct bio *bp, *pbp;
912	size_t size;
913
914	pbp = cbp->bio_parent;
915	pbp->bio_children--;
916	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
917	size = pbp->bio_length / (sc->sc_ndisks - 1);
918	if (size > 16384)
919		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
920	else if (size > 4096)
921		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
922	else
923		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
924	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
925		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
926		G_RAID3_NEXT_BIO(cbp) = NULL;
927		g_destroy_bio(cbp);
928	} else {
929		G_RAID3_FOREACH_BIO(pbp, bp) {
930			if (G_RAID3_NEXT_BIO(bp) == cbp)
931				break;
932		}
933		if (bp != NULL) {
934			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
935			    ("NULL bp->bio_driver1"));
936			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
937			G_RAID3_NEXT_BIO(cbp) = NULL;
938		}
939		g_destroy_bio(cbp);
940	}
941}
942
943static struct bio *
944g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
945{
946	struct bio *bp, *cbp;
947	size_t size;
948
949	cbp = g_clone_bio(pbp);
950	if (cbp == NULL)
951		return (NULL);
952	size = pbp->bio_length / (sc->sc_ndisks - 1);
953	if (size > 16384) {
954		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
955		g_raid3_64k_requested++;
956	} else if (size > 4096) {
957		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
958		g_raid3_16k_requested++;
959	} else {
960		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
961		g_raid3_4k_requested++;
962	}
963	if (cbp->bio_data == NULL) {
964		if (size > 16384)
965			g_raid3_64k_failed++;
966		if (size > 4096)
967			g_raid3_16k_failed++;
968		else
969			g_raid3_4k_failed++;
970		pbp->bio_children--;
971		g_destroy_bio(cbp);
972		return (NULL);
973	}
974	G_RAID3_NEXT_BIO(cbp) = NULL;
975	if (G_RAID3_HEAD_BIO(pbp) == NULL)
976		G_RAID3_HEAD_BIO(pbp) = cbp;
977	else {
978		G_RAID3_FOREACH_BIO(pbp, bp) {
979			if (G_RAID3_NEXT_BIO(bp) == NULL) {
980				G_RAID3_NEXT_BIO(bp) = cbp;
981				break;
982			}
983		}
984	}
985	return (cbp);
986}
987
988static void
989g_raid3_scatter(struct bio *pbp)
990{
991	struct g_raid3_softc *sc;
992	struct g_raid3_disk *disk;
993	struct bio *bp, *cbp;
994	off_t atom, cadd, padd, left;
995
996	sc = pbp->bio_to->geom->softc;
997	bp = NULL;
998	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
999		/*
1000		 * Find bio for which we should calculate data.
1001		 */
1002		G_RAID3_FOREACH_BIO(pbp, cbp) {
1003			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1004				bp = cbp;
1005				break;
1006			}
1007		}
1008		KASSERT(bp != NULL, ("NULL parity bio."));
1009	}
1010	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1011	cadd = padd = 0;
1012	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1013		G_RAID3_FOREACH_BIO(pbp, cbp) {
1014			if (cbp == bp)
1015				continue;
1016			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1017			padd += atom;
1018		}
1019		cadd += atom;
1020	}
1021	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1022		struct bio *tmpbp;
1023
1024		/*
1025		 * Calculate parity.
1026		 */
1027		bzero(bp->bio_data, bp->bio_length);
1028		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1029			if (cbp == bp)
1030				continue;
1031			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1032			    bp->bio_length);
1033			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1034				g_raid3_destroy_bio(sc, cbp);
1035		}
1036	}
1037	G_RAID3_FOREACH_BIO(pbp, cbp) {
1038		struct g_consumer *cp;
1039
1040		disk = cbp->bio_caller2;
1041		cp = disk->d_consumer;
1042		cbp->bio_to = cp->provider;
1043		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1044		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1045		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1046		    cp->acr, cp->acw, cp->ace));
1047		cp->index++;
1048		g_io_request(cbp, cp);
1049	}
1050}
1051
1052static void
1053g_raid3_gather(struct bio *pbp)
1054{
1055	struct g_raid3_softc *sc;
1056	struct g_raid3_disk *disk;
1057	struct bio *xbp, *fbp, *cbp;
1058	off_t atom, cadd, padd, left;
1059
1060	sc = pbp->bio_to->geom->softc;
1061	/*
1062	 * Find bio for which we have to calculate data.
1063	 * While going through this path, check if all requests
1064	 * succeeded, if not, deny whole request.
1065	 * If we're in COMPLETE mode, we allow one request to fail,
1066	 * so if we find one, we're sending it to the parity consumer.
1067	 * If there are more failed requests, we deny whole request.
1068	 */
1069	xbp = fbp = NULL;
1070	G_RAID3_FOREACH_BIO(pbp, cbp) {
1071		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1072			KASSERT(xbp == NULL, ("More than one parity bio."));
1073			xbp = cbp;
1074		}
1075		if (cbp->bio_error == 0)
1076			continue;
1077		/*
1078		 * Found failed request.
1079		 */
1080		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1081		disk = cbp->bio_caller2;
1082		if (disk != NULL) {
1083			/*
1084			 * Actually this is pointless to bump genid,
1085			 * because whole device is fucked up.
1086			 */
1087			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1088			g_raid3_event_send(disk,
1089			    G_RAID3_DISK_STATE_DISCONNECTED,
1090			    G_RAID3_EVENT_DONTWAIT);
1091		}
1092		if (fbp == NULL) {
1093			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1094				/*
1095				 * We are already in degraded mode, so we can't
1096				 * accept any failures.
1097				 */
1098				if (pbp->bio_error == 0)
1099					pbp->bio_error = fbp->bio_error;
1100			} else {
1101				fbp = cbp;
1102			}
1103		} else {
1104			/*
1105			 * Next failed request, that's too many.
1106			 */
1107			if (pbp->bio_error == 0)
1108				pbp->bio_error = fbp->bio_error;
1109		}
1110	}
1111	if (pbp->bio_error != 0)
1112		goto finish;
1113	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1114		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1115		if (xbp != fbp)
1116			g_raid3_replace_bio(xbp, fbp);
1117		g_raid3_destroy_bio(sc, fbp);
1118	} else if (fbp != NULL) {
1119		struct g_consumer *cp;
1120
1121		/*
1122		 * One request failed, so send the same request to
1123		 * the parity consumer.
1124		 */
1125		disk = pbp->bio_driver2;
1126		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1127			pbp->bio_error = fbp->bio_error;
1128			goto finish;
1129		}
1130		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1131		pbp->bio_inbed--;
1132		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1133		if (disk->d_no == sc->sc_ndisks - 1)
1134			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1135		fbp->bio_error = 0;
1136		fbp->bio_completed = 0;
1137		fbp->bio_children = 0;
1138		fbp->bio_inbed = 0;
1139		cp = disk->d_consumer;
1140		fbp->bio_caller2 = disk;
1141		fbp->bio_to = cp->provider;
1142		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1143		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1144		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1145		    cp->acr, cp->acw, cp->ace));
1146		cp->index++;
1147		g_io_request(fbp, cp);
1148		return;
1149	}
1150	if (xbp != NULL) {
1151		/*
1152		 * Calculate parity.
1153		 */
1154		G_RAID3_FOREACH_BIO(pbp, cbp) {
1155			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1156				continue;
1157			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1158			    xbp->bio_length);
1159		}
1160		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1161		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1162			if (!g_raid3_is_zero(xbp)) {
1163				g_raid3_parity_mismatch++;
1164				pbp->bio_error = EIO;
1165				goto finish;
1166			}
1167			g_raid3_destroy_bio(sc, xbp);
1168		}
1169	}
1170	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1171	cadd = padd = 0;
1172	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1173		G_RAID3_FOREACH_BIO(pbp, cbp) {
1174			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1175			pbp->bio_completed += atom;
1176			padd += atom;
1177		}
1178		cadd += atom;
1179	}
1180finish:
1181	if (pbp->bio_error == 0)
1182		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1183	else {
1184		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1185			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1186		else
1187			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1188	}
1189	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1190	g_io_deliver(pbp, pbp->bio_error);
1191	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1192		g_raid3_destroy_bio(sc, cbp);
1193}
1194
1195static void
1196g_raid3_done(struct bio *bp)
1197{
1198	struct g_raid3_softc *sc;
1199
1200	sc = bp->bio_from->geom->softc;
1201	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1202	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1203	mtx_lock(&sc->sc_queue_mtx);
1204	bioq_insert_head(&sc->sc_queue, bp);
1205	wakeup(sc);
1206	wakeup(&sc->sc_queue);
1207	mtx_unlock(&sc->sc_queue_mtx);
1208}
1209
1210static void
1211g_raid3_regular_request(struct bio *cbp)
1212{
1213	struct g_raid3_softc *sc;
1214	struct g_raid3_disk *disk;
1215	struct bio *pbp;
1216
1217	g_topology_assert_not();
1218
1219	cbp->bio_from->index--;
1220	pbp = cbp->bio_parent;
1221	sc = pbp->bio_to->geom->softc;
1222	disk = cbp->bio_from->private;
1223	if (disk == NULL) {
1224		g_topology_lock();
1225		g_raid3_kill_consumer(sc, cbp->bio_from);
1226		g_topology_unlock();
1227	}
1228
1229	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1230	pbp->bio_inbed++;
1231	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1232	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1233	    pbp->bio_children));
1234	if (pbp->bio_inbed != pbp->bio_children)
1235		return;
1236	switch (pbp->bio_cmd) {
1237	case BIO_READ:
1238		g_raid3_gather(pbp);
1239		break;
1240	case BIO_WRITE:
1241	case BIO_DELETE:
1242	    {
1243		int error = 0;
1244
1245		pbp->bio_completed = pbp->bio_length;
1246		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1247			if (cbp->bio_error != 0) {
1248				disk = cbp->bio_caller2;
1249				if (disk != NULL) {
1250					sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1251					g_raid3_event_send(disk,
1252					    G_RAID3_DISK_STATE_DISCONNECTED,
1253					    G_RAID3_EVENT_DONTWAIT);
1254				}
1255				if (error == 0)
1256					error = cbp->bio_error;
1257				else if (pbp->bio_error == 0) {
1258					/*
1259					 * Next failed request, that's too many.
1260					 */
1261					pbp->bio_error = error;
1262				}
1263			}
1264			g_raid3_destroy_bio(sc, cbp);
1265		}
1266		if (pbp->bio_error == 0)
1267			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1268		else
1269			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1270		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1271		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1272		g_io_deliver(pbp, pbp->bio_error);
1273		break;
1274	    }
1275	}
1276}
1277
1278static void
1279g_raid3_sync_done(struct bio *bp)
1280{
1281	struct g_raid3_softc *sc;
1282
1283	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1284	sc = bp->bio_from->geom->softc;
1285	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1286	mtx_lock(&sc->sc_queue_mtx);
1287	bioq_insert_head(&sc->sc_queue, bp);
1288	wakeup(sc);
1289	wakeup(&sc->sc_queue);
1290	mtx_unlock(&sc->sc_queue_mtx);
1291}
1292
1293static void
1294g_raid3_start(struct bio *bp)
1295{
1296	struct g_raid3_softc *sc;
1297
1298	sc = bp->bio_to->geom->softc;
1299	/*
1300	 * If sc == NULL or there are no valid disks, provider's error
1301	 * should be set and g_raid3_start() should not be called at all.
1302	 */
1303	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1304	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1305	    ("Provider's error should be set (error=%d)(device=%s).",
1306	    bp->bio_to->error, bp->bio_to->name));
1307	G_RAID3_LOGREQ(3, bp, "Request received.");
1308
1309	switch (bp->bio_cmd) {
1310	case BIO_READ:
1311	case BIO_WRITE:
1312	case BIO_DELETE:
1313		break;
1314	case BIO_GETATTR:
1315	default:
1316		g_io_deliver(bp, EOPNOTSUPP);
1317		return;
1318	}
1319	mtx_lock(&sc->sc_queue_mtx);
1320	bioq_insert_tail(&sc->sc_queue, bp);
1321	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1322	wakeup(sc);
1323	mtx_unlock(&sc->sc_queue_mtx);
1324}
1325
1326/*
1327 * Send one synchronization request.
1328 */
1329static void
1330g_raid3_sync_one(struct g_raid3_softc *sc)
1331{
1332	struct g_raid3_disk *disk;
1333	struct bio *bp;
1334
1335	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1336	    ("Wrong device state (%s, %s).", sc->sc_name,
1337	    g_raid3_device_state2str(sc->sc_state)));
1338	disk = sc->sc_syncdisk;
1339	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1340	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1341	    ("Disk %s is not marked for synchronization.",
1342	    g_raid3_get_diskname(disk)));
1343
1344	bp = g_new_bio();
1345	if (bp == NULL)
1346		return;
1347	bp->bio_parent = NULL;
1348	bp->bio_cmd = BIO_READ;
1349	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1350	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1351	bp->bio_cflags = 0;
1352	bp->bio_done = g_raid3_sync_done;
1353	bp->bio_data = disk->d_sync.ds_data;
1354	if (bp->bio_data == NULL) {
1355		g_destroy_bio(bp);
1356		return;
1357	}
1358	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1359	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1360	bp->bio_to = sc->sc_provider;
1361	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1362	disk->d_sync.ds_consumer->index++;
1363	g_io_request(bp, disk->d_sync.ds_consumer);
1364}
1365
1366static void
1367g_raid3_sync_request(struct bio *bp)
1368{
1369	struct g_raid3_softc *sc;
1370	struct g_raid3_disk *disk;
1371
1372	bp->bio_from->index--;
1373	sc = bp->bio_from->geom->softc;
1374	disk = bp->bio_from->private;
1375	if (disk == NULL) {
1376		g_topology_lock();
1377		g_raid3_kill_consumer(sc, bp->bio_from);
1378		g_topology_unlock();
1379		g_destroy_bio(bp);
1380		return;
1381	}
1382
1383	/*
1384	 * Synchronization request.
1385	 */
1386	switch (bp->bio_cmd) {
1387	case BIO_READ:
1388	    {
1389		struct g_consumer *cp;
1390		u_char *dst, *src;
1391		off_t left;
1392		u_int atom;
1393
1394		if (bp->bio_error != 0) {
1395			G_RAID3_LOGREQ(0, bp,
1396			    "Synchronization request failed (error=%d).",
1397			    bp->bio_error);
1398			g_destroy_bio(bp);
1399			return;
1400		}
1401		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1402		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1403		dst = src = bp->bio_data;
1404		if (disk->d_no == sc->sc_ndisks - 1) {
1405			u_int n;
1406
1407			/* Parity component. */
1408			for (left = bp->bio_length; left > 0;
1409			    left -= sc->sc_sectorsize) {
1410				bcopy(src, dst, atom);
1411				src += atom;
1412				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1413					g_raid3_xor(src, dst, dst, atom);
1414					src += atom;
1415				}
1416				dst += atom;
1417			}
1418		} else {
1419			/* Regular component. */
1420			src += atom * disk->d_no;
1421			for (left = bp->bio_length; left > 0;
1422			    left -= sc->sc_sectorsize) {
1423				bcopy(src, dst, atom);
1424				src += sc->sc_sectorsize;
1425				dst += atom;
1426			}
1427		}
1428		bp->bio_offset /= sc->sc_ndisks - 1;
1429		bp->bio_length /= sc->sc_ndisks - 1;
1430		bp->bio_cmd = BIO_WRITE;
1431		bp->bio_cflags = 0;
1432		bp->bio_children = bp->bio_inbed = 0;
1433		cp = disk->d_consumer;
1434		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1435		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1436		    cp->acr, cp->acw, cp->ace));
1437		cp->index++;
1438		g_io_request(bp, cp);
1439		return;
1440	    }
1441	case BIO_WRITE:
1442	    {
1443		struct g_raid3_disk_sync *sync;
1444
1445		if (bp->bio_error != 0) {
1446			G_RAID3_LOGREQ(0, bp,
1447			    "Synchronization request failed (error=%d).",
1448			    bp->bio_error);
1449			g_destroy_bio(bp);
1450			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1451			g_raid3_event_send(disk,
1452			    G_RAID3_DISK_STATE_DISCONNECTED,
1453			    G_RAID3_EVENT_DONTWAIT);
1454			return;
1455		}
1456		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1457		sync = &disk->d_sync;
1458		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1459		g_destroy_bio(bp);
1460		if (sync->ds_resync != -1)
1461			return;
1462		if (sync->ds_offset_done ==
1463		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1464			/*
1465			 * Disk up-to-date, activate it.
1466			 */
1467			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1468			    G_RAID3_EVENT_DONTWAIT);
1469			return;
1470		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1471			/*
1472			 * Update offset_done on every 100 blocks.
1473			 * XXX: This should be configurable.
1474			 */
1475			g_topology_lock();
1476			g_raid3_update_metadata(disk);
1477			g_topology_unlock();
1478		}
1479		return;
1480	    }
1481	default:
1482		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1483		    bp->bio_cmd, sc->sc_name));
1484		break;
1485	}
1486}
1487
1488static int
1489g_raid3_register_request(struct bio *pbp)
1490{
1491	struct g_raid3_softc *sc;
1492	struct g_raid3_disk *disk;
1493	struct g_consumer *cp;
1494	struct bio *cbp;
1495	off_t offset, length;
1496	u_int n, ndisks;
1497	int round_robin, verify;
1498
1499	ndisks = 0;
1500	sc = pbp->bio_to->geom->softc;
1501	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1502	    sc->sc_syncdisk == NULL) {
1503		g_io_deliver(pbp, EIO);
1504		return (0);
1505	}
1506	g_raid3_init_bio(pbp);
1507	length = pbp->bio_length / (sc->sc_ndisks - 1);
1508	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1509	round_robin = verify = 0;
1510	switch (pbp->bio_cmd) {
1511	case BIO_READ:
1512		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1513		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1514			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1515			verify = 1;
1516			ndisks = sc->sc_ndisks;
1517		} else {
1518			verify = 0;
1519			ndisks = sc->sc_ndisks - 1;
1520		}
1521		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1522		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1523			round_robin = 1;
1524		} else {
1525			round_robin = 0;
1526		}
1527		KASSERT(!round_robin || !verify,
1528		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1529		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1530		break;
1531	case BIO_WRITE:
1532	case BIO_DELETE:
1533	    {
1534		struct g_raid3_disk_sync *sync;
1535
1536		if (sc->sc_idle)
1537			g_raid3_unidle(sc);
1538
1539		ndisks = sc->sc_ndisks;
1540
1541		if (sc->sc_syncdisk == NULL)
1542			break;
1543		sync = &sc->sc_syncdisk->d_sync;
1544		if (offset >= sync->ds_offset)
1545			break;
1546		if (offset + length <= sync->ds_offset_done)
1547			break;
1548		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1549			break;
1550		sync->ds_resync = offset - (offset % MAXPHYS);
1551		break;
1552	    }
1553	}
1554	for (n = 0; n < ndisks; n++) {
1555		disk = &sc->sc_disks[n];
1556		cbp = g_raid3_clone_bio(sc, pbp);
1557		if (cbp == NULL) {
1558			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1559				g_raid3_destroy_bio(sc, cbp);
1560			return (ENOMEM);
1561		}
1562		cbp->bio_offset = offset;
1563		cbp->bio_length = length;
1564		cbp->bio_done = g_raid3_done;
1565		switch (pbp->bio_cmd) {
1566		case BIO_READ:
1567			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1568				/*
1569				 * Replace invalid component with the parity
1570				 * component.
1571				 */
1572				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1573				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1574				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1575			} else if (round_robin &&
1576			    disk->d_no == sc->sc_round_robin) {
1577				/*
1578				 * In round-robin mode skip one data component
1579				 * and use parity component when reading.
1580				 */
1581				pbp->bio_driver2 = disk;
1582				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1583				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1584				sc->sc_round_robin++;
1585				round_robin = 0;
1586			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1587				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1588			}
1589			break;
1590		case BIO_WRITE:
1591		case BIO_DELETE:
1592			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1593			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1594				if (n == ndisks - 1) {
1595					/*
1596					 * Active parity component, mark it as such.
1597					 */
1598					cbp->bio_cflags |=
1599					    G_RAID3_BIO_CFLAG_PARITY;
1600				}
1601			} else {
1602				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1603				if (n == ndisks - 1) {
1604					/*
1605					 * Parity component is not connected,
1606					 * so destroy its request.
1607					 */
1608					pbp->bio_pflags |=
1609					    G_RAID3_BIO_PFLAG_NOPARITY;
1610					g_raid3_destroy_bio(sc, cbp);
1611					cbp = NULL;
1612				} else {
1613					cbp->bio_cflags |=
1614					    G_RAID3_BIO_CFLAG_NODISK;
1615					disk = NULL;
1616				}
1617			}
1618			break;
1619		}
1620		if (cbp != NULL)
1621			cbp->bio_caller2 = disk;
1622	}
1623	switch (pbp->bio_cmd) {
1624	case BIO_READ:
1625		if (round_robin) {
1626			/*
1627			 * If we are in round-robin mode and 'round_robin' is
1628			 * still 1, it means, that we skipped parity component
1629			 * for this read and must reset sc_round_robin field.
1630			 */
1631			sc->sc_round_robin = 0;
1632		}
1633		G_RAID3_FOREACH_BIO(pbp, cbp) {
1634			disk = cbp->bio_caller2;
1635			cp = disk->d_consumer;
1636			cbp->bio_to = cp->provider;
1637			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1638			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1639			    ("Consumer %s not opened (r%dw%de%d).",
1640			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1641			cp->index++;
1642			g_io_request(cbp, cp);
1643		}
1644		break;
1645	case BIO_WRITE:
1646	case BIO_DELETE:
1647		/*
1648		 * Bump syncid on first write.
1649		 */
1650		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1651			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1652			g_topology_lock();
1653			g_raid3_bump_syncid(sc);
1654			g_topology_unlock();
1655		}
1656		g_raid3_scatter(pbp);
1657		break;
1658	}
1659	return (0);
1660}
1661
1662static int
1663g_raid3_can_destroy(struct g_raid3_softc *sc)
1664{
1665	struct g_geom *gp;
1666	struct g_consumer *cp;
1667
1668	g_topology_assert();
1669	gp = sc->sc_geom;
1670	LIST_FOREACH(cp, &gp->consumer, consumer) {
1671		if (g_raid3_is_busy(sc, cp))
1672			return (0);
1673	}
1674	gp = sc->sc_sync.ds_geom;
1675	LIST_FOREACH(cp, &gp->consumer, consumer) {
1676		if (g_raid3_is_busy(sc, cp))
1677			return (0);
1678	}
1679	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1680	    sc->sc_name);
1681	return (1);
1682}
1683
1684static int
1685g_raid3_try_destroy(struct g_raid3_softc *sc)
1686{
1687
1688	g_topology_lock();
1689	if (!g_raid3_can_destroy(sc)) {
1690		g_topology_unlock();
1691		return (0);
1692	}
1693	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1694		g_topology_unlock();
1695		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1696		    &sc->sc_worker);
1697		wakeup(&sc->sc_worker);
1698		sc->sc_worker = NULL;
1699	} else {
1700		g_raid3_destroy_device(sc);
1701		g_topology_unlock();
1702		free(sc->sc_disks, M_RAID3);
1703		free(sc, M_RAID3);
1704	}
1705	return (1);
1706}
1707
1708/*
1709 * Worker thread.
1710 */
1711static void
1712g_raid3_worker(void *arg)
1713{
1714	struct g_raid3_softc *sc;
1715	struct g_raid3_disk *disk;
1716	struct g_raid3_disk_sync *sync;
1717	struct g_raid3_event *ep;
1718	struct bio *bp;
1719	u_int nreqs;
1720
1721	sc = arg;
1722	mtx_lock_spin(&sched_lock);
1723	sched_prio(curthread, PRIBIO);
1724	mtx_unlock_spin(&sched_lock);
1725
1726	nreqs = 0;
1727	for (;;) {
1728		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1729		/*
1730		 * First take a look at events.
1731		 * This is important to handle events before any I/O requests.
1732		 */
1733		ep = g_raid3_event_get(sc);
1734		if (ep != NULL && g_topology_try_lock()) {
1735			g_raid3_event_remove(sc, ep);
1736			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1737				/* Update only device status. */
1738				G_RAID3_DEBUG(3,
1739				    "Running event for device %s.",
1740				    sc->sc_name);
1741				ep->e_error = 0;
1742				g_raid3_update_device(sc, 1);
1743			} else {
1744				/* Update disk status. */
1745				G_RAID3_DEBUG(3, "Running event for disk %s.",
1746				     g_raid3_get_diskname(ep->e_disk));
1747				ep->e_error = g_raid3_update_disk(ep->e_disk,
1748				    ep->e_state);
1749				if (ep->e_error == 0)
1750					g_raid3_update_device(sc, 0);
1751			}
1752			g_topology_unlock();
1753			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1754				KASSERT(ep->e_error == 0,
1755				    ("Error cannot be handled."));
1756				g_raid3_event_free(ep);
1757			} else {
1758				ep->e_flags |= G_RAID3_EVENT_DONE;
1759				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1760				    ep);
1761				mtx_lock(&sc->sc_events_mtx);
1762				wakeup(ep);
1763				mtx_unlock(&sc->sc_events_mtx);
1764			}
1765			if ((sc->sc_flags &
1766			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1767				if (g_raid3_try_destroy(sc))
1768					kthread_exit(0);
1769			}
1770			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1771			continue;
1772		}
1773		/*
1774		 * Now I/O requests.
1775		 */
1776		/* Get first request from the queue. */
1777		mtx_lock(&sc->sc_queue_mtx);
1778		bp = bioq_first(&sc->sc_queue);
1779		if (bp == NULL) {
1780			if (ep != NULL) {
1781				/*
1782				 * No I/O requests and topology lock was
1783				 * already held? Try again.
1784				 */
1785				mtx_unlock(&sc->sc_queue_mtx);
1786				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1787				continue;
1788			}
1789			if ((sc->sc_flags &
1790			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1791				mtx_unlock(&sc->sc_queue_mtx);
1792				if (g_raid3_try_destroy(sc))
1793					kthread_exit(0);
1794				mtx_lock(&sc->sc_queue_mtx);
1795			}
1796		}
1797		if (sc->sc_syncdisk != NULL &&
1798		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1799			mtx_unlock(&sc->sc_queue_mtx);
1800			/*
1801			 * It is time for synchronization...
1802			 */
1803			nreqs = 0;
1804			disk = sc->sc_syncdisk;
1805			sync = &disk->d_sync;
1806			if (sync->ds_offset <
1807			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1808			    sync->ds_offset == sync->ds_offset_done) {
1809				if (sync->ds_resync != -1) {
1810					sync->ds_offset = sync->ds_resync;
1811					sync->ds_offset_done = sync->ds_resync;
1812					sync->ds_resync = -1;
1813				}
1814				g_raid3_sync_one(sc);
1815			}
1816			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1817			goto sleep;
1818		}
1819		if (bp == NULL) {
1820			if (g_raid3_check_idle(sc)) {
1821				u_int idletime;
1822
1823				idletime = g_raid3_idletime;
1824				if (idletime == 0)
1825					idletime = 1;
1826				idletime *= hz;
1827				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1828				    "r3:w1", idletime) == EWOULDBLOCK) {
1829					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1830					    __func__);
1831					/*
1832					 * No I/O requests in 'idletime'
1833					 * seconds, so mark components as clean.
1834					 */
1835					g_raid3_idle(sc);
1836				}
1837				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1838			} else {
1839				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1840				    "r3:w2", 0);
1841				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1842			}
1843			continue;
1844		}
1845		nreqs++;
1846		bioq_remove(&sc->sc_queue, bp);
1847		mtx_unlock(&sc->sc_queue_mtx);
1848
1849		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1850			g_raid3_regular_request(bp);
1851		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1852			u_int timeout, sps;
1853
1854			g_raid3_sync_request(bp);
1855sleep:
1856			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1857			if (sps == 0) {
1858				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1859				continue;
1860			}
1861			if (ep != NULL) {
1862				/*
1863				 * We have some pending events, don't sleep now.
1864				 */
1865				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1866				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1867				continue;
1868			}
1869			mtx_lock(&sc->sc_queue_mtx);
1870			if (bioq_first(&sc->sc_queue) != NULL) {
1871				mtx_unlock(&sc->sc_queue_mtx);
1872				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1873				continue;
1874			}
1875			timeout = hz / sps;
1876			if (timeout == 0)
1877				timeout = 1;
1878			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1879			    timeout);
1880		} else {
1881			if (g_raid3_register_request(bp) != 0) {
1882				mtx_lock(&sc->sc_queue_mtx);
1883				bioq_insert_tail(&sc->sc_queue, bp);
1884				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1885				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1886			}
1887		}
1888		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1889	}
1890}
1891
1892/*
1893 * Open disk's consumer if needed.
1894 */
1895static void
1896g_raid3_update_access(struct g_raid3_disk *disk)
1897{
1898	struct g_provider *pp;
1899
1900	g_topology_assert();
1901
1902	pp = disk->d_softc->sc_provider;
1903	if (pp == NULL)
1904		return;
1905	if (pp->acw > 0) {
1906		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1907			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1908			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1909			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1910		}
1911	} else if (pp->acw == 0) {
1912		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1913			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1914			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1915			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1916		}
1917	}
1918}
1919
1920static void
1921g_raid3_sync_start(struct g_raid3_softc *sc)
1922{
1923	struct g_raid3_disk *disk;
1924	int error;
1925	u_int n;
1926
1927	g_topology_assert();
1928
1929	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1930	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1931	    sc->sc_state));
1932	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1933	    sc->sc_name, sc->sc_state));
1934	disk = NULL;
1935	for (n = 0; n < sc->sc_ndisks; n++) {
1936		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1937			continue;
1938		disk = &sc->sc_disks[n];
1939		break;
1940	}
1941	if (disk == NULL)
1942		return;
1943
1944	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1945	    g_raid3_get_diskname(disk));
1946	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1947	KASSERT(disk->d_sync.ds_consumer == NULL,
1948	    ("Sync consumer already exists (device=%s, disk=%s).",
1949	    sc->sc_name, g_raid3_get_diskname(disk)));
1950	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1951	disk->d_sync.ds_consumer->private = disk;
1952	disk->d_sync.ds_consumer->index = 0;
1953	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1954	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1955	    disk->d_softc->sc_name, error));
1956	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1957	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1958	    disk->d_softc->sc_name, error));
1959	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1960	sc->sc_syncdisk = disk;
1961}
1962
1963/*
1964 * Stop synchronization process.
1965 * type: 0 - synchronization finished
1966 *       1 - synchronization stopped
1967 */
1968static void
1969g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1970{
1971	struct g_raid3_disk *disk;
1972
1973	g_topology_assert();
1974	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1975	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1976	    sc->sc_state));
1977	disk = sc->sc_syncdisk;
1978	sc->sc_syncdisk = NULL;
1979	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1980	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1981	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1982	    g_raid3_disk_state2str(disk->d_state)));
1983	if (disk->d_sync.ds_consumer == NULL)
1984		return;
1985
1986	if (type == 0) {
1987		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1988		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1989	} else /* if (type == 1) */ {
1990		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1991		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1992	}
1993	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
1994	free(disk->d_sync.ds_data, M_RAID3);
1995	disk->d_sync.ds_consumer = NULL;
1996	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1997}
1998
1999static void
2000g_raid3_launch_provider(struct g_raid3_softc *sc)
2001{
2002	struct g_provider *pp;
2003
2004	g_topology_assert();
2005
2006	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2007	pp->mediasize = sc->sc_mediasize;
2008	pp->sectorsize = sc->sc_sectorsize;
2009	sc->sc_provider = pp;
2010	g_error_provider(pp, 0);
2011	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2012	    pp->name);
2013	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2014		g_raid3_sync_start(sc);
2015}
2016
2017static void
2018g_raid3_destroy_provider(struct g_raid3_softc *sc)
2019{
2020	struct bio *bp;
2021
2022	g_topology_assert();
2023	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2024	    sc->sc_name));
2025
2026	g_error_provider(sc->sc_provider, ENXIO);
2027	mtx_lock(&sc->sc_queue_mtx);
2028	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2029		bioq_remove(&sc->sc_queue, bp);
2030		g_io_deliver(bp, ENXIO);
2031	}
2032	mtx_unlock(&sc->sc_queue_mtx);
2033	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2034	    sc->sc_provider->name);
2035	sc->sc_provider->flags |= G_PF_WITHER;
2036	g_orphan_provider(sc->sc_provider, ENXIO);
2037	sc->sc_provider = NULL;
2038	if (sc->sc_syncdisk != NULL)
2039		g_raid3_sync_stop(sc, 1);
2040}
2041
2042static void
2043g_raid3_go(void *arg)
2044{
2045	struct g_raid3_softc *sc;
2046
2047	sc = arg;
2048	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2049	g_raid3_event_send(sc, 0,
2050	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2051}
2052
2053static u_int
2054g_raid3_determine_state(struct g_raid3_disk *disk)
2055{
2056	struct g_raid3_softc *sc;
2057	u_int state;
2058
2059	sc = disk->d_softc;
2060	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2061		if ((disk->d_flags &
2062		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2063			/* Disk does not need synchronization. */
2064			state = G_RAID3_DISK_STATE_ACTIVE;
2065		} else {
2066			if ((sc->sc_flags &
2067			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2068			    (disk->d_flags &
2069			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2070				/*
2071				 * We can start synchronization from
2072				 * the stored offset.
2073				 */
2074				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2075			} else {
2076				state = G_RAID3_DISK_STATE_STALE;
2077			}
2078		}
2079	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2080		/*
2081		 * Reset all synchronization data for this disk,
2082		 * because if it even was synchronized, it was
2083		 * synchronized to disks with different syncid.
2084		 */
2085		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2086		disk->d_sync.ds_offset = 0;
2087		disk->d_sync.ds_offset_done = 0;
2088		disk->d_sync.ds_syncid = sc->sc_syncid;
2089		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2090		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2091			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2092		} else {
2093			state = G_RAID3_DISK_STATE_STALE;
2094		}
2095	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2096		/*
2097		 * Not good, NOT GOOD!
2098		 * It means that device was started on stale disks
2099		 * and more fresh disk just arrive.
2100		 * If there were writes, device is fucked up, sorry.
2101		 * I think the best choice here is don't touch
2102		 * this disk and inform the user laudly.
2103		 */
2104		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2105		    "disk (%s) arrives!! It will not be connected to the "
2106		    "running device.", sc->sc_name,
2107		    g_raid3_get_diskname(disk));
2108		g_raid3_destroy_disk(disk);
2109		state = G_RAID3_DISK_STATE_NONE;
2110		/* Return immediately, because disk was destroyed. */
2111		return (state);
2112	}
2113	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2114	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2115	return (state);
2116}
2117
2118/*
2119 * Update device state.
2120 */
2121static void
2122g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2123{
2124	struct g_raid3_disk *disk;
2125	u_int state;
2126
2127	g_topology_assert();
2128
2129	switch (sc->sc_state) {
2130	case G_RAID3_DEVICE_STATE_STARTING:
2131	    {
2132		u_int n, ndirty, ndisks, genid, syncid;
2133
2134		KASSERT(sc->sc_provider == NULL,
2135		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2136		/*
2137		 * Are we ready? We are, if all disks are connected or
2138		 * one disk is missing and 'force' is true.
2139		 */
2140		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2141			if (!force)
2142				callout_drain(&sc->sc_callout);
2143		} else {
2144			if (force) {
2145				/*
2146				 * Timeout expired, so destroy device.
2147				 */
2148				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2149			}
2150			return;
2151		}
2152
2153		/*
2154		 * Find the biggest genid.
2155		 */
2156		genid = 0;
2157		for (n = 0; n < sc->sc_ndisks; n++) {
2158			disk = &sc->sc_disks[n];
2159			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2160				continue;
2161			if (disk->d_genid > genid)
2162				genid = disk->d_genid;
2163		}
2164		sc->sc_genid = genid;
2165		/*
2166		 * Remove all disks without the biggest genid.
2167		 */
2168		for (n = 0; n < sc->sc_ndisks; n++) {
2169			disk = &sc->sc_disks[n];
2170			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2171				continue;
2172			if (disk->d_genid < genid) {
2173				G_RAID3_DEBUG(0,
2174				    "Component %s (device %s) broken, skipping.",
2175				    g_raid3_get_diskname(disk), sc->sc_name);
2176				g_raid3_destroy_disk(disk);
2177			}
2178		}
2179
2180		/*
2181		 * There must be at least 'sc->sc_ndisks - 1' components
2182		 * with the same syncid and without SYNCHRONIZING flag.
2183		 */
2184
2185		/*
2186		 * Find the biggest syncid, number of valid components and
2187		 * number of dirty components.
2188		 */
2189		ndirty = ndisks = syncid = 0;
2190		for (n = 0; n < sc->sc_ndisks; n++) {
2191			disk = &sc->sc_disks[n];
2192			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2193				continue;
2194			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2195				ndirty++;
2196			if (disk->d_sync.ds_syncid > syncid) {
2197				syncid = disk->d_sync.ds_syncid;
2198				ndisks = 0;
2199			} else if (disk->d_sync.ds_syncid < syncid) {
2200				continue;
2201			}
2202			if ((disk->d_flags &
2203			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2204				continue;
2205			}
2206			ndisks++;
2207		}
2208		/*
2209		 * Do we have enough valid components?
2210		 */
2211		if (ndisks + 1 < sc->sc_ndisks) {
2212			G_RAID3_DEBUG(0,
2213			    "Device %s is broken, too few valid components.",
2214			    sc->sc_name);
2215			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2216			return;
2217		}
2218		/*
2219		 * If there is one DIRTY component and all disks are present,
2220		 * mark it for synchronization. If there is more than one DIRTY
2221		 * component, mark parity component for synchronization.
2222		 */
2223		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2224			for (n = 0; n < sc->sc_ndisks; n++) {
2225				disk = &sc->sc_disks[n];
2226				if ((disk->d_flags &
2227				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2228					continue;
2229				}
2230				disk->d_flags |=
2231				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2232			}
2233		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2234			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2235			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2236		}
2237
2238		sc->sc_syncid = syncid;
2239		if (force) {
2240			/* Remember to bump syncid on first write. */
2241			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2242		}
2243		if (ndisks == sc->sc_ndisks)
2244			state = G_RAID3_DEVICE_STATE_COMPLETE;
2245		else /* if (ndisks == sc->sc_ndisks - 1) */
2246			state = G_RAID3_DEVICE_STATE_DEGRADED;
2247		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2248		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2249		    g_raid3_device_state2str(state));
2250		sc->sc_state = state;
2251		for (n = 0; n < sc->sc_ndisks; n++) {
2252			disk = &sc->sc_disks[n];
2253			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2254				continue;
2255			state = g_raid3_determine_state(disk);
2256			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2257			if (state == G_RAID3_DISK_STATE_STALE)
2258				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2259		}
2260		break;
2261	    }
2262	case G_RAID3_DEVICE_STATE_DEGRADED:
2263		/*
2264		 * Genid need to be bumped immediately, so do it here.
2265		 */
2266		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2267			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2268			g_raid3_bump_genid(sc);
2269		}
2270
2271		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2272			return;
2273		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2274		    sc->sc_ndisks - 1) {
2275			if (sc->sc_provider != NULL)
2276				g_raid3_destroy_provider(sc);
2277			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2278			return;
2279		}
2280		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2281		    sc->sc_ndisks) {
2282			state = G_RAID3_DEVICE_STATE_COMPLETE;
2283			G_RAID3_DEBUG(1,
2284			    "Device %s state changed from %s to %s.",
2285			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2286			    g_raid3_device_state2str(state));
2287			sc->sc_state = state;
2288		}
2289		if (sc->sc_provider == NULL)
2290			g_raid3_launch_provider(sc);
2291		break;
2292	case G_RAID3_DEVICE_STATE_COMPLETE:
2293		/*
2294		 * Genid need to be bumped immediately, so do it here.
2295		 */
2296		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2297			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2298			g_raid3_bump_genid(sc);
2299		}
2300
2301		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2302			return;
2303		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2304		    sc->sc_ndisks - 1,
2305		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2306		    sc->sc_name));
2307		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2308		    sc->sc_ndisks - 1) {
2309			state = G_RAID3_DEVICE_STATE_DEGRADED;
2310			G_RAID3_DEBUG(1,
2311			    "Device %s state changed from %s to %s.",
2312			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2313			    g_raid3_device_state2str(state));
2314			sc->sc_state = state;
2315		}
2316		if (sc->sc_provider == NULL)
2317			g_raid3_launch_provider(sc);
2318		break;
2319	default:
2320		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2321		    g_raid3_device_state2str(sc->sc_state)));
2322		break;
2323	}
2324}
2325
2326/*
2327 * Update disk state and device state if needed.
2328 */
2329#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2330	"Disk %s state changed from %s to %s (device %s).",		\
2331	g_raid3_get_diskname(disk),					\
2332	g_raid3_disk_state2str(disk->d_state),				\
2333	g_raid3_disk_state2str(state), sc->sc_name)
2334static int
2335g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2336{
2337	struct g_raid3_softc *sc;
2338
2339	g_topology_assert();
2340
2341	sc = disk->d_softc;
2342again:
2343	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2344	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2345	    g_raid3_disk_state2str(state));
2346	switch (state) {
2347	case G_RAID3_DISK_STATE_NEW:
2348		/*
2349		 * Possible scenarios:
2350		 * 1. New disk arrive.
2351		 */
2352		/* Previous state should be NONE. */
2353		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2354		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2355		    g_raid3_disk_state2str(disk->d_state)));
2356		DISK_STATE_CHANGED();
2357
2358		disk->d_state = state;
2359		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2360		    sc->sc_name, g_raid3_get_diskname(disk));
2361		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2362			break;
2363		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2364		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2365		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2366		    g_raid3_device_state2str(sc->sc_state),
2367		    g_raid3_get_diskname(disk),
2368		    g_raid3_disk_state2str(disk->d_state)));
2369		state = g_raid3_determine_state(disk);
2370		if (state != G_RAID3_DISK_STATE_NONE)
2371			goto again;
2372		break;
2373	case G_RAID3_DISK_STATE_ACTIVE:
2374		/*
2375		 * Possible scenarios:
2376		 * 1. New disk does not need synchronization.
2377		 * 2. Synchronization process finished successfully.
2378		 */
2379		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2380		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2381		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2382		    g_raid3_device_state2str(sc->sc_state),
2383		    g_raid3_get_diskname(disk),
2384		    g_raid3_disk_state2str(disk->d_state)));
2385		/* Previous state should be NEW or SYNCHRONIZING. */
2386		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2387		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2388		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2389		    g_raid3_disk_state2str(disk->d_state)));
2390		DISK_STATE_CHANGED();
2391
2392		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2393			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2394		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2395			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2396			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2397			g_raid3_sync_stop(sc, 0);
2398		}
2399		disk->d_state = state;
2400		disk->d_sync.ds_offset = 0;
2401		disk->d_sync.ds_offset_done = 0;
2402		g_raid3_update_access(disk);
2403		g_raid3_update_metadata(disk);
2404		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2405		    sc->sc_name, g_raid3_get_diskname(disk));
2406		break;
2407	case G_RAID3_DISK_STATE_STALE:
2408		/*
2409		 * Possible scenarios:
2410		 * 1. Stale disk was connected.
2411		 */
2412		/* Previous state should be NEW. */
2413		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2414		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2415		    g_raid3_disk_state2str(disk->d_state)));
2416		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2417		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2418		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2419		    g_raid3_device_state2str(sc->sc_state),
2420		    g_raid3_get_diskname(disk),
2421		    g_raid3_disk_state2str(disk->d_state)));
2422		/*
2423		 * STALE state is only possible if device is marked
2424		 * NOAUTOSYNC.
2425		 */
2426		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2427		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2428		    g_raid3_device_state2str(sc->sc_state),
2429		    g_raid3_get_diskname(disk),
2430		    g_raid3_disk_state2str(disk->d_state)));
2431		DISK_STATE_CHANGED();
2432
2433		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2434		disk->d_state = state;
2435		g_raid3_update_metadata(disk);
2436		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2437		    sc->sc_name, g_raid3_get_diskname(disk));
2438		break;
2439	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2440		/*
2441		 * Possible scenarios:
2442		 * 1. Disk which needs synchronization was connected.
2443		 */
2444		/* Previous state should be NEW. */
2445		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2446		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2447		    g_raid3_disk_state2str(disk->d_state)));
2448		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2449		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2450		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2451		    g_raid3_device_state2str(sc->sc_state),
2452		    g_raid3_get_diskname(disk),
2453		    g_raid3_disk_state2str(disk->d_state)));
2454		DISK_STATE_CHANGED();
2455
2456		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2457			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2458		disk->d_state = state;
2459		if (sc->sc_provider != NULL) {
2460			g_raid3_sync_start(sc);
2461			g_raid3_update_metadata(disk);
2462		}
2463		break;
2464	case G_RAID3_DISK_STATE_DISCONNECTED:
2465		/*
2466		 * Possible scenarios:
2467		 * 1. Device wasn't running yet, but disk disappear.
2468		 * 2. Disk was active and disapppear.
2469		 * 3. Disk disappear during synchronization process.
2470		 */
2471		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2472		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2473			/*
2474			 * Previous state should be ACTIVE, STALE or
2475			 * SYNCHRONIZING.
2476			 */
2477			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2478			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2479			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2480			    ("Wrong disk state (%s, %s).",
2481			    g_raid3_get_diskname(disk),
2482			    g_raid3_disk_state2str(disk->d_state)));
2483		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2484			/* Previous state should be NEW. */
2485			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2486			    ("Wrong disk state (%s, %s).",
2487			    g_raid3_get_diskname(disk),
2488			    g_raid3_disk_state2str(disk->d_state)));
2489			/*
2490			 * Reset bumping syncid if disk disappeared in STARTING
2491			 * state.
2492			 */
2493			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2494				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2495#ifdef	INVARIANTS
2496		} else {
2497			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2498			    sc->sc_name,
2499			    g_raid3_device_state2str(sc->sc_state),
2500			    g_raid3_get_diskname(disk),
2501			    g_raid3_disk_state2str(disk->d_state)));
2502#endif
2503		}
2504		DISK_STATE_CHANGED();
2505		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2506		    sc->sc_name, g_raid3_get_diskname(disk));
2507
2508		g_raid3_destroy_disk(disk);
2509		break;
2510	default:
2511		KASSERT(1 == 0, ("Unknown state (%u).", state));
2512		break;
2513	}
2514	return (0);
2515}
2516#undef	DISK_STATE_CHANGED
2517
2518int
2519g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2520{
2521	struct g_provider *pp;
2522	u_char *buf;
2523	int error;
2524
2525	g_topology_assert();
2526
2527	error = g_access(cp, 1, 0, 0);
2528	if (error != 0)
2529		return (error);
2530	pp = cp->provider;
2531	g_topology_unlock();
2532	/* Metadata are stored on last sector. */
2533	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2534	    &error);
2535	g_topology_lock();
2536	g_access(cp, -1, 0, 0);
2537	if (error != 0) {
2538		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2539		    cp->provider->name, error);
2540		if (buf != NULL)
2541			g_free(buf);
2542		return (error);
2543	}
2544
2545	/* Decode metadata. */
2546	error = raid3_metadata_decode(buf, md);
2547	g_free(buf);
2548	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2549		return (EINVAL);
2550	if (md->md_version > G_RAID3_VERSION) {
2551		G_RAID3_DEBUG(0,
2552		    "Kernel module is too old to handle metadata from %s.",
2553		    cp->provider->name);
2554		return (EINVAL);
2555	}
2556	if (error != 0) {
2557		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2558		    cp->provider->name);
2559		return (error);
2560	}
2561
2562	return (0);
2563}
2564
2565static int
2566g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2567    struct g_raid3_metadata *md)
2568{
2569
2570	if (md->md_no >= sc->sc_ndisks) {
2571		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2572		    pp->name, md->md_no);
2573		return (EINVAL);
2574	}
2575	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2576		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2577		    pp->name, md->md_no);
2578		return (EEXIST);
2579	}
2580	if (md->md_all != sc->sc_ndisks) {
2581		G_RAID3_DEBUG(1,
2582		    "Invalid '%s' field on disk %s (device %s), skipping.",
2583		    "md_all", pp->name, sc->sc_name);
2584		return (EINVAL);
2585	}
2586	if (md->md_mediasize != sc->sc_mediasize) {
2587		G_RAID3_DEBUG(1,
2588		    "Invalid '%s' field on disk %s (device %s), skipping.",
2589		    "md_mediasize", pp->name, sc->sc_name);
2590		return (EINVAL);
2591	}
2592	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2593		G_RAID3_DEBUG(1,
2594		    "Invalid '%s' field on disk %s (device %s), skipping.",
2595		    "md_mediasize", pp->name, sc->sc_name);
2596		return (EINVAL);
2597	}
2598	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2599		G_RAID3_DEBUG(1,
2600		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2601		    sc->sc_name);
2602		return (EINVAL);
2603	}
2604	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2605		G_RAID3_DEBUG(1,
2606		    "Invalid '%s' field on disk %s (device %s), skipping.",
2607		    "md_sectorsize", pp->name, sc->sc_name);
2608		return (EINVAL);
2609	}
2610	if (md->md_sectorsize != sc->sc_sectorsize) {
2611		G_RAID3_DEBUG(1,
2612		    "Invalid '%s' field on disk %s (device %s), skipping.",
2613		    "md_sectorsize", pp->name, sc->sc_name);
2614		return (EINVAL);
2615	}
2616	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2617		G_RAID3_DEBUG(1,
2618		    "Invalid sector size of disk %s (device %s), skipping.",
2619		    pp->name, sc->sc_name);
2620		return (EINVAL);
2621	}
2622	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2623		G_RAID3_DEBUG(1,
2624		    "Invalid device flags on disk %s (device %s), skipping.",
2625		    pp->name, sc->sc_name);
2626		return (EINVAL);
2627	}
2628	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2629	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2630		/*
2631		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2632		 */
2633		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2634		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2635		return (EINVAL);
2636	}
2637	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2638		G_RAID3_DEBUG(1,
2639		    "Invalid disk flags on disk %s (device %s), skipping.",
2640		    pp->name, sc->sc_name);
2641		return (EINVAL);
2642	}
2643	return (0);
2644}
2645
2646int
2647g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2648    struct g_raid3_metadata *md)
2649{
2650	struct g_raid3_disk *disk;
2651	int error;
2652
2653	g_topology_assert();
2654	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2655
2656	error = g_raid3_check_metadata(sc, pp, md);
2657	if (error != 0)
2658		return (error);
2659	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2660	    md->md_genid < sc->sc_genid) {
2661		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2662		    pp->name, sc->sc_name);
2663		return (EINVAL);
2664	}
2665	disk = g_raid3_init_disk(sc, pp, md, &error);
2666	if (disk == NULL)
2667		return (error);
2668	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2669	    G_RAID3_EVENT_WAIT);
2670	if (error != 0)
2671		return (error);
2672	if (md->md_version < G_RAID3_VERSION) {
2673		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2674		    pp->name, md->md_version, G_RAID3_VERSION);
2675		g_raid3_update_metadata(disk);
2676	}
2677	return (0);
2678}
2679
2680static int
2681g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2682{
2683	struct g_raid3_softc *sc;
2684	struct g_raid3_disk *disk;
2685	int dcr, dcw, dce;
2686	u_int n;
2687
2688	g_topology_assert();
2689	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2690	    acw, ace);
2691
2692	dcr = pp->acr + acr;
2693	dcw = pp->acw + acw;
2694	dce = pp->ace + ace;
2695
2696	sc = pp->geom->softc;
2697	if (sc == NULL ||
2698	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2699	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2700		if (acr <= 0 && acw <= 0 && ace <= 0)
2701			return (0);
2702		else
2703			return (ENXIO);
2704	}
2705	for (n = 0; n < sc->sc_ndisks; n++) {
2706		disk = &sc->sc_disks[n];
2707		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2708			continue;
2709		/*
2710		 * Mark disk as dirty on open and unmark on close.
2711		 */
2712		if (pp->acw == 0 && dcw > 0) {
2713			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2714			    g_raid3_get_diskname(disk), sc->sc_name);
2715			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2716			g_raid3_update_metadata(disk);
2717		} else if (pp->acw > 0 && dcw == 0) {
2718			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2719			    g_raid3_get_diskname(disk), sc->sc_name);
2720			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2721			g_raid3_update_metadata(disk);
2722		}
2723	}
2724	return (0);
2725}
2726
2727static struct g_geom *
2728g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2729{
2730	struct g_raid3_softc *sc;
2731	struct g_geom *gp;
2732	int error, timeout;
2733	u_int n;
2734
2735	g_topology_assert();
2736	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2737
2738	/* One disk is minimum. */
2739	if (md->md_all < 1)
2740		return (NULL);
2741	/*
2742	 * Action geom.
2743	 */
2744	gp = g_new_geomf(mp, "%s", md->md_name);
2745	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2746	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2747	    M_WAITOK | M_ZERO);
2748	gp->start = g_raid3_start;
2749	gp->orphan = g_raid3_orphan;
2750	gp->access = g_raid3_access;
2751	gp->dumpconf = g_raid3_dumpconf;
2752
2753	sc->sc_id = md->md_id;
2754	sc->sc_mediasize = md->md_mediasize;
2755	sc->sc_sectorsize = md->md_sectorsize;
2756	sc->sc_ndisks = md->md_all;
2757	sc->sc_round_robin = 0;
2758	sc->sc_flags = md->md_mflags;
2759	sc->sc_bump_id = 0;
2760	sc->sc_idle = 0;
2761	for (n = 0; n < sc->sc_ndisks; n++) {
2762		sc->sc_disks[n].d_softc = sc;
2763		sc->sc_disks[n].d_no = n;
2764		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2765	}
2766	bioq_init(&sc->sc_queue);
2767	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2768	TAILQ_INIT(&sc->sc_events);
2769	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2770	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2771	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2772	gp->softc = sc;
2773	sc->sc_geom = gp;
2774	sc->sc_provider = NULL;
2775	/*
2776	 * Synchronization geom.
2777	 */
2778	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2779	gp->softc = sc;
2780	gp->orphan = g_raid3_orphan;
2781	sc->sc_sync.ds_geom = gp;
2782	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2783	    UMA_ALIGN_PTR, 0);
2784	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2785	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2786	    UMA_ALIGN_PTR, 0);
2787	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2788	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2789	    UMA_ALIGN_PTR, 0);
2790	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2791	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2792	    "g_raid3 %s", md->md_name);
2793	if (error != 0) {
2794		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2795		    sc->sc_name);
2796		uma_zdestroy(sc->sc_zone_64k);
2797		uma_zdestroy(sc->sc_zone_16k);
2798		uma_zdestroy(sc->sc_zone_4k);
2799		g_destroy_geom(sc->sc_sync.ds_geom);
2800		mtx_destroy(&sc->sc_events_mtx);
2801		mtx_destroy(&sc->sc_queue_mtx);
2802		g_destroy_geom(sc->sc_geom);
2803		free(sc->sc_disks, M_RAID3);
2804		free(sc, M_RAID3);
2805		return (NULL);
2806	}
2807
2808	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2809
2810	/*
2811	 * Run timeout.
2812	 */
2813	timeout = atomic_load_acq_int(&g_raid3_timeout);
2814	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2815	return (sc->sc_geom);
2816}
2817
2818int
2819g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2820{
2821	struct g_provider *pp;
2822
2823	g_topology_assert();
2824
2825	if (sc == NULL)
2826		return (ENXIO);
2827	pp = sc->sc_provider;
2828	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2829		if (force) {
2830			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2831			    "can't be definitely removed.", pp->name);
2832		} else {
2833			G_RAID3_DEBUG(1,
2834			    "Device %s is still open (r%dw%de%d).", pp->name,
2835			    pp->acr, pp->acw, pp->ace);
2836			return (EBUSY);
2837		}
2838	}
2839
2840	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2841	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2842	g_topology_unlock();
2843	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2844	mtx_lock(&sc->sc_queue_mtx);
2845	wakeup(sc);
2846	wakeup(&sc->sc_queue);
2847	mtx_unlock(&sc->sc_queue_mtx);
2848	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2849	while (sc->sc_worker != NULL)
2850		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2851	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2852	g_topology_lock();
2853	g_raid3_destroy_device(sc);
2854	free(sc->sc_disks, M_RAID3);
2855	free(sc, M_RAID3);
2856	return (0);
2857}
2858
2859static void
2860g_raid3_taste_orphan(struct g_consumer *cp)
2861{
2862
2863	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2864	    cp->provider->name));
2865}
2866
2867static struct g_geom *
2868g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2869{
2870	struct g_raid3_metadata md;
2871	struct g_raid3_softc *sc;
2872	struct g_consumer *cp;
2873	struct g_geom *gp;
2874	int error;
2875
2876	g_topology_assert();
2877	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2878	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2879
2880	gp = g_new_geomf(mp, "raid3:taste");
2881	/* This orphan function should be never called. */
2882	gp->orphan = g_raid3_taste_orphan;
2883	cp = g_new_consumer(gp);
2884	g_attach(cp, pp);
2885	error = g_raid3_read_metadata(cp, &md);
2886	g_detach(cp);
2887	g_destroy_consumer(cp);
2888	g_destroy_geom(gp);
2889	if (error != 0)
2890		return (NULL);
2891	gp = NULL;
2892
2893	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2894		return (NULL);
2895	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2896		return (NULL);
2897	if (g_raid3_debug >= 2)
2898		raid3_metadata_dump(&md);
2899
2900	/*
2901	 * Let's check if device already exists.
2902	 */
2903	sc = NULL;
2904	LIST_FOREACH(gp, &mp->geom, geom) {
2905		sc = gp->softc;
2906		if (sc == NULL)
2907			continue;
2908		if (sc->sc_sync.ds_geom == gp)
2909			continue;
2910		if (strcmp(md.md_name, sc->sc_name) != 0)
2911			continue;
2912		if (md.md_id != sc->sc_id) {
2913			G_RAID3_DEBUG(0, "Device %s already configured.",
2914			    sc->sc_name);
2915			return (NULL);
2916		}
2917		break;
2918	}
2919	if (gp == NULL) {
2920		gp = g_raid3_create(mp, &md);
2921		if (gp == NULL) {
2922			G_RAID3_DEBUG(0, "Cannot create device %s.",
2923			    md.md_name);
2924			return (NULL);
2925		}
2926		sc = gp->softc;
2927	}
2928	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2929	error = g_raid3_add_disk(sc, pp, &md);
2930	if (error != 0) {
2931		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2932		    pp->name, gp->name, error);
2933		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2934		    sc->sc_ndisks) {
2935			g_raid3_destroy(sc, 1);
2936		}
2937		return (NULL);
2938	}
2939	return (gp);
2940}
2941
2942static int
2943g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2944    struct g_geom *gp)
2945{
2946
2947	return (g_raid3_destroy(gp->softc, 0));
2948}
2949
2950static void
2951g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2952    struct g_consumer *cp, struct g_provider *pp)
2953{
2954	struct g_raid3_softc *sc;
2955
2956	g_topology_assert();
2957
2958	sc = gp->softc;
2959	if (sc == NULL)
2960		return;
2961	/* Skip synchronization geom. */
2962	if (gp == sc->sc_sync.ds_geom)
2963		return;
2964	if (pp != NULL) {
2965		/* Nothing here. */
2966	} else if (cp != NULL) {
2967		struct g_raid3_disk *disk;
2968
2969		disk = cp->private;
2970		if (disk == NULL)
2971			return;
2972		sbuf_printf(sb, "%s<Type>", indent);
2973		if (disk->d_no == sc->sc_ndisks - 1)
2974			sbuf_printf(sb, "PARITY");
2975		else
2976			sbuf_printf(sb, "DATA");
2977		sbuf_printf(sb, "</Type>\n");
2978		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2979		    (u_int)disk->d_no);
2980		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2981			sbuf_printf(sb, "%s<Synchronized>", indent);
2982			if (disk->d_sync.ds_offset_done == 0)
2983				sbuf_printf(sb, "0%%");
2984			else {
2985				sbuf_printf(sb, "%u%%",
2986				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2987				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2988			}
2989			sbuf_printf(sb, "</Synchronized>\n");
2990		}
2991		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2992		    disk->d_sync.ds_syncid);
2993		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
2994		sbuf_printf(sb, "%s<Flags>", indent);
2995		if (disk->d_flags == 0)
2996			sbuf_printf(sb, "NONE");
2997		else {
2998			int first = 1;
2999
3000#define	ADD_FLAG(flag, name)	do {					\
3001	if ((disk->d_flags & (flag)) != 0) {				\
3002		if (!first)						\
3003			sbuf_printf(sb, ", ");				\
3004		else							\
3005			first = 0;					\
3006		sbuf_printf(sb, name);					\
3007	}								\
3008} while (0)
3009			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3010			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3011			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3012			    "SYNCHRONIZING");
3013			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3014#undef	ADD_FLAG
3015		}
3016		sbuf_printf(sb, "</Flags>\n");
3017		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3018		    g_raid3_disk_state2str(disk->d_state));
3019	} else {
3020		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3021		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3022		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3023		sbuf_printf(sb, "%s<Flags>", indent);
3024		if (sc->sc_flags == 0)
3025			sbuf_printf(sb, "NONE");
3026		else {
3027			int first = 1;
3028
3029#define	ADD_FLAG(flag, name)	do {					\
3030	if ((sc->sc_flags & (flag)) != 0) {				\
3031		if (!first)						\
3032			sbuf_printf(sb, ", ");				\
3033		else							\
3034			first = 0;					\
3035		sbuf_printf(sb, name);					\
3036	}								\
3037} while (0)
3038			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3039			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3040			    "ROUND-ROBIN");
3041			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3042#undef	ADD_FLAG
3043		}
3044		sbuf_printf(sb, "</Flags>\n");
3045		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3046		    sc->sc_ndisks);
3047		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3048		    g_raid3_device_state2str(sc->sc_state));
3049	}
3050}
3051
3052static void
3053g_raid3_shutdown(void *arg, int howto)
3054{
3055	struct g_class *mp;
3056	struct g_geom *gp, *gp2;
3057
3058	mp = arg;
3059	DROP_GIANT();
3060	g_topology_lock();
3061	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3062		if (gp->softc == NULL)
3063			continue;
3064		g_raid3_destroy(gp->softc, 1);
3065	}
3066	g_topology_unlock();
3067	PICKUP_GIANT();
3068#if 0
3069	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3070#endif
3071}
3072
3073static void
3074g_raid3_init(struct g_class *mp)
3075{
3076
3077	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3078	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3079	if (g_raid3_ehtag == NULL)
3080		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3081}
3082
3083static void
3084g_raid3_fini(struct g_class *mp)
3085{
3086
3087	if (g_raid3_ehtag == NULL)
3088		return;
3089	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3090}
3091
3092DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3093