g_raid3.c revision 139671
183098Smp/*-
283098Smp * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3231990Smp * All rights reserved.
4231990Smp *
5231990Smp * Redistribution and use in source and binary forms, with or without
6231990Smp * modification, are permitted provided that the following conditions
7231990Smp * are met:
883098Smp * 1. Redistributions of source code must retain the above copyright
9231990Smp *    notice, this list of conditions and the following disclaimer.
1083098Smp * 2. Redistributions in binary form must reproduce the above copyright
1183098Smp *    notice, this list of conditions and the following disclaimer in the
12231990Smp *    documentation and/or other materials provided with the distribution.
1383098Smp *
14231990Smp * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15231990Smp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16231990Smp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17231990Smp * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18231990Smp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19231990Smp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2083098Smp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2183098Smp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22231990Smp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2383098Smp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2483098Smp * SUCH DAMAGE.
25231990Smp */
26231990Smp
27231990Smp#include <sys/cdefs.h>
2883098Smp__FBSDID("$FreeBSD: head/sys/geom/raid3/g_raid3.c 139671 2005-01-04 12:15:21Z pjd $");
29231990Smp
30231990Smp#include <sys/param.h>
31231990Smp#include <sys/systm.h>
3283098Smp#include <sys/kernel.h>
3383098Smp#include <sys/module.h>
3483098Smp#include <sys/limits.h>
35231990Smp#include <sys/lock.h>
3683098Smp#include <sys/mutex.h>
3783098Smp#include <sys/bio.h>
3883098Smp#include <sys/sysctl.h>
3983098Smp#include <sys/malloc.h>
4083098Smp#include <sys/eventhandler.h>
41231990Smp#include <vm/uma.h>
4283098Smp#include <geom/geom.h>
4383098Smp#include <sys/proc.h>
4483098Smp#include <sys/kthread.h>
4583098Smp#include <sys/sched.h>
46231990Smp#include <geom/raid3/g_raid3.h>
47231990Smp
4883098Smp
4983098Smpstatic MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
5083098Smp
5183098SmpSYSCTL_DECL(_kern_geom);
5283098SmpSYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
5383098Smpu_int g_raid3_debug = 0;
5483098SmpTUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
5583098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
5683098Smp    "Debug level");
5783098Smpstatic u_int g_raid3_timeout = 4;
5883098SmpTUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
5983098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
6083098Smp    0, "Time to wait on all raid3 components");
6183098Smpstatic u_int g_raid3_idletime = 5;
6283098SmpTUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
6383098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
6483098Smp    &g_raid3_idletime, 0, "Mark components as clean when idling");
6583098Smpstatic u_int g_raid3_reqs_per_sync = 5;
6683098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
6783098Smp    &g_raid3_reqs_per_sync, 0,
6883098Smp    "Number of regular I/O requests per synchronization request");
6983098Smpstatic u_int g_raid3_syncs_per_sec = 100;
7083098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
7183098Smp    &g_raid3_syncs_per_sec, 0,
7283098Smp    "Number of synchronizations requests per second");
7383098Smp
7483098Smpstatic u_int g_raid3_n64k = 50;
7583098SmpTUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
7683098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
7783098Smp    "Maximum number of 64kB allocations");
7883098Smpstatic u_int g_raid3_n16k = 200;
7983098SmpTUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
8083098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
8183098Smp    "Maximum number of 16kB allocations");
8283098Smpstatic u_int g_raid3_n4k = 1200;
8383098SmpTUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
8483098SmpSYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
8583098Smp    "Maximum number of 4kB allocations");
8683098Smp
8783098SmpSYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
8883098Smp    "GEOM_RAID3 statistics");
8983098Smpstatic u_int g_raid3_parity_mismatch = 0;
9083098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
9183098Smp    &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
9283098Smpstatic u_int g_raid3_64k_requested = 0;
9383098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
9483098Smp    &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
9583098Smpstatic u_int g_raid3_64k_failed = 0;
9683098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
9783098Smp    &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
9883098Smpstatic u_int g_raid3_16k_requested = 0;
9983098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
10083098Smp    &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
10183098Smpstatic u_int g_raid3_16k_failed = 0;
10283098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
10383098Smp    &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
10483098Smpstatic u_int g_raid3_4k_requested = 0;
10583098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
10683098Smp    &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
10783098Smpstatic u_int g_raid3_4k_failed = 0;
10883098SmpSYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
10983098Smp    &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
11083098Smp
111#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115} while (0)
116
117static eventhandler_tag g_raid3_ehtag = NULL;
118
119static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120    struct g_geom *gp);
121static g_taste_t g_raid3_taste;
122static void g_raid3_init(struct g_class *mp);
123static void g_raid3_fini(struct g_class *mp);
124
125struct g_class g_raid3_class = {
126	.name = G_RAID3_CLASS_NAME,
127	.version = G_VERSION,
128	.ctlreq = g_raid3_config,
129	.taste = g_raid3_taste,
130	.destroy_geom = g_raid3_destroy_geom,
131	.init = g_raid3_init,
132	.fini = g_raid3_fini
133};
134
135
136static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142
143
144static const char *
145g_raid3_disk_state2str(int state)
146{
147
148	switch (state) {
149	case G_RAID3_DISK_STATE_NODISK:
150		return ("NODISK");
151	case G_RAID3_DISK_STATE_NONE:
152		return ("NONE");
153	case G_RAID3_DISK_STATE_NEW:
154		return ("NEW");
155	case G_RAID3_DISK_STATE_ACTIVE:
156		return ("ACTIVE");
157	case G_RAID3_DISK_STATE_STALE:
158		return ("STALE");
159	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160		return ("SYNCHRONIZING");
161	case G_RAID3_DISK_STATE_DISCONNECTED:
162		return ("DISCONNECTED");
163	default:
164		return ("INVALID");
165	}
166}
167
168static const char *
169g_raid3_device_state2str(int state)
170{
171
172	switch (state) {
173	case G_RAID3_DEVICE_STATE_STARTING:
174		return ("STARTING");
175	case G_RAID3_DEVICE_STATE_DEGRADED:
176		return ("DEGRADED");
177	case G_RAID3_DEVICE_STATE_COMPLETE:
178		return ("COMPLETE");
179	default:
180		return ("INVALID");
181	}
182}
183
184const char *
185g_raid3_get_diskname(struct g_raid3_disk *disk)
186{
187
188	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189		return ("[unknown]");
190	return (disk->d_name);
191}
192
193#define	g_raid3_xor(src1, src2, dst, size)				\
194	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195	    (uint64_t *)(dst), (size_t)size)
196static void
197_g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198{
199
200	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201	for (; size > 0; size -= 128) {
202		*dst++ = (*src1++) ^ (*src2++);
203		*dst++ = (*src1++) ^ (*src2++);
204		*dst++ = (*src1++) ^ (*src2++);
205		*dst++ = (*src1++) ^ (*src2++);
206		*dst++ = (*src1++) ^ (*src2++);
207		*dst++ = (*src1++) ^ (*src2++);
208		*dst++ = (*src1++) ^ (*src2++);
209		*dst++ = (*src1++) ^ (*src2++);
210		*dst++ = (*src1++) ^ (*src2++);
211		*dst++ = (*src1++) ^ (*src2++);
212		*dst++ = (*src1++) ^ (*src2++);
213		*dst++ = (*src1++) ^ (*src2++);
214		*dst++ = (*src1++) ^ (*src2++);
215		*dst++ = (*src1++) ^ (*src2++);
216		*dst++ = (*src1++) ^ (*src2++);
217		*dst++ = (*src1++) ^ (*src2++);
218	}
219}
220
221static int
222g_raid3_is_zero(struct bio *bp)
223{
224	static const uint64_t zeros[] = {
225	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226	};
227	u_char *addr;
228	ssize_t size;
229
230	size = bp->bio_length;
231	addr = (u_char *)bp->bio_data;
232	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234			return (0);
235	}
236	return (1);
237}
238
239/*
240 * --- Events handling functions ---
241 * Events in geom_raid3 are used to maintain disks and device status
242 * from one thread to simplify locking.
243 */
244static void
245g_raid3_event_free(struct g_raid3_event *ep)
246{
247
248	free(ep, M_RAID3);
249}
250
251int
252g_raid3_event_send(void *arg, int state, int flags)
253{
254	struct g_raid3_softc *sc;
255	struct g_raid3_disk *disk;
256	struct g_raid3_event *ep;
257	int error;
258
259	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262		disk = NULL;
263		sc = arg;
264	} else {
265		disk = arg;
266		sc = disk->d_softc;
267	}
268	ep->e_disk = disk;
269	ep->e_state = state;
270	ep->e_flags = flags;
271	ep->e_error = 0;
272	mtx_lock(&sc->sc_events_mtx);
273	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274	mtx_unlock(&sc->sc_events_mtx);
275	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276	mtx_lock(&sc->sc_queue_mtx);
277	wakeup(sc);
278	wakeup(&sc->sc_queue);
279	mtx_unlock(&sc->sc_queue_mtx);
280	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281		return (0);
282	g_topology_assert();
283	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284	g_topology_unlock();
285	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286		mtx_lock(&sc->sc_events_mtx);
287		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288		    hz * 5);
289	}
290	/* Don't even try to use 'sc' here, because it could be already dead. */
291	g_topology_lock();
292	error = ep->e_error;
293	g_raid3_event_free(ep);
294	return (error);
295}
296
297static struct g_raid3_event *
298g_raid3_event_get(struct g_raid3_softc *sc)
299{
300	struct g_raid3_event *ep;
301
302	mtx_lock(&sc->sc_events_mtx);
303	ep = TAILQ_FIRST(&sc->sc_events);
304	mtx_unlock(&sc->sc_events_mtx);
305	return (ep);
306}
307
308static void
309g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310{
311
312	mtx_lock(&sc->sc_events_mtx);
313	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314	mtx_unlock(&sc->sc_events_mtx);
315}
316
317static void
318g_raid3_event_cancel(struct g_raid3_disk *disk)
319{
320	struct g_raid3_softc *sc;
321	struct g_raid3_event *ep, *tmpep;
322
323	g_topology_assert();
324
325	sc = disk->d_softc;
326	mtx_lock(&sc->sc_events_mtx);
327	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329			continue;
330		if (ep->e_disk != disk)
331			continue;
332		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334			g_raid3_event_free(ep);
335		else {
336			ep->e_error = ECANCELED;
337			wakeup(ep);
338		}
339	}
340	mtx_unlock(&sc->sc_events_mtx);
341}
342
343/*
344 * Return the number of disks in the given state.
345 * If state is equal to -1, count all connected disks.
346 */
347u_int
348g_raid3_ndisks(struct g_raid3_softc *sc, int state)
349{
350	struct g_raid3_disk *disk;
351	u_int n, ndisks;
352
353	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354		disk = &sc->sc_disks[n];
355		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356			continue;
357		if (state == -1 || disk->d_state == state)
358			ndisks++;
359	}
360	return (ndisks);
361}
362
363static u_int
364g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365{
366	struct bio *bp;
367	u_int nreqs = 0;
368
369	mtx_lock(&sc->sc_queue_mtx);
370	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371		if (bp->bio_from == cp)
372			nreqs++;
373	}
374	mtx_unlock(&sc->sc_queue_mtx);
375	return (nreqs);
376}
377
378static int
379g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380{
381
382	if (cp->index > 0) {
383		G_RAID3_DEBUG(2,
384		    "I/O requests for %s exist, can't destroy it now.",
385		    cp->provider->name);
386		return (1);
387	}
388	if (g_raid3_nrequests(sc, cp) > 0) {
389		G_RAID3_DEBUG(2,
390		    "I/O requests for %s in queue, can't destroy it now.",
391		    cp->provider->name);
392		return (1);
393	}
394	return (0);
395}
396
397static void
398g_raid3_destroy_consumer(void *arg, int flags __unused)
399{
400	struct g_consumer *cp;
401
402	cp = arg;
403	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404	g_detach(cp);
405	g_destroy_consumer(cp);
406}
407
408static void
409g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410{
411	struct g_provider *pp;
412	int retaste_wait;
413
414	g_topology_assert();
415
416	cp->private = NULL;
417	if (g_raid3_is_busy(sc, cp))
418		return;
419	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420	pp = cp->provider;
421	retaste_wait = 0;
422	if (cp->acw == 1) {
423		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424			retaste_wait = 1;
425	}
426	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427	    -cp->acw, -cp->ace, 0);
428	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430	if (retaste_wait) {
431		/*
432		 * After retaste event was send (inside g_access()), we can send
433		 * event to detach and destroy consumer.
434		 * A class, which has consumer to the given provider connected
435		 * will not receive retaste event for the provider.
436		 * This is the way how I ignore retaste events when I close
437		 * consumers opened for write: I detach and destroy consumer
438		 * after retaste event is sent.
439		 */
440		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441		return;
442	}
443	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444	g_detach(cp);
445	g_destroy_consumer(cp);
446}
447
448static int
449g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450{
451	int error;
452
453	g_topology_assert();
454	KASSERT(disk->d_consumer == NULL,
455	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
456
457	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
458	disk->d_consumer->private = disk;
459	disk->d_consumer->index = 0;
460	error = g_attach(disk->d_consumer, pp);
461	if (error != 0)
462		return (error);
463	error = g_access(disk->d_consumer, 1, 1, 1);
464	if (error != 0) {
465		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
466		    pp->name, error);
467		return (error);
468	}
469	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
470	return (0);
471}
472
473static void
474g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
475{
476
477	g_topology_assert();
478
479	if (cp == NULL)
480		return;
481	if (cp->provider != NULL)
482		g_raid3_kill_consumer(sc, cp);
483	else
484		g_destroy_consumer(cp);
485}
486
487/*
488 * Initialize disk. This means allocate memory, create consumer, attach it
489 * to the provider and open access (r1w1e1) to it.
490 */
491static struct g_raid3_disk *
492g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
493    struct g_raid3_metadata *md, int *errorp)
494{
495	struct g_raid3_disk *disk;
496	int error;
497
498	disk = &sc->sc_disks[md->md_no];
499	error = g_raid3_connect_disk(disk, pp);
500	if (error != 0)
501		goto fail;
502	disk->d_state = G_RAID3_DISK_STATE_NONE;
503	disk->d_flags = md->md_dflags;
504	if (md->md_provider[0] != '\0')
505		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
506	disk->d_sync.ds_consumer = NULL;
507	disk->d_sync.ds_offset = md->md_sync_offset;
508	disk->d_sync.ds_offset_done = md->md_sync_offset;
509	disk->d_sync.ds_resync = -1;
510	disk->d_genid = md->md_genid;
511	disk->d_sync.ds_syncid = md->md_syncid;
512	if (errorp != NULL)
513		*errorp = 0;
514	return (disk);
515fail:
516	if (errorp != NULL)
517		*errorp = error;
518	if (disk != NULL)
519		g_raid3_disconnect_consumer(sc, disk->d_consumer);
520	return (NULL);
521}
522
523static void
524g_raid3_destroy_disk(struct g_raid3_disk *disk)
525{
526	struct g_raid3_softc *sc;
527
528	g_topology_assert();
529
530	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
531		return;
532	g_raid3_event_cancel(disk);
533	sc = disk->d_softc;
534	switch (disk->d_state) {
535	case G_RAID3_DISK_STATE_SYNCHRONIZING:
536		if (sc->sc_syncdisk != NULL)
537			g_raid3_sync_stop(sc, 1);
538		/* FALLTHROUGH */
539	case G_RAID3_DISK_STATE_NEW:
540	case G_RAID3_DISK_STATE_STALE:
541	case G_RAID3_DISK_STATE_ACTIVE:
542		g_raid3_disconnect_consumer(sc, disk->d_consumer);
543		disk->d_consumer = NULL;
544		break;
545	default:
546		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
547		    g_raid3_get_diskname(disk),
548		    g_raid3_disk_state2str(disk->d_state)));
549	}
550	disk->d_state = G_RAID3_DISK_STATE_NODISK;
551}
552
553static void
554g_raid3_destroy_device(struct g_raid3_softc *sc)
555{
556	struct g_raid3_event *ep;
557	struct g_raid3_disk *disk;
558	struct g_geom *gp;
559	struct g_consumer *cp;
560	u_int n;
561
562	g_topology_assert();
563
564	gp = sc->sc_geom;
565	if (sc->sc_provider != NULL)
566		g_raid3_destroy_provider(sc);
567	for (n = 0; n < sc->sc_ndisks; n++) {
568		disk = &sc->sc_disks[n];
569		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
570			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
571			g_raid3_update_metadata(disk);
572			g_raid3_destroy_disk(disk);
573		}
574	}
575	while ((ep = g_raid3_event_get(sc)) != NULL) {
576		g_raid3_event_remove(sc, ep);
577		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
578			g_raid3_event_free(ep);
579		else {
580			ep->e_error = ECANCELED;
581			ep->e_flags |= G_RAID3_EVENT_DONE;
582			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
583			mtx_lock(&sc->sc_events_mtx);
584			wakeup(ep);
585			mtx_unlock(&sc->sc_events_mtx);
586		}
587	}
588	callout_drain(&sc->sc_callout);
589	gp->softc = NULL;
590	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
591	if (cp != NULL)
592		g_raid3_disconnect_consumer(sc, cp);
593	sc->sc_sync.ds_geom->softc = NULL;
594	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
595	uma_zdestroy(sc->sc_zone_64k);
596	uma_zdestroy(sc->sc_zone_16k);
597	uma_zdestroy(sc->sc_zone_4k);
598	mtx_destroy(&sc->sc_queue_mtx);
599	mtx_destroy(&sc->sc_events_mtx);
600	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
601	g_wither_geom(gp, ENXIO);
602}
603
604static void
605g_raid3_orphan(struct g_consumer *cp)
606{
607	struct g_raid3_disk *disk;
608
609	g_topology_assert();
610
611	disk = cp->private;
612	if (disk == NULL)
613		return;
614	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
615	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
616	    G_RAID3_EVENT_DONTWAIT);
617}
618
619static int
620g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
621{
622	struct g_raid3_softc *sc;
623	struct g_consumer *cp;
624	off_t offset, length;
625	u_char *sector;
626	int error = 0;
627
628	g_topology_assert();
629
630	sc = disk->d_softc;
631	cp = disk->d_consumer;
632	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
633	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
634	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
635	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
636	    cp->acw, cp->ace));
637	length = cp->provider->sectorsize;
638	offset = cp->provider->mediasize - length;
639	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
640	if (md != NULL)
641		raid3_metadata_encode(md, sector);
642	g_topology_unlock();
643	error = g_write_data(cp, offset, sector, length);
644	g_topology_lock();
645	free(sector, M_RAID3);
646	if (error != 0) {
647		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID;
648		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
649		    G_RAID3_EVENT_DONTWAIT);
650	}
651	return (error);
652}
653
654int
655g_raid3_clear_metadata(struct g_raid3_disk *disk)
656{
657	int error;
658
659	g_topology_assert();
660	error = g_raid3_write_metadata(disk, NULL);
661	if (error == 0) {
662		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
663		    g_raid3_get_diskname(disk));
664	} else {
665		G_RAID3_DEBUG(0,
666		    "Cannot clear metadata on disk %s (error=%d).",
667		    g_raid3_get_diskname(disk), error);
668	}
669	return (error);
670}
671
672void
673g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
674{
675	struct g_raid3_softc *sc;
676
677	sc = disk->d_softc;
678	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
679	md->md_version = G_RAID3_VERSION;
680	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
681	md->md_id = sc->sc_id;
682	md->md_all = sc->sc_ndisks;
683	md->md_genid = sc->sc_genid;
684	md->md_mediasize = sc->sc_mediasize;
685	md->md_sectorsize = sc->sc_sectorsize;
686	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
687	md->md_no = disk->d_no;
688	md->md_syncid = disk->d_sync.ds_syncid;
689	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
690	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
691		md->md_sync_offset = disk->d_sync.ds_offset_done;
692	else
693		md->md_sync_offset = 0;
694	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
695	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
696		strlcpy(md->md_provider, disk->d_consumer->provider->name,
697		    sizeof(md->md_provider));
698	} else {
699		bzero(md->md_provider, sizeof(md->md_provider));
700	}
701}
702
703void
704g_raid3_update_metadata(struct g_raid3_disk *disk)
705{
706	struct g_raid3_metadata md;
707	int error;
708
709	g_topology_assert();
710	g_raid3_fill_metadata(disk, &md);
711	error = g_raid3_write_metadata(disk, &md);
712	if (error == 0) {
713		G_RAID3_DEBUG(2, "Metadata on %s updated.",
714		    g_raid3_get_diskname(disk));
715	} else {
716		G_RAID3_DEBUG(0,
717		    "Cannot update metadata on disk %s (error=%d).",
718		    g_raid3_get_diskname(disk), error);
719	}
720}
721
722static void
723g_raid3_bump_syncid(struct g_raid3_softc *sc)
724{
725	struct g_raid3_disk *disk;
726	u_int n;
727
728	g_topology_assert();
729	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
730	    ("%s called with no active disks (device=%s).", __func__,
731	    sc->sc_name));
732
733	sc->sc_syncid++;
734	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
735	    sc->sc_syncid);
736	for (n = 0; n < sc->sc_ndisks; n++) {
737		disk = &sc->sc_disks[n];
738		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
739		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
740			disk->d_sync.ds_syncid = sc->sc_syncid;
741			g_raid3_update_metadata(disk);
742		}
743	}
744}
745
746static void
747g_raid3_bump_genid(struct g_raid3_softc *sc)
748{
749	struct g_raid3_disk *disk;
750	u_int n;
751
752	g_topology_assert();
753	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
754	    ("%s called with no active disks (device=%s).", __func__,
755	    sc->sc_name));
756
757	sc->sc_genid++;
758	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
759	    sc->sc_genid);
760	for (n = 0; n < sc->sc_ndisks; n++) {
761		disk = &sc->sc_disks[n];
762		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
763		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
764			disk->d_genid = sc->sc_genid;
765			g_raid3_update_metadata(disk);
766		}
767	}
768}
769
770static void
771g_raid3_idle(struct g_raid3_softc *sc)
772{
773	struct g_raid3_disk *disk;
774	u_int i;
775
776	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
777		return;
778	sc->sc_idle = 1;
779	g_topology_lock();
780	for (i = 0; i < sc->sc_ndisks; i++) {
781		disk = &sc->sc_disks[i];
782		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
783			continue;
784		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
785		    g_raid3_get_diskname(disk), sc->sc_name);
786		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
787		g_raid3_update_metadata(disk);
788	}
789	g_topology_unlock();
790}
791
792static void
793g_raid3_unidle(struct g_raid3_softc *sc)
794{
795	struct g_raid3_disk *disk;
796	u_int i;
797
798	sc->sc_idle = 0;
799	g_topology_lock();
800	for (i = 0; i < sc->sc_ndisks; i++) {
801		disk = &sc->sc_disks[i];
802		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
803			continue;
804		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
805		    g_raid3_get_diskname(disk), sc->sc_name);
806		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
807		g_raid3_update_metadata(disk);
808	}
809	g_topology_unlock();
810}
811
812/*
813 * Return 1 if we should check if RAID3 device is idling.
814 */
815static int
816g_raid3_check_idle(struct g_raid3_softc *sc)
817{
818	struct g_raid3_disk *disk;
819	u_int i;
820
821	if (sc->sc_idle)
822		return (0);
823	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
824		return (0);
825	/*
826	 * Check if there are no in-flight requests.
827	 */
828	for (i = 0; i < sc->sc_ndisks; i++) {
829		disk = &sc->sc_disks[i];
830		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
831			continue;
832		if (disk->d_consumer->index > 0)
833			return (0);
834	}
835	return (1);
836}
837
838/*
839 * Treat bio_driver1 field in parent bio as list head and field bio_caller1
840 * in child bio as pointer to the next element on the list.
841 */
842#define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
843
844#define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
845
846#define	G_RAID3_FOREACH_BIO(pbp, bp)					\
847	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
848	    (bp) = G_RAID3_NEXT_BIO(bp))
849
850#define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
851	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
852	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
853	    (bp) = (tmpbp))
854
855static void
856g_raid3_init_bio(struct bio *pbp)
857{
858
859	G_RAID3_HEAD_BIO(pbp) = NULL;
860}
861
862static void
863g_raid3_remove_bio(struct bio *cbp)
864{
865	struct bio *pbp, *bp;
866
867	pbp = cbp->bio_parent;
868	if (G_RAID3_HEAD_BIO(pbp) == cbp)
869		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
870	else {
871		G_RAID3_FOREACH_BIO(pbp, bp) {
872			if (G_RAID3_NEXT_BIO(bp) == cbp) {
873				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
874				break;
875			}
876		}
877	}
878	G_RAID3_NEXT_BIO(cbp) = NULL;
879}
880
881static void
882g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
883{
884	struct bio *pbp, *bp;
885
886	g_raid3_remove_bio(sbp);
887	pbp = dbp->bio_parent;
888	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
889	if (G_RAID3_HEAD_BIO(pbp) == dbp)
890		G_RAID3_HEAD_BIO(pbp) = sbp;
891	else {
892		G_RAID3_FOREACH_BIO(pbp, bp) {
893			if (G_RAID3_NEXT_BIO(bp) == dbp) {
894				G_RAID3_NEXT_BIO(bp) = sbp;
895				break;
896			}
897		}
898	}
899	G_RAID3_NEXT_BIO(dbp) = NULL;
900}
901
902static void
903g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
904{
905	struct bio *bp, *pbp;
906	size_t size;
907
908	pbp = cbp->bio_parent;
909	pbp->bio_children--;
910	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
911	size = pbp->bio_length / (sc->sc_ndisks - 1);
912	if (size > 16384)
913		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
914	else if (size > 4096)
915		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
916	else
917		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
918	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
919		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
920		G_RAID3_NEXT_BIO(cbp) = NULL;
921		g_destroy_bio(cbp);
922	} else {
923		G_RAID3_FOREACH_BIO(pbp, bp) {
924			if (G_RAID3_NEXT_BIO(bp) == cbp)
925				break;
926		}
927		if (bp != NULL) {
928			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
929			    ("NULL bp->bio_driver1"));
930			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
931			G_RAID3_NEXT_BIO(cbp) = NULL;
932		}
933		g_destroy_bio(cbp);
934	}
935}
936
937static struct bio *
938g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
939{
940	struct bio *bp, *cbp;
941	size_t size;
942
943	cbp = g_clone_bio(pbp);
944	if (cbp == NULL)
945		return (NULL);
946	size = pbp->bio_length / (sc->sc_ndisks - 1);
947	if (size > 16384) {
948		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
949		g_raid3_64k_requested++;
950	} else if (size > 4096) {
951		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
952		g_raid3_16k_requested++;
953	} else {
954		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
955		g_raid3_4k_requested++;
956	}
957	if (cbp->bio_data == NULL) {
958		if (size > 16384)
959			g_raid3_64k_failed++;
960		if (size > 4096)
961			g_raid3_16k_failed++;
962		else
963			g_raid3_4k_failed++;
964		pbp->bio_children--;
965		g_destroy_bio(cbp);
966		return (NULL);
967	}
968	G_RAID3_NEXT_BIO(cbp) = NULL;
969	if (G_RAID3_HEAD_BIO(pbp) == NULL)
970		G_RAID3_HEAD_BIO(pbp) = cbp;
971	else {
972		G_RAID3_FOREACH_BIO(pbp, bp) {
973			if (G_RAID3_NEXT_BIO(bp) == NULL) {
974				G_RAID3_NEXT_BIO(bp) = cbp;
975				break;
976			}
977		}
978	}
979	return (cbp);
980}
981
982static void
983g_raid3_scatter(struct bio *pbp)
984{
985	struct g_raid3_softc *sc;
986	struct g_raid3_disk *disk;
987	struct bio *bp, *cbp;
988	off_t atom, cadd, padd, left;
989
990	sc = pbp->bio_to->geom->softc;
991	bp = NULL;
992	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
993		/*
994		 * Find bio for which we should calculate data.
995		 */
996		G_RAID3_FOREACH_BIO(pbp, cbp) {
997			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
998				bp = cbp;
999				break;
1000			}
1001		}
1002		KASSERT(bp != NULL, ("NULL parity bio."));
1003	}
1004	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1005	cadd = padd = 0;
1006	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1007		G_RAID3_FOREACH_BIO(pbp, cbp) {
1008			if (cbp == bp)
1009				continue;
1010			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1011			padd += atom;
1012		}
1013		cadd += atom;
1014	}
1015	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1016		struct bio *tmpbp;
1017
1018		/*
1019		 * Calculate parity.
1020		 */
1021		bzero(bp->bio_data, bp->bio_length);
1022		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1023			if (cbp == bp)
1024				continue;
1025			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1026			    bp->bio_length);
1027			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1028				g_raid3_destroy_bio(sc, cbp);
1029		}
1030	}
1031	G_RAID3_FOREACH_BIO(pbp, cbp) {
1032		struct g_consumer *cp;
1033
1034		disk = cbp->bio_caller2;
1035		cp = disk->d_consumer;
1036		cbp->bio_to = cp->provider;
1037		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1038		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1039		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1040		    cp->acr, cp->acw, cp->ace));
1041		cp->index++;
1042		g_io_request(cbp, cp);
1043	}
1044}
1045
1046static void
1047g_raid3_gather(struct bio *pbp)
1048{
1049	struct g_raid3_softc *sc;
1050	struct g_raid3_disk *disk;
1051	struct bio *xbp, *fbp, *cbp;
1052	off_t atom, cadd, padd, left;
1053
1054	sc = pbp->bio_to->geom->softc;
1055	/*
1056	 * Find bio for which we have to calculate data.
1057	 * While going through this path, check if all requests
1058	 * succeeded, if not, deny whole request.
1059	 * If we're in COMPLETE mode, we allow one request to fail,
1060	 * so if we find one, we're sending it to the parity consumer.
1061	 * If there are more failed requests, we deny whole request.
1062	 */
1063	xbp = fbp = NULL;
1064	G_RAID3_FOREACH_BIO(pbp, cbp) {
1065		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1066			KASSERT(xbp == NULL, ("More than one parity bio."));
1067			xbp = cbp;
1068		}
1069		if (cbp->bio_error == 0)
1070			continue;
1071		/*
1072		 * Found failed request.
1073		 */
1074		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1075		disk = cbp->bio_caller2;
1076		if (disk != NULL) {
1077			/*
1078			 * Actually this is pointless to bump genid,
1079			 * because whole device is fucked up.
1080			 */
1081			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1082			g_raid3_event_send(disk,
1083			    G_RAID3_DISK_STATE_DISCONNECTED,
1084			    G_RAID3_EVENT_DONTWAIT);
1085		}
1086		if (fbp == NULL) {
1087			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1088				/*
1089				 * We are already in degraded mode, so we can't
1090				 * accept any failures.
1091				 */
1092				if (pbp->bio_error == 0)
1093					pbp->bio_error = fbp->bio_error;
1094			} else {
1095				fbp = cbp;
1096			}
1097		} else {
1098			/*
1099			 * Next failed request, that's too many.
1100			 */
1101			if (pbp->bio_error == 0)
1102				pbp->bio_error = fbp->bio_error;
1103		}
1104	}
1105	if (pbp->bio_error != 0)
1106		goto finish;
1107	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1108		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1109		if (xbp != fbp)
1110			g_raid3_replace_bio(xbp, fbp);
1111		g_raid3_destroy_bio(sc, fbp);
1112	} else if (fbp != NULL) {
1113		struct g_consumer *cp;
1114
1115		/*
1116		 * One request failed, so send the same request to
1117		 * the parity consumer.
1118		 */
1119		disk = pbp->bio_driver2;
1120		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1121			pbp->bio_error = fbp->bio_error;
1122			goto finish;
1123		}
1124		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1125		pbp->bio_inbed--;
1126		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1127		if (disk->d_no == sc->sc_ndisks - 1)
1128			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1129		fbp->bio_error = 0;
1130		fbp->bio_completed = 0;
1131		fbp->bio_children = 0;
1132		fbp->bio_inbed = 0;
1133		cp = disk->d_consumer;
1134		fbp->bio_caller2 = disk;
1135		fbp->bio_to = cp->provider;
1136		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1137		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1138		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1139		    cp->acr, cp->acw, cp->ace));
1140		cp->index++;
1141		g_io_request(fbp, cp);
1142		return;
1143	}
1144	if (xbp != NULL) {
1145		/*
1146		 * Calculate parity.
1147		 */
1148		G_RAID3_FOREACH_BIO(pbp, cbp) {
1149			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1150				continue;
1151			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1152			    xbp->bio_length);
1153		}
1154		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1155		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1156			if (!g_raid3_is_zero(xbp)) {
1157				g_raid3_parity_mismatch++;
1158				pbp->bio_error = EIO;
1159				goto finish;
1160			}
1161			g_raid3_destroy_bio(sc, xbp);
1162		}
1163	}
1164	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1165	cadd = padd = 0;
1166	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1167		G_RAID3_FOREACH_BIO(pbp, cbp) {
1168			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1169			pbp->bio_completed += atom;
1170			padd += atom;
1171		}
1172		cadd += atom;
1173	}
1174finish:
1175	if (pbp->bio_error == 0)
1176		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1177	else {
1178		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1179			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1180		else
1181			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1182	}
1183	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1184	g_io_deliver(pbp, pbp->bio_error);
1185	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1186		g_raid3_destroy_bio(sc, cbp);
1187}
1188
1189static void
1190g_raid3_done(struct bio *bp)
1191{
1192	struct g_raid3_softc *sc;
1193
1194	sc = bp->bio_from->geom->softc;
1195	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1196	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1197	mtx_lock(&sc->sc_queue_mtx);
1198	bioq_insert_head(&sc->sc_queue, bp);
1199	wakeup(sc);
1200	wakeup(&sc->sc_queue);
1201	mtx_unlock(&sc->sc_queue_mtx);
1202}
1203
1204static void
1205g_raid3_regular_request(struct bio *cbp)
1206{
1207	struct g_raid3_softc *sc;
1208	struct g_raid3_disk *disk;
1209	struct bio *pbp;
1210
1211	g_topology_assert_not();
1212
1213	cbp->bio_from->index--;
1214	pbp = cbp->bio_parent;
1215	sc = pbp->bio_to->geom->softc;
1216	disk = cbp->bio_from->private;
1217	if (disk == NULL) {
1218		g_topology_lock();
1219		g_raid3_kill_consumer(sc, cbp->bio_from);
1220		g_topology_unlock();
1221	}
1222
1223	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1224	pbp->bio_inbed++;
1225	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1226	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1227	    pbp->bio_children));
1228	if (pbp->bio_inbed != pbp->bio_children)
1229		return;
1230	switch (pbp->bio_cmd) {
1231	case BIO_READ:
1232		g_raid3_gather(pbp);
1233		break;
1234	case BIO_WRITE:
1235	case BIO_DELETE:
1236	    {
1237		int error = 0;
1238
1239		pbp->bio_completed = pbp->bio_length;
1240		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1241			if (cbp->bio_error != 0) {
1242				disk = cbp->bio_caller2;
1243				if (disk != NULL) {
1244					sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1245					g_raid3_event_send(disk,
1246					    G_RAID3_DISK_STATE_DISCONNECTED,
1247					    G_RAID3_EVENT_DONTWAIT);
1248				}
1249				if (error == 0)
1250					error = cbp->bio_error;
1251				else if (pbp->bio_error == 0) {
1252					/*
1253					 * Next failed request, that's too many.
1254					 */
1255					pbp->bio_error = error;
1256				}
1257			}
1258			g_raid3_destroy_bio(sc, cbp);
1259		}
1260		if (pbp->bio_error == 0)
1261			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1262		else
1263			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1264		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1265		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1266		g_io_deliver(pbp, pbp->bio_error);
1267		break;
1268	    }
1269	}
1270}
1271
1272static void
1273g_raid3_sync_done(struct bio *bp)
1274{
1275	struct g_raid3_softc *sc;
1276
1277	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1278	sc = bp->bio_from->geom->softc;
1279	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1280	mtx_lock(&sc->sc_queue_mtx);
1281	bioq_insert_head(&sc->sc_queue, bp);
1282	wakeup(sc);
1283	wakeup(&sc->sc_queue);
1284	mtx_unlock(&sc->sc_queue_mtx);
1285}
1286
1287static void
1288g_raid3_start(struct bio *bp)
1289{
1290	struct g_raid3_softc *sc;
1291
1292	sc = bp->bio_to->geom->softc;
1293	/*
1294	 * If sc == NULL or there are no valid disks, provider's error
1295	 * should be set and g_raid3_start() should not be called at all.
1296	 */
1297	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1298	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1299	    ("Provider's error should be set (error=%d)(device=%s).",
1300	    bp->bio_to->error, bp->bio_to->name));
1301	G_RAID3_LOGREQ(3, bp, "Request received.");
1302
1303	switch (bp->bio_cmd) {
1304	case BIO_READ:
1305	case BIO_WRITE:
1306	case BIO_DELETE:
1307		break;
1308	case BIO_GETATTR:
1309	default:
1310		g_io_deliver(bp, EOPNOTSUPP);
1311		return;
1312	}
1313	mtx_lock(&sc->sc_queue_mtx);
1314	bioq_insert_tail(&sc->sc_queue, bp);
1315	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1316	wakeup(sc);
1317	mtx_unlock(&sc->sc_queue_mtx);
1318}
1319
1320/*
1321 * Send one synchronization request.
1322 */
1323static void
1324g_raid3_sync_one(struct g_raid3_softc *sc)
1325{
1326	struct g_raid3_disk *disk;
1327	struct bio *bp;
1328
1329	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1330	    ("Wrong device state (%s, %s).", sc->sc_name,
1331	    g_raid3_device_state2str(sc->sc_state)));
1332	disk = sc->sc_syncdisk;
1333	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1334	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1335	    ("Disk %s is not marked for synchronization.",
1336	    g_raid3_get_diskname(disk)));
1337
1338	bp = g_new_bio();
1339	if (bp == NULL)
1340		return;
1341	bp->bio_parent = NULL;
1342	bp->bio_cmd = BIO_READ;
1343	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1344	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1345	bp->bio_cflags = 0;
1346	bp->bio_done = g_raid3_sync_done;
1347	bp->bio_data = disk->d_sync.ds_data;
1348	if (bp->bio_data == NULL) {
1349		g_destroy_bio(bp);
1350		return;
1351	}
1352	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1353	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1354	bp->bio_to = sc->sc_provider;
1355	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1356	disk->d_sync.ds_consumer->index++;
1357	g_io_request(bp, disk->d_sync.ds_consumer);
1358}
1359
1360static void
1361g_raid3_sync_request(struct bio *bp)
1362{
1363	struct g_raid3_softc *sc;
1364	struct g_raid3_disk *disk;
1365
1366	bp->bio_from->index--;
1367	sc = bp->bio_from->geom->softc;
1368	disk = bp->bio_from->private;
1369	if (disk == NULL) {
1370		g_topology_lock();
1371		g_raid3_kill_consumer(sc, bp->bio_from);
1372		g_topology_unlock();
1373		g_destroy_bio(bp);
1374		return;
1375	}
1376
1377	/*
1378	 * Synchronization request.
1379	 */
1380	switch (bp->bio_cmd) {
1381	case BIO_READ:
1382	    {
1383		struct g_consumer *cp;
1384		u_char *dst, *src;
1385		off_t left;
1386		u_int atom;
1387
1388		if (bp->bio_error != 0) {
1389			G_RAID3_LOGREQ(0, bp,
1390			    "Synchronization request failed (error=%d).",
1391			    bp->bio_error);
1392			g_destroy_bio(bp);
1393			return;
1394		}
1395		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1396		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1397		dst = src = bp->bio_data;
1398		if (disk->d_no == sc->sc_ndisks - 1) {
1399			u_int n;
1400
1401			/* Parity component. */
1402			for (left = bp->bio_length; left > 0;
1403			    left -= sc->sc_sectorsize) {
1404				bcopy(src, dst, atom);
1405				src += atom;
1406				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1407					g_raid3_xor(src, dst, dst, atom);
1408					src += atom;
1409				}
1410				dst += atom;
1411			}
1412		} else {
1413			/* Regular component. */
1414			src += atom * disk->d_no;
1415			for (left = bp->bio_length; left > 0;
1416			    left -= sc->sc_sectorsize) {
1417				bcopy(src, dst, atom);
1418				src += sc->sc_sectorsize;
1419				dst += atom;
1420			}
1421		}
1422		bp->bio_offset /= sc->sc_ndisks - 1;
1423		bp->bio_length /= sc->sc_ndisks - 1;
1424		bp->bio_cmd = BIO_WRITE;
1425		bp->bio_cflags = 0;
1426		bp->bio_children = bp->bio_inbed = 0;
1427		cp = disk->d_consumer;
1428		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1429		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1430		    cp->acr, cp->acw, cp->ace));
1431		cp->index++;
1432		g_io_request(bp, cp);
1433		return;
1434	    }
1435	case BIO_WRITE:
1436	    {
1437		struct g_raid3_disk_sync *sync;
1438
1439		if (bp->bio_error != 0) {
1440			G_RAID3_LOGREQ(0, bp,
1441			    "Synchronization request failed (error=%d).",
1442			    bp->bio_error);
1443			g_destroy_bio(bp);
1444			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1445			g_raid3_event_send(disk,
1446			    G_RAID3_DISK_STATE_DISCONNECTED,
1447			    G_RAID3_EVENT_DONTWAIT);
1448			return;
1449		}
1450		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1451		sync = &disk->d_sync;
1452		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1453		g_destroy_bio(bp);
1454		if (sync->ds_resync != -1)
1455			return;
1456		if (sync->ds_offset_done ==
1457		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1458			/*
1459			 * Disk up-to-date, activate it.
1460			 */
1461			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1462			    G_RAID3_EVENT_DONTWAIT);
1463			return;
1464		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1465			/*
1466			 * Update offset_done on every 100 blocks.
1467			 * XXX: This should be configurable.
1468			 */
1469			g_topology_lock();
1470			g_raid3_update_metadata(disk);
1471			g_topology_unlock();
1472		}
1473		return;
1474	    }
1475	default:
1476		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1477		    bp->bio_cmd, sc->sc_name));
1478		break;
1479	}
1480}
1481
1482static int
1483g_raid3_register_request(struct bio *pbp)
1484{
1485	struct g_raid3_softc *sc;
1486	struct g_raid3_disk *disk;
1487	struct g_consumer *cp;
1488	struct bio *cbp;
1489	off_t offset, length;
1490	u_int n, ndisks;
1491	int round_robin, verify;
1492
1493	ndisks = 0;
1494	sc = pbp->bio_to->geom->softc;
1495	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1496	    sc->sc_syncdisk == NULL) {
1497		g_io_deliver(pbp, EIO);
1498		return (0);
1499	}
1500	g_raid3_init_bio(pbp);
1501	length = pbp->bio_length / (sc->sc_ndisks - 1);
1502	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1503	round_robin = verify = 0;
1504	switch (pbp->bio_cmd) {
1505	case BIO_READ:
1506		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1507		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1508			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1509			verify = 1;
1510			ndisks = sc->sc_ndisks;
1511		} else {
1512			verify = 0;
1513			ndisks = sc->sc_ndisks - 1;
1514		}
1515		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1516		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1517			round_robin = 1;
1518		} else {
1519			round_robin = 0;
1520		}
1521		KASSERT(!round_robin || !verify,
1522		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1523		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1524		break;
1525	case BIO_WRITE:
1526	case BIO_DELETE:
1527	    {
1528		struct g_raid3_disk_sync *sync;
1529
1530		if (sc->sc_idle)
1531			g_raid3_unidle(sc);
1532
1533		ndisks = sc->sc_ndisks;
1534
1535		if (sc->sc_syncdisk == NULL)
1536			break;
1537		sync = &sc->sc_syncdisk->d_sync;
1538		if (offset >= sync->ds_offset)
1539			break;
1540		if (offset + length <= sync->ds_offset_done)
1541			break;
1542		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1543			break;
1544		sync->ds_resync = offset - (offset % MAXPHYS);
1545		break;
1546	    }
1547	}
1548	for (n = 0; n < ndisks; n++) {
1549		disk = &sc->sc_disks[n];
1550		cbp = g_raid3_clone_bio(sc, pbp);
1551		if (cbp == NULL) {
1552			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1553				g_raid3_destroy_bio(sc, cbp);
1554			return (ENOMEM);
1555		}
1556		cbp->bio_offset = offset;
1557		cbp->bio_length = length;
1558		cbp->bio_done = g_raid3_done;
1559		switch (pbp->bio_cmd) {
1560		case BIO_READ:
1561			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1562				/*
1563				 * Replace invalid component with the parity
1564				 * component.
1565				 */
1566				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1567				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1568				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1569			} else if (round_robin &&
1570			    disk->d_no == sc->sc_round_robin) {
1571				/*
1572				 * In round-robin mode skip one data component
1573				 * and use parity component when reading.
1574				 */
1575				pbp->bio_driver2 = disk;
1576				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1577				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1578				sc->sc_round_robin++;
1579				round_robin = 0;
1580			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1581				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1582			}
1583			break;
1584		case BIO_WRITE:
1585		case BIO_DELETE:
1586			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1587			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1588				if (n == ndisks - 1) {
1589					/*
1590					 * Active parity component, mark it as such.
1591					 */
1592					cbp->bio_cflags |=
1593					    G_RAID3_BIO_CFLAG_PARITY;
1594				}
1595			} else {
1596				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1597				if (n == ndisks - 1) {
1598					/*
1599					 * Parity component is not connected,
1600					 * so destroy its request.
1601					 */
1602					pbp->bio_pflags |=
1603					    G_RAID3_BIO_PFLAG_NOPARITY;
1604					g_raid3_destroy_bio(sc, cbp);
1605					cbp = NULL;
1606				} else {
1607					cbp->bio_cflags |=
1608					    G_RAID3_BIO_CFLAG_NODISK;
1609					disk = NULL;
1610				}
1611			}
1612			break;
1613		}
1614		if (cbp != NULL)
1615			cbp->bio_caller2 = disk;
1616	}
1617	switch (pbp->bio_cmd) {
1618	case BIO_READ:
1619		if (round_robin) {
1620			/*
1621			 * If we are in round-robin mode and 'round_robin' is
1622			 * still 1, it means, that we skipped parity component
1623			 * for this read and must reset sc_round_robin field.
1624			 */
1625			sc->sc_round_robin = 0;
1626		}
1627		G_RAID3_FOREACH_BIO(pbp, cbp) {
1628			disk = cbp->bio_caller2;
1629			cp = disk->d_consumer;
1630			cbp->bio_to = cp->provider;
1631			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1632			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1633			    ("Consumer %s not opened (r%dw%de%d).",
1634			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1635			cp->index++;
1636			g_io_request(cbp, cp);
1637		}
1638		break;
1639	case BIO_WRITE:
1640	case BIO_DELETE:
1641		/*
1642		 * Bump syncid on first write.
1643		 */
1644		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1645			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1646			g_topology_lock();
1647			g_raid3_bump_syncid(sc);
1648			g_topology_unlock();
1649		}
1650		g_raid3_scatter(pbp);
1651		break;
1652	}
1653	return (0);
1654}
1655
1656static int
1657g_raid3_can_destroy(struct g_raid3_softc *sc)
1658{
1659	struct g_geom *gp;
1660	struct g_consumer *cp;
1661
1662	g_topology_assert();
1663	gp = sc->sc_geom;
1664	LIST_FOREACH(cp, &gp->consumer, consumer) {
1665		if (g_raid3_is_busy(sc, cp))
1666			return (0);
1667	}
1668	gp = sc->sc_sync.ds_geom;
1669	LIST_FOREACH(cp, &gp->consumer, consumer) {
1670		if (g_raid3_is_busy(sc, cp))
1671			return (0);
1672	}
1673	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1674	    sc->sc_name);
1675	return (1);
1676}
1677
1678static int
1679g_raid3_try_destroy(struct g_raid3_softc *sc)
1680{
1681
1682	g_topology_lock();
1683	if (!g_raid3_can_destroy(sc)) {
1684		g_topology_unlock();
1685		return (0);
1686	}
1687	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1688		g_topology_unlock();
1689		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1690		    &sc->sc_worker);
1691		wakeup(&sc->sc_worker);
1692		sc->sc_worker = NULL;
1693	} else {
1694		g_raid3_destroy_device(sc);
1695		g_topology_unlock();
1696		free(sc->sc_disks, M_RAID3);
1697		free(sc, M_RAID3);
1698	}
1699	return (1);
1700}
1701
1702/*
1703 * Worker thread.
1704 */
1705static void
1706g_raid3_worker(void *arg)
1707{
1708	struct g_raid3_softc *sc;
1709	struct g_raid3_disk *disk;
1710	struct g_raid3_disk_sync *sync;
1711	struct g_raid3_event *ep;
1712	struct bio *bp;
1713	u_int nreqs;
1714
1715	sc = arg;
1716	mtx_lock_spin(&sched_lock);
1717	sched_prio(curthread, PRIBIO);
1718	mtx_unlock_spin(&sched_lock);
1719
1720	nreqs = 0;
1721	for (;;) {
1722		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1723		/*
1724		 * First take a look at events.
1725		 * This is important to handle events before any I/O requests.
1726		 */
1727		ep = g_raid3_event_get(sc);
1728		if (ep != NULL && g_topology_try_lock()) {
1729			g_raid3_event_remove(sc, ep);
1730			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1731				/* Update only device status. */
1732				G_RAID3_DEBUG(3,
1733				    "Running event for device %s.",
1734				    sc->sc_name);
1735				ep->e_error = 0;
1736				g_raid3_update_device(sc, 1);
1737			} else {
1738				/* Update disk status. */
1739				G_RAID3_DEBUG(3, "Running event for disk %s.",
1740				     g_raid3_get_diskname(ep->e_disk));
1741				ep->e_error = g_raid3_update_disk(ep->e_disk,
1742				    ep->e_state);
1743				if (ep->e_error == 0)
1744					g_raid3_update_device(sc, 0);
1745			}
1746			g_topology_unlock();
1747			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1748				KASSERT(ep->e_error == 0,
1749				    ("Error cannot be handled."));
1750				g_raid3_event_free(ep);
1751			} else {
1752				ep->e_flags |= G_RAID3_EVENT_DONE;
1753				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1754				    ep);
1755				mtx_lock(&sc->sc_events_mtx);
1756				wakeup(ep);
1757				mtx_unlock(&sc->sc_events_mtx);
1758			}
1759			if ((sc->sc_flags &
1760			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1761				if (g_raid3_try_destroy(sc))
1762					kthread_exit(0);
1763			}
1764			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1765			continue;
1766		}
1767		/*
1768		 * Now I/O requests.
1769		 */
1770		/* Get first request from the queue. */
1771		mtx_lock(&sc->sc_queue_mtx);
1772		bp = bioq_first(&sc->sc_queue);
1773		if (bp == NULL) {
1774			if (ep != NULL) {
1775				/*
1776				 * No I/O requests and topology lock was
1777				 * already held? Try again.
1778				 */
1779				mtx_unlock(&sc->sc_queue_mtx);
1780				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1781				continue;
1782			}
1783			if ((sc->sc_flags &
1784			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1785				mtx_unlock(&sc->sc_queue_mtx);
1786				if (g_raid3_try_destroy(sc))
1787					kthread_exit(0);
1788				mtx_lock(&sc->sc_queue_mtx);
1789			}
1790		}
1791		if (sc->sc_syncdisk != NULL &&
1792		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1793			mtx_unlock(&sc->sc_queue_mtx);
1794			/*
1795			 * It is time for synchronization...
1796			 */
1797			nreqs = 0;
1798			disk = sc->sc_syncdisk;
1799			sync = &disk->d_sync;
1800			if (sync->ds_offset <
1801			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1802			    sync->ds_offset == sync->ds_offset_done) {
1803				if (sync->ds_resync != -1) {
1804					sync->ds_offset = sync->ds_resync;
1805					sync->ds_offset_done = sync->ds_resync;
1806					sync->ds_resync = -1;
1807				}
1808				g_raid3_sync_one(sc);
1809			}
1810			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1811			goto sleep;
1812		}
1813		if (bp == NULL) {
1814			if (g_raid3_check_idle(sc)) {
1815				u_int idletime;
1816
1817				idletime = g_raid3_idletime;
1818				if (idletime == 0)
1819					idletime = 1;
1820				idletime *= hz;
1821				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1822				    "r3:w1", idletime) == EWOULDBLOCK) {
1823					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1824					    __func__);
1825					/*
1826					 * No I/O requests in 'idletime'
1827					 * seconds, so mark components as clean.
1828					 */
1829					g_raid3_idle(sc);
1830				}
1831				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1832			} else {
1833				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1834				    "r3:w2", 0);
1835				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1836			}
1837			continue;
1838		}
1839		nreqs++;
1840		bioq_remove(&sc->sc_queue, bp);
1841		mtx_unlock(&sc->sc_queue_mtx);
1842
1843		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1844			g_raid3_regular_request(bp);
1845		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1846			u_int timeout, sps;
1847
1848			g_raid3_sync_request(bp);
1849sleep:
1850			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1851			if (sps == 0) {
1852				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1853				continue;
1854			}
1855			if (ep != NULL) {
1856				/*
1857				 * We have some pending events, don't sleep now.
1858				 */
1859				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1860				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1861				continue;
1862			}
1863			mtx_lock(&sc->sc_queue_mtx);
1864			if (bioq_first(&sc->sc_queue) != NULL) {
1865				mtx_unlock(&sc->sc_queue_mtx);
1866				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1867				continue;
1868			}
1869			timeout = hz / sps;
1870			if (timeout == 0)
1871				timeout = 1;
1872			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1873			    timeout);
1874		} else {
1875			if (g_raid3_register_request(bp) != 0) {
1876				mtx_lock(&sc->sc_queue_mtx);
1877				bioq_insert_tail(&sc->sc_queue, bp);
1878				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1879				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1880			}
1881		}
1882		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1883	}
1884}
1885
1886/*
1887 * Open disk's consumer if needed.
1888 */
1889static void
1890g_raid3_update_access(struct g_raid3_disk *disk)
1891{
1892	struct g_provider *pp;
1893
1894	g_topology_assert();
1895
1896	pp = disk->d_softc->sc_provider;
1897	if (pp == NULL)
1898		return;
1899	if (pp->acw > 0) {
1900		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1901			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1902			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1903			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1904		}
1905	} else if (pp->acw == 0) {
1906		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1907			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1908			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1909			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1910		}
1911	}
1912}
1913
1914static void
1915g_raid3_sync_start(struct g_raid3_softc *sc)
1916{
1917	struct g_raid3_disk *disk;
1918	int error;
1919	u_int n;
1920
1921	g_topology_assert();
1922
1923	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1924	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1925	    sc->sc_state));
1926	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1927	    sc->sc_name, sc->sc_state));
1928	disk = NULL;
1929	for (n = 0; n < sc->sc_ndisks; n++) {
1930		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1931			continue;
1932		disk = &sc->sc_disks[n];
1933		break;
1934	}
1935	if (disk == NULL)
1936		return;
1937
1938	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1939	    g_raid3_get_diskname(disk));
1940	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1941	KASSERT(disk->d_sync.ds_consumer == NULL,
1942	    ("Sync consumer already exists (device=%s, disk=%s).",
1943	    sc->sc_name, g_raid3_get_diskname(disk)));
1944	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1945	disk->d_sync.ds_consumer->private = disk;
1946	disk->d_sync.ds_consumer->index = 0;
1947	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1948	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1949	    disk->d_softc->sc_name, error));
1950	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1951	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1952	    disk->d_softc->sc_name, error));
1953	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1954	sc->sc_syncdisk = disk;
1955}
1956
1957/*
1958 * Stop synchronization process.
1959 * type: 0 - synchronization finished
1960 *       1 - synchronization stopped
1961 */
1962static void
1963g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1964{
1965	struct g_raid3_disk *disk;
1966
1967	g_topology_assert();
1968	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1969	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1970	    sc->sc_state));
1971	disk = sc->sc_syncdisk;
1972	sc->sc_syncdisk = NULL;
1973	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1974	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1975	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1976	    g_raid3_disk_state2str(disk->d_state)));
1977	if (disk->d_sync.ds_consumer == NULL)
1978		return;
1979
1980	if (type == 0) {
1981		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1982		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1983	} else /* if (type == 1) */ {
1984		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1985		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1986	}
1987	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
1988	free(disk->d_sync.ds_data, M_RAID3);
1989	disk->d_sync.ds_consumer = NULL;
1990	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1991}
1992
1993static void
1994g_raid3_launch_provider(struct g_raid3_softc *sc)
1995{
1996	struct g_provider *pp;
1997
1998	g_topology_assert();
1999
2000	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2001	pp->mediasize = sc->sc_mediasize;
2002	pp->sectorsize = sc->sc_sectorsize;
2003	sc->sc_provider = pp;
2004	g_error_provider(pp, 0);
2005	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2006	    pp->name);
2007	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2008		g_raid3_sync_start(sc);
2009}
2010
2011static void
2012g_raid3_destroy_provider(struct g_raid3_softc *sc)
2013{
2014	struct bio *bp;
2015
2016	g_topology_assert();
2017	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2018	    sc->sc_name));
2019
2020	g_error_provider(sc->sc_provider, ENXIO);
2021	mtx_lock(&sc->sc_queue_mtx);
2022	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2023		bioq_remove(&sc->sc_queue, bp);
2024		g_io_deliver(bp, ENXIO);
2025	}
2026	mtx_unlock(&sc->sc_queue_mtx);
2027	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2028	    sc->sc_provider->name);
2029	sc->sc_provider->flags |= G_PF_WITHER;
2030	g_orphan_provider(sc->sc_provider, ENXIO);
2031	sc->sc_provider = NULL;
2032	if (sc->sc_syncdisk != NULL)
2033		g_raid3_sync_stop(sc, 1);
2034}
2035
2036static void
2037g_raid3_go(void *arg)
2038{
2039	struct g_raid3_softc *sc;
2040
2041	sc = arg;
2042	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2043	g_raid3_event_send(sc, 0,
2044	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2045}
2046
2047static u_int
2048g_raid3_determine_state(struct g_raid3_disk *disk)
2049{
2050	struct g_raid3_softc *sc;
2051	u_int state;
2052
2053	sc = disk->d_softc;
2054	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2055		if ((disk->d_flags &
2056		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2057			/* Disk does not need synchronization. */
2058			state = G_RAID3_DISK_STATE_ACTIVE;
2059		} else {
2060			if ((sc->sc_flags &
2061			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2062			    (disk->d_flags &
2063			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2064				/*
2065				 * We can start synchronization from
2066				 * the stored offset.
2067				 */
2068				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2069			} else {
2070				state = G_RAID3_DISK_STATE_STALE;
2071			}
2072		}
2073	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2074		/*
2075		 * Reset all synchronization data for this disk,
2076		 * because if it even was synchronized, it was
2077		 * synchronized to disks with different syncid.
2078		 */
2079		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2080		disk->d_sync.ds_offset = 0;
2081		disk->d_sync.ds_offset_done = 0;
2082		disk->d_sync.ds_syncid = sc->sc_syncid;
2083		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2084		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2085			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2086		} else {
2087			state = G_RAID3_DISK_STATE_STALE;
2088		}
2089	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2090		/*
2091		 * Not good, NOT GOOD!
2092		 * It means that device was started on stale disks
2093		 * and more fresh disk just arrive.
2094		 * If there were writes, device is fucked up, sorry.
2095		 * I think the best choice here is don't touch
2096		 * this disk and inform the user laudly.
2097		 */
2098		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2099		    "disk (%s) arrives!! It will not be connected to the "
2100		    "running device.", sc->sc_name,
2101		    g_raid3_get_diskname(disk));
2102		g_raid3_destroy_disk(disk);
2103		state = G_RAID3_DISK_STATE_NONE;
2104		/* Return immediately, because disk was destroyed. */
2105		return (state);
2106	}
2107	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2108	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2109	return (state);
2110}
2111
2112/*
2113 * Update device state.
2114 */
2115static void
2116g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2117{
2118	struct g_raid3_disk *disk;
2119	u_int state;
2120
2121	g_topology_assert();
2122
2123	switch (sc->sc_state) {
2124	case G_RAID3_DEVICE_STATE_STARTING:
2125	    {
2126		u_int n, ndirty, ndisks, genid, syncid;
2127
2128		KASSERT(sc->sc_provider == NULL,
2129		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2130		/*
2131		 * Are we ready? We are, if all disks are connected or
2132		 * one disk is missing and 'force' is true.
2133		 */
2134		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2135			if (!force)
2136				callout_drain(&sc->sc_callout);
2137		} else {
2138			if (force) {
2139				/*
2140				 * Timeout expired, so destroy device.
2141				 */
2142				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2143			}
2144			return;
2145		}
2146
2147		/*
2148		 * Find the biggest genid.
2149		 */
2150		genid = 0;
2151		for (n = 0; n < sc->sc_ndisks; n++) {
2152			disk = &sc->sc_disks[n];
2153			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2154				continue;
2155			if (disk->d_genid > genid)
2156				genid = disk->d_genid;
2157		}
2158		sc->sc_genid = genid;
2159		/*
2160		 * Remove all disks without the biggest genid.
2161		 */
2162		for (n = 0; n < sc->sc_ndisks; n++) {
2163			disk = &sc->sc_disks[n];
2164			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2165				continue;
2166			if (disk->d_genid < genid) {
2167				G_RAID3_DEBUG(0,
2168				    "Component %s (device %s) broken, skipping.",
2169				    g_raid3_get_diskname(disk), sc->sc_name);
2170				g_raid3_destroy_disk(disk);
2171			}
2172		}
2173
2174		/*
2175		 * There must be at least 'sc->sc_ndisks - 1' components
2176		 * with the same syncid and without SYNCHRONIZING flag.
2177		 */
2178
2179		/*
2180		 * Find the biggest syncid, number of valid components and
2181		 * number of dirty components.
2182		 */
2183		ndirty = ndisks = syncid = 0;
2184		for (n = 0; n < sc->sc_ndisks; n++) {
2185			disk = &sc->sc_disks[n];
2186			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2187				continue;
2188			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2189				ndirty++;
2190			if (disk->d_sync.ds_syncid > syncid) {
2191				syncid = disk->d_sync.ds_syncid;
2192				ndisks = 0;
2193			} else if (disk->d_sync.ds_syncid < syncid) {
2194				continue;
2195			}
2196			if ((disk->d_flags &
2197			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2198				continue;
2199			}
2200			ndisks++;
2201		}
2202		/*
2203		 * Do we have enough valid components?
2204		 */
2205		if (ndisks + 1 < sc->sc_ndisks) {
2206			G_RAID3_DEBUG(0,
2207			    "Device %s is broken, too few valid components.",
2208			    sc->sc_name);
2209			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2210			return;
2211		}
2212		/*
2213		 * If there is one DIRTY component and all disks are present,
2214		 * mark it for synchronization. If there is more than one DIRTY
2215		 * component, mark parity component for synchronization.
2216		 */
2217		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2218			for (n = 0; n < sc->sc_ndisks; n++) {
2219				disk = &sc->sc_disks[n];
2220				if ((disk->d_flags &
2221				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2222					continue;
2223				}
2224				disk->d_flags |=
2225				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2226			}
2227		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2228			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2229			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2230		}
2231
2232		sc->sc_syncid = syncid;
2233		if (force) {
2234			/* Remember to bump syncid on first write. */
2235			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2236		}
2237		if (ndisks == sc->sc_ndisks)
2238			state = G_RAID3_DEVICE_STATE_COMPLETE;
2239		else /* if (ndisks == sc->sc_ndisks - 1) */
2240			state = G_RAID3_DEVICE_STATE_DEGRADED;
2241		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2242		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2243		    g_raid3_device_state2str(state));
2244		sc->sc_state = state;
2245		for (n = 0; n < sc->sc_ndisks; n++) {
2246			disk = &sc->sc_disks[n];
2247			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2248				continue;
2249			state = g_raid3_determine_state(disk);
2250			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2251			if (state == G_RAID3_DISK_STATE_STALE)
2252				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2253		}
2254		break;
2255	    }
2256	case G_RAID3_DEVICE_STATE_DEGRADED:
2257		/*
2258		 * Genid need to be bumped immediately, so do it here.
2259		 */
2260		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2261			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2262			g_raid3_bump_genid(sc);
2263		}
2264
2265		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2266			return;
2267		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2268		    sc->sc_ndisks - 1) {
2269			if (sc->sc_provider != NULL)
2270				g_raid3_destroy_provider(sc);
2271			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2272			return;
2273		}
2274		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2275		    sc->sc_ndisks) {
2276			state = G_RAID3_DEVICE_STATE_COMPLETE;
2277			G_RAID3_DEBUG(1,
2278			    "Device %s state changed from %s to %s.",
2279			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2280			    g_raid3_device_state2str(state));
2281			sc->sc_state = state;
2282		}
2283		if (sc->sc_provider == NULL)
2284			g_raid3_launch_provider(sc);
2285		break;
2286	case G_RAID3_DEVICE_STATE_COMPLETE:
2287		/*
2288		 * Genid need to be bumped immediately, so do it here.
2289		 */
2290		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2291			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2292			g_raid3_bump_genid(sc);
2293		}
2294
2295		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2296			return;
2297		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2298		    sc->sc_ndisks - 1,
2299		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2300		    sc->sc_name));
2301		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2302		    sc->sc_ndisks - 1) {
2303			state = G_RAID3_DEVICE_STATE_DEGRADED;
2304			G_RAID3_DEBUG(1,
2305			    "Device %s state changed from %s to %s.",
2306			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2307			    g_raid3_device_state2str(state));
2308			sc->sc_state = state;
2309		}
2310		if (sc->sc_provider == NULL)
2311			g_raid3_launch_provider(sc);
2312		break;
2313	default:
2314		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2315		    g_raid3_device_state2str(sc->sc_state)));
2316		break;
2317	}
2318}
2319
2320/*
2321 * Update disk state and device state if needed.
2322 */
2323#define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2324	"Disk %s state changed from %s to %s (device %s).",		\
2325	g_raid3_get_diskname(disk),					\
2326	g_raid3_disk_state2str(disk->d_state),				\
2327	g_raid3_disk_state2str(state), sc->sc_name)
2328static int
2329g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2330{
2331	struct g_raid3_softc *sc;
2332
2333	g_topology_assert();
2334
2335	sc = disk->d_softc;
2336again:
2337	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2338	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2339	    g_raid3_disk_state2str(state));
2340	switch (state) {
2341	case G_RAID3_DISK_STATE_NEW:
2342		/*
2343		 * Possible scenarios:
2344		 * 1. New disk arrive.
2345		 */
2346		/* Previous state should be NONE. */
2347		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2348		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2349		    g_raid3_disk_state2str(disk->d_state)));
2350		DISK_STATE_CHANGED();
2351
2352		disk->d_state = state;
2353		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2354		    sc->sc_name, g_raid3_get_diskname(disk));
2355		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2356			break;
2357		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2358		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2359		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2360		    g_raid3_device_state2str(sc->sc_state),
2361		    g_raid3_get_diskname(disk),
2362		    g_raid3_disk_state2str(disk->d_state)));
2363		state = g_raid3_determine_state(disk);
2364		if (state != G_RAID3_DISK_STATE_NONE)
2365			goto again;
2366		break;
2367	case G_RAID3_DISK_STATE_ACTIVE:
2368		/*
2369		 * Possible scenarios:
2370		 * 1. New disk does not need synchronization.
2371		 * 2. Synchronization process finished successfully.
2372		 */
2373		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2374		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2375		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2376		    g_raid3_device_state2str(sc->sc_state),
2377		    g_raid3_get_diskname(disk),
2378		    g_raid3_disk_state2str(disk->d_state)));
2379		/* Previous state should be NEW or SYNCHRONIZING. */
2380		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2381		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2382		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2383		    g_raid3_disk_state2str(disk->d_state)));
2384		DISK_STATE_CHANGED();
2385
2386		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2387			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2388		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2389			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2390			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2391			g_raid3_sync_stop(sc, 0);
2392		}
2393		disk->d_state = state;
2394		disk->d_sync.ds_offset = 0;
2395		disk->d_sync.ds_offset_done = 0;
2396		g_raid3_update_access(disk);
2397		g_raid3_update_metadata(disk);
2398		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2399		    sc->sc_name, g_raid3_get_diskname(disk));
2400		break;
2401	case G_RAID3_DISK_STATE_STALE:
2402		/*
2403		 * Possible scenarios:
2404		 * 1. Stale disk was connected.
2405		 */
2406		/* Previous state should be NEW. */
2407		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2408		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2409		    g_raid3_disk_state2str(disk->d_state)));
2410		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2411		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2412		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2413		    g_raid3_device_state2str(sc->sc_state),
2414		    g_raid3_get_diskname(disk),
2415		    g_raid3_disk_state2str(disk->d_state)));
2416		/*
2417		 * STALE state is only possible if device is marked
2418		 * NOAUTOSYNC.
2419		 */
2420		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2421		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2422		    g_raid3_device_state2str(sc->sc_state),
2423		    g_raid3_get_diskname(disk),
2424		    g_raid3_disk_state2str(disk->d_state)));
2425		DISK_STATE_CHANGED();
2426
2427		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2428		disk->d_state = state;
2429		g_raid3_update_metadata(disk);
2430		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2431		    sc->sc_name, g_raid3_get_diskname(disk));
2432		break;
2433	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2434		/*
2435		 * Possible scenarios:
2436		 * 1. Disk which needs synchronization was connected.
2437		 */
2438		/* Previous state should be NEW. */
2439		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2440		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2441		    g_raid3_disk_state2str(disk->d_state)));
2442		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2443		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2444		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2445		    g_raid3_device_state2str(sc->sc_state),
2446		    g_raid3_get_diskname(disk),
2447		    g_raid3_disk_state2str(disk->d_state)));
2448		DISK_STATE_CHANGED();
2449
2450		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2451			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2452		disk->d_state = state;
2453		if (sc->sc_provider != NULL) {
2454			g_raid3_sync_start(sc);
2455			g_raid3_update_metadata(disk);
2456		}
2457		break;
2458	case G_RAID3_DISK_STATE_DISCONNECTED:
2459		/*
2460		 * Possible scenarios:
2461		 * 1. Device wasn't running yet, but disk disappear.
2462		 * 2. Disk was active and disapppear.
2463		 * 3. Disk disappear during synchronization process.
2464		 */
2465		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2466		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2467			/*
2468			 * Previous state should be ACTIVE, STALE or
2469			 * SYNCHRONIZING.
2470			 */
2471			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2472			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2473			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2474			    ("Wrong disk state (%s, %s).",
2475			    g_raid3_get_diskname(disk),
2476			    g_raid3_disk_state2str(disk->d_state)));
2477		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2478			/* Previous state should be NEW. */
2479			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2480			    ("Wrong disk state (%s, %s).",
2481			    g_raid3_get_diskname(disk),
2482			    g_raid3_disk_state2str(disk->d_state)));
2483			/*
2484			 * Reset bumping syncid if disk disappeared in STARTING
2485			 * state.
2486			 */
2487			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2488				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2489#ifdef	INVARIANTS
2490		} else {
2491			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2492			    sc->sc_name,
2493			    g_raid3_device_state2str(sc->sc_state),
2494			    g_raid3_get_diskname(disk),
2495			    g_raid3_disk_state2str(disk->d_state)));
2496#endif
2497		}
2498		DISK_STATE_CHANGED();
2499		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2500		    sc->sc_name, g_raid3_get_diskname(disk));
2501
2502		g_raid3_destroy_disk(disk);
2503		break;
2504	default:
2505		KASSERT(1 == 0, ("Unknown state (%u).", state));
2506		break;
2507	}
2508	return (0);
2509}
2510#undef	DISK_STATE_CHANGED
2511
2512int
2513g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2514{
2515	struct g_provider *pp;
2516	u_char *buf;
2517	int error;
2518
2519	g_topology_assert();
2520
2521	error = g_access(cp, 1, 0, 0);
2522	if (error != 0)
2523		return (error);
2524	pp = cp->provider;
2525	g_topology_unlock();
2526	/* Metadata are stored on last sector. */
2527	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2528	    &error);
2529	g_topology_lock();
2530	g_access(cp, -1, 0, 0);
2531	if (error != 0) {
2532		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2533		    cp->provider->name, error);
2534		if (buf != NULL)
2535			g_free(buf);
2536		return (error);
2537	}
2538
2539	/* Decode metadata. */
2540	error = raid3_metadata_decode(buf, md);
2541	g_free(buf);
2542	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2543		return (EINVAL);
2544	if (md->md_version > G_RAID3_VERSION) {
2545		G_RAID3_DEBUG(0,
2546		    "Kernel module is too old to handle metadata from %s.",
2547		    cp->provider->name);
2548		return (EINVAL);
2549	}
2550	if (error != 0) {
2551		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2552		    cp->provider->name);
2553		return (error);
2554	}
2555
2556	return (0);
2557}
2558
2559static int
2560g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2561    struct g_raid3_metadata *md)
2562{
2563
2564	if (md->md_no >= sc->sc_ndisks) {
2565		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2566		    pp->name, md->md_no);
2567		return (EINVAL);
2568	}
2569	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2570		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2571		    pp->name, md->md_no);
2572		return (EEXIST);
2573	}
2574	if (md->md_all != sc->sc_ndisks) {
2575		G_RAID3_DEBUG(1,
2576		    "Invalid '%s' field on disk %s (device %s), skipping.",
2577		    "md_all", pp->name, sc->sc_name);
2578		return (EINVAL);
2579	}
2580	if (md->md_mediasize != sc->sc_mediasize) {
2581		G_RAID3_DEBUG(1,
2582		    "Invalid '%s' field on disk %s (device %s), skipping.",
2583		    "md_mediasize", pp->name, sc->sc_name);
2584		return (EINVAL);
2585	}
2586	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2587		G_RAID3_DEBUG(1,
2588		    "Invalid '%s' field on disk %s (device %s), skipping.",
2589		    "md_mediasize", pp->name, sc->sc_name);
2590		return (EINVAL);
2591	}
2592	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2593		G_RAID3_DEBUG(1,
2594		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2595		    sc->sc_name);
2596		return (EINVAL);
2597	}
2598	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2599		G_RAID3_DEBUG(1,
2600		    "Invalid '%s' field on disk %s (device %s), skipping.",
2601		    "md_sectorsize", pp->name, sc->sc_name);
2602		return (EINVAL);
2603	}
2604	if (md->md_sectorsize != sc->sc_sectorsize) {
2605		G_RAID3_DEBUG(1,
2606		    "Invalid '%s' field on disk %s (device %s), skipping.",
2607		    "md_sectorsize", pp->name, sc->sc_name);
2608		return (EINVAL);
2609	}
2610	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2611		G_RAID3_DEBUG(1,
2612		    "Invalid sector size of disk %s (device %s), skipping.",
2613		    pp->name, sc->sc_name);
2614		return (EINVAL);
2615	}
2616	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2617		G_RAID3_DEBUG(1,
2618		    "Invalid device flags on disk %s (device %s), skipping.",
2619		    pp->name, sc->sc_name);
2620		return (EINVAL);
2621	}
2622	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2623	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2624		/*
2625		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2626		 */
2627		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2628		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2629		return (EINVAL);
2630	}
2631	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2632		G_RAID3_DEBUG(1,
2633		    "Invalid disk flags on disk %s (device %s), skipping.",
2634		    pp->name, sc->sc_name);
2635		return (EINVAL);
2636	}
2637	return (0);
2638}
2639
2640int
2641g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2642    struct g_raid3_metadata *md)
2643{
2644	struct g_raid3_disk *disk;
2645	int error;
2646
2647	g_topology_assert();
2648	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2649
2650	error = g_raid3_check_metadata(sc, pp, md);
2651	if (error != 0)
2652		return (error);
2653	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2654	    md->md_genid < sc->sc_genid) {
2655		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2656		    pp->name, sc->sc_name);
2657		return (EINVAL);
2658	}
2659	disk = g_raid3_init_disk(sc, pp, md, &error);
2660	if (disk == NULL)
2661		return (error);
2662	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2663	    G_RAID3_EVENT_WAIT);
2664	if (error != 0)
2665		return (error);
2666	if (md->md_version < G_RAID3_VERSION) {
2667		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2668		    pp->name, md->md_version, G_RAID3_VERSION);
2669		g_raid3_update_metadata(disk);
2670	}
2671	return (0);
2672}
2673
2674static int
2675g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2676{
2677	struct g_raid3_softc *sc;
2678	struct g_raid3_disk *disk;
2679	int dcr, dcw, dce;
2680	u_int n;
2681
2682	g_topology_assert();
2683	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2684	    acw, ace);
2685
2686	dcr = pp->acr + acr;
2687	dcw = pp->acw + acw;
2688	dce = pp->ace + ace;
2689
2690	sc = pp->geom->softc;
2691	if (sc == NULL ||
2692	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2693	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2694		if (acr <= 0 && acw <= 0 && ace <= 0)
2695			return (0);
2696		else
2697			return (ENXIO);
2698	}
2699	for (n = 0; n < sc->sc_ndisks; n++) {
2700		disk = &sc->sc_disks[n];
2701		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2702			continue;
2703		/*
2704		 * Mark disk as dirty on open and unmark on close.
2705		 */
2706		if (pp->acw == 0 && dcw > 0) {
2707			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2708			    g_raid3_get_diskname(disk), sc->sc_name);
2709			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2710			g_raid3_update_metadata(disk);
2711		} else if (pp->acw > 0 && dcw == 0) {
2712			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2713			    g_raid3_get_diskname(disk), sc->sc_name);
2714			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2715			g_raid3_update_metadata(disk);
2716		}
2717	}
2718	return (0);
2719}
2720
2721static struct g_geom *
2722g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2723{
2724	struct g_raid3_softc *sc;
2725	struct g_geom *gp;
2726	int error, timeout;
2727	u_int n;
2728
2729	g_topology_assert();
2730	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2731
2732	/* One disk is minimum. */
2733	if (md->md_all < 1)
2734		return (NULL);
2735	/*
2736	 * Action geom.
2737	 */
2738	gp = g_new_geomf(mp, "%s", md->md_name);
2739	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2740	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2741	    M_WAITOK | M_ZERO);
2742	gp->start = g_raid3_start;
2743	gp->orphan = g_raid3_orphan;
2744	gp->access = g_raid3_access;
2745	gp->dumpconf = g_raid3_dumpconf;
2746
2747	sc->sc_id = md->md_id;
2748	sc->sc_mediasize = md->md_mediasize;
2749	sc->sc_sectorsize = md->md_sectorsize;
2750	sc->sc_ndisks = md->md_all;
2751	sc->sc_round_robin = 0;
2752	sc->sc_flags = md->md_mflags;
2753	sc->sc_bump_id = 0;
2754	sc->sc_idle = 0;
2755	for (n = 0; n < sc->sc_ndisks; n++) {
2756		sc->sc_disks[n].d_softc = sc;
2757		sc->sc_disks[n].d_no = n;
2758		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2759	}
2760	bioq_init(&sc->sc_queue);
2761	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2762	TAILQ_INIT(&sc->sc_events);
2763	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2764	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2765	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2766	gp->softc = sc;
2767	sc->sc_geom = gp;
2768	sc->sc_provider = NULL;
2769	/*
2770	 * Synchronization geom.
2771	 */
2772	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2773	gp->softc = sc;
2774	gp->orphan = g_raid3_orphan;
2775	sc->sc_sync.ds_geom = gp;
2776	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2777	    UMA_ALIGN_PTR, 0);
2778	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2779	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2780	    UMA_ALIGN_PTR, 0);
2781	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2782	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2783	    UMA_ALIGN_PTR, 0);
2784	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2785	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2786	    "g_raid3 %s", md->md_name);
2787	if (error != 0) {
2788		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2789		    sc->sc_name);
2790		uma_zdestroy(sc->sc_zone_64k);
2791		uma_zdestroy(sc->sc_zone_16k);
2792		uma_zdestroy(sc->sc_zone_4k);
2793		g_destroy_geom(sc->sc_sync.ds_geom);
2794		mtx_destroy(&sc->sc_events_mtx);
2795		mtx_destroy(&sc->sc_queue_mtx);
2796		g_destroy_geom(sc->sc_geom);
2797		free(sc->sc_disks, M_RAID3);
2798		free(sc, M_RAID3);
2799		return (NULL);
2800	}
2801
2802	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2803
2804	/*
2805	 * Run timeout.
2806	 */
2807	timeout = atomic_load_acq_int(&g_raid3_timeout);
2808	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2809	return (sc->sc_geom);
2810}
2811
2812int
2813g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2814{
2815	struct g_provider *pp;
2816
2817	g_topology_assert();
2818
2819	if (sc == NULL)
2820		return (ENXIO);
2821	pp = sc->sc_provider;
2822	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2823		if (force) {
2824			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2825			    "can't be definitely removed.", pp->name);
2826		} else {
2827			G_RAID3_DEBUG(1,
2828			    "Device %s is still open (r%dw%de%d).", pp->name,
2829			    pp->acr, pp->acw, pp->ace);
2830			return (EBUSY);
2831		}
2832	}
2833
2834	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2835	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2836	g_topology_unlock();
2837	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2838	mtx_lock(&sc->sc_queue_mtx);
2839	wakeup(sc);
2840	wakeup(&sc->sc_queue);
2841	mtx_unlock(&sc->sc_queue_mtx);
2842	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2843	while (sc->sc_worker != NULL)
2844		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2845	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2846	g_topology_lock();
2847	g_raid3_destroy_device(sc);
2848	free(sc->sc_disks, M_RAID3);
2849	free(sc, M_RAID3);
2850	return (0);
2851}
2852
2853static void
2854g_raid3_taste_orphan(struct g_consumer *cp)
2855{
2856
2857	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2858	    cp->provider->name));
2859}
2860
2861static struct g_geom *
2862g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2863{
2864	struct g_raid3_metadata md;
2865	struct g_raid3_softc *sc;
2866	struct g_consumer *cp;
2867	struct g_geom *gp;
2868	int error;
2869
2870	g_topology_assert();
2871	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2872	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2873
2874	gp = g_new_geomf(mp, "raid3:taste");
2875	/* This orphan function should be never called. */
2876	gp->orphan = g_raid3_taste_orphan;
2877	cp = g_new_consumer(gp);
2878	g_attach(cp, pp);
2879	error = g_raid3_read_metadata(cp, &md);
2880	g_detach(cp);
2881	g_destroy_consumer(cp);
2882	g_destroy_geom(gp);
2883	if (error != 0)
2884		return (NULL);
2885	gp = NULL;
2886
2887	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2888		return (NULL);
2889	if (g_raid3_debug >= 2)
2890		raid3_metadata_dump(&md);
2891
2892	/*
2893	 * Let's check if device already exists.
2894	 */
2895	sc = NULL;
2896	LIST_FOREACH(gp, &mp->geom, geom) {
2897		sc = gp->softc;
2898		if (sc == NULL)
2899			continue;
2900		if (sc->sc_sync.ds_geom == gp)
2901			continue;
2902		if (strcmp(md.md_name, sc->sc_name) != 0)
2903			continue;
2904		if (md.md_id != sc->sc_id) {
2905			G_RAID3_DEBUG(0, "Device %s already configured.",
2906			    sc->sc_name);
2907			return (NULL);
2908		}
2909		break;
2910	}
2911	if (gp == NULL) {
2912		gp = g_raid3_create(mp, &md);
2913		if (gp == NULL) {
2914			G_RAID3_DEBUG(0, "Cannot create device %s.",
2915			    md.md_name);
2916			return (NULL);
2917		}
2918		sc = gp->softc;
2919	}
2920	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2921	error = g_raid3_add_disk(sc, pp, &md);
2922	if (error != 0) {
2923		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2924		    pp->name, gp->name, error);
2925		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2926		    sc->sc_ndisks) {
2927			g_raid3_destroy(sc, 1);
2928		}
2929		return (NULL);
2930	}
2931	return (gp);
2932}
2933
2934static int
2935g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2936    struct g_geom *gp)
2937{
2938
2939	return (g_raid3_destroy(gp->softc, 0));
2940}
2941
2942static void
2943g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2944    struct g_consumer *cp, struct g_provider *pp)
2945{
2946	struct g_raid3_softc *sc;
2947
2948	g_topology_assert();
2949
2950	sc = gp->softc;
2951	if (sc == NULL)
2952		return;
2953	/* Skip synchronization geom. */
2954	if (gp == sc->sc_sync.ds_geom)
2955		return;
2956	if (pp != NULL) {
2957		/* Nothing here. */
2958	} else if (cp != NULL) {
2959		struct g_raid3_disk *disk;
2960
2961		disk = cp->private;
2962		if (disk == NULL)
2963			return;
2964		sbuf_printf(sb, "%s<Type>", indent);
2965		if (disk->d_no == sc->sc_ndisks - 1)
2966			sbuf_printf(sb, "PARITY");
2967		else
2968			sbuf_printf(sb, "DATA");
2969		sbuf_printf(sb, "</Type>\n");
2970		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2971		    (u_int)disk->d_no);
2972		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2973			sbuf_printf(sb, "%s<Synchronized>", indent);
2974			if (disk->d_sync.ds_offset_done == 0)
2975				sbuf_printf(sb, "0%%");
2976			else {
2977				sbuf_printf(sb, "%u%%",
2978				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2979				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2980			}
2981			sbuf_printf(sb, "</Synchronized>\n");
2982		}
2983		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2984		    disk->d_sync.ds_syncid);
2985		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
2986		sbuf_printf(sb, "%s<Flags>", indent);
2987		if (disk->d_flags == 0)
2988			sbuf_printf(sb, "NONE");
2989		else {
2990			int first = 1;
2991
2992#define	ADD_FLAG(flag, name)	do {					\
2993	if ((disk->d_flags & (flag)) != 0) {				\
2994		if (!first)						\
2995			sbuf_printf(sb, ", ");				\
2996		else							\
2997			first = 0;					\
2998		sbuf_printf(sb, name);					\
2999	}								\
3000} while (0)
3001			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3002			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3003			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3004			    "SYNCHRONIZING");
3005			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3006#undef	ADD_FLAG
3007		}
3008		sbuf_printf(sb, "</Flags>\n");
3009		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3010		    g_raid3_disk_state2str(disk->d_state));
3011	} else {
3012		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3013		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3014		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3015		sbuf_printf(sb, "%s<Flags>", indent);
3016		if (sc->sc_flags == 0)
3017			sbuf_printf(sb, "NONE");
3018		else {
3019			int first = 1;
3020
3021#define	ADD_FLAG(flag, name)	do {					\
3022	if ((sc->sc_flags & (flag)) != 0) {				\
3023		if (!first)						\
3024			sbuf_printf(sb, ", ");				\
3025		else							\
3026			first = 0;					\
3027		sbuf_printf(sb, name);					\
3028	}								\
3029} while (0)
3030			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3031			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3032			    "ROUND-ROBIN");
3033			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3034#undef	ADD_FLAG
3035		}
3036		sbuf_printf(sb, "</Flags>\n");
3037		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3038		    sc->sc_ndisks);
3039		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3040		    g_raid3_device_state2str(sc->sc_state));
3041	}
3042}
3043
3044static void
3045g_raid3_shutdown(void *arg, int howto)
3046{
3047	struct g_class *mp;
3048	struct g_geom *gp, *gp2;
3049
3050	mp = arg;
3051	DROP_GIANT();
3052	g_topology_lock();
3053	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3054		if (gp->softc == NULL)
3055			continue;
3056		g_raid3_destroy(gp->softc, 1);
3057	}
3058	g_topology_unlock();
3059	PICKUP_GIANT();
3060#if 0
3061	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3062#endif
3063}
3064
3065static void
3066g_raid3_init(struct g_class *mp)
3067{
3068
3069	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3070	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3071	if (g_raid3_ehtag == NULL)
3072		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3073}
3074
3075static void
3076g_raid3_fini(struct g_class *mp)
3077{
3078
3079	if (g_raid3_ehtag == NULL)
3080		return;
3081	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3082}
3083
3084DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3085