g_raid.c revision 219974
1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid/g_raid.c 219974 2011-03-24 21:31:32Z mav $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sysctl.h>
39#include <sys/malloc.h>
40#include <sys/eventhandler.h>
41#include <vm/uma.h>
42#include <geom/geom.h>
43#include <sys/proc.h>
44#include <sys/kthread.h>
45#include <sys/sched.h>
46#include <geom/raid/g_raid.h>
47#include "g_raid_md_if.h"
48#include "g_raid_tr_if.h"
49
50static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
51
52SYSCTL_DECL(_kern_geom);
53SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
54u_int g_raid_aggressive_spare = 0;
55TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
56SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
57    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
58u_int g_raid_debug = 2;
59TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
60SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
61    "Debug level");
62int g_raid_read_err_thresh = 10;
63TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
64SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
65    &g_raid_read_err_thresh, 0,
66    "Number of read errors equated to disk failure");
67u_int g_raid_start_timeout = 30;
68TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
70    &g_raid_start_timeout, 0,
71    "Time to wait for all array components");
72static u_int g_raid_clean_time = 5;
73TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
74SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
75    &g_raid_clean_time, 0, "Mark volume as clean when idling");
76static u_int g_raid_disconnect_on_failure = 1;
77TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
78    &g_raid_disconnect_on_failure);
79SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
80    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
81static u_int g_raid_name_format = 0;
82TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
83SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
84    &g_raid_name_format, 0, "Providers name format.");
85static u_int g_raid_idle_threshold = 1000000;
86TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
87SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
88    &g_raid_idle_threshold, 1000000,
89    "Time in microseconds to consider a volume idle.");
90
91#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
92	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
93	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
94	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
95} while (0)
96
97LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
98    LIST_HEAD_INITIALIZER(g_raid_md_classes);
99
100LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
101    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
102
103LIST_HEAD(, g_raid_volume) g_raid_volumes =
104    LIST_HEAD_INITIALIZER(g_raid_volumes);
105
106static eventhandler_tag g_raid_pre_sync = NULL;
107static int g_raid_started = 0;
108
109static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
110    struct g_geom *gp);
111static g_taste_t g_raid_taste;
112static void g_raid_init(struct g_class *mp);
113static void g_raid_fini(struct g_class *mp);
114
115struct g_class g_raid_class = {
116	.name = G_RAID_CLASS_NAME,
117	.version = G_VERSION,
118	.ctlreq = g_raid_ctl,
119	.taste = g_raid_taste,
120	.destroy_geom = g_raid_destroy_geom,
121	.init = g_raid_init,
122	.fini = g_raid_fini
123};
124
125static void g_raid_destroy_provider(struct g_raid_volume *vol);
126static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
127static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
128static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
129static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
130static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
131    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
132static void g_raid_start(struct bio *bp);
133static void g_raid_start_request(struct bio *bp);
134static void g_raid_disk_done(struct bio *bp);
135static void g_raid_poll(struct g_raid_softc *sc);
136
137static const char *
138g_raid_node_event2str(int event)
139{
140
141	switch (event) {
142	case G_RAID_NODE_E_WAKE:
143		return ("WAKE");
144	case G_RAID_NODE_E_START:
145		return ("START");
146	default:
147		return ("INVALID");
148	}
149}
150
151const char *
152g_raid_disk_state2str(int state)
153{
154
155	switch (state) {
156	case G_RAID_DISK_S_NONE:
157		return ("NONE");
158	case G_RAID_DISK_S_OFFLINE:
159		return ("OFFLINE");
160	case G_RAID_DISK_S_FAILED:
161		return ("FAILED");
162	case G_RAID_DISK_S_STALE_FAILED:
163		return ("STALE_FAILED");
164	case G_RAID_DISK_S_SPARE:
165		return ("SPARE");
166	case G_RAID_DISK_S_STALE:
167		return ("STALE");
168	case G_RAID_DISK_S_ACTIVE:
169		return ("ACTIVE");
170	default:
171		return ("INVALID");
172	}
173}
174
175static const char *
176g_raid_disk_event2str(int event)
177{
178
179	switch (event) {
180	case G_RAID_DISK_E_DISCONNECTED:
181		return ("DISCONNECTED");
182	default:
183		return ("INVALID");
184	}
185}
186
187const char *
188g_raid_subdisk_state2str(int state)
189{
190
191	switch (state) {
192	case G_RAID_SUBDISK_S_NONE:
193		return ("NONE");
194	case G_RAID_SUBDISK_S_FAILED:
195		return ("FAILED");
196	case G_RAID_SUBDISK_S_NEW:
197		return ("NEW");
198	case G_RAID_SUBDISK_S_REBUILD:
199		return ("REBUILD");
200	case G_RAID_SUBDISK_S_UNINITIALIZED:
201		return ("UNINITIALIZED");
202	case G_RAID_SUBDISK_S_STALE:
203		return ("STALE");
204	case G_RAID_SUBDISK_S_RESYNC:
205		return ("RESYNC");
206	case G_RAID_SUBDISK_S_ACTIVE:
207		return ("ACTIVE");
208	default:
209		return ("INVALID");
210	}
211}
212
213static const char *
214g_raid_subdisk_event2str(int event)
215{
216
217	switch (event) {
218	case G_RAID_SUBDISK_E_NEW:
219		return ("NEW");
220	case G_RAID_SUBDISK_E_DISCONNECTED:
221		return ("DISCONNECTED");
222	default:
223		return ("INVALID");
224	}
225}
226
227const char *
228g_raid_volume_state2str(int state)
229{
230
231	switch (state) {
232	case G_RAID_VOLUME_S_STARTING:
233		return ("STARTING");
234	case G_RAID_VOLUME_S_BROKEN:
235		return ("BROKEN");
236	case G_RAID_VOLUME_S_DEGRADED:
237		return ("DEGRADED");
238	case G_RAID_VOLUME_S_SUBOPTIMAL:
239		return ("SUBOPTIMAL");
240	case G_RAID_VOLUME_S_OPTIMAL:
241		return ("OPTIMAL");
242	case G_RAID_VOLUME_S_UNSUPPORTED:
243		return ("UNSUPPORTED");
244	case G_RAID_VOLUME_S_STOPPED:
245		return ("STOPPED");
246	default:
247		return ("INVALID");
248	}
249}
250
251static const char *
252g_raid_volume_event2str(int event)
253{
254
255	switch (event) {
256	case G_RAID_VOLUME_E_UP:
257		return ("UP");
258	case G_RAID_VOLUME_E_DOWN:
259		return ("DOWN");
260	case G_RAID_VOLUME_E_START:
261		return ("START");
262	case G_RAID_VOLUME_E_STARTMD:
263		return ("STARTMD");
264	default:
265		return ("INVALID");
266	}
267}
268
269const char *
270g_raid_volume_level2str(int level, int qual)
271{
272
273	switch (level) {
274	case G_RAID_VOLUME_RL_RAID0:
275		return ("RAID0");
276	case G_RAID_VOLUME_RL_RAID1:
277		return ("RAID1");
278	case G_RAID_VOLUME_RL_RAID3:
279		return ("RAID3");
280	case G_RAID_VOLUME_RL_RAID4:
281		return ("RAID4");
282	case G_RAID_VOLUME_RL_RAID5:
283		return ("RAID5");
284	case G_RAID_VOLUME_RL_RAID6:
285		return ("RAID6");
286	case G_RAID_VOLUME_RL_RAID1E:
287		return ("RAID1E");
288	case G_RAID_VOLUME_RL_SINGLE:
289		return ("SINGLE");
290	case G_RAID_VOLUME_RL_CONCAT:
291		return ("CONCAT");
292	case G_RAID_VOLUME_RL_RAID5E:
293		return ("RAID5E");
294	case G_RAID_VOLUME_RL_RAID5EE:
295		return ("RAID5EE");
296	default:
297		return ("UNKNOWN");
298	}
299}
300
301int
302g_raid_volume_str2level(const char *str, int *level, int *qual)
303{
304
305	*level = G_RAID_VOLUME_RL_UNKNOWN;
306	*qual = G_RAID_VOLUME_RLQ_NONE;
307	if (strcasecmp(str, "RAID0") == 0)
308		*level = G_RAID_VOLUME_RL_RAID0;
309	else if (strcasecmp(str, "RAID1") == 0)
310		*level = G_RAID_VOLUME_RL_RAID1;
311	else if (strcasecmp(str, "RAID3") == 0)
312		*level = G_RAID_VOLUME_RL_RAID3;
313	else if (strcasecmp(str, "RAID4") == 0)
314		*level = G_RAID_VOLUME_RL_RAID4;
315	else if (strcasecmp(str, "RAID5") == 0)
316		*level = G_RAID_VOLUME_RL_RAID5;
317	else if (strcasecmp(str, "RAID6") == 0)
318		*level = G_RAID_VOLUME_RL_RAID6;
319	else if (strcasecmp(str, "RAID10") == 0 ||
320		 strcasecmp(str, "RAID1E") == 0)
321		*level = G_RAID_VOLUME_RL_RAID1E;
322	else if (strcasecmp(str, "SINGLE") == 0)
323		*level = G_RAID_VOLUME_RL_SINGLE;
324	else if (strcasecmp(str, "CONCAT") == 0)
325		*level = G_RAID_VOLUME_RL_CONCAT;
326	else if (strcasecmp(str, "RAID5E") == 0)
327		*level = G_RAID_VOLUME_RL_RAID5E;
328	else if (strcasecmp(str, "RAID5EE") == 0)
329		*level = G_RAID_VOLUME_RL_RAID5EE;
330	else
331		return (-1);
332	return (0);
333}
334
335const char *
336g_raid_get_diskname(struct g_raid_disk *disk)
337{
338
339	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
340		return ("[unknown]");
341	return (disk->d_consumer->provider->name);
342}
343
344void
345g_raid_report_disk_state(struct g_raid_disk *disk)
346{
347	struct g_raid_subdisk *sd;
348	int len, state;
349	uint32_t s;
350
351	if (disk->d_consumer == NULL)
352		return;
353	if (disk->d_state == G_RAID_DISK_S_FAILED ||
354	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
355		s = G_STATE_FAILED;
356	} else {
357		state = G_RAID_SUBDISK_S_ACTIVE;
358		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
359			if (sd->sd_state < state)
360				state = sd->sd_state;
361		}
362		if (state == G_RAID_SUBDISK_S_FAILED)
363			s = G_STATE_FAILED;
364		else if (state == G_RAID_SUBDISK_S_NEW ||
365		    state == G_RAID_SUBDISK_S_REBUILD)
366			s = G_STATE_REBUILD;
367		else if (state == G_RAID_SUBDISK_S_STALE ||
368		    state == G_RAID_SUBDISK_S_RESYNC)
369			s = G_STATE_RESYNC;
370		else
371			s = G_STATE_ACTIVE;
372	}
373	len = sizeof(s);
374	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
375	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
376	    g_raid_get_diskname(disk), s);
377}
378
379void
380g_raid_change_disk_state(struct g_raid_disk *disk, int state)
381{
382
383	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
384	    g_raid_get_diskname(disk),
385	    g_raid_disk_state2str(disk->d_state),
386	    g_raid_disk_state2str(state));
387	disk->d_state = state;
388	g_raid_report_disk_state(disk);
389}
390
391void
392g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
393{
394
395	G_RAID_DEBUG1(0, sd->sd_softc,
396	    "Subdisk %s:%d-%s state changed from %s to %s.",
397	    sd->sd_volume->v_name, sd->sd_pos,
398	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
399	    g_raid_subdisk_state2str(sd->sd_state),
400	    g_raid_subdisk_state2str(state));
401	sd->sd_state = state;
402	if (sd->sd_disk)
403		g_raid_report_disk_state(sd->sd_disk);
404}
405
406void
407g_raid_change_volume_state(struct g_raid_volume *vol, int state)
408{
409
410	G_RAID_DEBUG1(0, vol->v_softc,
411	    "Volume %s state changed from %s to %s.",
412	    vol->v_name,
413	    g_raid_volume_state2str(vol->v_state),
414	    g_raid_volume_state2str(state));
415	vol->v_state = state;
416}
417
418/*
419 * --- Events handling functions ---
420 * Events in geom_raid are used to maintain subdisks and volumes status
421 * from one thread to simplify locking.
422 */
423static void
424g_raid_event_free(struct g_raid_event *ep)
425{
426
427	free(ep, M_RAID);
428}
429
430int
431g_raid_event_send(void *arg, int event, int flags)
432{
433	struct g_raid_softc *sc;
434	struct g_raid_event *ep;
435	int error;
436
437	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
438		sc = ((struct g_raid_volume *)arg)->v_softc;
439	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
440		sc = ((struct g_raid_disk *)arg)->d_softc;
441	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
442		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
443	} else {
444		sc = arg;
445	}
446	ep = malloc(sizeof(*ep), M_RAID,
447	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
448	if (ep == NULL)
449		return (ENOMEM);
450	ep->e_tgt = arg;
451	ep->e_event = event;
452	ep->e_flags = flags;
453	ep->e_error = 0;
454	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
455	mtx_lock(&sc->sc_queue_mtx);
456	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
457	mtx_unlock(&sc->sc_queue_mtx);
458	wakeup(sc);
459
460	if ((flags & G_RAID_EVENT_WAIT) == 0)
461		return (0);
462
463	sx_assert(&sc->sc_lock, SX_XLOCKED);
464	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
465	sx_xunlock(&sc->sc_lock);
466	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
467		mtx_lock(&sc->sc_queue_mtx);
468		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
469		    hz * 5);
470	}
471	error = ep->e_error;
472	g_raid_event_free(ep);
473	sx_xlock(&sc->sc_lock);
474	return (error);
475}
476
477static void
478g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
479{
480	struct g_raid_event *ep, *tmpep;
481
482	sx_assert(&sc->sc_lock, SX_XLOCKED);
483
484	mtx_lock(&sc->sc_queue_mtx);
485	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
486		if (ep->e_tgt != tgt)
487			continue;
488		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
489		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
490			g_raid_event_free(ep);
491		else {
492			ep->e_error = ECANCELED;
493			wakeup(ep);
494		}
495	}
496	mtx_unlock(&sc->sc_queue_mtx);
497}
498
499static int
500g_raid_event_check(struct g_raid_softc *sc, void *tgt)
501{
502	struct g_raid_event *ep;
503	int	res = 0;
504
505	sx_assert(&sc->sc_lock, SX_XLOCKED);
506
507	mtx_lock(&sc->sc_queue_mtx);
508	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
509		if (ep->e_tgt != tgt)
510			continue;
511		res = 1;
512		break;
513	}
514	mtx_unlock(&sc->sc_queue_mtx);
515	return (res);
516}
517
518/*
519 * Return the number of disks in given state.
520 * If state is equal to -1, count all connected disks.
521 */
522u_int
523g_raid_ndisks(struct g_raid_softc *sc, int state)
524{
525	struct g_raid_disk *disk;
526	u_int n;
527
528	sx_assert(&sc->sc_lock, SX_LOCKED);
529
530	n = 0;
531	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
532		if (disk->d_state == state || state == -1)
533			n++;
534	}
535	return (n);
536}
537
538/*
539 * Return the number of subdisks in given state.
540 * If state is equal to -1, count all connected disks.
541 */
542u_int
543g_raid_nsubdisks(struct g_raid_volume *vol, int state)
544{
545	struct g_raid_subdisk *subdisk;
546	struct g_raid_softc *sc;
547	u_int i, n ;
548
549	sc = vol->v_softc;
550	sx_assert(&sc->sc_lock, SX_LOCKED);
551
552	n = 0;
553	for (i = 0; i < vol->v_disks_count; i++) {
554		subdisk = &vol->v_subdisks[i];
555		if ((state == -1 &&
556		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
557		    subdisk->sd_state == state)
558			n++;
559	}
560	return (n);
561}
562
563/*
564 * Return the first subdisk in given state.
565 * If state is equal to -1, then the first connected disks.
566 */
567struct g_raid_subdisk *
568g_raid_get_subdisk(struct g_raid_volume *vol, int state)
569{
570	struct g_raid_subdisk *sd;
571	struct g_raid_softc *sc;
572	u_int i;
573
574	sc = vol->v_softc;
575	sx_assert(&sc->sc_lock, SX_LOCKED);
576
577	for (i = 0; i < vol->v_disks_count; i++) {
578		sd = &vol->v_subdisks[i];
579		if ((state == -1 &&
580		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
581		    sd->sd_state == state)
582			return (sd);
583	}
584	return (NULL);
585}
586
587struct g_consumer *
588g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
589{
590	struct g_consumer *cp;
591	struct g_provider *pp;
592
593	g_topology_assert();
594
595	if (strncmp(name, "/dev/", 5) == 0)
596		name += 5;
597	pp = g_provider_by_name(name);
598	if (pp == NULL)
599		return (NULL);
600	cp = g_new_consumer(sc->sc_geom);
601	if (g_attach(cp, pp) != 0) {
602		g_destroy_consumer(cp);
603		return (NULL);
604	}
605	if (g_access(cp, 1, 1, 1) != 0) {
606		g_detach(cp);
607		g_destroy_consumer(cp);
608		return (NULL);
609	}
610	return (cp);
611}
612
613static u_int
614g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
615{
616	struct bio *bp;
617	u_int nreqs = 0;
618
619	mtx_lock(&sc->sc_queue_mtx);
620	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
621		if (bp->bio_from == cp)
622			nreqs++;
623	}
624	mtx_unlock(&sc->sc_queue_mtx);
625	return (nreqs);
626}
627
628u_int
629g_raid_nopens(struct g_raid_softc *sc)
630{
631	struct g_raid_volume *vol;
632	u_int opens;
633
634	opens = 0;
635	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
636		if (vol->v_provider_open != 0)
637			opens++;
638	}
639	return (opens);
640}
641
642static int
643g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
644{
645
646	if (cp->index > 0) {
647		G_RAID_DEBUG1(2, sc,
648		    "I/O requests for %s exist, can't destroy it now.",
649		    cp->provider->name);
650		return (1);
651	}
652	if (g_raid_nrequests(sc, cp) > 0) {
653		G_RAID_DEBUG1(2, sc,
654		    "I/O requests for %s in queue, can't destroy it now.",
655		    cp->provider->name);
656		return (1);
657	}
658	return (0);
659}
660
661static void
662g_raid_destroy_consumer(void *arg, int flags __unused)
663{
664	struct g_consumer *cp;
665
666	g_topology_assert();
667
668	cp = arg;
669	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
670	g_detach(cp);
671	g_destroy_consumer(cp);
672}
673
674void
675g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
676{
677	struct g_provider *pp;
678	int retaste_wait;
679
680	g_topology_assert_not();
681
682	g_topology_lock();
683	cp->private = NULL;
684	if (g_raid_consumer_is_busy(sc, cp))
685		goto out;
686	pp = cp->provider;
687	retaste_wait = 0;
688	if (cp->acw == 1) {
689		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
690			retaste_wait = 1;
691	}
692	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
693		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
694	if (retaste_wait) {
695		/*
696		 * After retaste event was send (inside g_access()), we can send
697		 * event to detach and destroy consumer.
698		 * A class, which has consumer to the given provider connected
699		 * will not receive retaste event for the provider.
700		 * This is the way how I ignore retaste events when I close
701		 * consumers opened for write: I detach and destroy consumer
702		 * after retaste event is sent.
703		 */
704		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
705		goto out;
706	}
707	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
708	g_detach(cp);
709	g_destroy_consumer(cp);
710out:
711	g_topology_unlock();
712}
713
714static void
715g_raid_orphan(struct g_consumer *cp)
716{
717	struct g_raid_disk *disk;
718
719	g_topology_assert();
720
721	disk = cp->private;
722	if (disk == NULL)
723		return;
724	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
725	    G_RAID_EVENT_DISK);
726}
727
728static int
729g_raid_clean(struct g_raid_volume *vol, int acw)
730{
731	struct g_raid_softc *sc;
732	int timeout;
733
734	sc = vol->v_softc;
735	g_topology_assert_not();
736	sx_assert(&sc->sc_lock, SX_XLOCKED);
737
738//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
739//		return (0);
740	if (!vol->v_dirty)
741		return (0);
742	if (vol->v_writes > 0)
743		return (0);
744	if (acw > 0 || (acw == -1 &&
745	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
746		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
747		if (timeout > 0)
748			return (timeout);
749	}
750	vol->v_dirty = 0;
751	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
752	    vol->v_name);
753	g_raid_write_metadata(sc, vol, NULL, NULL);
754	return (0);
755}
756
757static void
758g_raid_dirty(struct g_raid_volume *vol)
759{
760	struct g_raid_softc *sc;
761
762	sc = vol->v_softc;
763	g_topology_assert_not();
764	sx_assert(&sc->sc_lock, SX_XLOCKED);
765
766//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
767//		return;
768	vol->v_dirty = 1;
769	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
770	    vol->v_name);
771	g_raid_write_metadata(sc, vol, NULL, NULL);
772}
773
774void
775g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
776{
777	struct g_raid_softc *sc;
778	struct g_raid_volume *vol;
779	struct g_raid_subdisk *sd;
780	struct bio_queue_head queue;
781	struct bio *cbp;
782	int i;
783
784	vol = tr->tro_volume;
785	sc = vol->v_softc;
786
787	/*
788	 * Allocate all bios before sending any request, so we can return
789	 * ENOMEM in nice and clean way.
790	 */
791	bioq_init(&queue);
792	for (i = 0; i < vol->v_disks_count; i++) {
793		sd = &vol->v_subdisks[i];
794		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
795		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
796			continue;
797		cbp = g_clone_bio(bp);
798		if (cbp == NULL)
799			goto failure;
800		cbp->bio_caller1 = sd;
801		bioq_insert_tail(&queue, cbp);
802	}
803	for (cbp = bioq_first(&queue); cbp != NULL;
804	    cbp = bioq_first(&queue)) {
805		bioq_remove(&queue, cbp);
806		sd = cbp->bio_caller1;
807		cbp->bio_caller1 = NULL;
808		g_raid_subdisk_iostart(sd, cbp);
809	}
810	return;
811failure:
812	for (cbp = bioq_first(&queue); cbp != NULL;
813	    cbp = bioq_first(&queue)) {
814		bioq_remove(&queue, cbp);
815		g_destroy_bio(cbp);
816	}
817	if (bp->bio_error == 0)
818		bp->bio_error = ENOMEM;
819	g_raid_iodone(bp, bp->bio_error);
820}
821
822static void
823g_raid_tr_kerneldump_common_done(struct bio *bp)
824{
825
826	bp->bio_flags |= BIO_DONE;
827}
828
829int
830g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
831    void *virtual, vm_offset_t physical, off_t offset, size_t length)
832{
833	struct g_raid_softc *sc;
834	struct g_raid_volume *vol;
835	struct bio bp;
836
837	vol = tr->tro_volume;
838	sc = vol->v_softc;
839
840	bzero(&bp, sizeof(bp));
841	bp.bio_cmd = BIO_WRITE;
842	bp.bio_done = g_raid_tr_kerneldump_common_done;
843	bp.bio_attribute = NULL;
844	bp.bio_offset = offset;
845	bp.bio_length = length;
846	bp.bio_data = virtual;
847	bp.bio_to = vol->v_provider;
848
849	g_raid_start(&bp);
850	while (!(bp.bio_flags & BIO_DONE)) {
851		G_RAID_DEBUG1(4, sc, "Poll...");
852		g_raid_poll(sc);
853		DELAY(10);
854	}
855
856	return (bp.bio_error != 0 ? EIO : 0);
857}
858
859static int
860g_raid_dump(void *arg,
861    void *virtual, vm_offset_t physical, off_t offset, size_t length)
862{
863	struct g_raid_volume *vol;
864	int error;
865
866	vol = (struct g_raid_volume *)arg;
867	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
868	    (long long unsigned)offset, (long long unsigned)length);
869
870	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
871	    virtual, physical, offset, length);
872	return (error);
873}
874
875static void
876g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
877{
878	struct g_kerneldump *gkd;
879	struct g_provider *pp;
880	struct g_raid_volume *vol;
881
882	gkd = (struct g_kerneldump*)bp->bio_data;
883	pp = bp->bio_to;
884	vol = pp->private;
885	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
886		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
887	gkd->di.dumper = g_raid_dump;
888	gkd->di.priv = vol;
889	gkd->di.blocksize = vol->v_sectorsize;
890	gkd->di.maxiosize = DFLTPHYS;
891	gkd->di.mediaoffset = gkd->offset;
892	if ((gkd->offset + gkd->length) > vol->v_mediasize)
893		gkd->length = vol->v_mediasize - gkd->offset;
894	gkd->di.mediasize = gkd->length;
895	g_io_deliver(bp, 0);
896}
897
898static void
899g_raid_start(struct bio *bp)
900{
901	struct g_raid_softc *sc;
902
903	sc = bp->bio_to->geom->softc;
904	/*
905	 * If sc == NULL or there are no valid disks, provider's error
906	 * should be set and g_raid_start() should not be called at all.
907	 */
908//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
909//	    ("Provider's error should be set (error=%d)(mirror=%s).",
910//	    bp->bio_to->error, bp->bio_to->name));
911	G_RAID_LOGREQ(3, bp, "Request received.");
912
913	switch (bp->bio_cmd) {
914	case BIO_READ:
915	case BIO_WRITE:
916	case BIO_DELETE:
917	case BIO_FLUSH:
918		break;
919	case BIO_GETATTR:
920		if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
921			g_raid_kerneldump(sc, bp);
922		else
923			g_io_deliver(bp, EOPNOTSUPP);
924		return;
925	default:
926		g_io_deliver(bp, EOPNOTSUPP);
927		return;
928	}
929	mtx_lock(&sc->sc_queue_mtx);
930	bioq_disksort(&sc->sc_queue, bp);
931	mtx_unlock(&sc->sc_queue_mtx);
932	if (!dumping) {
933		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
934		wakeup(sc);
935	}
936}
937
938static int
939g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
940{
941	/*
942	 * 5 cases:
943	 * (1) bp entirely below NO
944	 * (2) bp entirely above NO
945	 * (3) bp start below, but end in range YES
946	 * (4) bp entirely within YES
947	 * (5) bp starts within, ends above YES
948	 *
949	 * lock range 10-19 (offset 10 length 10)
950	 * (1) 1-5: first if kicks it out
951	 * (2) 30-35: second if kicks it out
952	 * (3) 5-15: passes both ifs
953	 * (4) 12-14: passes both ifs
954	 * (5) 19-20: passes both
955	 */
956	off_t lend = lstart + len - 1;
957	off_t bstart = bp->bio_offset;
958	off_t bend = bp->bio_offset + bp->bio_length - 1;
959
960	if (bend < lstart)
961		return (0);
962	if (lend < bstart)
963		return (0);
964	return (1);
965}
966
967static int
968g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
969{
970	struct g_raid_lock *lp;
971
972	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
973
974	LIST_FOREACH(lp, &vol->v_locks, l_next) {
975		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
976			return (1);
977	}
978	return (0);
979}
980
981static void
982g_raid_start_request(struct bio *bp)
983{
984	struct g_raid_softc *sc;
985	struct g_raid_volume *vol;
986
987	sc = bp->bio_to->geom->softc;
988	sx_assert(&sc->sc_lock, SX_LOCKED);
989	vol = bp->bio_to->private;
990
991	/*
992	 * Check to see if this item is in a locked range.  If so,
993	 * queue it to our locked queue and return.  We'll requeue
994	 * it when the range is unlocked.  Internal I/O for the
995	 * rebuild/rescan/recovery process is excluded from this
996	 * check so we can actually do the recovery.
997	 */
998	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
999	    g_raid_is_in_locked_range(vol, bp)) {
1000		G_RAID_LOGREQ(3, bp, "Defer request.");
1001		bioq_insert_tail(&vol->v_locked, bp);
1002		return;
1003	}
1004
1005	/*
1006	 * If we're actually going to do the write/delete, then
1007	 * update the idle stats for the volume.
1008	 */
1009	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1010		if (!vol->v_dirty)
1011			g_raid_dirty(vol);
1012		vol->v_writes++;
1013	}
1014
1015	/*
1016	 * Put request onto inflight queue, so we can check if new
1017	 * synchronization requests don't collide with it.  Then tell
1018	 * the transformation layer to start the I/O.
1019	 */
1020	bioq_insert_tail(&vol->v_inflight, bp);
1021	G_RAID_LOGREQ(4, bp, "Request started");
1022	G_RAID_TR_IOSTART(vol->v_tr, bp);
1023}
1024
1025static void
1026g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1027{
1028	off_t off, len;
1029	struct bio *nbp;
1030	struct g_raid_lock *lp;
1031
1032	vol->v_pending_lock = 0;
1033	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1034		if (lp->l_pending) {
1035			off = lp->l_offset;
1036			len = lp->l_length;
1037			lp->l_pending = 0;
1038			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1039				if (g_raid_bio_overlaps(nbp, off, len))
1040					lp->l_pending++;
1041			}
1042			if (lp->l_pending) {
1043				vol->v_pending_lock = 1;
1044				G_RAID_DEBUG1(4, vol->v_softc,
1045				    "Deferred lock(%jd, %jd) has %d pending",
1046				    (intmax_t)off, (intmax_t)(off + len),
1047				    lp->l_pending);
1048				continue;
1049			}
1050			G_RAID_DEBUG1(4, vol->v_softc,
1051			    "Deferred lock of %jd to %jd completed",
1052			    (intmax_t)off, (intmax_t)(off + len));
1053			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1054		}
1055	}
1056}
1057
1058void
1059g_raid_iodone(struct bio *bp, int error)
1060{
1061	struct g_raid_softc *sc;
1062	struct g_raid_volume *vol;
1063
1064	sc = bp->bio_to->geom->softc;
1065	sx_assert(&sc->sc_lock, SX_LOCKED);
1066	vol = bp->bio_to->private;
1067	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1068
1069	/* Update stats if we done write/delete. */
1070	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1071		vol->v_writes--;
1072		vol->v_last_write = time_uptime;
1073	}
1074
1075	bioq_remove(&vol->v_inflight, bp);
1076	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1077		g_raid_finish_with_locked_ranges(vol, bp);
1078	getmicrouptime(&vol->v_last_done);
1079	g_io_deliver(bp, error);
1080}
1081
1082int
1083g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1084    struct bio *ignore, void *argp)
1085{
1086	struct g_raid_softc *sc;
1087	struct g_raid_lock *lp;
1088	struct bio *bp;
1089
1090	sc = vol->v_softc;
1091	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1092	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1093	lp->l_offset = off;
1094	lp->l_length = len;
1095	lp->l_callback_arg = argp;
1096
1097	lp->l_pending = 0;
1098	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1099		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1100			lp->l_pending++;
1101	}
1102
1103	/*
1104	 * If there are any writes that are pending, we return EBUSY.  All
1105	 * callers will have to wait until all pending writes clear.
1106	 */
1107	if (lp->l_pending > 0) {
1108		vol->v_pending_lock = 1;
1109		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1110		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1111		return (EBUSY);
1112	}
1113	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1114	    (intmax_t)off, (intmax_t)(off+len));
1115	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1116	return (0);
1117}
1118
1119int
1120g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1121{
1122	struct g_raid_lock *lp;
1123	struct g_raid_softc *sc;
1124	struct bio *bp;
1125
1126	sc = vol->v_softc;
1127	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1128		if (lp->l_offset == off && lp->l_length == len) {
1129			LIST_REMOVE(lp, l_next);
1130			/* XXX
1131			 * Right now we just put them all back on the queue
1132			 * and hope for the best.  We hope this because any
1133			 * locked ranges will go right back on this list
1134			 * when the worker thread runs.
1135			 * XXX
1136			 */
1137			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1138			    (intmax_t)lp->l_offset,
1139			    (intmax_t)(lp->l_offset+lp->l_length));
1140			mtx_lock(&sc->sc_queue_mtx);
1141			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1142				bioq_disksort(&sc->sc_queue, bp);
1143			mtx_unlock(&sc->sc_queue_mtx);
1144			free(lp, M_RAID);
1145			return (0);
1146		}
1147	}
1148	return (EINVAL);
1149}
1150
1151void
1152g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1153{
1154	struct g_consumer *cp;
1155	struct g_raid_disk *disk, *tdisk;
1156
1157	bp->bio_caller1 = sd;
1158
1159	/*
1160	 * Make sure that the disk is present. Generally it is a task of
1161	 * transformation layers to not send requests to absent disks, but
1162	 * it is better to be safe and report situation then sorry.
1163	 */
1164	if (sd->sd_disk == NULL) {
1165		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1166nodisk:
1167		bp->bio_from = NULL;
1168		bp->bio_to = NULL;
1169		bp->bio_error = ENXIO;
1170		g_raid_disk_done(bp);
1171		return;
1172	}
1173	disk = sd->sd_disk;
1174	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1175	    disk->d_state != G_RAID_DISK_S_FAILED) {
1176		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1177		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1178		goto nodisk;
1179	}
1180
1181	cp = disk->d_consumer;
1182	bp->bio_from = cp;
1183	bp->bio_to = cp->provider;
1184	cp->index++;
1185
1186	/* Update average disks load. */
1187	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1188		if (tdisk->d_consumer == NULL)
1189			tdisk->d_load = 0;
1190		else
1191			tdisk->d_load = (tdisk->d_consumer->index *
1192			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1193	}
1194
1195	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1196	if (dumping) {
1197		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1198		if (bp->bio_cmd == BIO_WRITE) {
1199			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1200			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1201		} else
1202			bp->bio_error = EOPNOTSUPP;
1203		g_raid_disk_done(bp);
1204	} else {
1205		bp->bio_done = g_raid_disk_done;
1206		bp->bio_offset += sd->sd_offset;
1207		G_RAID_LOGREQ(3, bp, "Sending request.");
1208		g_io_request(bp, cp);
1209	}
1210}
1211
1212int
1213g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1214    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1215{
1216
1217	if (sd->sd_disk == NULL)
1218		return (ENXIO);
1219	if (sd->sd_disk->d_kd.di.dumper == NULL)
1220		return (EOPNOTSUPP);
1221	return (dump_write(&sd->sd_disk->d_kd.di,
1222	    virtual, physical,
1223	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1224	    length));
1225}
1226
1227static void
1228g_raid_disk_done(struct bio *bp)
1229{
1230	struct g_raid_softc *sc;
1231	struct g_raid_subdisk *sd;
1232
1233	sd = bp->bio_caller1;
1234	sc = sd->sd_softc;
1235	mtx_lock(&sc->sc_queue_mtx);
1236	bioq_disksort(&sc->sc_queue, bp);
1237	mtx_unlock(&sc->sc_queue_mtx);
1238	if (!dumping)
1239		wakeup(sc);
1240}
1241
1242static void
1243g_raid_disk_done_request(struct bio *bp)
1244{
1245	struct g_raid_softc *sc;
1246	struct g_raid_disk *disk;
1247	struct g_raid_subdisk *sd;
1248	struct g_raid_volume *vol;
1249
1250	g_topology_assert_not();
1251
1252	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1253	sd = bp->bio_caller1;
1254	sc = sd->sd_softc;
1255	vol = sd->sd_volume;
1256	if (bp->bio_from != NULL) {
1257		bp->bio_from->index--;
1258		disk = bp->bio_from->private;
1259		if (disk == NULL)
1260			g_raid_kill_consumer(sc, bp->bio_from);
1261	}
1262	bp->bio_offset -= sd->sd_offset;
1263
1264	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1265}
1266
1267static void
1268g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1269{
1270
1271	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1272		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1273	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1274		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1275	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1276		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1277	else
1278		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1279	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1280		KASSERT(ep->e_error == 0,
1281		    ("Error cannot be handled."));
1282		g_raid_event_free(ep);
1283	} else {
1284		ep->e_flags |= G_RAID_EVENT_DONE;
1285		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1286		mtx_lock(&sc->sc_queue_mtx);
1287		wakeup(ep);
1288		mtx_unlock(&sc->sc_queue_mtx);
1289	}
1290}
1291
1292/*
1293 * Worker thread.
1294 */
1295static void
1296g_raid_worker(void *arg)
1297{
1298	struct g_raid_softc *sc;
1299	struct g_raid_event *ep;
1300	struct g_raid_volume *vol;
1301	struct bio *bp;
1302	struct timeval now, t;
1303	int timeout, rv;
1304
1305	sc = arg;
1306	thread_lock(curthread);
1307	sched_prio(curthread, PRIBIO);
1308	thread_unlock(curthread);
1309
1310	sx_xlock(&sc->sc_lock);
1311	for (;;) {
1312		mtx_lock(&sc->sc_queue_mtx);
1313		/*
1314		 * First take a look at events.
1315		 * This is important to handle events before any I/O requests.
1316		 */
1317		bp = NULL;
1318		vol = NULL;
1319		rv = 0;
1320		ep = TAILQ_FIRST(&sc->sc_events);
1321		if (ep != NULL)
1322			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1323		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1324			;
1325		else {
1326			getmicrouptime(&now);
1327			t = now;
1328			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1329				if (bioq_first(&vol->v_inflight) == NULL &&
1330				    vol->v_tr &&
1331				    timevalcmp(&vol->v_last_done, &t, < ))
1332					t = vol->v_last_done;
1333			}
1334			timevalsub(&t, &now);
1335			timeout = g_raid_idle_threshold +
1336			    t.tv_sec * 1000000 + t.tv_usec;
1337			if (timeout > 0) {
1338				/*
1339				 * Two steps to avoid overflows at HZ=1000
1340				 * and idle timeouts > 2.1s.  Some rounding
1341				 * errors can occur, but they are < 1tick,
1342				 * which is deemed to be close enough for
1343				 * this purpose.
1344				 */
1345				int micpertic = 1000000 / hz;
1346				timeout = (timeout + micpertic - 1) / micpertic;
1347				sx_xunlock(&sc->sc_lock);
1348				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1349				    PRIBIO | PDROP, "-", timeout);
1350				sx_xlock(&sc->sc_lock);
1351				goto process;
1352			} else
1353				rv = EWOULDBLOCK;
1354		}
1355		mtx_unlock(&sc->sc_queue_mtx);
1356process:
1357		if (ep != NULL) {
1358			g_raid_handle_event(sc, ep);
1359		} else if (bp != NULL) {
1360			if (bp->bio_to != NULL &&
1361			    bp->bio_to->geom == sc->sc_geom)
1362				g_raid_start_request(bp);
1363			else
1364				g_raid_disk_done_request(bp);
1365		} else if (rv == EWOULDBLOCK) {
1366			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1367				if (vol->v_writes == 0 && vol->v_dirty)
1368					g_raid_clean(vol, -1);
1369				if (bioq_first(&vol->v_inflight) == NULL &&
1370				    vol->v_tr) {
1371					t.tv_sec = g_raid_idle_threshold / 1000000;
1372					t.tv_usec = g_raid_idle_threshold % 1000000;
1373					timevaladd(&t, &vol->v_last_done);
1374					getmicrouptime(&now);
1375					if (timevalcmp(&t, &now, <= )) {
1376						G_RAID_TR_IDLE(vol->v_tr);
1377						vol->v_last_done = now;
1378					}
1379				}
1380			}
1381		}
1382		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1383			g_raid_destroy_node(sc, 1);	/* May not return. */
1384	}
1385}
1386
1387static void
1388g_raid_poll(struct g_raid_softc *sc)
1389{
1390	struct g_raid_event *ep;
1391	struct bio *bp;
1392
1393	sx_xlock(&sc->sc_lock);
1394	mtx_lock(&sc->sc_queue_mtx);
1395	/*
1396	 * First take a look at events.
1397	 * This is important to handle events before any I/O requests.
1398	 */
1399	ep = TAILQ_FIRST(&sc->sc_events);
1400	if (ep != NULL) {
1401		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1402		mtx_unlock(&sc->sc_queue_mtx);
1403		g_raid_handle_event(sc, ep);
1404		goto out;
1405	}
1406	bp = bioq_takefirst(&sc->sc_queue);
1407	if (bp != NULL) {
1408		mtx_unlock(&sc->sc_queue_mtx);
1409		if (bp->bio_from == NULL ||
1410		    bp->bio_from->geom != sc->sc_geom)
1411			g_raid_start_request(bp);
1412		else
1413			g_raid_disk_done_request(bp);
1414	}
1415out:
1416	sx_xunlock(&sc->sc_lock);
1417}
1418
1419static void
1420g_raid_launch_provider(struct g_raid_volume *vol)
1421{
1422	struct g_raid_disk *disk;
1423	struct g_raid_softc *sc;
1424	struct g_provider *pp;
1425	char name[G_RAID_MAX_VOLUMENAME];
1426	off_t off;
1427
1428	sc = vol->v_softc;
1429	sx_assert(&sc->sc_lock, SX_LOCKED);
1430
1431	g_topology_lock();
1432	/* Try to name provider with volume name. */
1433	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1434	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1435	    g_provider_by_name(name) != NULL) {
1436		/* Otherwise use sequential volume number. */
1437		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1438	}
1439	pp = g_new_providerf(sc->sc_geom, "%s", name);
1440	pp->private = vol;
1441	pp->mediasize = vol->v_mediasize;
1442	pp->sectorsize = vol->v_sectorsize;
1443	pp->stripesize = 0;
1444	pp->stripeoffset = 0;
1445	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1446	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1447	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1448	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1449		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1450		    disk->d_consumer != NULL &&
1451		    disk->d_consumer->provider != NULL) {
1452			pp->stripesize = disk->d_consumer->provider->stripesize;
1453			off = disk->d_consumer->provider->stripeoffset;
1454			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1455			if (off > 0)
1456				pp->stripeoffset %= off;
1457		}
1458		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1459			pp->stripesize *= (vol->v_disks_count - 1);
1460			pp->stripeoffset *= (vol->v_disks_count - 1);
1461		}
1462	} else
1463		pp->stripesize = vol->v_strip_size;
1464	vol->v_provider = pp;
1465	g_error_provider(pp, 0);
1466	g_topology_unlock();
1467	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1468	    pp->name, vol->v_name);
1469}
1470
1471static void
1472g_raid_destroy_provider(struct g_raid_volume *vol)
1473{
1474	struct g_raid_softc *sc;
1475	struct g_provider *pp;
1476	struct bio *bp, *tmp;
1477
1478	g_topology_assert_not();
1479	sc = vol->v_softc;
1480	pp = vol->v_provider;
1481	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1482
1483	g_topology_lock();
1484	g_error_provider(pp, ENXIO);
1485	mtx_lock(&sc->sc_queue_mtx);
1486	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1487		if (bp->bio_to != pp)
1488			continue;
1489		bioq_remove(&sc->sc_queue, bp);
1490		g_io_deliver(bp, ENXIO);
1491	}
1492	mtx_unlock(&sc->sc_queue_mtx);
1493	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1494	    pp->name, vol->v_name);
1495	g_wither_provider(pp, ENXIO);
1496	g_topology_unlock();
1497	vol->v_provider = NULL;
1498}
1499
1500/*
1501 * Update device state.
1502 */
1503static int
1504g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1505{
1506	struct g_raid_softc *sc;
1507
1508	sc = vol->v_softc;
1509	sx_assert(&sc->sc_lock, SX_XLOCKED);
1510
1511	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1512	    g_raid_volume_event2str(event),
1513	    vol->v_name);
1514	switch (event) {
1515	case G_RAID_VOLUME_E_DOWN:
1516		if (vol->v_provider != NULL)
1517			g_raid_destroy_provider(vol);
1518		break;
1519	case G_RAID_VOLUME_E_UP:
1520		if (vol->v_provider == NULL)
1521			g_raid_launch_provider(vol);
1522		break;
1523	case G_RAID_VOLUME_E_START:
1524		if (vol->v_tr)
1525			G_RAID_TR_START(vol->v_tr);
1526		return (0);
1527	default:
1528		if (sc->sc_md)
1529			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1530		return (0);
1531	}
1532
1533	/* Manage root mount release. */
1534	if (vol->v_starting) {
1535		vol->v_starting = 0;
1536		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1537		root_mount_rel(vol->v_rootmount);
1538		vol->v_rootmount = NULL;
1539	}
1540	if (vol->v_stopping && vol->v_provider_open == 0)
1541		g_raid_destroy_volume(vol);
1542	return (0);
1543}
1544
1545/*
1546 * Update subdisk state.
1547 */
1548static int
1549g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1550{
1551	struct g_raid_softc *sc;
1552	struct g_raid_volume *vol;
1553
1554	sc = sd->sd_softc;
1555	vol = sd->sd_volume;
1556	sx_assert(&sc->sc_lock, SX_XLOCKED);
1557
1558	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1559	    g_raid_subdisk_event2str(event),
1560	    vol->v_name, sd->sd_pos,
1561	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1562	if (vol->v_tr)
1563		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1564
1565	return (0);
1566}
1567
1568/*
1569 * Update disk state.
1570 */
1571static int
1572g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1573{
1574	struct g_raid_softc *sc;
1575
1576	sc = disk->d_softc;
1577	sx_assert(&sc->sc_lock, SX_XLOCKED);
1578
1579	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1580	    g_raid_disk_event2str(event),
1581	    g_raid_get_diskname(disk));
1582
1583	if (sc->sc_md)
1584		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1585	return (0);
1586}
1587
1588/*
1589 * Node event.
1590 */
1591static int
1592g_raid_update_node(struct g_raid_softc *sc, u_int event)
1593{
1594	sx_assert(&sc->sc_lock, SX_XLOCKED);
1595
1596	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1597	    g_raid_node_event2str(event));
1598
1599	if (event == G_RAID_NODE_E_WAKE)
1600		return (0);
1601	if (sc->sc_md)
1602		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1603	return (0);
1604}
1605
1606static int
1607g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1608{
1609	struct g_raid_volume *vol;
1610	struct g_raid_softc *sc;
1611	int dcr, dcw, dce, opens, error = 0;
1612
1613	g_topology_assert();
1614	sc = pp->geom->softc;
1615	vol = pp->private;
1616	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1617	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1618
1619	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1620	    acr, acw, ace);
1621
1622	dcr = pp->acr + acr;
1623	dcw = pp->acw + acw;
1624	dce = pp->ace + ace;
1625
1626	g_topology_unlock();
1627	sx_xlock(&sc->sc_lock);
1628	/* Deny new opens while dying. */
1629	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1630		error = ENXIO;
1631		goto out;
1632	}
1633	if (dcw == 0 && vol->v_dirty)
1634		g_raid_clean(vol, dcw);
1635	vol->v_provider_open += acr + acw + ace;
1636	/* Handle delayed node destruction. */
1637	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1638	    vol->v_provider_open == 0) {
1639		/* Count open volumes. */
1640		opens = g_raid_nopens(sc);
1641		if (opens == 0) {
1642			sc->sc_stopping = G_RAID_DESTROY_HARD;
1643			/* Wake up worker to make it selfdestruct. */
1644			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1645		}
1646	}
1647	/* Handle open volume destruction. */
1648	if (vol->v_stopping && vol->v_provider_open == 0)
1649		g_raid_destroy_volume(vol);
1650out:
1651	sx_xunlock(&sc->sc_lock);
1652	g_topology_lock();
1653	return (error);
1654}
1655
1656struct g_raid_softc *
1657g_raid_create_node(struct g_class *mp,
1658    const char *name, struct g_raid_md_object *md)
1659{
1660	struct g_raid_softc *sc;
1661	struct g_geom *gp;
1662	int error;
1663
1664	g_topology_assert();
1665	G_RAID_DEBUG(1, "Creating array %s.", name);
1666
1667	gp = g_new_geomf(mp, "%s", name);
1668	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1669	gp->start = g_raid_start;
1670	gp->orphan = g_raid_orphan;
1671	gp->access = g_raid_access;
1672	gp->dumpconf = g_raid_dumpconf;
1673
1674	sc->sc_md = md;
1675	sc->sc_geom = gp;
1676	sc->sc_flags = 0;
1677	TAILQ_INIT(&sc->sc_volumes);
1678	TAILQ_INIT(&sc->sc_disks);
1679	sx_init(&sc->sc_lock, "gmirror:lock");
1680	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
1681	TAILQ_INIT(&sc->sc_events);
1682	bioq_init(&sc->sc_queue);
1683	gp->softc = sc;
1684	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1685	    "g_raid %s", name);
1686	if (error != 0) {
1687		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1688		mtx_destroy(&sc->sc_queue_mtx);
1689		sx_destroy(&sc->sc_lock);
1690		g_destroy_geom(sc->sc_geom);
1691		free(sc, M_RAID);
1692		return (NULL);
1693	}
1694
1695	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1696	return (sc);
1697}
1698
1699struct g_raid_volume *
1700g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1701{
1702	struct g_raid_volume	*vol, *vol1;
1703	int i;
1704
1705	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1706	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1707	vol->v_softc = sc;
1708	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1709	vol->v_state = G_RAID_VOLUME_S_STARTING;
1710	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1711	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1712	bioq_init(&vol->v_inflight);
1713	bioq_init(&vol->v_locked);
1714	LIST_INIT(&vol->v_locks);
1715	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1716		vol->v_subdisks[i].sd_softc = sc;
1717		vol->v_subdisks[i].sd_volume = vol;
1718		vol->v_subdisks[i].sd_pos = i;
1719		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1720	}
1721
1722	/* Find free ID for this volume. */
1723	g_topology_lock();
1724	vol1 = vol;
1725	if (id >= 0) {
1726		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1727			if (vol1->v_global_id == id)
1728				break;
1729		}
1730	}
1731	if (vol1 != NULL) {
1732		for (id = 0; ; id++) {
1733			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1734				if (vol1->v_global_id == id)
1735					break;
1736			}
1737			if (vol1 == NULL)
1738				break;
1739		}
1740	}
1741	vol->v_global_id = id;
1742	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1743	g_topology_unlock();
1744
1745	/* Delay root mounting. */
1746	vol->v_rootmount = root_mount_hold("GRAID");
1747	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1748	vol->v_starting = 1;
1749	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1750	return (vol);
1751}
1752
1753struct g_raid_disk *
1754g_raid_create_disk(struct g_raid_softc *sc)
1755{
1756	struct g_raid_disk	*disk;
1757
1758	G_RAID_DEBUG1(1, sc, "Creating disk.");
1759	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1760	disk->d_softc = sc;
1761	disk->d_state = G_RAID_DISK_S_NONE;
1762	TAILQ_INIT(&disk->d_subdisks);
1763	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1764	return (disk);
1765}
1766
1767int g_raid_start_volume(struct g_raid_volume *vol)
1768{
1769	struct g_raid_tr_class *class;
1770	struct g_raid_tr_object *obj;
1771	int status;
1772
1773	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1774	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1775		G_RAID_DEBUG1(2, vol->v_softc,
1776		    "Tasting volume %s for %s transformation.",
1777		    vol->v_name, class->name);
1778		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1779		    M_WAITOK);
1780		obj->tro_class = class;
1781		obj->tro_volume = vol;
1782		status = G_RAID_TR_TASTE(obj, vol);
1783		if (status != G_RAID_TR_TASTE_FAIL)
1784			break;
1785		kobj_delete((kobj_t)obj, M_RAID);
1786	}
1787	if (class == NULL) {
1788		G_RAID_DEBUG1(0, vol->v_softc,
1789		    "No transformation module found for %s.",
1790		    vol->v_name);
1791		vol->v_tr = NULL;
1792		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
1793		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
1794		    G_RAID_EVENT_VOLUME);
1795		return (-1);
1796	}
1797	G_RAID_DEBUG1(2, vol->v_softc,
1798	    "Transformation module %s chosen for %s.",
1799	    class->name, vol->v_name);
1800	vol->v_tr = obj;
1801	return (0);
1802}
1803
1804int
1805g_raid_destroy_node(struct g_raid_softc *sc, int worker)
1806{
1807	struct g_raid_volume *vol, *tmpv;
1808	struct g_raid_disk *disk, *tmpd;
1809	int error = 0;
1810
1811	sc->sc_stopping = G_RAID_DESTROY_HARD;
1812	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
1813		if (g_raid_destroy_volume(vol))
1814			error = EBUSY;
1815	}
1816	if (error)
1817		return (error);
1818	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
1819		if (g_raid_destroy_disk(disk))
1820			error = EBUSY;
1821	}
1822	if (error)
1823		return (error);
1824	if (sc->sc_md) {
1825		G_RAID_MD_FREE(sc->sc_md);
1826		kobj_delete((kobj_t)sc->sc_md, M_RAID);
1827		sc->sc_md = NULL;
1828	}
1829	if (sc->sc_geom != NULL) {
1830		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
1831		g_topology_lock();
1832		sc->sc_geom->softc = NULL;
1833		g_wither_geom(sc->sc_geom, ENXIO);
1834		g_topology_unlock();
1835		sc->sc_geom = NULL;
1836	} else
1837		G_RAID_DEBUG(1, "Array destroyed.");
1838	if (worker) {
1839		g_raid_event_cancel(sc, sc);
1840		mtx_destroy(&sc->sc_queue_mtx);
1841		sx_xunlock(&sc->sc_lock);
1842		sx_destroy(&sc->sc_lock);
1843		wakeup(&sc->sc_stopping);
1844		free(sc, M_RAID);
1845		curthread->td_pflags &= ~TDP_GEOM;
1846		G_RAID_DEBUG(1, "Thread exiting.");
1847		kproc_exit(0);
1848	} else {
1849		/* Wake up worker to make it selfdestruct. */
1850		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1851	}
1852	return (0);
1853}
1854
1855int
1856g_raid_destroy_volume(struct g_raid_volume *vol)
1857{
1858	struct g_raid_softc *sc;
1859	struct g_raid_disk *disk;
1860	int i;
1861
1862	sc = vol->v_softc;
1863	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
1864	vol->v_stopping = 1;
1865	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
1866		if (vol->v_tr) {
1867			G_RAID_TR_STOP(vol->v_tr);
1868			return (EBUSY);
1869		} else
1870			vol->v_state = G_RAID_VOLUME_S_STOPPED;
1871	}
1872	if (g_raid_event_check(sc, vol) != 0)
1873		return (EBUSY);
1874	if (vol->v_provider != NULL)
1875		return (EBUSY);
1876	if (vol->v_provider_open != 0)
1877		return (EBUSY);
1878	if (vol->v_tr) {
1879		G_RAID_TR_FREE(vol->v_tr);
1880		kobj_delete((kobj_t)vol->v_tr, M_RAID);
1881		vol->v_tr = NULL;
1882	}
1883	if (vol->v_rootmount)
1884		root_mount_rel(vol->v_rootmount);
1885	g_topology_lock();
1886	LIST_REMOVE(vol, v_global_next);
1887	g_topology_unlock();
1888	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
1889	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1890		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
1891		disk = vol->v_subdisks[i].sd_disk;
1892		if (disk == NULL)
1893			continue;
1894		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
1895	}
1896	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
1897	if (sc->sc_md)
1898		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
1899	g_raid_event_cancel(sc, vol);
1900	free(vol, M_RAID);
1901	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
1902		/* Wake up worker to let it selfdestruct. */
1903		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1904	}
1905	return (0);
1906}
1907
1908int
1909g_raid_destroy_disk(struct g_raid_disk *disk)
1910{
1911	struct g_raid_softc *sc;
1912	struct g_raid_subdisk *sd, *tmp;
1913
1914	sc = disk->d_softc;
1915	G_RAID_DEBUG1(2, sc, "Destroying disk.");
1916	if (disk->d_consumer) {
1917		g_raid_kill_consumer(sc, disk->d_consumer);
1918		disk->d_consumer = NULL;
1919	}
1920	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
1921		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
1922		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1923		    G_RAID_EVENT_SUBDISK);
1924		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
1925		sd->sd_disk = NULL;
1926	}
1927	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
1928	if (sc->sc_md)
1929		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
1930	g_raid_event_cancel(sc, disk);
1931	free(disk, M_RAID);
1932	return (0);
1933}
1934
1935int
1936g_raid_destroy(struct g_raid_softc *sc, int how)
1937{
1938	int opens;
1939
1940	g_topology_assert_not();
1941	if (sc == NULL)
1942		return (ENXIO);
1943	sx_assert(&sc->sc_lock, SX_XLOCKED);
1944
1945	/* Count open volumes. */
1946	opens = g_raid_nopens(sc);
1947
1948	/* React on some opened volumes. */
1949	if (opens > 0) {
1950		switch (how) {
1951		case G_RAID_DESTROY_SOFT:
1952			G_RAID_DEBUG1(1, sc,
1953			    "%d volumes are still open.",
1954			    opens);
1955			return (EBUSY);
1956		case G_RAID_DESTROY_DELAYED:
1957			G_RAID_DEBUG1(1, sc,
1958			    "Array will be destroyed on last close.");
1959			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
1960			return (EBUSY);
1961		case G_RAID_DESTROY_HARD:
1962			G_RAID_DEBUG1(1, sc,
1963			    "%d volumes are still open.",
1964			    opens);
1965		}
1966	}
1967
1968	/* Mark node for destruction. */
1969	sc->sc_stopping = G_RAID_DESTROY_HARD;
1970	/* Wake up worker to let it selfdestruct. */
1971	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1972	/* Sleep until node destroyed. */
1973	sx_sleep(&sc->sc_stopping, &sc->sc_lock,
1974	    PRIBIO | PDROP, "r:destroy", 0);
1975	return (0);
1976}
1977
1978static void
1979g_raid_taste_orphan(struct g_consumer *cp)
1980{
1981
1982	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
1983	    cp->provider->name));
1984}
1985
1986static struct g_geom *
1987g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
1988{
1989	struct g_consumer *cp;
1990	struct g_geom *gp, *geom;
1991	struct g_raid_md_class *class;
1992	struct g_raid_md_object *obj;
1993	int status;
1994
1995	g_topology_assert();
1996	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
1997	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
1998
1999	gp = g_new_geomf(mp, "mirror:taste");
2000	/*
2001	 * This orphan function should be never called.
2002	 */
2003	gp->orphan = g_raid_taste_orphan;
2004	cp = g_new_consumer(gp);
2005	g_attach(cp, pp);
2006
2007	geom = NULL;
2008	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2009		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2010		    pp->name, class->name);
2011		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2012		    M_WAITOK);
2013		obj->mdo_class = class;
2014		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2015		if (status != G_RAID_MD_TASTE_NEW)
2016			kobj_delete((kobj_t)obj, M_RAID);
2017		if (status != G_RAID_MD_TASTE_FAIL)
2018			break;
2019	}
2020
2021	g_detach(cp);
2022	g_destroy_consumer(cp);
2023	g_destroy_geom(gp);
2024	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2025	return (geom);
2026}
2027
2028int
2029g_raid_create_node_format(const char *format, struct g_geom **gp)
2030{
2031	struct g_raid_md_class *class;
2032	struct g_raid_md_object *obj;
2033	int status;
2034
2035	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2036	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2037		if (strcasecmp(class->name, format) == 0)
2038			break;
2039	}
2040	if (class == NULL) {
2041		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2042		return (G_RAID_MD_TASTE_FAIL);
2043	}
2044	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2045	    M_WAITOK);
2046	obj->mdo_class = class;
2047	status = G_RAID_MD_CREATE(obj, &g_raid_class, gp);
2048	if (status != G_RAID_MD_TASTE_NEW)
2049		kobj_delete((kobj_t)obj, M_RAID);
2050	return (status);
2051}
2052
2053static int
2054g_raid_destroy_geom(struct gctl_req *req __unused,
2055    struct g_class *mp __unused, struct g_geom *gp)
2056{
2057	struct g_raid_softc *sc;
2058	int error;
2059
2060	g_topology_unlock();
2061	sc = gp->softc;
2062	sx_xlock(&sc->sc_lock);
2063	g_cancel_event(sc);
2064	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2065	if (error != 0)
2066		sx_xunlock(&sc->sc_lock);
2067	g_topology_lock();
2068	return (error);
2069}
2070
2071void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2072    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2073{
2074
2075	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2076		return;
2077	if (sc->sc_md)
2078		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2079}
2080
2081void g_raid_fail_disk(struct g_raid_softc *sc,
2082    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2083{
2084
2085	if (disk == NULL)
2086		disk = sd->sd_disk;
2087	if (disk == NULL) {
2088		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2089		return;
2090	}
2091	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2092		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2093		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2094		return;
2095	}
2096	if (sc->sc_md)
2097		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2098}
2099
2100static void
2101g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2102    struct g_consumer *cp, struct g_provider *pp)
2103{
2104	struct g_raid_softc *sc;
2105	struct g_raid_volume *vol;
2106	struct g_raid_subdisk *sd;
2107	struct g_raid_disk *disk;
2108	int i, s;
2109
2110	g_topology_assert();
2111
2112	sc = gp->softc;
2113	if (sc == NULL)
2114		return;
2115	if (pp != NULL) {
2116		vol = pp->private;
2117		g_topology_unlock();
2118		sx_xlock(&sc->sc_lock);
2119		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2120		    vol->v_name);
2121		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2122		    g_raid_volume_level2str(vol->v_raid_level,
2123		    vol->v_raid_level_qualifier));
2124		sbuf_printf(sb,
2125		    "%s<Transformation>%s</Transformation>\n", indent,
2126		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2127		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2128		    vol->v_disks_count);
2129		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2130		    vol->v_strip_size);
2131		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2132		    g_raid_volume_state2str(vol->v_state));
2133		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2134		    vol->v_dirty ? "Yes" : "No");
2135		sbuf_printf(sb, "%s<Subdisks>", indent);
2136		for (i = 0; i < vol->v_disks_count; i++) {
2137			sd = &vol->v_subdisks[i];
2138			if (sd->sd_disk != NULL &&
2139			    sd->sd_disk->d_consumer != NULL) {
2140				sbuf_printf(sb, "%s ",
2141				    g_raid_get_diskname(sd->sd_disk));
2142			} else {
2143				sbuf_printf(sb, "NONE ");
2144			}
2145			sbuf_printf(sb, "(%s",
2146			    g_raid_subdisk_state2str(sd->sd_state));
2147			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2148			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2149				sbuf_printf(sb, " %d%%",
2150				    (int)(sd->sd_rebuild_pos * 100 /
2151				     sd->sd_size));
2152			}
2153			sbuf_printf(sb, ")");
2154			if (i + 1 < vol->v_disks_count)
2155				sbuf_printf(sb, ", ");
2156		}
2157		sbuf_printf(sb, "</Subdisks>\n");
2158		sx_xunlock(&sc->sc_lock);
2159		g_topology_lock();
2160	} else if (cp != NULL) {
2161		disk = cp->private;
2162		if (disk == NULL)
2163			return;
2164		g_topology_unlock();
2165		sx_xlock(&sc->sc_lock);
2166		sbuf_printf(sb, "%s<State>%s", indent,
2167		    g_raid_disk_state2str(disk->d_state));
2168		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2169			sbuf_printf(sb, " (");
2170			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2171				sbuf_printf(sb, "%s",
2172				    g_raid_subdisk_state2str(sd->sd_state));
2173				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2174				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2175					sbuf_printf(sb, " %d%%",
2176					    (int)(sd->sd_rebuild_pos * 100 /
2177					     sd->sd_size));
2178				}
2179				if (TAILQ_NEXT(sd, sd_next))
2180					sbuf_printf(sb, ", ");
2181			}
2182			sbuf_printf(sb, ")");
2183		}
2184		sbuf_printf(sb, "</State>\n");
2185		sbuf_printf(sb, "%s<Subdisks>", indent);
2186		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2187			sbuf_printf(sb, "r%d(%s):%d@%ju",
2188			    sd->sd_volume->v_global_id,
2189			    sd->sd_volume->v_name,
2190			    sd->sd_pos, sd->sd_offset);
2191			if (TAILQ_NEXT(sd, sd_next))
2192				sbuf_printf(sb, ", ");
2193		}
2194		sbuf_printf(sb, "</Subdisks>\n");
2195		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2196		    disk->d_read_errs);
2197		sx_xunlock(&sc->sc_lock);
2198		g_topology_lock();
2199	} else {
2200		g_topology_unlock();
2201		sx_xlock(&sc->sc_lock);
2202		if (sc->sc_md) {
2203			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2204			    sc->sc_md->mdo_class->name);
2205		}
2206		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2207			s = 0xff;
2208			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2209				if (vol->v_state < s)
2210					s = vol->v_state;
2211			}
2212			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2213			    g_raid_volume_state2str(s));
2214		}
2215		sx_xunlock(&sc->sc_lock);
2216		g_topology_lock();
2217	}
2218}
2219
2220static void
2221g_raid_shutdown_pre_sync(void *arg, int howto)
2222{
2223	struct g_class *mp;
2224	struct g_geom *gp, *gp2;
2225	struct g_raid_softc *sc;
2226	int error;
2227
2228	mp = arg;
2229	DROP_GIANT();
2230	g_topology_lock();
2231	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2232		if ((sc = gp->softc) == NULL)
2233			continue;
2234		g_topology_unlock();
2235		sx_xlock(&sc->sc_lock);
2236		g_cancel_event(sc);
2237		error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2238		if (error != 0)
2239			sx_xunlock(&sc->sc_lock);
2240		g_topology_lock();
2241	}
2242	g_topology_unlock();
2243	PICKUP_GIANT();
2244}
2245
2246static void
2247g_raid_init(struct g_class *mp)
2248{
2249
2250	g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
2251	    g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
2252	if (g_raid_pre_sync == NULL)
2253		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2254	g_raid_started = 1;
2255}
2256
2257static void
2258g_raid_fini(struct g_class *mp)
2259{
2260
2261	if (g_raid_pre_sync != NULL)
2262		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync);
2263	g_raid_started = 0;
2264}
2265
2266int
2267g_raid_md_modevent(module_t mod, int type, void *arg)
2268{
2269	struct g_raid_md_class *class, *c, *nc;
2270	int error;
2271
2272	error = 0;
2273	class = arg;
2274	switch (type) {
2275	case MOD_LOAD:
2276		c = LIST_FIRST(&g_raid_md_classes);
2277		if (c == NULL || c->mdc_priority > class->mdc_priority)
2278			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2279		else {
2280			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2281			    nc->mdc_priority < class->mdc_priority)
2282				c = nc;
2283			LIST_INSERT_AFTER(c, class, mdc_list);
2284		}
2285		if (g_raid_started)
2286			g_retaste(&g_raid_class);
2287		break;
2288	case MOD_UNLOAD:
2289		LIST_REMOVE(class, mdc_list);
2290		break;
2291	default:
2292		error = EOPNOTSUPP;
2293		break;
2294	}
2295
2296	return (error);
2297}
2298
2299int
2300g_raid_tr_modevent(module_t mod, int type, void *arg)
2301{
2302	struct g_raid_tr_class *class, *c, *nc;
2303	int error;
2304
2305	error = 0;
2306	class = arg;
2307	switch (type) {
2308	case MOD_LOAD:
2309		c = LIST_FIRST(&g_raid_tr_classes);
2310		if (c == NULL || c->trc_priority > class->trc_priority)
2311			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2312		else {
2313			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2314			    nc->trc_priority < class->trc_priority)
2315				c = nc;
2316			LIST_INSERT_AFTER(c, class, trc_list);
2317		}
2318		break;
2319	case MOD_UNLOAD:
2320		LIST_REMOVE(class, trc_list);
2321		break;
2322	default:
2323		error = EOPNOTSUPP;
2324		break;
2325	}
2326
2327	return (error);
2328}
2329
2330/*
2331 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2332 * to reduce module priority, allowing submodules to register them first.
2333 */
2334static moduledata_t g_raid_mod = {
2335	"g_raid",
2336	g_modevent,
2337	&g_raid_class
2338};
2339DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2340MODULE_VERSION(geom_raid, 0);
2341