g_raid.c revision 234603
1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid/g_raid.c 234603 2012-04-23 13:04:02Z mav $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sbuf.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/eventhandler.h>
42#include <vm/uma.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/raid/g_raid.h>
48#include "g_raid_md_if.h"
49#include "g_raid_tr_if.h"
50
51static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52
53SYSCTL_DECL(_kern_geom);
54SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55u_int g_raid_aggressive_spare = 0;
56TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
57SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
58    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
59u_int g_raid_debug = 0;
60TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
61SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
62    "Debug level");
63int g_raid_read_err_thresh = 10;
64TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
65SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
66    &g_raid_read_err_thresh, 0,
67    "Number of read errors equated to disk failure");
68u_int g_raid_start_timeout = 30;
69TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
70SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
71    &g_raid_start_timeout, 0,
72    "Time to wait for all array components");
73static u_int g_raid_clean_time = 5;
74TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
75SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
76    &g_raid_clean_time, 0, "Mark volume as clean when idling");
77static u_int g_raid_disconnect_on_failure = 1;
78TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
79    &g_raid_disconnect_on_failure);
80SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
81    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
82static u_int g_raid_name_format = 0;
83TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
84SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
85    &g_raid_name_format, 0, "Providers name format.");
86static u_int g_raid_idle_threshold = 1000000;
87TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
88SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
89    &g_raid_idle_threshold, 1000000,
90    "Time in microseconds to consider a volume idle.");
91
92#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
93	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
94	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
95	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
96} while (0)
97
98LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
99    LIST_HEAD_INITIALIZER(g_raid_md_classes);
100
101LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
102    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
103
104LIST_HEAD(, g_raid_volume) g_raid_volumes =
105    LIST_HEAD_INITIALIZER(g_raid_volumes);
106
107static eventhandler_tag g_raid_pre_sync = NULL;
108static int g_raid_started = 0;
109
110static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
111    struct g_geom *gp);
112static g_taste_t g_raid_taste;
113static void g_raid_init(struct g_class *mp);
114static void g_raid_fini(struct g_class *mp);
115
116struct g_class g_raid_class = {
117	.name = G_RAID_CLASS_NAME,
118	.version = G_VERSION,
119	.ctlreq = g_raid_ctl,
120	.taste = g_raid_taste,
121	.destroy_geom = g_raid_destroy_geom,
122	.init = g_raid_init,
123	.fini = g_raid_fini
124};
125
126static void g_raid_destroy_provider(struct g_raid_volume *vol);
127static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
128static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
129static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
130static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
131static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
132    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
133static void g_raid_start(struct bio *bp);
134static void g_raid_start_request(struct bio *bp);
135static void g_raid_disk_done(struct bio *bp);
136static void g_raid_poll(struct g_raid_softc *sc);
137
138static const char *
139g_raid_node_event2str(int event)
140{
141
142	switch (event) {
143	case G_RAID_NODE_E_WAKE:
144		return ("WAKE");
145	case G_RAID_NODE_E_START:
146		return ("START");
147	default:
148		return ("INVALID");
149	}
150}
151
152const char *
153g_raid_disk_state2str(int state)
154{
155
156	switch (state) {
157	case G_RAID_DISK_S_NONE:
158		return ("NONE");
159	case G_RAID_DISK_S_OFFLINE:
160		return ("OFFLINE");
161	case G_RAID_DISK_S_FAILED:
162		return ("FAILED");
163	case G_RAID_DISK_S_STALE_FAILED:
164		return ("STALE_FAILED");
165	case G_RAID_DISK_S_SPARE:
166		return ("SPARE");
167	case G_RAID_DISK_S_STALE:
168		return ("STALE");
169	case G_RAID_DISK_S_ACTIVE:
170		return ("ACTIVE");
171	default:
172		return ("INVALID");
173	}
174}
175
176static const char *
177g_raid_disk_event2str(int event)
178{
179
180	switch (event) {
181	case G_RAID_DISK_E_DISCONNECTED:
182		return ("DISCONNECTED");
183	default:
184		return ("INVALID");
185	}
186}
187
188const char *
189g_raid_subdisk_state2str(int state)
190{
191
192	switch (state) {
193	case G_RAID_SUBDISK_S_NONE:
194		return ("NONE");
195	case G_RAID_SUBDISK_S_FAILED:
196		return ("FAILED");
197	case G_RAID_SUBDISK_S_NEW:
198		return ("NEW");
199	case G_RAID_SUBDISK_S_REBUILD:
200		return ("REBUILD");
201	case G_RAID_SUBDISK_S_UNINITIALIZED:
202		return ("UNINITIALIZED");
203	case G_RAID_SUBDISK_S_STALE:
204		return ("STALE");
205	case G_RAID_SUBDISK_S_RESYNC:
206		return ("RESYNC");
207	case G_RAID_SUBDISK_S_ACTIVE:
208		return ("ACTIVE");
209	default:
210		return ("INVALID");
211	}
212}
213
214static const char *
215g_raid_subdisk_event2str(int event)
216{
217
218	switch (event) {
219	case G_RAID_SUBDISK_E_NEW:
220		return ("NEW");
221	case G_RAID_SUBDISK_E_DISCONNECTED:
222		return ("DISCONNECTED");
223	default:
224		return ("INVALID");
225	}
226}
227
228const char *
229g_raid_volume_state2str(int state)
230{
231
232	switch (state) {
233	case G_RAID_VOLUME_S_STARTING:
234		return ("STARTING");
235	case G_RAID_VOLUME_S_BROKEN:
236		return ("BROKEN");
237	case G_RAID_VOLUME_S_DEGRADED:
238		return ("DEGRADED");
239	case G_RAID_VOLUME_S_SUBOPTIMAL:
240		return ("SUBOPTIMAL");
241	case G_RAID_VOLUME_S_OPTIMAL:
242		return ("OPTIMAL");
243	case G_RAID_VOLUME_S_UNSUPPORTED:
244		return ("UNSUPPORTED");
245	case G_RAID_VOLUME_S_STOPPED:
246		return ("STOPPED");
247	default:
248		return ("INVALID");
249	}
250}
251
252static const char *
253g_raid_volume_event2str(int event)
254{
255
256	switch (event) {
257	case G_RAID_VOLUME_E_UP:
258		return ("UP");
259	case G_RAID_VOLUME_E_DOWN:
260		return ("DOWN");
261	case G_RAID_VOLUME_E_START:
262		return ("START");
263	case G_RAID_VOLUME_E_STARTMD:
264		return ("STARTMD");
265	default:
266		return ("INVALID");
267	}
268}
269
270const char *
271g_raid_volume_level2str(int level, int qual)
272{
273
274	switch (level) {
275	case G_RAID_VOLUME_RL_RAID0:
276		return ("RAID0");
277	case G_RAID_VOLUME_RL_RAID1:
278		return ("RAID1");
279	case G_RAID_VOLUME_RL_RAID3:
280		if (qual == G_RAID_VOLUME_RLQ_R3P0)
281			return ("RAID3-P0");
282		if (qual == G_RAID_VOLUME_RLQ_R3PN)
283			return ("RAID3-PN");
284		return ("RAID3");
285	case G_RAID_VOLUME_RL_RAID4:
286		if (qual == G_RAID_VOLUME_RLQ_R4P0)
287			return ("RAID3-P0");
288		if (qual == G_RAID_VOLUME_RLQ_R4PN)
289			return ("RAID3-PN");
290		return ("RAID4");
291	case G_RAID_VOLUME_RL_RAID5:
292		if (qual == G_RAID_VOLUME_RLQ_R5RA)
293			return ("RAID5-RA");
294		if (qual == G_RAID_VOLUME_RLQ_R5RS)
295			return ("RAID5-RS");
296		if (qual == G_RAID_VOLUME_RLQ_R5LA)
297			return ("RAID5-LA");
298		if (qual == G_RAID_VOLUME_RLQ_R5LS)
299			return ("RAID5-LS");
300		return ("RAID5");
301	case G_RAID_VOLUME_RL_RAID6:
302		if (qual == G_RAID_VOLUME_RLQ_R6RA)
303			return ("RAID6-RA");
304		if (qual == G_RAID_VOLUME_RLQ_R6RS)
305			return ("RAID6-RS");
306		if (qual == G_RAID_VOLUME_RLQ_R6LA)
307			return ("RAID6-LA");
308		if (qual == G_RAID_VOLUME_RLQ_R6LS)
309			return ("RAID6-LS");
310		return ("RAID6");
311	case G_RAID_VOLUME_RL_RAIDMDF:
312		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
313			return ("RAIDMDF-RA");
314		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
315			return ("RAIDMDF-RS");
316		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
317			return ("RAIDMDF-LA");
318		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
319			return ("RAIDMDF-LS");
320		return ("RAIDMDF");
321	case G_RAID_VOLUME_RL_RAID1E:
322		if (qual == G_RAID_VOLUME_RLQ_R1EA)
323			return ("RAID1E-A");
324		if (qual == G_RAID_VOLUME_RLQ_R1EO)
325			return ("RAID1E-O");
326		return ("RAID1E");
327	case G_RAID_VOLUME_RL_SINGLE:
328		return ("SINGLE");
329	case G_RAID_VOLUME_RL_CONCAT:
330		return ("CONCAT");
331	case G_RAID_VOLUME_RL_RAID5E:
332		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
333			return ("RAID5E-RA");
334		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
335			return ("RAID5E-RS");
336		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
337			return ("RAID5E-LA");
338		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
339			return ("RAID5E-LS");
340		return ("RAID5E");
341	case G_RAID_VOLUME_RL_RAID5EE:
342		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
343			return ("RAID5EE-RA");
344		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
345			return ("RAID5EE-RS");
346		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
347			return ("RAID5EE-LA");
348		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
349			return ("RAID5EE-LS");
350		return ("RAID5EE");
351	case G_RAID_VOLUME_RL_RAID5R:
352		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
353			return ("RAID5R-RA");
354		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
355			return ("RAID5R-RS");
356		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
357			return ("RAID5R-LA");
358		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
359			return ("RAID5R-LS");
360		return ("RAID5E");
361	default:
362		return ("UNKNOWN");
363	}
364}
365
366int
367g_raid_volume_str2level(const char *str, int *level, int *qual)
368{
369
370	*level = G_RAID_VOLUME_RL_UNKNOWN;
371	*qual = G_RAID_VOLUME_RLQ_NONE;
372	if (strcasecmp(str, "RAID0") == 0)
373		*level = G_RAID_VOLUME_RL_RAID0;
374	else if (strcasecmp(str, "RAID1") == 0)
375		*level = G_RAID_VOLUME_RL_RAID1;
376	else if (strcasecmp(str, "RAID3-P0") == 0) {
377		*level = G_RAID_VOLUME_RL_RAID3;
378		*qual = G_RAID_VOLUME_RLQ_R3P0;
379	} else if (strcasecmp(str, "RAID3-PN") == 0 &&
380		   strcasecmp(str, "RAID3") == 0) {
381		*level = G_RAID_VOLUME_RL_RAID3;
382		*qual = G_RAID_VOLUME_RLQ_R3P0;
383	} else if (strcasecmp(str, "RAID4-P0") == 0) {
384		*level = G_RAID_VOLUME_RL_RAID4;
385		*qual = G_RAID_VOLUME_RLQ_R4P0;
386	} else if (strcasecmp(str, "RAID4-PN") == 0 &&
387		   strcasecmp(str, "RAID4") == 0) {
388		*level = G_RAID_VOLUME_RL_RAID4;
389		*qual = G_RAID_VOLUME_RLQ_R4P0;
390	} else if (strcasecmp(str, "RAID5-RA") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID5;
392		*qual = G_RAID_VOLUME_RLQ_R5RA;
393	} else if (strcasecmp(str, "RAID5-RS") == 0) {
394		*level = G_RAID_VOLUME_RL_RAID5;
395		*qual = G_RAID_VOLUME_RLQ_R5RS;
396	} else if (strcasecmp(str, "RAID5") == 0 ||
397		   strcasecmp(str, "RAID5-LA") == 0) {
398		*level = G_RAID_VOLUME_RL_RAID5;
399		*qual = G_RAID_VOLUME_RLQ_R5LA;
400	} else if (strcasecmp(str, "RAID5-LS") == 0) {
401		*level = G_RAID_VOLUME_RL_RAID5;
402		*qual = G_RAID_VOLUME_RLQ_R5LS;
403	} else if (strcasecmp(str, "RAID6-RA") == 0) {
404		*level = G_RAID_VOLUME_RL_RAID6;
405		*qual = G_RAID_VOLUME_RLQ_R6RA;
406	} else if (strcasecmp(str, "RAID6-RS") == 0) {
407		*level = G_RAID_VOLUME_RL_RAID6;
408		*qual = G_RAID_VOLUME_RLQ_R6RS;
409	} else if (strcasecmp(str, "RAID6") == 0 ||
410		   strcasecmp(str, "RAID6-LA") == 0) {
411		*level = G_RAID_VOLUME_RL_RAID6;
412		*qual = G_RAID_VOLUME_RLQ_R6LA;
413	} else if (strcasecmp(str, "RAID6-LS") == 0) {
414		*level = G_RAID_VOLUME_RL_RAID6;
415		*qual = G_RAID_VOLUME_RLQ_R6LS;
416	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
417		*level = G_RAID_VOLUME_RL_RAIDMDF;
418		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
419	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
420		*level = G_RAID_VOLUME_RL_RAIDMDF;
421		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
422	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
423		   strcasecmp(str, "RAIDMDF-LA") == 0) {
424		*level = G_RAID_VOLUME_RL_RAIDMDF;
425		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
426	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
427		*level = G_RAID_VOLUME_RL_RAIDMDF;
428		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
429	} else if (strcasecmp(str, "RAID10") == 0 ||
430		   strcasecmp(str, "RAID1E") == 0 ||
431		   strcasecmp(str, "RAID1E-A") == 0) {
432		*level = G_RAID_VOLUME_RL_RAID1E;
433		*qual = G_RAID_VOLUME_RLQ_R1EA;
434	} else if (strcasecmp(str, "RAID1E-O") == 0) {
435		*level = G_RAID_VOLUME_RL_RAID1E;
436		*qual = G_RAID_VOLUME_RLQ_R1EO;
437	} else if (strcasecmp(str, "SINGLE") == 0)
438		*level = G_RAID_VOLUME_RL_SINGLE;
439	else if (strcasecmp(str, "CONCAT") == 0)
440		*level = G_RAID_VOLUME_RL_CONCAT;
441	else if (strcasecmp(str, "RAID5E-RA") == 0) {
442		*level = G_RAID_VOLUME_RL_RAID5E;
443		*qual = G_RAID_VOLUME_RLQ_R5ERA;
444	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
445		*level = G_RAID_VOLUME_RL_RAID5E;
446		*qual = G_RAID_VOLUME_RLQ_R5ERS;
447	} else if (strcasecmp(str, "RAID5E") == 0 ||
448		   strcasecmp(str, "RAID5E-LA") == 0) {
449		*level = G_RAID_VOLUME_RL_RAID5E;
450		*qual = G_RAID_VOLUME_RLQ_R5ELA;
451	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
452		*level = G_RAID_VOLUME_RL_RAID5E;
453		*qual = G_RAID_VOLUME_RLQ_R5ELS;
454	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
455		*level = G_RAID_VOLUME_RL_RAID5EE;
456		*qual = G_RAID_VOLUME_RLQ_R5EERA;
457	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
458		*level = G_RAID_VOLUME_RL_RAID5EE;
459		*qual = G_RAID_VOLUME_RLQ_R5EERS;
460	} else if (strcasecmp(str, "RAID5EE") == 0 ||
461		   strcasecmp(str, "RAID5EE-LA") == 0) {
462		*level = G_RAID_VOLUME_RL_RAID5EE;
463		*qual = G_RAID_VOLUME_RLQ_R5EELA;
464	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
465		*level = G_RAID_VOLUME_RL_RAID5EE;
466		*qual = G_RAID_VOLUME_RLQ_R5EELS;
467	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
468		*level = G_RAID_VOLUME_RL_RAID5R;
469		*qual = G_RAID_VOLUME_RLQ_R5RRA;
470	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
471		*level = G_RAID_VOLUME_RL_RAID5R;
472		*qual = G_RAID_VOLUME_RLQ_R5RRS;
473	} else if (strcasecmp(str, "RAID5R") == 0 ||
474		   strcasecmp(str, "RAID5R-LA") == 0) {
475		*level = G_RAID_VOLUME_RL_RAID5R;
476		*qual = G_RAID_VOLUME_RLQ_R5RLA;
477	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
478		*level = G_RAID_VOLUME_RL_RAID5R;
479		*qual = G_RAID_VOLUME_RLQ_R5RLS;
480	} else
481		return (-1);
482	return (0);
483}
484
485const char *
486g_raid_get_diskname(struct g_raid_disk *disk)
487{
488
489	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
490		return ("[unknown]");
491	return (disk->d_consumer->provider->name);
492}
493
494void
495g_raid_report_disk_state(struct g_raid_disk *disk)
496{
497	struct g_raid_subdisk *sd;
498	int len, state;
499	uint32_t s;
500
501	if (disk->d_consumer == NULL)
502		return;
503	if (disk->d_state == G_RAID_DISK_S_FAILED ||
504	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
505		s = G_STATE_FAILED;
506	} else {
507		state = G_RAID_SUBDISK_S_ACTIVE;
508		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
509			if (sd->sd_state < state)
510				state = sd->sd_state;
511		}
512		if (state == G_RAID_SUBDISK_S_FAILED)
513			s = G_STATE_FAILED;
514		else if (state == G_RAID_SUBDISK_S_NEW ||
515		    state == G_RAID_SUBDISK_S_REBUILD)
516			s = G_STATE_REBUILD;
517		else if (state == G_RAID_SUBDISK_S_STALE ||
518		    state == G_RAID_SUBDISK_S_RESYNC)
519			s = G_STATE_RESYNC;
520		else
521			s = G_STATE_ACTIVE;
522	}
523	len = sizeof(s);
524	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
525	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
526	    g_raid_get_diskname(disk), s);
527}
528
529void
530g_raid_change_disk_state(struct g_raid_disk *disk, int state)
531{
532
533	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
534	    g_raid_get_diskname(disk),
535	    g_raid_disk_state2str(disk->d_state),
536	    g_raid_disk_state2str(state));
537	disk->d_state = state;
538	g_raid_report_disk_state(disk);
539}
540
541void
542g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
543{
544
545	G_RAID_DEBUG1(0, sd->sd_softc,
546	    "Subdisk %s:%d-%s state changed from %s to %s.",
547	    sd->sd_volume->v_name, sd->sd_pos,
548	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
549	    g_raid_subdisk_state2str(sd->sd_state),
550	    g_raid_subdisk_state2str(state));
551	sd->sd_state = state;
552	if (sd->sd_disk)
553		g_raid_report_disk_state(sd->sd_disk);
554}
555
556void
557g_raid_change_volume_state(struct g_raid_volume *vol, int state)
558{
559
560	G_RAID_DEBUG1(0, vol->v_softc,
561	    "Volume %s state changed from %s to %s.",
562	    vol->v_name,
563	    g_raid_volume_state2str(vol->v_state),
564	    g_raid_volume_state2str(state));
565	vol->v_state = state;
566}
567
568/*
569 * --- Events handling functions ---
570 * Events in geom_raid are used to maintain subdisks and volumes status
571 * from one thread to simplify locking.
572 */
573static void
574g_raid_event_free(struct g_raid_event *ep)
575{
576
577	free(ep, M_RAID);
578}
579
580int
581g_raid_event_send(void *arg, int event, int flags)
582{
583	struct g_raid_softc *sc;
584	struct g_raid_event *ep;
585	int error;
586
587	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
588		sc = ((struct g_raid_volume *)arg)->v_softc;
589	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
590		sc = ((struct g_raid_disk *)arg)->d_softc;
591	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
592		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
593	} else {
594		sc = arg;
595	}
596	ep = malloc(sizeof(*ep), M_RAID,
597	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
598	if (ep == NULL)
599		return (ENOMEM);
600	ep->e_tgt = arg;
601	ep->e_event = event;
602	ep->e_flags = flags;
603	ep->e_error = 0;
604	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
605	mtx_lock(&sc->sc_queue_mtx);
606	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
607	mtx_unlock(&sc->sc_queue_mtx);
608	wakeup(sc);
609
610	if ((flags & G_RAID_EVENT_WAIT) == 0)
611		return (0);
612
613	sx_assert(&sc->sc_lock, SX_XLOCKED);
614	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
615	sx_xunlock(&sc->sc_lock);
616	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
617		mtx_lock(&sc->sc_queue_mtx);
618		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
619		    hz * 5);
620	}
621	error = ep->e_error;
622	g_raid_event_free(ep);
623	sx_xlock(&sc->sc_lock);
624	return (error);
625}
626
627static void
628g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
629{
630	struct g_raid_event *ep, *tmpep;
631
632	sx_assert(&sc->sc_lock, SX_XLOCKED);
633
634	mtx_lock(&sc->sc_queue_mtx);
635	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
636		if (ep->e_tgt != tgt)
637			continue;
638		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
639		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
640			g_raid_event_free(ep);
641		else {
642			ep->e_error = ECANCELED;
643			wakeup(ep);
644		}
645	}
646	mtx_unlock(&sc->sc_queue_mtx);
647}
648
649static int
650g_raid_event_check(struct g_raid_softc *sc, void *tgt)
651{
652	struct g_raid_event *ep;
653	int	res = 0;
654
655	sx_assert(&sc->sc_lock, SX_XLOCKED);
656
657	mtx_lock(&sc->sc_queue_mtx);
658	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
659		if (ep->e_tgt != tgt)
660			continue;
661		res = 1;
662		break;
663	}
664	mtx_unlock(&sc->sc_queue_mtx);
665	return (res);
666}
667
668/*
669 * Return the number of disks in given state.
670 * If state is equal to -1, count all connected disks.
671 */
672u_int
673g_raid_ndisks(struct g_raid_softc *sc, int state)
674{
675	struct g_raid_disk *disk;
676	u_int n;
677
678	sx_assert(&sc->sc_lock, SX_LOCKED);
679
680	n = 0;
681	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
682		if (disk->d_state == state || state == -1)
683			n++;
684	}
685	return (n);
686}
687
688/*
689 * Return the number of subdisks in given state.
690 * If state is equal to -1, count all connected disks.
691 */
692u_int
693g_raid_nsubdisks(struct g_raid_volume *vol, int state)
694{
695	struct g_raid_subdisk *subdisk;
696	struct g_raid_softc *sc;
697	u_int i, n ;
698
699	sc = vol->v_softc;
700	sx_assert(&sc->sc_lock, SX_LOCKED);
701
702	n = 0;
703	for (i = 0; i < vol->v_disks_count; i++) {
704		subdisk = &vol->v_subdisks[i];
705		if ((state == -1 &&
706		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
707		    subdisk->sd_state == state)
708			n++;
709	}
710	return (n);
711}
712
713/*
714 * Return the first subdisk in given state.
715 * If state is equal to -1, then the first connected disks.
716 */
717struct g_raid_subdisk *
718g_raid_get_subdisk(struct g_raid_volume *vol, int state)
719{
720	struct g_raid_subdisk *sd;
721	struct g_raid_softc *sc;
722	u_int i;
723
724	sc = vol->v_softc;
725	sx_assert(&sc->sc_lock, SX_LOCKED);
726
727	for (i = 0; i < vol->v_disks_count; i++) {
728		sd = &vol->v_subdisks[i];
729		if ((state == -1 &&
730		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
731		    sd->sd_state == state)
732			return (sd);
733	}
734	return (NULL);
735}
736
737struct g_consumer *
738g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
739{
740	struct g_consumer *cp;
741	struct g_provider *pp;
742
743	g_topology_assert();
744
745	if (strncmp(name, "/dev/", 5) == 0)
746		name += 5;
747	pp = g_provider_by_name(name);
748	if (pp == NULL)
749		return (NULL);
750	cp = g_new_consumer(sc->sc_geom);
751	if (g_attach(cp, pp) != 0) {
752		g_destroy_consumer(cp);
753		return (NULL);
754	}
755	if (g_access(cp, 1, 1, 1) != 0) {
756		g_detach(cp);
757		g_destroy_consumer(cp);
758		return (NULL);
759	}
760	return (cp);
761}
762
763static u_int
764g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
765{
766	struct bio *bp;
767	u_int nreqs = 0;
768
769	mtx_lock(&sc->sc_queue_mtx);
770	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
771		if (bp->bio_from == cp)
772			nreqs++;
773	}
774	mtx_unlock(&sc->sc_queue_mtx);
775	return (nreqs);
776}
777
778u_int
779g_raid_nopens(struct g_raid_softc *sc)
780{
781	struct g_raid_volume *vol;
782	u_int opens;
783
784	opens = 0;
785	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
786		if (vol->v_provider_open != 0)
787			opens++;
788	}
789	return (opens);
790}
791
792static int
793g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
794{
795
796	if (cp->index > 0) {
797		G_RAID_DEBUG1(2, sc,
798		    "I/O requests for %s exist, can't destroy it now.",
799		    cp->provider->name);
800		return (1);
801	}
802	if (g_raid_nrequests(sc, cp) > 0) {
803		G_RAID_DEBUG1(2, sc,
804		    "I/O requests for %s in queue, can't destroy it now.",
805		    cp->provider->name);
806		return (1);
807	}
808	return (0);
809}
810
811static void
812g_raid_destroy_consumer(void *arg, int flags __unused)
813{
814	struct g_consumer *cp;
815
816	g_topology_assert();
817
818	cp = arg;
819	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
820	g_detach(cp);
821	g_destroy_consumer(cp);
822}
823
824void
825g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
826{
827	struct g_provider *pp;
828	int retaste_wait;
829
830	g_topology_assert_not();
831
832	g_topology_lock();
833	cp->private = NULL;
834	if (g_raid_consumer_is_busy(sc, cp))
835		goto out;
836	pp = cp->provider;
837	retaste_wait = 0;
838	if (cp->acw == 1) {
839		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
840			retaste_wait = 1;
841	}
842	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
843		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
844	if (retaste_wait) {
845		/*
846		 * After retaste event was send (inside g_access()), we can send
847		 * event to detach and destroy consumer.
848		 * A class, which has consumer to the given provider connected
849		 * will not receive retaste event for the provider.
850		 * This is the way how I ignore retaste events when I close
851		 * consumers opened for write: I detach and destroy consumer
852		 * after retaste event is sent.
853		 */
854		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
855		goto out;
856	}
857	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
858	g_detach(cp);
859	g_destroy_consumer(cp);
860out:
861	g_topology_unlock();
862}
863
864static void
865g_raid_orphan(struct g_consumer *cp)
866{
867	struct g_raid_disk *disk;
868
869	g_topology_assert();
870
871	disk = cp->private;
872	if (disk == NULL)
873		return;
874	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
875	    G_RAID_EVENT_DISK);
876}
877
878static int
879g_raid_clean(struct g_raid_volume *vol, int acw)
880{
881	struct g_raid_softc *sc;
882	int timeout;
883
884	sc = vol->v_softc;
885	g_topology_assert_not();
886	sx_assert(&sc->sc_lock, SX_XLOCKED);
887
888//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
889//		return (0);
890	if (!vol->v_dirty)
891		return (0);
892	if (vol->v_writes > 0)
893		return (0);
894	if (acw > 0 || (acw == -1 &&
895	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
896		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
897		if (timeout > 0)
898			return (timeout);
899	}
900	vol->v_dirty = 0;
901	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
902	    vol->v_name);
903	g_raid_write_metadata(sc, vol, NULL, NULL);
904	return (0);
905}
906
907static void
908g_raid_dirty(struct g_raid_volume *vol)
909{
910	struct g_raid_softc *sc;
911
912	sc = vol->v_softc;
913	g_topology_assert_not();
914	sx_assert(&sc->sc_lock, SX_XLOCKED);
915
916//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
917//		return;
918	vol->v_dirty = 1;
919	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
920	    vol->v_name);
921	g_raid_write_metadata(sc, vol, NULL, NULL);
922}
923
924void
925g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
926{
927	struct g_raid_softc *sc;
928	struct g_raid_volume *vol;
929	struct g_raid_subdisk *sd;
930	struct bio_queue_head queue;
931	struct bio *cbp;
932	int i;
933
934	vol = tr->tro_volume;
935	sc = vol->v_softc;
936
937	/*
938	 * Allocate all bios before sending any request, so we can return
939	 * ENOMEM in nice and clean way.
940	 */
941	bioq_init(&queue);
942	for (i = 0; i < vol->v_disks_count; i++) {
943		sd = &vol->v_subdisks[i];
944		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
945		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
946			continue;
947		cbp = g_clone_bio(bp);
948		if (cbp == NULL)
949			goto failure;
950		cbp->bio_caller1 = sd;
951		bioq_insert_tail(&queue, cbp);
952	}
953	for (cbp = bioq_first(&queue); cbp != NULL;
954	    cbp = bioq_first(&queue)) {
955		bioq_remove(&queue, cbp);
956		sd = cbp->bio_caller1;
957		cbp->bio_caller1 = NULL;
958		g_raid_subdisk_iostart(sd, cbp);
959	}
960	return;
961failure:
962	for (cbp = bioq_first(&queue); cbp != NULL;
963	    cbp = bioq_first(&queue)) {
964		bioq_remove(&queue, cbp);
965		g_destroy_bio(cbp);
966	}
967	if (bp->bio_error == 0)
968		bp->bio_error = ENOMEM;
969	g_raid_iodone(bp, bp->bio_error);
970}
971
972static void
973g_raid_tr_kerneldump_common_done(struct bio *bp)
974{
975
976	bp->bio_flags |= BIO_DONE;
977}
978
979int
980g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
981    void *virtual, vm_offset_t physical, off_t offset, size_t length)
982{
983	struct g_raid_softc *sc;
984	struct g_raid_volume *vol;
985	struct bio bp;
986
987	vol = tr->tro_volume;
988	sc = vol->v_softc;
989
990	bzero(&bp, sizeof(bp));
991	bp.bio_cmd = BIO_WRITE;
992	bp.bio_done = g_raid_tr_kerneldump_common_done;
993	bp.bio_attribute = NULL;
994	bp.bio_offset = offset;
995	bp.bio_length = length;
996	bp.bio_data = virtual;
997	bp.bio_to = vol->v_provider;
998
999	g_raid_start(&bp);
1000	while (!(bp.bio_flags & BIO_DONE)) {
1001		G_RAID_DEBUG1(4, sc, "Poll...");
1002		g_raid_poll(sc);
1003		DELAY(10);
1004	}
1005
1006	return (bp.bio_error != 0 ? EIO : 0);
1007}
1008
1009static int
1010g_raid_dump(void *arg,
1011    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1012{
1013	struct g_raid_volume *vol;
1014	int error;
1015
1016	vol = (struct g_raid_volume *)arg;
1017	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1018	    (long long unsigned)offset, (long long unsigned)length);
1019
1020	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1021	    virtual, physical, offset, length);
1022	return (error);
1023}
1024
1025static void
1026g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1027{
1028	struct g_kerneldump *gkd;
1029	struct g_provider *pp;
1030	struct g_raid_volume *vol;
1031
1032	gkd = (struct g_kerneldump*)bp->bio_data;
1033	pp = bp->bio_to;
1034	vol = pp->private;
1035	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1036		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1037	gkd->di.dumper = g_raid_dump;
1038	gkd->di.priv = vol;
1039	gkd->di.blocksize = vol->v_sectorsize;
1040	gkd->di.maxiosize = DFLTPHYS;
1041	gkd->di.mediaoffset = gkd->offset;
1042	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1043		gkd->length = vol->v_mediasize - gkd->offset;
1044	gkd->di.mediasize = gkd->length;
1045	g_io_deliver(bp, 0);
1046}
1047
1048static void
1049g_raid_start(struct bio *bp)
1050{
1051	struct g_raid_softc *sc;
1052
1053	sc = bp->bio_to->geom->softc;
1054	/*
1055	 * If sc == NULL or there are no valid disks, provider's error
1056	 * should be set and g_raid_start() should not be called at all.
1057	 */
1058//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1059//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1060//	    bp->bio_to->error, bp->bio_to->name));
1061	G_RAID_LOGREQ(3, bp, "Request received.");
1062
1063	switch (bp->bio_cmd) {
1064	case BIO_READ:
1065	case BIO_WRITE:
1066	case BIO_DELETE:
1067	case BIO_FLUSH:
1068		break;
1069	case BIO_GETATTR:
1070		if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1071			g_raid_kerneldump(sc, bp);
1072		else
1073			g_io_deliver(bp, EOPNOTSUPP);
1074		return;
1075	default:
1076		g_io_deliver(bp, EOPNOTSUPP);
1077		return;
1078	}
1079	mtx_lock(&sc->sc_queue_mtx);
1080	bioq_disksort(&sc->sc_queue, bp);
1081	mtx_unlock(&sc->sc_queue_mtx);
1082	if (!dumping) {
1083		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1084		wakeup(sc);
1085	}
1086}
1087
1088static int
1089g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1090{
1091	/*
1092	 * 5 cases:
1093	 * (1) bp entirely below NO
1094	 * (2) bp entirely above NO
1095	 * (3) bp start below, but end in range YES
1096	 * (4) bp entirely within YES
1097	 * (5) bp starts within, ends above YES
1098	 *
1099	 * lock range 10-19 (offset 10 length 10)
1100	 * (1) 1-5: first if kicks it out
1101	 * (2) 30-35: second if kicks it out
1102	 * (3) 5-15: passes both ifs
1103	 * (4) 12-14: passes both ifs
1104	 * (5) 19-20: passes both
1105	 */
1106	off_t lend = lstart + len - 1;
1107	off_t bstart = bp->bio_offset;
1108	off_t bend = bp->bio_offset + bp->bio_length - 1;
1109
1110	if (bend < lstart)
1111		return (0);
1112	if (lend < bstart)
1113		return (0);
1114	return (1);
1115}
1116
1117static int
1118g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1119{
1120	struct g_raid_lock *lp;
1121
1122	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1123
1124	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1125		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1126			return (1);
1127	}
1128	return (0);
1129}
1130
1131static void
1132g_raid_start_request(struct bio *bp)
1133{
1134	struct g_raid_softc *sc;
1135	struct g_raid_volume *vol;
1136
1137	sc = bp->bio_to->geom->softc;
1138	sx_assert(&sc->sc_lock, SX_LOCKED);
1139	vol = bp->bio_to->private;
1140
1141	/*
1142	 * Check to see if this item is in a locked range.  If so,
1143	 * queue it to our locked queue and return.  We'll requeue
1144	 * it when the range is unlocked.  Internal I/O for the
1145	 * rebuild/rescan/recovery process is excluded from this
1146	 * check so we can actually do the recovery.
1147	 */
1148	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1149	    g_raid_is_in_locked_range(vol, bp)) {
1150		G_RAID_LOGREQ(3, bp, "Defer request.");
1151		bioq_insert_tail(&vol->v_locked, bp);
1152		return;
1153	}
1154
1155	/*
1156	 * If we're actually going to do the write/delete, then
1157	 * update the idle stats for the volume.
1158	 */
1159	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1160		if (!vol->v_dirty)
1161			g_raid_dirty(vol);
1162		vol->v_writes++;
1163	}
1164
1165	/*
1166	 * Put request onto inflight queue, so we can check if new
1167	 * synchronization requests don't collide with it.  Then tell
1168	 * the transformation layer to start the I/O.
1169	 */
1170	bioq_insert_tail(&vol->v_inflight, bp);
1171	G_RAID_LOGREQ(4, bp, "Request started");
1172	G_RAID_TR_IOSTART(vol->v_tr, bp);
1173}
1174
1175static void
1176g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1177{
1178	off_t off, len;
1179	struct bio *nbp;
1180	struct g_raid_lock *lp;
1181
1182	vol->v_pending_lock = 0;
1183	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1184		if (lp->l_pending) {
1185			off = lp->l_offset;
1186			len = lp->l_length;
1187			lp->l_pending = 0;
1188			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1189				if (g_raid_bio_overlaps(nbp, off, len))
1190					lp->l_pending++;
1191			}
1192			if (lp->l_pending) {
1193				vol->v_pending_lock = 1;
1194				G_RAID_DEBUG1(4, vol->v_softc,
1195				    "Deferred lock(%jd, %jd) has %d pending",
1196				    (intmax_t)off, (intmax_t)(off + len),
1197				    lp->l_pending);
1198				continue;
1199			}
1200			G_RAID_DEBUG1(4, vol->v_softc,
1201			    "Deferred lock of %jd to %jd completed",
1202			    (intmax_t)off, (intmax_t)(off + len));
1203			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1204		}
1205	}
1206}
1207
1208void
1209g_raid_iodone(struct bio *bp, int error)
1210{
1211	struct g_raid_softc *sc;
1212	struct g_raid_volume *vol;
1213
1214	sc = bp->bio_to->geom->softc;
1215	sx_assert(&sc->sc_lock, SX_LOCKED);
1216	vol = bp->bio_to->private;
1217	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1218
1219	/* Update stats if we done write/delete. */
1220	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1221		vol->v_writes--;
1222		vol->v_last_write = time_uptime;
1223	}
1224
1225	bioq_remove(&vol->v_inflight, bp);
1226	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1227		g_raid_finish_with_locked_ranges(vol, bp);
1228	getmicrouptime(&vol->v_last_done);
1229	g_io_deliver(bp, error);
1230}
1231
1232int
1233g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1234    struct bio *ignore, void *argp)
1235{
1236	struct g_raid_softc *sc;
1237	struct g_raid_lock *lp;
1238	struct bio *bp;
1239
1240	sc = vol->v_softc;
1241	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1242	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1243	lp->l_offset = off;
1244	lp->l_length = len;
1245	lp->l_callback_arg = argp;
1246
1247	lp->l_pending = 0;
1248	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1249		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1250			lp->l_pending++;
1251	}
1252
1253	/*
1254	 * If there are any writes that are pending, we return EBUSY.  All
1255	 * callers will have to wait until all pending writes clear.
1256	 */
1257	if (lp->l_pending > 0) {
1258		vol->v_pending_lock = 1;
1259		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1260		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1261		return (EBUSY);
1262	}
1263	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1264	    (intmax_t)off, (intmax_t)(off+len));
1265	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1266	return (0);
1267}
1268
1269int
1270g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1271{
1272	struct g_raid_lock *lp;
1273	struct g_raid_softc *sc;
1274	struct bio *bp;
1275
1276	sc = vol->v_softc;
1277	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1278		if (lp->l_offset == off && lp->l_length == len) {
1279			LIST_REMOVE(lp, l_next);
1280			/* XXX
1281			 * Right now we just put them all back on the queue
1282			 * and hope for the best.  We hope this because any
1283			 * locked ranges will go right back on this list
1284			 * when the worker thread runs.
1285			 * XXX
1286			 */
1287			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1288			    (intmax_t)lp->l_offset,
1289			    (intmax_t)(lp->l_offset+lp->l_length));
1290			mtx_lock(&sc->sc_queue_mtx);
1291			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1292				bioq_disksort(&sc->sc_queue, bp);
1293			mtx_unlock(&sc->sc_queue_mtx);
1294			free(lp, M_RAID);
1295			return (0);
1296		}
1297	}
1298	return (EINVAL);
1299}
1300
1301void
1302g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1303{
1304	struct g_consumer *cp;
1305	struct g_raid_disk *disk, *tdisk;
1306
1307	bp->bio_caller1 = sd;
1308
1309	/*
1310	 * Make sure that the disk is present. Generally it is a task of
1311	 * transformation layers to not send requests to absent disks, but
1312	 * it is better to be safe and report situation then sorry.
1313	 */
1314	if (sd->sd_disk == NULL) {
1315		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1316nodisk:
1317		bp->bio_from = NULL;
1318		bp->bio_to = NULL;
1319		bp->bio_error = ENXIO;
1320		g_raid_disk_done(bp);
1321		return;
1322	}
1323	disk = sd->sd_disk;
1324	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1325	    disk->d_state != G_RAID_DISK_S_FAILED) {
1326		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1327		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1328		goto nodisk;
1329	}
1330
1331	cp = disk->d_consumer;
1332	bp->bio_from = cp;
1333	bp->bio_to = cp->provider;
1334	cp->index++;
1335
1336	/* Update average disks load. */
1337	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1338		if (tdisk->d_consumer == NULL)
1339			tdisk->d_load = 0;
1340		else
1341			tdisk->d_load = (tdisk->d_consumer->index *
1342			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1343	}
1344
1345	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1346	if (dumping) {
1347		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1348		if (bp->bio_cmd == BIO_WRITE) {
1349			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1350			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1351		} else
1352			bp->bio_error = EOPNOTSUPP;
1353		g_raid_disk_done(bp);
1354	} else {
1355		bp->bio_done = g_raid_disk_done;
1356		bp->bio_offset += sd->sd_offset;
1357		G_RAID_LOGREQ(3, bp, "Sending request.");
1358		g_io_request(bp, cp);
1359	}
1360}
1361
1362int
1363g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1364    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1365{
1366
1367	if (sd->sd_disk == NULL)
1368		return (ENXIO);
1369	if (sd->sd_disk->d_kd.di.dumper == NULL)
1370		return (EOPNOTSUPP);
1371	return (dump_write(&sd->sd_disk->d_kd.di,
1372	    virtual, physical,
1373	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1374	    length));
1375}
1376
1377static void
1378g_raid_disk_done(struct bio *bp)
1379{
1380	struct g_raid_softc *sc;
1381	struct g_raid_subdisk *sd;
1382
1383	sd = bp->bio_caller1;
1384	sc = sd->sd_softc;
1385	mtx_lock(&sc->sc_queue_mtx);
1386	bioq_disksort(&sc->sc_queue, bp);
1387	mtx_unlock(&sc->sc_queue_mtx);
1388	if (!dumping)
1389		wakeup(sc);
1390}
1391
1392static void
1393g_raid_disk_done_request(struct bio *bp)
1394{
1395	struct g_raid_softc *sc;
1396	struct g_raid_disk *disk;
1397	struct g_raid_subdisk *sd;
1398	struct g_raid_volume *vol;
1399
1400	g_topology_assert_not();
1401
1402	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1403	sd = bp->bio_caller1;
1404	sc = sd->sd_softc;
1405	vol = sd->sd_volume;
1406	if (bp->bio_from != NULL) {
1407		bp->bio_from->index--;
1408		disk = bp->bio_from->private;
1409		if (disk == NULL)
1410			g_raid_kill_consumer(sc, bp->bio_from);
1411	}
1412	bp->bio_offset -= sd->sd_offset;
1413
1414	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1415}
1416
1417static void
1418g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1419{
1420
1421	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1422		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1423	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1424		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1425	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1426		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1427	else
1428		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1429	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1430		KASSERT(ep->e_error == 0,
1431		    ("Error cannot be handled."));
1432		g_raid_event_free(ep);
1433	} else {
1434		ep->e_flags |= G_RAID_EVENT_DONE;
1435		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1436		mtx_lock(&sc->sc_queue_mtx);
1437		wakeup(ep);
1438		mtx_unlock(&sc->sc_queue_mtx);
1439	}
1440}
1441
1442/*
1443 * Worker thread.
1444 */
1445static void
1446g_raid_worker(void *arg)
1447{
1448	struct g_raid_softc *sc;
1449	struct g_raid_event *ep;
1450	struct g_raid_volume *vol;
1451	struct bio *bp;
1452	struct timeval now, t;
1453	int timeout, rv;
1454
1455	sc = arg;
1456	thread_lock(curthread);
1457	sched_prio(curthread, PRIBIO);
1458	thread_unlock(curthread);
1459
1460	sx_xlock(&sc->sc_lock);
1461	for (;;) {
1462		mtx_lock(&sc->sc_queue_mtx);
1463		/*
1464		 * First take a look at events.
1465		 * This is important to handle events before any I/O requests.
1466		 */
1467		bp = NULL;
1468		vol = NULL;
1469		rv = 0;
1470		ep = TAILQ_FIRST(&sc->sc_events);
1471		if (ep != NULL)
1472			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1473		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1474			;
1475		else {
1476			getmicrouptime(&now);
1477			t = now;
1478			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1479				if (bioq_first(&vol->v_inflight) == NULL &&
1480				    vol->v_tr &&
1481				    timevalcmp(&vol->v_last_done, &t, < ))
1482					t = vol->v_last_done;
1483			}
1484			timevalsub(&t, &now);
1485			timeout = g_raid_idle_threshold +
1486			    t.tv_sec * 1000000 + t.tv_usec;
1487			if (timeout > 0) {
1488				/*
1489				 * Two steps to avoid overflows at HZ=1000
1490				 * and idle timeouts > 2.1s.  Some rounding
1491				 * errors can occur, but they are < 1tick,
1492				 * which is deemed to be close enough for
1493				 * this purpose.
1494				 */
1495				int micpertic = 1000000 / hz;
1496				timeout = (timeout + micpertic - 1) / micpertic;
1497				sx_xunlock(&sc->sc_lock);
1498				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1499				    PRIBIO | PDROP, "-", timeout);
1500				sx_xlock(&sc->sc_lock);
1501				goto process;
1502			} else
1503				rv = EWOULDBLOCK;
1504		}
1505		mtx_unlock(&sc->sc_queue_mtx);
1506process:
1507		if (ep != NULL) {
1508			g_raid_handle_event(sc, ep);
1509		} else if (bp != NULL) {
1510			if (bp->bio_to != NULL &&
1511			    bp->bio_to->geom == sc->sc_geom)
1512				g_raid_start_request(bp);
1513			else
1514				g_raid_disk_done_request(bp);
1515		} else if (rv == EWOULDBLOCK) {
1516			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1517				if (vol->v_writes == 0 && vol->v_dirty)
1518					g_raid_clean(vol, -1);
1519				if (bioq_first(&vol->v_inflight) == NULL &&
1520				    vol->v_tr) {
1521					t.tv_sec = g_raid_idle_threshold / 1000000;
1522					t.tv_usec = g_raid_idle_threshold % 1000000;
1523					timevaladd(&t, &vol->v_last_done);
1524					getmicrouptime(&now);
1525					if (timevalcmp(&t, &now, <= )) {
1526						G_RAID_TR_IDLE(vol->v_tr);
1527						vol->v_last_done = now;
1528					}
1529				}
1530			}
1531		}
1532		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1533			g_raid_destroy_node(sc, 1);	/* May not return. */
1534	}
1535}
1536
1537static void
1538g_raid_poll(struct g_raid_softc *sc)
1539{
1540	struct g_raid_event *ep;
1541	struct bio *bp;
1542
1543	sx_xlock(&sc->sc_lock);
1544	mtx_lock(&sc->sc_queue_mtx);
1545	/*
1546	 * First take a look at events.
1547	 * This is important to handle events before any I/O requests.
1548	 */
1549	ep = TAILQ_FIRST(&sc->sc_events);
1550	if (ep != NULL) {
1551		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1552		mtx_unlock(&sc->sc_queue_mtx);
1553		g_raid_handle_event(sc, ep);
1554		goto out;
1555	}
1556	bp = bioq_takefirst(&sc->sc_queue);
1557	if (bp != NULL) {
1558		mtx_unlock(&sc->sc_queue_mtx);
1559		if (bp->bio_from == NULL ||
1560		    bp->bio_from->geom != sc->sc_geom)
1561			g_raid_start_request(bp);
1562		else
1563			g_raid_disk_done_request(bp);
1564	}
1565out:
1566	sx_xunlock(&sc->sc_lock);
1567}
1568
1569static void
1570g_raid_launch_provider(struct g_raid_volume *vol)
1571{
1572	struct g_raid_disk *disk;
1573	struct g_raid_softc *sc;
1574	struct g_provider *pp;
1575	char name[G_RAID_MAX_VOLUMENAME];
1576	off_t off;
1577
1578	sc = vol->v_softc;
1579	sx_assert(&sc->sc_lock, SX_LOCKED);
1580
1581	g_topology_lock();
1582	/* Try to name provider with volume name. */
1583	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1584	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1585	    g_provider_by_name(name) != NULL) {
1586		/* Otherwise use sequential volume number. */
1587		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1588	}
1589	pp = g_new_providerf(sc->sc_geom, "%s", name);
1590	pp->private = vol;
1591	pp->mediasize = vol->v_mediasize;
1592	pp->sectorsize = vol->v_sectorsize;
1593	pp->stripesize = 0;
1594	pp->stripeoffset = 0;
1595	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1596	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1597	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1598	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1599		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1600		    disk->d_consumer != NULL &&
1601		    disk->d_consumer->provider != NULL) {
1602			pp->stripesize = disk->d_consumer->provider->stripesize;
1603			off = disk->d_consumer->provider->stripeoffset;
1604			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1605			if (off > 0)
1606				pp->stripeoffset %= off;
1607		}
1608		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1609			pp->stripesize *= (vol->v_disks_count - 1);
1610			pp->stripeoffset *= (vol->v_disks_count - 1);
1611		}
1612	} else
1613		pp->stripesize = vol->v_strip_size;
1614	vol->v_provider = pp;
1615	g_error_provider(pp, 0);
1616	g_topology_unlock();
1617	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1618	    pp->name, vol->v_name);
1619}
1620
1621static void
1622g_raid_destroy_provider(struct g_raid_volume *vol)
1623{
1624	struct g_raid_softc *sc;
1625	struct g_provider *pp;
1626	struct bio *bp, *tmp;
1627
1628	g_topology_assert_not();
1629	sc = vol->v_softc;
1630	pp = vol->v_provider;
1631	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1632
1633	g_topology_lock();
1634	g_error_provider(pp, ENXIO);
1635	mtx_lock(&sc->sc_queue_mtx);
1636	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1637		if (bp->bio_to != pp)
1638			continue;
1639		bioq_remove(&sc->sc_queue, bp);
1640		g_io_deliver(bp, ENXIO);
1641	}
1642	mtx_unlock(&sc->sc_queue_mtx);
1643	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1644	    pp->name, vol->v_name);
1645	g_wither_provider(pp, ENXIO);
1646	g_topology_unlock();
1647	vol->v_provider = NULL;
1648}
1649
1650/*
1651 * Update device state.
1652 */
1653static int
1654g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1655{
1656	struct g_raid_softc *sc;
1657
1658	sc = vol->v_softc;
1659	sx_assert(&sc->sc_lock, SX_XLOCKED);
1660
1661	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1662	    g_raid_volume_event2str(event),
1663	    vol->v_name);
1664	switch (event) {
1665	case G_RAID_VOLUME_E_DOWN:
1666		if (vol->v_provider != NULL)
1667			g_raid_destroy_provider(vol);
1668		break;
1669	case G_RAID_VOLUME_E_UP:
1670		if (vol->v_provider == NULL)
1671			g_raid_launch_provider(vol);
1672		break;
1673	case G_RAID_VOLUME_E_START:
1674		if (vol->v_tr)
1675			G_RAID_TR_START(vol->v_tr);
1676		return (0);
1677	default:
1678		if (sc->sc_md)
1679			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1680		return (0);
1681	}
1682
1683	/* Manage root mount release. */
1684	if (vol->v_starting) {
1685		vol->v_starting = 0;
1686		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1687		root_mount_rel(vol->v_rootmount);
1688		vol->v_rootmount = NULL;
1689	}
1690	if (vol->v_stopping && vol->v_provider_open == 0)
1691		g_raid_destroy_volume(vol);
1692	return (0);
1693}
1694
1695/*
1696 * Update subdisk state.
1697 */
1698static int
1699g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1700{
1701	struct g_raid_softc *sc;
1702	struct g_raid_volume *vol;
1703
1704	sc = sd->sd_softc;
1705	vol = sd->sd_volume;
1706	sx_assert(&sc->sc_lock, SX_XLOCKED);
1707
1708	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1709	    g_raid_subdisk_event2str(event),
1710	    vol->v_name, sd->sd_pos,
1711	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1712	if (vol->v_tr)
1713		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1714
1715	return (0);
1716}
1717
1718/*
1719 * Update disk state.
1720 */
1721static int
1722g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1723{
1724	struct g_raid_softc *sc;
1725
1726	sc = disk->d_softc;
1727	sx_assert(&sc->sc_lock, SX_XLOCKED);
1728
1729	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1730	    g_raid_disk_event2str(event),
1731	    g_raid_get_diskname(disk));
1732
1733	if (sc->sc_md)
1734		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1735	return (0);
1736}
1737
1738/*
1739 * Node event.
1740 */
1741static int
1742g_raid_update_node(struct g_raid_softc *sc, u_int event)
1743{
1744	sx_assert(&sc->sc_lock, SX_XLOCKED);
1745
1746	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1747	    g_raid_node_event2str(event));
1748
1749	if (event == G_RAID_NODE_E_WAKE)
1750		return (0);
1751	if (sc->sc_md)
1752		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1753	return (0);
1754}
1755
1756static int
1757g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1758{
1759	struct g_raid_volume *vol;
1760	struct g_raid_softc *sc;
1761	int dcw, opens, error = 0;
1762
1763	g_topology_assert();
1764	sc = pp->geom->softc;
1765	vol = pp->private;
1766	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1767	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1768
1769	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1770	    acr, acw, ace);
1771	dcw = pp->acw + acw;
1772
1773	g_topology_unlock();
1774	sx_xlock(&sc->sc_lock);
1775	/* Deny new opens while dying. */
1776	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1777		error = ENXIO;
1778		goto out;
1779	}
1780	if (dcw == 0 && vol->v_dirty)
1781		g_raid_clean(vol, dcw);
1782	vol->v_provider_open += acr + acw + ace;
1783	/* Handle delayed node destruction. */
1784	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1785	    vol->v_provider_open == 0) {
1786		/* Count open volumes. */
1787		opens = g_raid_nopens(sc);
1788		if (opens == 0) {
1789			sc->sc_stopping = G_RAID_DESTROY_HARD;
1790			/* Wake up worker to make it selfdestruct. */
1791			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1792		}
1793	}
1794	/* Handle open volume destruction. */
1795	if (vol->v_stopping && vol->v_provider_open == 0)
1796		g_raid_destroy_volume(vol);
1797out:
1798	sx_xunlock(&sc->sc_lock);
1799	g_topology_lock();
1800	return (error);
1801}
1802
1803struct g_raid_softc *
1804g_raid_create_node(struct g_class *mp,
1805    const char *name, struct g_raid_md_object *md)
1806{
1807	struct g_raid_softc *sc;
1808	struct g_geom *gp;
1809	int error;
1810
1811	g_topology_assert();
1812	G_RAID_DEBUG(1, "Creating array %s.", name);
1813
1814	gp = g_new_geomf(mp, "%s", name);
1815	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1816	gp->start = g_raid_start;
1817	gp->orphan = g_raid_orphan;
1818	gp->access = g_raid_access;
1819	gp->dumpconf = g_raid_dumpconf;
1820
1821	sc->sc_md = md;
1822	sc->sc_geom = gp;
1823	sc->sc_flags = 0;
1824	TAILQ_INIT(&sc->sc_volumes);
1825	TAILQ_INIT(&sc->sc_disks);
1826	sx_init(&sc->sc_lock, "gmirror:lock");
1827	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
1828	TAILQ_INIT(&sc->sc_events);
1829	bioq_init(&sc->sc_queue);
1830	gp->softc = sc;
1831	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1832	    "g_raid %s", name);
1833	if (error != 0) {
1834		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1835		mtx_destroy(&sc->sc_queue_mtx);
1836		sx_destroy(&sc->sc_lock);
1837		g_destroy_geom(sc->sc_geom);
1838		free(sc, M_RAID);
1839		return (NULL);
1840	}
1841
1842	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1843	return (sc);
1844}
1845
1846struct g_raid_volume *
1847g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1848{
1849	struct g_raid_volume	*vol, *vol1;
1850	int i;
1851
1852	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1853	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1854	vol->v_softc = sc;
1855	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1856	vol->v_state = G_RAID_VOLUME_S_STARTING;
1857	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1858	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1859	bioq_init(&vol->v_inflight);
1860	bioq_init(&vol->v_locked);
1861	LIST_INIT(&vol->v_locks);
1862	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1863		vol->v_subdisks[i].sd_softc = sc;
1864		vol->v_subdisks[i].sd_volume = vol;
1865		vol->v_subdisks[i].sd_pos = i;
1866		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1867	}
1868
1869	/* Find free ID for this volume. */
1870	g_topology_lock();
1871	vol1 = vol;
1872	if (id >= 0) {
1873		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1874			if (vol1->v_global_id == id)
1875				break;
1876		}
1877	}
1878	if (vol1 != NULL) {
1879		for (id = 0; ; id++) {
1880			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1881				if (vol1->v_global_id == id)
1882					break;
1883			}
1884			if (vol1 == NULL)
1885				break;
1886		}
1887	}
1888	vol->v_global_id = id;
1889	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1890	g_topology_unlock();
1891
1892	/* Delay root mounting. */
1893	vol->v_rootmount = root_mount_hold("GRAID");
1894	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1895	vol->v_starting = 1;
1896	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1897	return (vol);
1898}
1899
1900struct g_raid_disk *
1901g_raid_create_disk(struct g_raid_softc *sc)
1902{
1903	struct g_raid_disk	*disk;
1904
1905	G_RAID_DEBUG1(1, sc, "Creating disk.");
1906	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1907	disk->d_softc = sc;
1908	disk->d_state = G_RAID_DISK_S_NONE;
1909	TAILQ_INIT(&disk->d_subdisks);
1910	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1911	return (disk);
1912}
1913
1914int g_raid_start_volume(struct g_raid_volume *vol)
1915{
1916	struct g_raid_tr_class *class;
1917	struct g_raid_tr_object *obj;
1918	int status;
1919
1920	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1921	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1922		G_RAID_DEBUG1(2, vol->v_softc,
1923		    "Tasting volume %s for %s transformation.",
1924		    vol->v_name, class->name);
1925		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1926		    M_WAITOK);
1927		obj->tro_class = class;
1928		obj->tro_volume = vol;
1929		status = G_RAID_TR_TASTE(obj, vol);
1930		if (status != G_RAID_TR_TASTE_FAIL)
1931			break;
1932		kobj_delete((kobj_t)obj, M_RAID);
1933	}
1934	if (class == NULL) {
1935		G_RAID_DEBUG1(0, vol->v_softc,
1936		    "No transformation module found for %s.",
1937		    vol->v_name);
1938		vol->v_tr = NULL;
1939		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
1940		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
1941		    G_RAID_EVENT_VOLUME);
1942		return (-1);
1943	}
1944	G_RAID_DEBUG1(2, vol->v_softc,
1945	    "Transformation module %s chosen for %s.",
1946	    class->name, vol->v_name);
1947	vol->v_tr = obj;
1948	return (0);
1949}
1950
1951int
1952g_raid_destroy_node(struct g_raid_softc *sc, int worker)
1953{
1954	struct g_raid_volume *vol, *tmpv;
1955	struct g_raid_disk *disk, *tmpd;
1956	int error = 0;
1957
1958	sc->sc_stopping = G_RAID_DESTROY_HARD;
1959	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
1960		if (g_raid_destroy_volume(vol))
1961			error = EBUSY;
1962	}
1963	if (error)
1964		return (error);
1965	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
1966		if (g_raid_destroy_disk(disk))
1967			error = EBUSY;
1968	}
1969	if (error)
1970		return (error);
1971	if (sc->sc_md) {
1972		G_RAID_MD_FREE(sc->sc_md);
1973		kobj_delete((kobj_t)sc->sc_md, M_RAID);
1974		sc->sc_md = NULL;
1975	}
1976	if (sc->sc_geom != NULL) {
1977		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
1978		g_topology_lock();
1979		sc->sc_geom->softc = NULL;
1980		g_wither_geom(sc->sc_geom, ENXIO);
1981		g_topology_unlock();
1982		sc->sc_geom = NULL;
1983	} else
1984		G_RAID_DEBUG(1, "Array destroyed.");
1985	if (worker) {
1986		g_raid_event_cancel(sc, sc);
1987		mtx_destroy(&sc->sc_queue_mtx);
1988		sx_xunlock(&sc->sc_lock);
1989		sx_destroy(&sc->sc_lock);
1990		wakeup(&sc->sc_stopping);
1991		free(sc, M_RAID);
1992		curthread->td_pflags &= ~TDP_GEOM;
1993		G_RAID_DEBUG(1, "Thread exiting.");
1994		kproc_exit(0);
1995	} else {
1996		/* Wake up worker to make it selfdestruct. */
1997		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1998	}
1999	return (0);
2000}
2001
2002int
2003g_raid_destroy_volume(struct g_raid_volume *vol)
2004{
2005	struct g_raid_softc *sc;
2006	struct g_raid_disk *disk;
2007	int i;
2008
2009	sc = vol->v_softc;
2010	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2011	vol->v_stopping = 1;
2012	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2013		if (vol->v_tr) {
2014			G_RAID_TR_STOP(vol->v_tr);
2015			return (EBUSY);
2016		} else
2017			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2018	}
2019	if (g_raid_event_check(sc, vol) != 0)
2020		return (EBUSY);
2021	if (vol->v_provider != NULL)
2022		return (EBUSY);
2023	if (vol->v_provider_open != 0)
2024		return (EBUSY);
2025	if (vol->v_tr) {
2026		G_RAID_TR_FREE(vol->v_tr);
2027		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2028		vol->v_tr = NULL;
2029	}
2030	if (vol->v_rootmount)
2031		root_mount_rel(vol->v_rootmount);
2032	g_topology_lock();
2033	LIST_REMOVE(vol, v_global_next);
2034	g_topology_unlock();
2035	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2036	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2037		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2038		disk = vol->v_subdisks[i].sd_disk;
2039		if (disk == NULL)
2040			continue;
2041		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2042	}
2043	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2044	if (sc->sc_md)
2045		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2046	g_raid_event_cancel(sc, vol);
2047	free(vol, M_RAID);
2048	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2049		/* Wake up worker to let it selfdestruct. */
2050		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2051	}
2052	return (0);
2053}
2054
2055int
2056g_raid_destroy_disk(struct g_raid_disk *disk)
2057{
2058	struct g_raid_softc *sc;
2059	struct g_raid_subdisk *sd, *tmp;
2060
2061	sc = disk->d_softc;
2062	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2063	if (disk->d_consumer) {
2064		g_raid_kill_consumer(sc, disk->d_consumer);
2065		disk->d_consumer = NULL;
2066	}
2067	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2068		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2069		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2070		    G_RAID_EVENT_SUBDISK);
2071		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2072		sd->sd_disk = NULL;
2073	}
2074	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2075	if (sc->sc_md)
2076		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2077	g_raid_event_cancel(sc, disk);
2078	free(disk, M_RAID);
2079	return (0);
2080}
2081
2082int
2083g_raid_destroy(struct g_raid_softc *sc, int how)
2084{
2085	int opens;
2086
2087	g_topology_assert_not();
2088	if (sc == NULL)
2089		return (ENXIO);
2090	sx_assert(&sc->sc_lock, SX_XLOCKED);
2091
2092	/* Count open volumes. */
2093	opens = g_raid_nopens(sc);
2094
2095	/* React on some opened volumes. */
2096	if (opens > 0) {
2097		switch (how) {
2098		case G_RAID_DESTROY_SOFT:
2099			G_RAID_DEBUG1(1, sc,
2100			    "%d volumes are still open.",
2101			    opens);
2102			return (EBUSY);
2103		case G_RAID_DESTROY_DELAYED:
2104			G_RAID_DEBUG1(1, sc,
2105			    "Array will be destroyed on last close.");
2106			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2107			return (EBUSY);
2108		case G_RAID_DESTROY_HARD:
2109			G_RAID_DEBUG1(1, sc,
2110			    "%d volumes are still open.",
2111			    opens);
2112		}
2113	}
2114
2115	/* Mark node for destruction. */
2116	sc->sc_stopping = G_RAID_DESTROY_HARD;
2117	/* Wake up worker to let it selfdestruct. */
2118	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2119	/* Sleep until node destroyed. */
2120	sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2121	    PRIBIO | PDROP, "r:destroy", 0);
2122	return (0);
2123}
2124
2125static void
2126g_raid_taste_orphan(struct g_consumer *cp)
2127{
2128
2129	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2130	    cp->provider->name));
2131}
2132
2133static struct g_geom *
2134g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2135{
2136	struct g_consumer *cp;
2137	struct g_geom *gp, *geom;
2138	struct g_raid_md_class *class;
2139	struct g_raid_md_object *obj;
2140	int status;
2141
2142	g_topology_assert();
2143	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2144	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2145
2146	gp = g_new_geomf(mp, "mirror:taste");
2147	/*
2148	 * This orphan function should be never called.
2149	 */
2150	gp->orphan = g_raid_taste_orphan;
2151	cp = g_new_consumer(gp);
2152	g_attach(cp, pp);
2153
2154	geom = NULL;
2155	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2156		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2157		    pp->name, class->name);
2158		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2159		    M_WAITOK);
2160		obj->mdo_class = class;
2161		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2162		if (status != G_RAID_MD_TASTE_NEW)
2163			kobj_delete((kobj_t)obj, M_RAID);
2164		if (status != G_RAID_MD_TASTE_FAIL)
2165			break;
2166	}
2167
2168	g_detach(cp);
2169	g_destroy_consumer(cp);
2170	g_destroy_geom(gp);
2171	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2172	return (geom);
2173}
2174
2175int
2176g_raid_create_node_format(const char *format, struct g_geom **gp)
2177{
2178	struct g_raid_md_class *class;
2179	struct g_raid_md_object *obj;
2180	int status;
2181
2182	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2183	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2184		if (strcasecmp(class->name, format) == 0)
2185			break;
2186	}
2187	if (class == NULL) {
2188		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2189		return (G_RAID_MD_TASTE_FAIL);
2190	}
2191	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2192	    M_WAITOK);
2193	obj->mdo_class = class;
2194	status = G_RAID_MD_CREATE(obj, &g_raid_class, gp);
2195	if (status != G_RAID_MD_TASTE_NEW)
2196		kobj_delete((kobj_t)obj, M_RAID);
2197	return (status);
2198}
2199
2200static int
2201g_raid_destroy_geom(struct gctl_req *req __unused,
2202    struct g_class *mp __unused, struct g_geom *gp)
2203{
2204	struct g_raid_softc *sc;
2205	int error;
2206
2207	g_topology_unlock();
2208	sc = gp->softc;
2209	sx_xlock(&sc->sc_lock);
2210	g_cancel_event(sc);
2211	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2212	if (error != 0)
2213		sx_xunlock(&sc->sc_lock);
2214	g_topology_lock();
2215	return (error);
2216}
2217
2218void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2219    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2220{
2221
2222	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2223		return;
2224	if (sc->sc_md)
2225		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2226}
2227
2228void g_raid_fail_disk(struct g_raid_softc *sc,
2229    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2230{
2231
2232	if (disk == NULL)
2233		disk = sd->sd_disk;
2234	if (disk == NULL) {
2235		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2236		return;
2237	}
2238	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2239		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2240		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2241		return;
2242	}
2243	if (sc->sc_md)
2244		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2245}
2246
2247static void
2248g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2249    struct g_consumer *cp, struct g_provider *pp)
2250{
2251	struct g_raid_softc *sc;
2252	struct g_raid_volume *vol;
2253	struct g_raid_subdisk *sd;
2254	struct g_raid_disk *disk;
2255	int i, s;
2256
2257	g_topology_assert();
2258
2259	sc = gp->softc;
2260	if (sc == NULL)
2261		return;
2262	if (pp != NULL) {
2263		vol = pp->private;
2264		g_topology_unlock();
2265		sx_xlock(&sc->sc_lock);
2266		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2267		    vol->v_name);
2268		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2269		    g_raid_volume_level2str(vol->v_raid_level,
2270		    vol->v_raid_level_qualifier));
2271		sbuf_printf(sb,
2272		    "%s<Transformation>%s</Transformation>\n", indent,
2273		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2274		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2275		    vol->v_disks_count);
2276		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2277		    vol->v_strip_size);
2278		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2279		    g_raid_volume_state2str(vol->v_state));
2280		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2281		    vol->v_dirty ? "Yes" : "No");
2282		sbuf_printf(sb, "%s<Subdisks>", indent);
2283		for (i = 0; i < vol->v_disks_count; i++) {
2284			sd = &vol->v_subdisks[i];
2285			if (sd->sd_disk != NULL &&
2286			    sd->sd_disk->d_consumer != NULL) {
2287				sbuf_printf(sb, "%s ",
2288				    g_raid_get_diskname(sd->sd_disk));
2289			} else {
2290				sbuf_printf(sb, "NONE ");
2291			}
2292			sbuf_printf(sb, "(%s",
2293			    g_raid_subdisk_state2str(sd->sd_state));
2294			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2295			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2296				sbuf_printf(sb, " %d%%",
2297				    (int)(sd->sd_rebuild_pos * 100 /
2298				     sd->sd_size));
2299			}
2300			sbuf_printf(sb, ")");
2301			if (i + 1 < vol->v_disks_count)
2302				sbuf_printf(sb, ", ");
2303		}
2304		sbuf_printf(sb, "</Subdisks>\n");
2305		sx_xunlock(&sc->sc_lock);
2306		g_topology_lock();
2307	} else if (cp != NULL) {
2308		disk = cp->private;
2309		if (disk == NULL)
2310			return;
2311		g_topology_unlock();
2312		sx_xlock(&sc->sc_lock);
2313		sbuf_printf(sb, "%s<State>%s", indent,
2314		    g_raid_disk_state2str(disk->d_state));
2315		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2316			sbuf_printf(sb, " (");
2317			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2318				sbuf_printf(sb, "%s",
2319				    g_raid_subdisk_state2str(sd->sd_state));
2320				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2321				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2322					sbuf_printf(sb, " %d%%",
2323					    (int)(sd->sd_rebuild_pos * 100 /
2324					     sd->sd_size));
2325				}
2326				if (TAILQ_NEXT(sd, sd_next))
2327					sbuf_printf(sb, ", ");
2328			}
2329			sbuf_printf(sb, ")");
2330		}
2331		sbuf_printf(sb, "</State>\n");
2332		sbuf_printf(sb, "%s<Subdisks>", indent);
2333		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2334			sbuf_printf(sb, "r%d(%s):%d@%ju",
2335			    sd->sd_volume->v_global_id,
2336			    sd->sd_volume->v_name,
2337			    sd->sd_pos, sd->sd_offset);
2338			if (TAILQ_NEXT(sd, sd_next))
2339				sbuf_printf(sb, ", ");
2340		}
2341		sbuf_printf(sb, "</Subdisks>\n");
2342		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2343		    disk->d_read_errs);
2344		sx_xunlock(&sc->sc_lock);
2345		g_topology_lock();
2346	} else {
2347		g_topology_unlock();
2348		sx_xlock(&sc->sc_lock);
2349		if (sc->sc_md) {
2350			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2351			    sc->sc_md->mdo_class->name);
2352		}
2353		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2354			s = 0xff;
2355			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2356				if (vol->v_state < s)
2357					s = vol->v_state;
2358			}
2359			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2360			    g_raid_volume_state2str(s));
2361		}
2362		sx_xunlock(&sc->sc_lock);
2363		g_topology_lock();
2364	}
2365}
2366
2367static void
2368g_raid_shutdown_pre_sync(void *arg, int howto)
2369{
2370	struct g_class *mp;
2371	struct g_geom *gp, *gp2;
2372	struct g_raid_softc *sc;
2373	int error;
2374
2375	mp = arg;
2376	DROP_GIANT();
2377	g_topology_lock();
2378	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2379		if ((sc = gp->softc) == NULL)
2380			continue;
2381		g_topology_unlock();
2382		sx_xlock(&sc->sc_lock);
2383		g_cancel_event(sc);
2384		error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2385		if (error != 0)
2386			sx_xunlock(&sc->sc_lock);
2387		g_topology_lock();
2388	}
2389	g_topology_unlock();
2390	PICKUP_GIANT();
2391}
2392
2393static void
2394g_raid_init(struct g_class *mp)
2395{
2396
2397	g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
2398	    g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
2399	if (g_raid_pre_sync == NULL)
2400		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2401	g_raid_started = 1;
2402}
2403
2404static void
2405g_raid_fini(struct g_class *mp)
2406{
2407
2408	if (g_raid_pre_sync != NULL)
2409		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync);
2410	g_raid_started = 0;
2411}
2412
2413int
2414g_raid_md_modevent(module_t mod, int type, void *arg)
2415{
2416	struct g_raid_md_class *class, *c, *nc;
2417	int error;
2418
2419	error = 0;
2420	class = arg;
2421	switch (type) {
2422	case MOD_LOAD:
2423		c = LIST_FIRST(&g_raid_md_classes);
2424		if (c == NULL || c->mdc_priority > class->mdc_priority)
2425			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2426		else {
2427			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2428			    nc->mdc_priority < class->mdc_priority)
2429				c = nc;
2430			LIST_INSERT_AFTER(c, class, mdc_list);
2431		}
2432		if (g_raid_started)
2433			g_retaste(&g_raid_class);
2434		break;
2435	case MOD_UNLOAD:
2436		LIST_REMOVE(class, mdc_list);
2437		break;
2438	default:
2439		error = EOPNOTSUPP;
2440		break;
2441	}
2442
2443	return (error);
2444}
2445
2446int
2447g_raid_tr_modevent(module_t mod, int type, void *arg)
2448{
2449	struct g_raid_tr_class *class, *c, *nc;
2450	int error;
2451
2452	error = 0;
2453	class = arg;
2454	switch (type) {
2455	case MOD_LOAD:
2456		c = LIST_FIRST(&g_raid_tr_classes);
2457		if (c == NULL || c->trc_priority > class->trc_priority)
2458			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2459		else {
2460			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2461			    nc->trc_priority < class->trc_priority)
2462				c = nc;
2463			LIST_INSERT_AFTER(c, class, trc_list);
2464		}
2465		break;
2466	case MOD_UNLOAD:
2467		LIST_REMOVE(class, trc_list);
2468		break;
2469	default:
2470		error = EOPNOTSUPP;
2471		break;
2472	}
2473
2474	return (error);
2475}
2476
2477/*
2478 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2479 * to reduce module priority, allowing submodules to register them first.
2480 */
2481static moduledata_t g_raid_mod = {
2482	"g_raid",
2483	g_modevent,
2484	&g_raid_class
2485};
2486DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2487MODULE_VERSION(geom_raid, 0);
2488