1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sbuf.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/eventhandler.h>
42#include <vm/uma.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/raid/g_raid.h>
48#include "g_raid_md_if.h"
49#include "g_raid_tr_if.h"
50
51static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52
53SYSCTL_DECL(_kern_geom);
54SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55int g_raid_enable = 1;
56TUNABLE_INT("kern.geom.raid.enable", &g_raid_enable);
57SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RW,
58    &g_raid_enable, 0, "Enable on-disk metadata taste");
59u_int g_raid_aggressive_spare = 0;
60TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
61SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
62    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
63u_int g_raid_debug = 0;
64TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
65SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
66    "Debug level");
67int g_raid_read_err_thresh = 10;
68TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
70    &g_raid_read_err_thresh, 0,
71    "Number of read errors equated to disk failure");
72u_int g_raid_start_timeout = 30;
73TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
74SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
75    &g_raid_start_timeout, 0,
76    "Time to wait for all array components");
77static u_int g_raid_clean_time = 5;
78TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
79SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
80    &g_raid_clean_time, 0, "Mark volume as clean when idling");
81static u_int g_raid_disconnect_on_failure = 1;
82TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
83    &g_raid_disconnect_on_failure);
84SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
85    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
86static u_int g_raid_name_format = 0;
87TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
88SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
89    &g_raid_name_format, 0, "Providers name format.");
90static u_int g_raid_idle_threshold = 1000000;
91TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
92SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
93    &g_raid_idle_threshold, 1000000,
94    "Time in microseconds to consider a volume idle.");
95static u_int ar_legacy_aliases = 1;
96SYSCTL_INT(_kern_geom_raid, OID_AUTO, legacy_aliases, CTLFLAG_RW,
97           &ar_legacy_aliases, 0, "Create aliases named as the legacy ataraid style.");
98TUNABLE_INT("kern.geom_raid.legacy_aliases", &ar_legacy_aliases);
99
100
101#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
102	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
103	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
104	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
105} while (0)
106
107LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
108    LIST_HEAD_INITIALIZER(g_raid_md_classes);
109
110LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
111    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
112
113LIST_HEAD(, g_raid_volume) g_raid_volumes =
114    LIST_HEAD_INITIALIZER(g_raid_volumes);
115
116static eventhandler_tag g_raid_post_sync = NULL;
117static int g_raid_started = 0;
118static int g_raid_shutdown = 0;
119
120static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
121    struct g_geom *gp);
122static g_taste_t g_raid_taste;
123static void g_raid_init(struct g_class *mp);
124static void g_raid_fini(struct g_class *mp);
125
126struct g_class g_raid_class = {
127	.name = G_RAID_CLASS_NAME,
128	.version = G_VERSION,
129	.ctlreq = g_raid_ctl,
130	.taste = g_raid_taste,
131	.destroy_geom = g_raid_destroy_geom,
132	.init = g_raid_init,
133	.fini = g_raid_fini
134};
135
136static void g_raid_destroy_provider(struct g_raid_volume *vol);
137static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
138static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
139static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
140static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
141static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
142    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
143static void g_raid_start(struct bio *bp);
144static void g_raid_start_request(struct bio *bp);
145static void g_raid_disk_done(struct bio *bp);
146static void g_raid_poll(struct g_raid_softc *sc);
147
148static const char *
149g_raid_node_event2str(int event)
150{
151
152	switch (event) {
153	case G_RAID_NODE_E_WAKE:
154		return ("WAKE");
155	case G_RAID_NODE_E_START:
156		return ("START");
157	default:
158		return ("INVALID");
159	}
160}
161
162const char *
163g_raid_disk_state2str(int state)
164{
165
166	switch (state) {
167	case G_RAID_DISK_S_NONE:
168		return ("NONE");
169	case G_RAID_DISK_S_OFFLINE:
170		return ("OFFLINE");
171	case G_RAID_DISK_S_DISABLED:
172		return ("DISABLED");
173	case G_RAID_DISK_S_FAILED:
174		return ("FAILED");
175	case G_RAID_DISK_S_STALE_FAILED:
176		return ("STALE_FAILED");
177	case G_RAID_DISK_S_SPARE:
178		return ("SPARE");
179	case G_RAID_DISK_S_STALE:
180		return ("STALE");
181	case G_RAID_DISK_S_ACTIVE:
182		return ("ACTIVE");
183	default:
184		return ("INVALID");
185	}
186}
187
188static const char *
189g_raid_disk_event2str(int event)
190{
191
192	switch (event) {
193	case G_RAID_DISK_E_DISCONNECTED:
194		return ("DISCONNECTED");
195	default:
196		return ("INVALID");
197	}
198}
199
200const char *
201g_raid_subdisk_state2str(int state)
202{
203
204	switch (state) {
205	case G_RAID_SUBDISK_S_NONE:
206		return ("NONE");
207	case G_RAID_SUBDISK_S_FAILED:
208		return ("FAILED");
209	case G_RAID_SUBDISK_S_NEW:
210		return ("NEW");
211	case G_RAID_SUBDISK_S_REBUILD:
212		return ("REBUILD");
213	case G_RAID_SUBDISK_S_UNINITIALIZED:
214		return ("UNINITIALIZED");
215	case G_RAID_SUBDISK_S_STALE:
216		return ("STALE");
217	case G_RAID_SUBDISK_S_RESYNC:
218		return ("RESYNC");
219	case G_RAID_SUBDISK_S_ACTIVE:
220		return ("ACTIVE");
221	default:
222		return ("INVALID");
223	}
224}
225
226static const char *
227g_raid_subdisk_event2str(int event)
228{
229
230	switch (event) {
231	case G_RAID_SUBDISK_E_NEW:
232		return ("NEW");
233	case G_RAID_SUBDISK_E_FAILED:
234		return ("FAILED");
235	case G_RAID_SUBDISK_E_DISCONNECTED:
236		return ("DISCONNECTED");
237	default:
238		return ("INVALID");
239	}
240}
241
242const char *
243g_raid_volume_state2str(int state)
244{
245
246	switch (state) {
247	case G_RAID_VOLUME_S_STARTING:
248		return ("STARTING");
249	case G_RAID_VOLUME_S_BROKEN:
250		return ("BROKEN");
251	case G_RAID_VOLUME_S_DEGRADED:
252		return ("DEGRADED");
253	case G_RAID_VOLUME_S_SUBOPTIMAL:
254		return ("SUBOPTIMAL");
255	case G_RAID_VOLUME_S_OPTIMAL:
256		return ("OPTIMAL");
257	case G_RAID_VOLUME_S_UNSUPPORTED:
258		return ("UNSUPPORTED");
259	case G_RAID_VOLUME_S_STOPPED:
260		return ("STOPPED");
261	default:
262		return ("INVALID");
263	}
264}
265
266static const char *
267g_raid_volume_event2str(int event)
268{
269
270	switch (event) {
271	case G_RAID_VOLUME_E_UP:
272		return ("UP");
273	case G_RAID_VOLUME_E_DOWN:
274		return ("DOWN");
275	case G_RAID_VOLUME_E_START:
276		return ("START");
277	case G_RAID_VOLUME_E_STARTMD:
278		return ("STARTMD");
279	default:
280		return ("INVALID");
281	}
282}
283
284const char *
285g_raid_volume_level2str(int level, int qual)
286{
287
288	switch (level) {
289	case G_RAID_VOLUME_RL_RAID0:
290		return ("RAID0");
291	case G_RAID_VOLUME_RL_RAID1:
292		return ("RAID1");
293	case G_RAID_VOLUME_RL_RAID3:
294		if (qual == G_RAID_VOLUME_RLQ_R3P0)
295			return ("RAID3-P0");
296		if (qual == G_RAID_VOLUME_RLQ_R3PN)
297			return ("RAID3-PN");
298		return ("RAID3");
299	case G_RAID_VOLUME_RL_RAID4:
300		if (qual == G_RAID_VOLUME_RLQ_R4P0)
301			return ("RAID4-P0");
302		if (qual == G_RAID_VOLUME_RLQ_R4PN)
303			return ("RAID4-PN");
304		return ("RAID4");
305	case G_RAID_VOLUME_RL_RAID5:
306		if (qual == G_RAID_VOLUME_RLQ_R5RA)
307			return ("RAID5-RA");
308		if (qual == G_RAID_VOLUME_RLQ_R5RS)
309			return ("RAID5-RS");
310		if (qual == G_RAID_VOLUME_RLQ_R5LA)
311			return ("RAID5-LA");
312		if (qual == G_RAID_VOLUME_RLQ_R5LS)
313			return ("RAID5-LS");
314		return ("RAID5");
315	case G_RAID_VOLUME_RL_RAID6:
316		if (qual == G_RAID_VOLUME_RLQ_R6RA)
317			return ("RAID6-RA");
318		if (qual == G_RAID_VOLUME_RLQ_R6RS)
319			return ("RAID6-RS");
320		if (qual == G_RAID_VOLUME_RLQ_R6LA)
321			return ("RAID6-LA");
322		if (qual == G_RAID_VOLUME_RLQ_R6LS)
323			return ("RAID6-LS");
324		return ("RAID6");
325	case G_RAID_VOLUME_RL_RAIDMDF:
326		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
327			return ("RAIDMDF-RA");
328		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
329			return ("RAIDMDF-RS");
330		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
331			return ("RAIDMDF-LA");
332		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
333			return ("RAIDMDF-LS");
334		return ("RAIDMDF");
335	case G_RAID_VOLUME_RL_RAID1E:
336		if (qual == G_RAID_VOLUME_RLQ_R1EA)
337			return ("RAID1E-A");
338		if (qual == G_RAID_VOLUME_RLQ_R1EO)
339			return ("RAID1E-O");
340		return ("RAID1E");
341	case G_RAID_VOLUME_RL_SINGLE:
342		return ("SINGLE");
343	case G_RAID_VOLUME_RL_CONCAT:
344		return ("CONCAT");
345	case G_RAID_VOLUME_RL_RAID5E:
346		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
347			return ("RAID5E-RA");
348		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
349			return ("RAID5E-RS");
350		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
351			return ("RAID5E-LA");
352		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
353			return ("RAID5E-LS");
354		return ("RAID5E");
355	case G_RAID_VOLUME_RL_RAID5EE:
356		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
357			return ("RAID5EE-RA");
358		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
359			return ("RAID5EE-RS");
360		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
361			return ("RAID5EE-LA");
362		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
363			return ("RAID5EE-LS");
364		return ("RAID5EE");
365	case G_RAID_VOLUME_RL_RAID5R:
366		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
367			return ("RAID5R-RA");
368		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
369			return ("RAID5R-RS");
370		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
371			return ("RAID5R-LA");
372		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
373			return ("RAID5R-LS");
374		return ("RAID5E");
375	default:
376		return ("UNKNOWN");
377	}
378}
379
380int
381g_raid_volume_str2level(const char *str, int *level, int *qual)
382{
383
384	*level = G_RAID_VOLUME_RL_UNKNOWN;
385	*qual = G_RAID_VOLUME_RLQ_NONE;
386	if (strcasecmp(str, "RAID0") == 0)
387		*level = G_RAID_VOLUME_RL_RAID0;
388	else if (strcasecmp(str, "RAID1") == 0)
389		*level = G_RAID_VOLUME_RL_RAID1;
390	else if (strcasecmp(str, "RAID3-P0") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID3;
392		*qual = G_RAID_VOLUME_RLQ_R3P0;
393	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
394		   strcasecmp(str, "RAID3") == 0) {
395		*level = G_RAID_VOLUME_RL_RAID3;
396		*qual = G_RAID_VOLUME_RLQ_R3PN;
397	} else if (strcasecmp(str, "RAID4-P0") == 0) {
398		*level = G_RAID_VOLUME_RL_RAID4;
399		*qual = G_RAID_VOLUME_RLQ_R4P0;
400	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
401		   strcasecmp(str, "RAID4") == 0) {
402		*level = G_RAID_VOLUME_RL_RAID4;
403		*qual = G_RAID_VOLUME_RLQ_R4PN;
404	} else if (strcasecmp(str, "RAID5-RA") == 0) {
405		*level = G_RAID_VOLUME_RL_RAID5;
406		*qual = G_RAID_VOLUME_RLQ_R5RA;
407	} else if (strcasecmp(str, "RAID5-RS") == 0) {
408		*level = G_RAID_VOLUME_RL_RAID5;
409		*qual = G_RAID_VOLUME_RLQ_R5RS;
410	} else if (strcasecmp(str, "RAID5") == 0 ||
411		   strcasecmp(str, "RAID5-LA") == 0) {
412		*level = G_RAID_VOLUME_RL_RAID5;
413		*qual = G_RAID_VOLUME_RLQ_R5LA;
414	} else if (strcasecmp(str, "RAID5-LS") == 0) {
415		*level = G_RAID_VOLUME_RL_RAID5;
416		*qual = G_RAID_VOLUME_RLQ_R5LS;
417	} else if (strcasecmp(str, "RAID6-RA") == 0) {
418		*level = G_RAID_VOLUME_RL_RAID6;
419		*qual = G_RAID_VOLUME_RLQ_R6RA;
420	} else if (strcasecmp(str, "RAID6-RS") == 0) {
421		*level = G_RAID_VOLUME_RL_RAID6;
422		*qual = G_RAID_VOLUME_RLQ_R6RS;
423	} else if (strcasecmp(str, "RAID6") == 0 ||
424		   strcasecmp(str, "RAID6-LA") == 0) {
425		*level = G_RAID_VOLUME_RL_RAID6;
426		*qual = G_RAID_VOLUME_RLQ_R6LA;
427	} else if (strcasecmp(str, "RAID6-LS") == 0) {
428		*level = G_RAID_VOLUME_RL_RAID6;
429		*qual = G_RAID_VOLUME_RLQ_R6LS;
430	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
431		*level = G_RAID_VOLUME_RL_RAIDMDF;
432		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
433	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
434		*level = G_RAID_VOLUME_RL_RAIDMDF;
435		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
436	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
437		   strcasecmp(str, "RAIDMDF-LA") == 0) {
438		*level = G_RAID_VOLUME_RL_RAIDMDF;
439		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
440	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
441		*level = G_RAID_VOLUME_RL_RAIDMDF;
442		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
443	} else if (strcasecmp(str, "RAID10") == 0 ||
444		   strcasecmp(str, "RAID1E") == 0 ||
445		   strcasecmp(str, "RAID1E-A") == 0) {
446		*level = G_RAID_VOLUME_RL_RAID1E;
447		*qual = G_RAID_VOLUME_RLQ_R1EA;
448	} else if (strcasecmp(str, "RAID1E-O") == 0) {
449		*level = G_RAID_VOLUME_RL_RAID1E;
450		*qual = G_RAID_VOLUME_RLQ_R1EO;
451	} else if (strcasecmp(str, "SINGLE") == 0)
452		*level = G_RAID_VOLUME_RL_SINGLE;
453	else if (strcasecmp(str, "CONCAT") == 0)
454		*level = G_RAID_VOLUME_RL_CONCAT;
455	else if (strcasecmp(str, "RAID5E-RA") == 0) {
456		*level = G_RAID_VOLUME_RL_RAID5E;
457		*qual = G_RAID_VOLUME_RLQ_R5ERA;
458	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
459		*level = G_RAID_VOLUME_RL_RAID5E;
460		*qual = G_RAID_VOLUME_RLQ_R5ERS;
461	} else if (strcasecmp(str, "RAID5E") == 0 ||
462		   strcasecmp(str, "RAID5E-LA") == 0) {
463		*level = G_RAID_VOLUME_RL_RAID5E;
464		*qual = G_RAID_VOLUME_RLQ_R5ELA;
465	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
466		*level = G_RAID_VOLUME_RL_RAID5E;
467		*qual = G_RAID_VOLUME_RLQ_R5ELS;
468	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
469		*level = G_RAID_VOLUME_RL_RAID5EE;
470		*qual = G_RAID_VOLUME_RLQ_R5EERA;
471	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
472		*level = G_RAID_VOLUME_RL_RAID5EE;
473		*qual = G_RAID_VOLUME_RLQ_R5EERS;
474	} else if (strcasecmp(str, "RAID5EE") == 0 ||
475		   strcasecmp(str, "RAID5EE-LA") == 0) {
476		*level = G_RAID_VOLUME_RL_RAID5EE;
477		*qual = G_RAID_VOLUME_RLQ_R5EELA;
478	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
479		*level = G_RAID_VOLUME_RL_RAID5EE;
480		*qual = G_RAID_VOLUME_RLQ_R5EELS;
481	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
482		*level = G_RAID_VOLUME_RL_RAID5R;
483		*qual = G_RAID_VOLUME_RLQ_R5RRA;
484	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
485		*level = G_RAID_VOLUME_RL_RAID5R;
486		*qual = G_RAID_VOLUME_RLQ_R5RRS;
487	} else if (strcasecmp(str, "RAID5R") == 0 ||
488		   strcasecmp(str, "RAID5R-LA") == 0) {
489		*level = G_RAID_VOLUME_RL_RAID5R;
490		*qual = G_RAID_VOLUME_RLQ_R5RLA;
491	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
492		*level = G_RAID_VOLUME_RL_RAID5R;
493		*qual = G_RAID_VOLUME_RLQ_R5RLS;
494	} else
495		return (-1);
496	return (0);
497}
498
499const char *
500g_raid_get_diskname(struct g_raid_disk *disk)
501{
502
503	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
504		return ("[unknown]");
505	return (disk->d_consumer->provider->name);
506}
507
508void
509g_raid_get_disk_info(struct g_raid_disk *disk)
510{
511	struct g_consumer *cp = disk->d_consumer;
512	int error, len;
513
514	/* Read kernel dumping information. */
515	disk->d_kd.offset = 0;
516	disk->d_kd.length = OFF_MAX;
517	len = sizeof(disk->d_kd);
518	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
519	if (error)
520		disk->d_kd.di.dumper = NULL;
521	if (disk->d_kd.di.dumper == NULL)
522		G_RAID_DEBUG1(2, disk->d_softc,
523		    "Dumping not supported by %s: %d.",
524		    cp->provider->name, error);
525
526	/* Read BIO_DELETE support. */
527	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
528	if (error)
529		disk->d_candelete = 0;
530	if (!disk->d_candelete)
531		G_RAID_DEBUG1(2, disk->d_softc,
532		    "BIO_DELETE not supported by %s: %d.",
533		    cp->provider->name, error);
534}
535
536void
537g_raid_report_disk_state(struct g_raid_disk *disk)
538{
539	struct g_raid_subdisk *sd;
540	int len, state;
541	uint32_t s;
542
543	if (disk->d_consumer == NULL)
544		return;
545	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
546		s = G_STATE_ACTIVE; /* XXX */
547	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
548	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
549		s = G_STATE_FAILED;
550	} else {
551		state = G_RAID_SUBDISK_S_ACTIVE;
552		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
553			if (sd->sd_state < state)
554				state = sd->sd_state;
555		}
556		if (state == G_RAID_SUBDISK_S_FAILED)
557			s = G_STATE_FAILED;
558		else if (state == G_RAID_SUBDISK_S_NEW ||
559		    state == G_RAID_SUBDISK_S_REBUILD)
560			s = G_STATE_REBUILD;
561		else if (state == G_RAID_SUBDISK_S_STALE ||
562		    state == G_RAID_SUBDISK_S_RESYNC)
563			s = G_STATE_RESYNC;
564		else
565			s = G_STATE_ACTIVE;
566	}
567	len = sizeof(s);
568	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
569	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
570	    g_raid_get_diskname(disk), s);
571}
572
573void
574g_raid_change_disk_state(struct g_raid_disk *disk, int state)
575{
576
577	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
578	    g_raid_get_diskname(disk),
579	    g_raid_disk_state2str(disk->d_state),
580	    g_raid_disk_state2str(state));
581	disk->d_state = state;
582	g_raid_report_disk_state(disk);
583}
584
585void
586g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
587{
588
589	G_RAID_DEBUG1(0, sd->sd_softc,
590	    "Subdisk %s:%d-%s state changed from %s to %s.",
591	    sd->sd_volume->v_name, sd->sd_pos,
592	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
593	    g_raid_subdisk_state2str(sd->sd_state),
594	    g_raid_subdisk_state2str(state));
595	sd->sd_state = state;
596	if (sd->sd_disk)
597		g_raid_report_disk_state(sd->sd_disk);
598}
599
600void
601g_raid_change_volume_state(struct g_raid_volume *vol, int state)
602{
603
604	G_RAID_DEBUG1(0, vol->v_softc,
605	    "Volume %s state changed from %s to %s.",
606	    vol->v_name,
607	    g_raid_volume_state2str(vol->v_state),
608	    g_raid_volume_state2str(state));
609	vol->v_state = state;
610}
611
612/*
613 * --- Events handling functions ---
614 * Events in geom_raid are used to maintain subdisks and volumes status
615 * from one thread to simplify locking.
616 */
617static void
618g_raid_event_free(struct g_raid_event *ep)
619{
620
621	free(ep, M_RAID);
622}
623
624int
625g_raid_event_send(void *arg, int event, int flags)
626{
627	struct g_raid_softc *sc;
628	struct g_raid_event *ep;
629	int error;
630
631	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
632		sc = ((struct g_raid_volume *)arg)->v_softc;
633	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
634		sc = ((struct g_raid_disk *)arg)->d_softc;
635	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
636		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
637	} else {
638		sc = arg;
639	}
640	ep = malloc(sizeof(*ep), M_RAID,
641	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
642	if (ep == NULL)
643		return (ENOMEM);
644	ep->e_tgt = arg;
645	ep->e_event = event;
646	ep->e_flags = flags;
647	ep->e_error = 0;
648	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
649	mtx_lock(&sc->sc_queue_mtx);
650	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
651	mtx_unlock(&sc->sc_queue_mtx);
652	wakeup(sc);
653
654	if ((flags & G_RAID_EVENT_WAIT) == 0)
655		return (0);
656
657	sx_assert(&sc->sc_lock, SX_XLOCKED);
658	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
659	sx_xunlock(&sc->sc_lock);
660	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
661		mtx_lock(&sc->sc_queue_mtx);
662		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
663		    hz * 5);
664	}
665	error = ep->e_error;
666	g_raid_event_free(ep);
667	sx_xlock(&sc->sc_lock);
668	return (error);
669}
670
671static void
672g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
673{
674	struct g_raid_event *ep, *tmpep;
675
676	sx_assert(&sc->sc_lock, SX_XLOCKED);
677
678	mtx_lock(&sc->sc_queue_mtx);
679	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
680		if (ep->e_tgt != tgt)
681			continue;
682		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
683		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
684			g_raid_event_free(ep);
685		else {
686			ep->e_error = ECANCELED;
687			wakeup(ep);
688		}
689	}
690	mtx_unlock(&sc->sc_queue_mtx);
691}
692
693static int
694g_raid_event_check(struct g_raid_softc *sc, void *tgt)
695{
696	struct g_raid_event *ep;
697	int	res = 0;
698
699	sx_assert(&sc->sc_lock, SX_XLOCKED);
700
701	mtx_lock(&sc->sc_queue_mtx);
702	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
703		if (ep->e_tgt != tgt)
704			continue;
705		res = 1;
706		break;
707	}
708	mtx_unlock(&sc->sc_queue_mtx);
709	return (res);
710}
711
712/*
713 * Return the number of disks in given state.
714 * If state is equal to -1, count all connected disks.
715 */
716u_int
717g_raid_ndisks(struct g_raid_softc *sc, int state)
718{
719	struct g_raid_disk *disk;
720	u_int n;
721
722	sx_assert(&sc->sc_lock, SX_LOCKED);
723
724	n = 0;
725	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
726		if (disk->d_state == state || state == -1)
727			n++;
728	}
729	return (n);
730}
731
732/*
733 * Return the number of subdisks in given state.
734 * If state is equal to -1, count all connected disks.
735 */
736u_int
737g_raid_nsubdisks(struct g_raid_volume *vol, int state)
738{
739	struct g_raid_subdisk *subdisk;
740	struct g_raid_softc *sc;
741	u_int i, n ;
742
743	sc = vol->v_softc;
744	sx_assert(&sc->sc_lock, SX_LOCKED);
745
746	n = 0;
747	for (i = 0; i < vol->v_disks_count; i++) {
748		subdisk = &vol->v_subdisks[i];
749		if ((state == -1 &&
750		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
751		    subdisk->sd_state == state)
752			n++;
753	}
754	return (n);
755}
756
757/*
758 * Return the first subdisk in given state.
759 * If state is equal to -1, then the first connected disks.
760 */
761struct g_raid_subdisk *
762g_raid_get_subdisk(struct g_raid_volume *vol, int state)
763{
764	struct g_raid_subdisk *sd;
765	struct g_raid_softc *sc;
766	u_int i;
767
768	sc = vol->v_softc;
769	sx_assert(&sc->sc_lock, SX_LOCKED);
770
771	for (i = 0; i < vol->v_disks_count; i++) {
772		sd = &vol->v_subdisks[i];
773		if ((state == -1 &&
774		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
775		    sd->sd_state == state)
776			return (sd);
777	}
778	return (NULL);
779}
780
781struct g_consumer *
782g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
783{
784	struct g_consumer *cp;
785	struct g_provider *pp;
786
787	g_topology_assert();
788
789	if (strncmp(name, "/dev/", 5) == 0)
790		name += 5;
791	pp = g_provider_by_name(name);
792	if (pp == NULL)
793		return (NULL);
794	cp = g_new_consumer(sc->sc_geom);
795	if (g_attach(cp, pp) != 0) {
796		g_destroy_consumer(cp);
797		return (NULL);
798	}
799	if (g_access(cp, 1, 1, 1) != 0) {
800		g_detach(cp);
801		g_destroy_consumer(cp);
802		return (NULL);
803	}
804	return (cp);
805}
806
807static u_int
808g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
809{
810	struct bio *bp;
811	u_int nreqs = 0;
812
813	mtx_lock(&sc->sc_queue_mtx);
814	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
815		if (bp->bio_from == cp)
816			nreqs++;
817	}
818	mtx_unlock(&sc->sc_queue_mtx);
819	return (nreqs);
820}
821
822u_int
823g_raid_nopens(struct g_raid_softc *sc)
824{
825	struct g_raid_volume *vol;
826	u_int opens;
827
828	opens = 0;
829	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
830		if (vol->v_provider_open != 0)
831			opens++;
832	}
833	return (opens);
834}
835
836static int
837g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
838{
839
840	if (cp->index > 0) {
841		G_RAID_DEBUG1(2, sc,
842		    "I/O requests for %s exist, can't destroy it now.",
843		    cp->provider->name);
844		return (1);
845	}
846	if (g_raid_nrequests(sc, cp) > 0) {
847		G_RAID_DEBUG1(2, sc,
848		    "I/O requests for %s in queue, can't destroy it now.",
849		    cp->provider->name);
850		return (1);
851	}
852	return (0);
853}
854
855static void
856g_raid_destroy_consumer(void *arg, int flags __unused)
857{
858	struct g_consumer *cp;
859
860	g_topology_assert();
861
862	cp = arg;
863	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
864	g_detach(cp);
865	g_destroy_consumer(cp);
866}
867
868void
869g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
870{
871	struct g_provider *pp;
872	int retaste_wait;
873
874	g_topology_assert_not();
875
876	g_topology_lock();
877	cp->private = NULL;
878	if (g_raid_consumer_is_busy(sc, cp))
879		goto out;
880	pp = cp->provider;
881	retaste_wait = 0;
882	if (cp->acw == 1) {
883		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
884			retaste_wait = 1;
885	}
886	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
887		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
888	if (retaste_wait) {
889		/*
890		 * After retaste event was send (inside g_access()), we can send
891		 * event to detach and destroy consumer.
892		 * A class, which has consumer to the given provider connected
893		 * will not receive retaste event for the provider.
894		 * This is the way how I ignore retaste events when I close
895		 * consumers opened for write: I detach and destroy consumer
896		 * after retaste event is sent.
897		 */
898		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
899		goto out;
900	}
901	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
902	g_detach(cp);
903	g_destroy_consumer(cp);
904out:
905	g_topology_unlock();
906}
907
908static void
909g_raid_orphan(struct g_consumer *cp)
910{
911	struct g_raid_disk *disk;
912
913	g_topology_assert();
914
915	disk = cp->private;
916	if (disk == NULL)
917		return;
918	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
919	    G_RAID_EVENT_DISK);
920}
921
922static void
923g_raid_clean(struct g_raid_volume *vol, int acw)
924{
925	struct g_raid_softc *sc;
926	int timeout;
927
928	sc = vol->v_softc;
929	g_topology_assert_not();
930	sx_assert(&sc->sc_lock, SX_XLOCKED);
931
932//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
933//		return;
934	if (!vol->v_dirty)
935		return;
936	if (vol->v_writes > 0)
937		return;
938	if (acw > 0 || (acw == -1 &&
939	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
940		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
941		if (!g_raid_shutdown && timeout > 0)
942			return;
943	}
944	vol->v_dirty = 0;
945	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
946	    vol->v_name);
947	g_raid_write_metadata(sc, vol, NULL, NULL);
948}
949
950static void
951g_raid_dirty(struct g_raid_volume *vol)
952{
953	struct g_raid_softc *sc;
954
955	sc = vol->v_softc;
956	g_topology_assert_not();
957	sx_assert(&sc->sc_lock, SX_XLOCKED);
958
959//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
960//		return;
961	vol->v_dirty = 1;
962	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
963	    vol->v_name);
964	g_raid_write_metadata(sc, vol, NULL, NULL);
965}
966
967void
968g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
969{
970	struct g_raid_softc *sc;
971	struct g_raid_volume *vol;
972	struct g_raid_subdisk *sd;
973	struct bio_queue_head queue;
974	struct bio *cbp;
975	int i;
976
977	vol = tr->tro_volume;
978	sc = vol->v_softc;
979
980	/*
981	 * Allocate all bios before sending any request, so we can return
982	 * ENOMEM in nice and clean way.
983	 */
984	bioq_init(&queue);
985	for (i = 0; i < vol->v_disks_count; i++) {
986		sd = &vol->v_subdisks[i];
987		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
988		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
989			continue;
990		cbp = g_clone_bio(bp);
991		if (cbp == NULL)
992			goto failure;
993		cbp->bio_caller1 = sd;
994		bioq_insert_tail(&queue, cbp);
995	}
996	while ((cbp = bioq_takefirst(&queue)) != NULL) {
997		sd = cbp->bio_caller1;
998		cbp->bio_caller1 = NULL;
999		g_raid_subdisk_iostart(sd, cbp);
1000	}
1001	return;
1002failure:
1003	while ((cbp = bioq_takefirst(&queue)) != NULL)
1004		g_destroy_bio(cbp);
1005	if (bp->bio_error == 0)
1006		bp->bio_error = ENOMEM;
1007	g_raid_iodone(bp, bp->bio_error);
1008}
1009
1010static void
1011g_raid_tr_kerneldump_common_done(struct bio *bp)
1012{
1013
1014	bp->bio_flags |= BIO_DONE;
1015}
1016
1017int
1018g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1019    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1020{
1021	struct g_raid_softc *sc;
1022	struct g_raid_volume *vol;
1023	struct bio bp;
1024
1025	vol = tr->tro_volume;
1026	sc = vol->v_softc;
1027
1028	bzero(&bp, sizeof(bp));
1029	bp.bio_cmd = BIO_WRITE;
1030	bp.bio_done = g_raid_tr_kerneldump_common_done;
1031	bp.bio_attribute = NULL;
1032	bp.bio_offset = offset;
1033	bp.bio_length = length;
1034	bp.bio_data = virtual;
1035	bp.bio_to = vol->v_provider;
1036
1037	g_raid_start(&bp);
1038	while (!(bp.bio_flags & BIO_DONE)) {
1039		G_RAID_DEBUG1(4, sc, "Poll...");
1040		g_raid_poll(sc);
1041		DELAY(10);
1042	}
1043
1044	return (bp.bio_error != 0 ? EIO : 0);
1045}
1046
1047static int
1048g_raid_dump(void *arg,
1049    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1050{
1051	struct g_raid_volume *vol;
1052	int error;
1053
1054	vol = (struct g_raid_volume *)arg;
1055	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1056	    (long long unsigned)offset, (long long unsigned)length);
1057
1058	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1059	    virtual, physical, offset, length);
1060	return (error);
1061}
1062
1063static void
1064g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1065{
1066	struct g_kerneldump *gkd;
1067	struct g_provider *pp;
1068	struct g_raid_volume *vol;
1069
1070	gkd = (struct g_kerneldump*)bp->bio_data;
1071	pp = bp->bio_to;
1072	vol = pp->private;
1073	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1074		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1075	gkd->di.dumper = g_raid_dump;
1076	gkd->di.priv = vol;
1077	gkd->di.blocksize = vol->v_sectorsize;
1078	gkd->di.maxiosize = DFLTPHYS;
1079	gkd->di.mediaoffset = gkd->offset;
1080	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1081		gkd->length = vol->v_mediasize - gkd->offset;
1082	gkd->di.mediasize = gkd->length;
1083	g_io_deliver(bp, 0);
1084}
1085
1086static void
1087g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1088{
1089	struct g_provider *pp;
1090	struct g_raid_volume *vol;
1091	struct g_raid_subdisk *sd;
1092	int *val;
1093	int i;
1094
1095	val = (int *)bp->bio_data;
1096	pp = bp->bio_to;
1097	vol = pp->private;
1098	*val = 0;
1099	for (i = 0; i < vol->v_disks_count; i++) {
1100		sd = &vol->v_subdisks[i];
1101		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1102			continue;
1103		if (sd->sd_disk->d_candelete) {
1104			*val = 1;
1105			break;
1106		}
1107	}
1108	g_io_deliver(bp, 0);
1109}
1110
1111static void
1112g_raid_start(struct bio *bp)
1113{
1114	struct g_raid_softc *sc;
1115
1116	sc = bp->bio_to->geom->softc;
1117	/*
1118	 * If sc == NULL or there are no valid disks, provider's error
1119	 * should be set and g_raid_start() should not be called at all.
1120	 */
1121//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1122//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1123//	    bp->bio_to->error, bp->bio_to->name));
1124	G_RAID_LOGREQ(3, bp, "Request received.");
1125
1126	switch (bp->bio_cmd) {
1127	case BIO_READ:
1128	case BIO_WRITE:
1129	case BIO_DELETE:
1130	case BIO_FLUSH:
1131		break;
1132	case BIO_GETATTR:
1133		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1134			g_raid_candelete(sc, bp);
1135		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1136			g_raid_kerneldump(sc, bp);
1137		else
1138			g_io_deliver(bp, EOPNOTSUPP);
1139		return;
1140	default:
1141		g_io_deliver(bp, EOPNOTSUPP);
1142		return;
1143	}
1144	mtx_lock(&sc->sc_queue_mtx);
1145	bioq_disksort(&sc->sc_queue, bp);
1146	mtx_unlock(&sc->sc_queue_mtx);
1147	if (!dumping) {
1148		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1149		wakeup(sc);
1150	}
1151}
1152
1153static int
1154g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1155{
1156	/*
1157	 * 5 cases:
1158	 * (1) bp entirely below NO
1159	 * (2) bp entirely above NO
1160	 * (3) bp start below, but end in range YES
1161	 * (4) bp entirely within YES
1162	 * (5) bp starts within, ends above YES
1163	 *
1164	 * lock range 10-19 (offset 10 length 10)
1165	 * (1) 1-5: first if kicks it out
1166	 * (2) 30-35: second if kicks it out
1167	 * (3) 5-15: passes both ifs
1168	 * (4) 12-14: passes both ifs
1169	 * (5) 19-20: passes both
1170	 */
1171	off_t lend = lstart + len - 1;
1172	off_t bstart = bp->bio_offset;
1173	off_t bend = bp->bio_offset + bp->bio_length - 1;
1174
1175	if (bend < lstart)
1176		return (0);
1177	if (lend < bstart)
1178		return (0);
1179	return (1);
1180}
1181
1182static int
1183g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1184{
1185	struct g_raid_lock *lp;
1186
1187	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1188
1189	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1190		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1191			return (1);
1192	}
1193	return (0);
1194}
1195
1196static void
1197g_raid_start_request(struct bio *bp)
1198{
1199	struct g_raid_softc *sc;
1200	struct g_raid_volume *vol;
1201
1202	sc = bp->bio_to->geom->softc;
1203	sx_assert(&sc->sc_lock, SX_LOCKED);
1204	vol = bp->bio_to->private;
1205
1206	/*
1207	 * Check to see if this item is in a locked range.  If so,
1208	 * queue it to our locked queue and return.  We'll requeue
1209	 * it when the range is unlocked.  Internal I/O for the
1210	 * rebuild/rescan/recovery process is excluded from this
1211	 * check so we can actually do the recovery.
1212	 */
1213	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1214	    g_raid_is_in_locked_range(vol, bp)) {
1215		G_RAID_LOGREQ(3, bp, "Defer request.");
1216		bioq_insert_tail(&vol->v_locked, bp);
1217		return;
1218	}
1219
1220	/*
1221	 * If we're actually going to do the write/delete, then
1222	 * update the idle stats for the volume.
1223	 */
1224	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1225		if (!vol->v_dirty)
1226			g_raid_dirty(vol);
1227		vol->v_writes++;
1228	}
1229
1230	/*
1231	 * Put request onto inflight queue, so we can check if new
1232	 * synchronization requests don't collide with it.  Then tell
1233	 * the transformation layer to start the I/O.
1234	 */
1235	bioq_insert_tail(&vol->v_inflight, bp);
1236	G_RAID_LOGREQ(4, bp, "Request started");
1237	G_RAID_TR_IOSTART(vol->v_tr, bp);
1238}
1239
1240static void
1241g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1242{
1243	off_t off, len;
1244	struct bio *nbp;
1245	struct g_raid_lock *lp;
1246
1247	vol->v_pending_lock = 0;
1248	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1249		if (lp->l_pending) {
1250			off = lp->l_offset;
1251			len = lp->l_length;
1252			lp->l_pending = 0;
1253			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1254				if (g_raid_bio_overlaps(nbp, off, len))
1255					lp->l_pending++;
1256			}
1257			if (lp->l_pending) {
1258				vol->v_pending_lock = 1;
1259				G_RAID_DEBUG1(4, vol->v_softc,
1260				    "Deferred lock(%jd, %jd) has %d pending",
1261				    (intmax_t)off, (intmax_t)(off + len),
1262				    lp->l_pending);
1263				continue;
1264			}
1265			G_RAID_DEBUG1(4, vol->v_softc,
1266			    "Deferred lock of %jd to %jd completed",
1267			    (intmax_t)off, (intmax_t)(off + len));
1268			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1269		}
1270	}
1271}
1272
1273void
1274g_raid_iodone(struct bio *bp, int error)
1275{
1276	struct g_raid_softc *sc;
1277	struct g_raid_volume *vol;
1278
1279	sc = bp->bio_to->geom->softc;
1280	sx_assert(&sc->sc_lock, SX_LOCKED);
1281	vol = bp->bio_to->private;
1282	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1283
1284	/* Update stats if we done write/delete. */
1285	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1286		vol->v_writes--;
1287		vol->v_last_write = time_uptime;
1288	}
1289
1290	bioq_remove(&vol->v_inflight, bp);
1291	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1292		g_raid_finish_with_locked_ranges(vol, bp);
1293	getmicrouptime(&vol->v_last_done);
1294	g_io_deliver(bp, error);
1295}
1296
1297int
1298g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1299    struct bio *ignore, void *argp)
1300{
1301	struct g_raid_softc *sc;
1302	struct g_raid_lock *lp;
1303	struct bio *bp;
1304
1305	sc = vol->v_softc;
1306	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1307	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1308	lp->l_offset = off;
1309	lp->l_length = len;
1310	lp->l_callback_arg = argp;
1311
1312	lp->l_pending = 0;
1313	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1314		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1315			lp->l_pending++;
1316	}
1317
1318	/*
1319	 * If there are any writes that are pending, we return EBUSY.  All
1320	 * callers will have to wait until all pending writes clear.
1321	 */
1322	if (lp->l_pending > 0) {
1323		vol->v_pending_lock = 1;
1324		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1325		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1326		return (EBUSY);
1327	}
1328	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1329	    (intmax_t)off, (intmax_t)(off+len));
1330	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1331	return (0);
1332}
1333
1334int
1335g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1336{
1337	struct g_raid_lock *lp;
1338	struct g_raid_softc *sc;
1339	struct bio *bp;
1340
1341	sc = vol->v_softc;
1342	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1343		if (lp->l_offset == off && lp->l_length == len) {
1344			LIST_REMOVE(lp, l_next);
1345			/* XXX
1346			 * Right now we just put them all back on the queue
1347			 * and hope for the best.  We hope this because any
1348			 * locked ranges will go right back on this list
1349			 * when the worker thread runs.
1350			 * XXX
1351			 */
1352			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1353			    (intmax_t)lp->l_offset,
1354			    (intmax_t)(lp->l_offset+lp->l_length));
1355			mtx_lock(&sc->sc_queue_mtx);
1356			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1357				bioq_disksort(&sc->sc_queue, bp);
1358			mtx_unlock(&sc->sc_queue_mtx);
1359			free(lp, M_RAID);
1360			return (0);
1361		}
1362	}
1363	return (EINVAL);
1364}
1365
1366void
1367g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1368{
1369	struct g_consumer *cp;
1370	struct g_raid_disk *disk, *tdisk;
1371
1372	bp->bio_caller1 = sd;
1373
1374	/*
1375	 * Make sure that the disk is present. Generally it is a task of
1376	 * transformation layers to not send requests to absent disks, but
1377	 * it is better to be safe and report situation then sorry.
1378	 */
1379	if (sd->sd_disk == NULL) {
1380		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1381nodisk:
1382		bp->bio_from = NULL;
1383		bp->bio_to = NULL;
1384		bp->bio_error = ENXIO;
1385		g_raid_disk_done(bp);
1386		return;
1387	}
1388	disk = sd->sd_disk;
1389	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1390	    disk->d_state != G_RAID_DISK_S_FAILED) {
1391		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1392		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1393		goto nodisk;
1394	}
1395
1396	cp = disk->d_consumer;
1397	bp->bio_from = cp;
1398	bp->bio_to = cp->provider;
1399	cp->index++;
1400
1401	/* Update average disks load. */
1402	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1403		if (tdisk->d_consumer == NULL)
1404			tdisk->d_load = 0;
1405		else
1406			tdisk->d_load = (tdisk->d_consumer->index *
1407			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1408	}
1409
1410	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1411	if (dumping) {
1412		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1413		if (bp->bio_cmd == BIO_WRITE) {
1414			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1415			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1416		} else
1417			bp->bio_error = EOPNOTSUPP;
1418		g_raid_disk_done(bp);
1419	} else {
1420		bp->bio_done = g_raid_disk_done;
1421		bp->bio_offset += sd->sd_offset;
1422		G_RAID_LOGREQ(3, bp, "Sending request.");
1423		g_io_request(bp, cp);
1424	}
1425}
1426
1427int
1428g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1429    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1430{
1431
1432	if (sd->sd_disk == NULL)
1433		return (ENXIO);
1434	if (sd->sd_disk->d_kd.di.dumper == NULL)
1435		return (EOPNOTSUPP);
1436	return (dump_write(&sd->sd_disk->d_kd.di,
1437	    virtual, physical,
1438	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1439	    length));
1440}
1441
1442static void
1443g_raid_disk_done(struct bio *bp)
1444{
1445	struct g_raid_softc *sc;
1446	struct g_raid_subdisk *sd;
1447
1448	sd = bp->bio_caller1;
1449	sc = sd->sd_softc;
1450	mtx_lock(&sc->sc_queue_mtx);
1451	bioq_disksort(&sc->sc_queue, bp);
1452	mtx_unlock(&sc->sc_queue_mtx);
1453	if (!dumping)
1454		wakeup(sc);
1455}
1456
1457static void
1458g_raid_disk_done_request(struct bio *bp)
1459{
1460	struct g_raid_softc *sc;
1461	struct g_raid_disk *disk;
1462	struct g_raid_subdisk *sd;
1463	struct g_raid_volume *vol;
1464
1465	g_topology_assert_not();
1466
1467	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1468	sd = bp->bio_caller1;
1469	sc = sd->sd_softc;
1470	vol = sd->sd_volume;
1471	if (bp->bio_from != NULL) {
1472		bp->bio_from->index--;
1473		disk = bp->bio_from->private;
1474		if (disk == NULL)
1475			g_raid_kill_consumer(sc, bp->bio_from);
1476	}
1477	bp->bio_offset -= sd->sd_offset;
1478
1479	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1480}
1481
1482static void
1483g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1484{
1485
1486	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1487		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1488	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1489		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1490	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1491		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1492	else
1493		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1494	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1495		KASSERT(ep->e_error == 0,
1496		    ("Error cannot be handled."));
1497		g_raid_event_free(ep);
1498	} else {
1499		ep->e_flags |= G_RAID_EVENT_DONE;
1500		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1501		mtx_lock(&sc->sc_queue_mtx);
1502		wakeup(ep);
1503		mtx_unlock(&sc->sc_queue_mtx);
1504	}
1505}
1506
1507/*
1508 * Worker thread.
1509 */
1510static void
1511g_raid_worker(void *arg)
1512{
1513	struct g_raid_softc *sc;
1514	struct g_raid_event *ep;
1515	struct g_raid_volume *vol;
1516	struct bio *bp;
1517	struct timeval now, t;
1518	int timeout, rv;
1519
1520	sc = arg;
1521	thread_lock(curthread);
1522	sched_prio(curthread, PRIBIO);
1523	thread_unlock(curthread);
1524
1525	sx_xlock(&sc->sc_lock);
1526	for (;;) {
1527		mtx_lock(&sc->sc_queue_mtx);
1528		/*
1529		 * First take a look at events.
1530		 * This is important to handle events before any I/O requests.
1531		 */
1532		bp = NULL;
1533		vol = NULL;
1534		rv = 0;
1535		ep = TAILQ_FIRST(&sc->sc_events);
1536		if (ep != NULL)
1537			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1538		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1539			;
1540		else {
1541			getmicrouptime(&now);
1542			t = now;
1543			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1544				if (bioq_first(&vol->v_inflight) == NULL &&
1545				    vol->v_tr &&
1546				    timevalcmp(&vol->v_last_done, &t, < ))
1547					t = vol->v_last_done;
1548			}
1549			timevalsub(&t, &now);
1550			timeout = g_raid_idle_threshold +
1551			    t.tv_sec * 1000000 + t.tv_usec;
1552			if (timeout > 0) {
1553				/*
1554				 * Two steps to avoid overflows at HZ=1000
1555				 * and idle timeouts > 2.1s.  Some rounding
1556				 * errors can occur, but they are < 1tick,
1557				 * which is deemed to be close enough for
1558				 * this purpose.
1559				 */
1560				int micpertic = 1000000 / hz;
1561				timeout = (timeout + micpertic - 1) / micpertic;
1562				sx_xunlock(&sc->sc_lock);
1563				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1564				    PRIBIO | PDROP, "-", timeout);
1565				sx_xlock(&sc->sc_lock);
1566				goto process;
1567			} else
1568				rv = EWOULDBLOCK;
1569		}
1570		mtx_unlock(&sc->sc_queue_mtx);
1571process:
1572		if (ep != NULL) {
1573			g_raid_handle_event(sc, ep);
1574		} else if (bp != NULL) {
1575			if (bp->bio_to != NULL &&
1576			    bp->bio_to->geom == sc->sc_geom)
1577				g_raid_start_request(bp);
1578			else
1579				g_raid_disk_done_request(bp);
1580		} else if (rv == EWOULDBLOCK) {
1581			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1582				g_raid_clean(vol, -1);
1583				if (bioq_first(&vol->v_inflight) == NULL &&
1584				    vol->v_tr) {
1585					t.tv_sec = g_raid_idle_threshold / 1000000;
1586					t.tv_usec = g_raid_idle_threshold % 1000000;
1587					timevaladd(&t, &vol->v_last_done);
1588					getmicrouptime(&now);
1589					if (timevalcmp(&t, &now, <= )) {
1590						G_RAID_TR_IDLE(vol->v_tr);
1591						vol->v_last_done = now;
1592					}
1593				}
1594			}
1595		}
1596		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1597			g_raid_destroy_node(sc, 1);	/* May not return. */
1598	}
1599}
1600
1601static void
1602g_raid_poll(struct g_raid_softc *sc)
1603{
1604	struct g_raid_event *ep;
1605	struct bio *bp;
1606
1607	sx_xlock(&sc->sc_lock);
1608	mtx_lock(&sc->sc_queue_mtx);
1609	/*
1610	 * First take a look at events.
1611	 * This is important to handle events before any I/O requests.
1612	 */
1613	ep = TAILQ_FIRST(&sc->sc_events);
1614	if (ep != NULL) {
1615		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1616		mtx_unlock(&sc->sc_queue_mtx);
1617		g_raid_handle_event(sc, ep);
1618		goto out;
1619	}
1620	bp = bioq_takefirst(&sc->sc_queue);
1621	if (bp != NULL) {
1622		mtx_unlock(&sc->sc_queue_mtx);
1623		if (bp->bio_from == NULL ||
1624		    bp->bio_from->geom != sc->sc_geom)
1625			g_raid_start_request(bp);
1626		else
1627			g_raid_disk_done_request(bp);
1628	}
1629out:
1630	sx_xunlock(&sc->sc_lock);
1631}
1632
1633static void
1634g_raid_launch_provider(struct g_raid_volume *vol)
1635{
1636	struct g_raid_disk *disk;
1637	struct g_raid_subdisk *sd;
1638	struct g_raid_softc *sc;
1639	struct g_provider *pp;
1640	char name[G_RAID_MAX_VOLUMENAME];
1641	char   announce_buf[80], buf1[32];
1642	off_t off;
1643	int i;
1644
1645	sc = vol->v_softc;
1646	sx_assert(&sc->sc_lock, SX_LOCKED);
1647
1648	g_topology_lock();
1649	/* Try to name provider with volume name. */
1650	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1651	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1652	    g_provider_by_name(name) != NULL) {
1653		/* Otherwise use sequential volume number. */
1654		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1655	}
1656
1657	/*
1658	 * Create a /dev/ar%d that the old ataraid(4) stack once
1659	 * created as an alias for /dev/raid/r%d if requested.
1660	 * This helps going from stable/7 ataraid devices to newer
1661	 * FreeBSD releases. sbruno 07 MAY 2013
1662	 */
1663
1664        if (ar_legacy_aliases) {
1665		snprintf(announce_buf, sizeof(announce_buf),
1666                        "kern.devalias.%s", name);
1667                snprintf(buf1, sizeof(buf1),
1668                        "ar%d", vol->v_global_id);
1669                setenv(announce_buf, buf1);
1670        }
1671
1672	pp = g_new_providerf(sc->sc_geom, "%s", name);
1673	if (vol->v_tr->tro_class->trc_accept_unmapped) {
1674		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1675		for (i = 0; i < vol->v_disks_count; i++) {
1676			sd = &vol->v_subdisks[i];
1677			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1678				continue;
1679			if ((sd->sd_disk->d_consumer->provider->flags &
1680			    G_PF_ACCEPT_UNMAPPED) == 0)
1681				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
1682		}
1683	}
1684	pp->private = vol;
1685	pp->mediasize = vol->v_mediasize;
1686	pp->sectorsize = vol->v_sectorsize;
1687	pp->stripesize = 0;
1688	pp->stripeoffset = 0;
1689	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1690	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1691	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1692	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1693		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1694		    disk->d_consumer != NULL &&
1695		    disk->d_consumer->provider != NULL) {
1696			pp->stripesize = disk->d_consumer->provider->stripesize;
1697			off = disk->d_consumer->provider->stripeoffset;
1698			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1699			if (off > 0)
1700				pp->stripeoffset %= off;
1701		}
1702		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1703			pp->stripesize *= (vol->v_disks_count - 1);
1704			pp->stripeoffset *= (vol->v_disks_count - 1);
1705		}
1706	} else
1707		pp->stripesize = vol->v_strip_size;
1708	vol->v_provider = pp;
1709	g_error_provider(pp, 0);
1710	g_topology_unlock();
1711	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1712	    pp->name, vol->v_name);
1713}
1714
1715static void
1716g_raid_destroy_provider(struct g_raid_volume *vol)
1717{
1718	struct g_raid_softc *sc;
1719	struct g_provider *pp;
1720	struct bio *bp, *tmp;
1721
1722	g_topology_assert_not();
1723	sc = vol->v_softc;
1724	pp = vol->v_provider;
1725	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1726
1727	g_topology_lock();
1728	g_error_provider(pp, ENXIO);
1729	mtx_lock(&sc->sc_queue_mtx);
1730	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1731		if (bp->bio_to != pp)
1732			continue;
1733		bioq_remove(&sc->sc_queue, bp);
1734		g_io_deliver(bp, ENXIO);
1735	}
1736	mtx_unlock(&sc->sc_queue_mtx);
1737	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1738	    pp->name, vol->v_name);
1739	g_wither_provider(pp, ENXIO);
1740	g_topology_unlock();
1741	vol->v_provider = NULL;
1742}
1743
1744/*
1745 * Update device state.
1746 */
1747static int
1748g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1749{
1750	struct g_raid_softc *sc;
1751
1752	sc = vol->v_softc;
1753	sx_assert(&sc->sc_lock, SX_XLOCKED);
1754
1755	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1756	    g_raid_volume_event2str(event),
1757	    vol->v_name);
1758	switch (event) {
1759	case G_RAID_VOLUME_E_DOWN:
1760		if (vol->v_provider != NULL)
1761			g_raid_destroy_provider(vol);
1762		break;
1763	case G_RAID_VOLUME_E_UP:
1764		if (vol->v_provider == NULL)
1765			g_raid_launch_provider(vol);
1766		break;
1767	case G_RAID_VOLUME_E_START:
1768		if (vol->v_tr)
1769			G_RAID_TR_START(vol->v_tr);
1770		return (0);
1771	default:
1772		if (sc->sc_md)
1773			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1774		return (0);
1775	}
1776
1777	/* Manage root mount release. */
1778	if (vol->v_starting) {
1779		vol->v_starting = 0;
1780		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1781		root_mount_rel(vol->v_rootmount);
1782		vol->v_rootmount = NULL;
1783	}
1784	if (vol->v_stopping && vol->v_provider_open == 0)
1785		g_raid_destroy_volume(vol);
1786	return (0);
1787}
1788
1789/*
1790 * Update subdisk state.
1791 */
1792static int
1793g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1794{
1795	struct g_raid_softc *sc;
1796	struct g_raid_volume *vol;
1797
1798	sc = sd->sd_softc;
1799	vol = sd->sd_volume;
1800	sx_assert(&sc->sc_lock, SX_XLOCKED);
1801
1802	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1803	    g_raid_subdisk_event2str(event),
1804	    vol->v_name, sd->sd_pos,
1805	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1806	if (vol->v_tr)
1807		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1808
1809	return (0);
1810}
1811
1812/*
1813 * Update disk state.
1814 */
1815static int
1816g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1817{
1818	struct g_raid_softc *sc;
1819
1820	sc = disk->d_softc;
1821	sx_assert(&sc->sc_lock, SX_XLOCKED);
1822
1823	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1824	    g_raid_disk_event2str(event),
1825	    g_raid_get_diskname(disk));
1826
1827	if (sc->sc_md)
1828		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1829	return (0);
1830}
1831
1832/*
1833 * Node event.
1834 */
1835static int
1836g_raid_update_node(struct g_raid_softc *sc, u_int event)
1837{
1838	sx_assert(&sc->sc_lock, SX_XLOCKED);
1839
1840	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1841	    g_raid_node_event2str(event));
1842
1843	if (event == G_RAID_NODE_E_WAKE)
1844		return (0);
1845	if (sc->sc_md)
1846		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1847	return (0);
1848}
1849
1850static int
1851g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1852{
1853	struct g_raid_volume *vol;
1854	struct g_raid_softc *sc;
1855	int dcw, opens, error = 0;
1856
1857	g_topology_assert();
1858	sc = pp->geom->softc;
1859	vol = pp->private;
1860	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1861	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1862
1863	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1864	    acr, acw, ace);
1865	dcw = pp->acw + acw;
1866
1867	g_topology_unlock();
1868	sx_xlock(&sc->sc_lock);
1869	/* Deny new opens while dying. */
1870	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1871		error = ENXIO;
1872		goto out;
1873	}
1874	/* Deny write opens for read-only volumes. */
1875	if (vol->v_read_only && acw > 0) {
1876		error = EROFS;
1877		goto out;
1878	}
1879	if (dcw == 0)
1880		g_raid_clean(vol, dcw);
1881	vol->v_provider_open += acr + acw + ace;
1882	/* Handle delayed node destruction. */
1883	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1884	    vol->v_provider_open == 0) {
1885		/* Count open volumes. */
1886		opens = g_raid_nopens(sc);
1887		if (opens == 0) {
1888			sc->sc_stopping = G_RAID_DESTROY_HARD;
1889			/* Wake up worker to make it selfdestruct. */
1890			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1891		}
1892	}
1893	/* Handle open volume destruction. */
1894	if (vol->v_stopping && vol->v_provider_open == 0)
1895		g_raid_destroy_volume(vol);
1896out:
1897	sx_xunlock(&sc->sc_lock);
1898	g_topology_lock();
1899	return (error);
1900}
1901
1902struct g_raid_softc *
1903g_raid_create_node(struct g_class *mp,
1904    const char *name, struct g_raid_md_object *md)
1905{
1906	struct g_raid_softc *sc;
1907	struct g_geom *gp;
1908	int error;
1909
1910	g_topology_assert();
1911	G_RAID_DEBUG(1, "Creating array %s.", name);
1912
1913	gp = g_new_geomf(mp, "%s", name);
1914	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1915	gp->start = g_raid_start;
1916	gp->orphan = g_raid_orphan;
1917	gp->access = g_raid_access;
1918	gp->dumpconf = g_raid_dumpconf;
1919
1920	sc->sc_md = md;
1921	sc->sc_geom = gp;
1922	sc->sc_flags = 0;
1923	TAILQ_INIT(&sc->sc_volumes);
1924	TAILQ_INIT(&sc->sc_disks);
1925	sx_init(&sc->sc_lock, "graid:lock");
1926	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1927	TAILQ_INIT(&sc->sc_events);
1928	bioq_init(&sc->sc_queue);
1929	gp->softc = sc;
1930	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1931	    "g_raid %s", name);
1932	if (error != 0) {
1933		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1934		mtx_destroy(&sc->sc_queue_mtx);
1935		sx_destroy(&sc->sc_lock);
1936		g_destroy_geom(sc->sc_geom);
1937		free(sc, M_RAID);
1938		return (NULL);
1939	}
1940
1941	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1942	return (sc);
1943}
1944
1945struct g_raid_volume *
1946g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1947{
1948	struct g_raid_volume	*vol, *vol1;
1949	int i;
1950
1951	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1952	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1953	vol->v_softc = sc;
1954	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1955	vol->v_state = G_RAID_VOLUME_S_STARTING;
1956	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1957	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1958	vol->v_rotate_parity = 1;
1959	bioq_init(&vol->v_inflight);
1960	bioq_init(&vol->v_locked);
1961	LIST_INIT(&vol->v_locks);
1962	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1963		vol->v_subdisks[i].sd_softc = sc;
1964		vol->v_subdisks[i].sd_volume = vol;
1965		vol->v_subdisks[i].sd_pos = i;
1966		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1967	}
1968
1969	/* Find free ID for this volume. */
1970	g_topology_lock();
1971	vol1 = vol;
1972	if (id >= 0) {
1973		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1974			if (vol1->v_global_id == id)
1975				break;
1976		}
1977	}
1978	if (vol1 != NULL) {
1979		for (id = 0; ; id++) {
1980			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1981				if (vol1->v_global_id == id)
1982					break;
1983			}
1984			if (vol1 == NULL)
1985				break;
1986		}
1987	}
1988	vol->v_global_id = id;
1989	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1990	g_topology_unlock();
1991
1992	/* Delay root mounting. */
1993	vol->v_rootmount = root_mount_hold("GRAID");
1994	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1995	vol->v_starting = 1;
1996	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1997	return (vol);
1998}
1999
2000struct g_raid_disk *
2001g_raid_create_disk(struct g_raid_softc *sc)
2002{
2003	struct g_raid_disk	*disk;
2004
2005	G_RAID_DEBUG1(1, sc, "Creating disk.");
2006	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
2007	disk->d_softc = sc;
2008	disk->d_state = G_RAID_DISK_S_NONE;
2009	TAILQ_INIT(&disk->d_subdisks);
2010	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
2011	return (disk);
2012}
2013
2014int g_raid_start_volume(struct g_raid_volume *vol)
2015{
2016	struct g_raid_tr_class *class;
2017	struct g_raid_tr_object *obj;
2018	int status;
2019
2020	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
2021	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
2022		if (!class->trc_enable)
2023			continue;
2024		G_RAID_DEBUG1(2, vol->v_softc,
2025		    "Tasting volume %s for %s transformation.",
2026		    vol->v_name, class->name);
2027		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2028		    M_WAITOK);
2029		obj->tro_class = class;
2030		obj->tro_volume = vol;
2031		status = G_RAID_TR_TASTE(obj, vol);
2032		if (status != G_RAID_TR_TASTE_FAIL)
2033			break;
2034		kobj_delete((kobj_t)obj, M_RAID);
2035	}
2036	if (class == NULL) {
2037		G_RAID_DEBUG1(0, vol->v_softc,
2038		    "No transformation module found for %s.",
2039		    vol->v_name);
2040		vol->v_tr = NULL;
2041		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2042		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2043		    G_RAID_EVENT_VOLUME);
2044		return (-1);
2045	}
2046	G_RAID_DEBUG1(2, vol->v_softc,
2047	    "Transformation module %s chosen for %s.",
2048	    class->name, vol->v_name);
2049	vol->v_tr = obj;
2050	return (0);
2051}
2052
2053int
2054g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2055{
2056	struct g_raid_volume *vol, *tmpv;
2057	struct g_raid_disk *disk, *tmpd;
2058	int error = 0;
2059
2060	sc->sc_stopping = G_RAID_DESTROY_HARD;
2061	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2062		if (g_raid_destroy_volume(vol))
2063			error = EBUSY;
2064	}
2065	if (error)
2066		return (error);
2067	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2068		if (g_raid_destroy_disk(disk))
2069			error = EBUSY;
2070	}
2071	if (error)
2072		return (error);
2073	if (sc->sc_md) {
2074		G_RAID_MD_FREE(sc->sc_md);
2075		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2076		sc->sc_md = NULL;
2077	}
2078	if (sc->sc_geom != NULL) {
2079		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2080		g_topology_lock();
2081		sc->sc_geom->softc = NULL;
2082		g_wither_geom(sc->sc_geom, ENXIO);
2083		g_topology_unlock();
2084		sc->sc_geom = NULL;
2085	} else
2086		G_RAID_DEBUG(1, "Array destroyed.");
2087	if (worker) {
2088		g_raid_event_cancel(sc, sc);
2089		mtx_destroy(&sc->sc_queue_mtx);
2090		sx_xunlock(&sc->sc_lock);
2091		sx_destroy(&sc->sc_lock);
2092		wakeup(&sc->sc_stopping);
2093		free(sc, M_RAID);
2094		curthread->td_pflags &= ~TDP_GEOM;
2095		G_RAID_DEBUG(1, "Thread exiting.");
2096		kproc_exit(0);
2097	} else {
2098		/* Wake up worker to make it selfdestruct. */
2099		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2100	}
2101	return (0);
2102}
2103
2104int
2105g_raid_destroy_volume(struct g_raid_volume *vol)
2106{
2107	struct g_raid_softc *sc;
2108	struct g_raid_disk *disk;
2109	int i;
2110
2111	sc = vol->v_softc;
2112	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2113	vol->v_stopping = 1;
2114	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2115		if (vol->v_tr) {
2116			G_RAID_TR_STOP(vol->v_tr);
2117			return (EBUSY);
2118		} else
2119			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2120	}
2121	if (g_raid_event_check(sc, vol) != 0)
2122		return (EBUSY);
2123	if (vol->v_provider != NULL)
2124		return (EBUSY);
2125	if (vol->v_provider_open != 0)
2126		return (EBUSY);
2127	if (vol->v_tr) {
2128		G_RAID_TR_FREE(vol->v_tr);
2129		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2130		vol->v_tr = NULL;
2131	}
2132	if (vol->v_rootmount)
2133		root_mount_rel(vol->v_rootmount);
2134	g_topology_lock();
2135	LIST_REMOVE(vol, v_global_next);
2136	g_topology_unlock();
2137	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2138	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2139		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2140		disk = vol->v_subdisks[i].sd_disk;
2141		if (disk == NULL)
2142			continue;
2143		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2144	}
2145	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2146	if (sc->sc_md)
2147		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2148	g_raid_event_cancel(sc, vol);
2149	free(vol, M_RAID);
2150	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2151		/* Wake up worker to let it selfdestruct. */
2152		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2153	}
2154	return (0);
2155}
2156
2157int
2158g_raid_destroy_disk(struct g_raid_disk *disk)
2159{
2160	struct g_raid_softc *sc;
2161	struct g_raid_subdisk *sd, *tmp;
2162
2163	sc = disk->d_softc;
2164	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2165	if (disk->d_consumer) {
2166		g_raid_kill_consumer(sc, disk->d_consumer);
2167		disk->d_consumer = NULL;
2168	}
2169	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2170		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2171		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2172		    G_RAID_EVENT_SUBDISK);
2173		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2174		sd->sd_disk = NULL;
2175	}
2176	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2177	if (sc->sc_md)
2178		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2179	g_raid_event_cancel(sc, disk);
2180	free(disk, M_RAID);
2181	return (0);
2182}
2183
2184int
2185g_raid_destroy(struct g_raid_softc *sc, int how)
2186{
2187	int error, opens;
2188
2189	g_topology_assert_not();
2190	if (sc == NULL)
2191		return (ENXIO);
2192	sx_assert(&sc->sc_lock, SX_XLOCKED);
2193
2194	/* Count open volumes. */
2195	opens = g_raid_nopens(sc);
2196
2197	/* React on some opened volumes. */
2198	if (opens > 0) {
2199		switch (how) {
2200		case G_RAID_DESTROY_SOFT:
2201			G_RAID_DEBUG1(1, sc,
2202			    "%d volumes are still open.",
2203			    opens);
2204			sx_xunlock(&sc->sc_lock);
2205			return (EBUSY);
2206		case G_RAID_DESTROY_DELAYED:
2207			G_RAID_DEBUG1(1, sc,
2208			    "Array will be destroyed on last close.");
2209			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2210			sx_xunlock(&sc->sc_lock);
2211			return (EBUSY);
2212		case G_RAID_DESTROY_HARD:
2213			G_RAID_DEBUG1(1, sc,
2214			    "%d volumes are still open.",
2215			    opens);
2216		}
2217	}
2218
2219	/* Mark node for destruction. */
2220	sc->sc_stopping = G_RAID_DESTROY_HARD;
2221	/* Wake up worker to let it selfdestruct. */
2222	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2223	/* Sleep until node destroyed. */
2224	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2225	    PRIBIO | PDROP, "r:destroy", hz * 3);
2226	return (error == EWOULDBLOCK ? EBUSY : 0);
2227}
2228
2229static void
2230g_raid_taste_orphan(struct g_consumer *cp)
2231{
2232
2233	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2234	    cp->provider->name));
2235}
2236
2237static struct g_geom *
2238g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2239{
2240	struct g_consumer *cp;
2241	struct g_geom *gp, *geom;
2242	struct g_raid_md_class *class;
2243	struct g_raid_md_object *obj;
2244	int status;
2245
2246	g_topology_assert();
2247	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2248	if (!g_raid_enable)
2249		return (NULL);
2250	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2251
2252	geom = NULL;
2253	status = G_RAID_MD_TASTE_FAIL;
2254	gp = g_new_geomf(mp, "raid:taste");
2255	/*
2256	 * This orphan function should be never called.
2257	 */
2258	gp->orphan = g_raid_taste_orphan;
2259	cp = g_new_consumer(gp);
2260	g_attach(cp, pp);
2261	if (g_access(cp, 1, 0, 0) != 0)
2262		goto ofail;
2263
2264	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2265		if (!class->mdc_enable)
2266			continue;
2267		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2268		    pp->name, class->name);
2269		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2270		    M_WAITOK);
2271		obj->mdo_class = class;
2272		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2273		if (status != G_RAID_MD_TASTE_NEW)
2274			kobj_delete((kobj_t)obj, M_RAID);
2275		if (status != G_RAID_MD_TASTE_FAIL)
2276			break;
2277	}
2278
2279	if (status == G_RAID_MD_TASTE_FAIL)
2280		(void)g_access(cp, -1, 0, 0);
2281ofail:
2282	g_detach(cp);
2283	g_destroy_consumer(cp);
2284	g_destroy_geom(gp);
2285	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2286	return (geom);
2287}
2288
2289int
2290g_raid_create_node_format(const char *format, struct gctl_req *req,
2291    struct g_geom **gp)
2292{
2293	struct g_raid_md_class *class;
2294	struct g_raid_md_object *obj;
2295	int status;
2296
2297	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2298	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2299		if (strcasecmp(class->name, format) == 0)
2300			break;
2301	}
2302	if (class == NULL) {
2303		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2304		return (G_RAID_MD_TASTE_FAIL);
2305	}
2306	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2307	    M_WAITOK);
2308	obj->mdo_class = class;
2309	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2310	if (status != G_RAID_MD_TASTE_NEW)
2311		kobj_delete((kobj_t)obj, M_RAID);
2312	return (status);
2313}
2314
2315static int
2316g_raid_destroy_geom(struct gctl_req *req __unused,
2317    struct g_class *mp __unused, struct g_geom *gp)
2318{
2319	struct g_raid_softc *sc;
2320	int error;
2321
2322	g_topology_unlock();
2323	sc = gp->softc;
2324	sx_xlock(&sc->sc_lock);
2325	g_cancel_event(sc);
2326	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2327	g_topology_lock();
2328	return (error);
2329}
2330
2331void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2332    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2333{
2334
2335	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2336		return;
2337	if (sc->sc_md)
2338		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2339}
2340
2341void g_raid_fail_disk(struct g_raid_softc *sc,
2342    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2343{
2344
2345	if (disk == NULL)
2346		disk = sd->sd_disk;
2347	if (disk == NULL) {
2348		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2349		return;
2350	}
2351	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2352		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2353		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2354		return;
2355	}
2356	if (sc->sc_md)
2357		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2358}
2359
2360static void
2361g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2362    struct g_consumer *cp, struct g_provider *pp)
2363{
2364	struct g_raid_softc *sc;
2365	struct g_raid_volume *vol;
2366	struct g_raid_subdisk *sd;
2367	struct g_raid_disk *disk;
2368	int i, s;
2369
2370	g_topology_assert();
2371
2372	sc = gp->softc;
2373	if (sc == NULL)
2374		return;
2375	if (pp != NULL) {
2376		vol = pp->private;
2377		g_topology_unlock();
2378		sx_xlock(&sc->sc_lock);
2379		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
2380		    sc->sc_md->mdo_class->name,
2381		    g_raid_volume_level2str(vol->v_raid_level,
2382		    vol->v_raid_level_qualifier));
2383		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2384		    vol->v_name);
2385		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2386		    g_raid_volume_level2str(vol->v_raid_level,
2387		    vol->v_raid_level_qualifier));
2388		sbuf_printf(sb,
2389		    "%s<Transformation>%s</Transformation>\n", indent,
2390		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2391		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2392		    vol->v_disks_count);
2393		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2394		    vol->v_strip_size);
2395		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2396		    g_raid_volume_state2str(vol->v_state));
2397		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2398		    vol->v_dirty ? "Yes" : "No");
2399		sbuf_printf(sb, "%s<Subdisks>", indent);
2400		for (i = 0; i < vol->v_disks_count; i++) {
2401			sd = &vol->v_subdisks[i];
2402			if (sd->sd_disk != NULL &&
2403			    sd->sd_disk->d_consumer != NULL) {
2404				sbuf_printf(sb, "%s ",
2405				    g_raid_get_diskname(sd->sd_disk));
2406			} else {
2407				sbuf_printf(sb, "NONE ");
2408			}
2409			sbuf_printf(sb, "(%s",
2410			    g_raid_subdisk_state2str(sd->sd_state));
2411			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2412			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2413				sbuf_printf(sb, " %d%%",
2414				    (int)(sd->sd_rebuild_pos * 100 /
2415				     sd->sd_size));
2416			}
2417			sbuf_printf(sb, ")");
2418			if (i + 1 < vol->v_disks_count)
2419				sbuf_printf(sb, ", ");
2420		}
2421		sbuf_printf(sb, "</Subdisks>\n");
2422		sx_xunlock(&sc->sc_lock);
2423		g_topology_lock();
2424	} else if (cp != NULL) {
2425		disk = cp->private;
2426		if (disk == NULL)
2427			return;
2428		g_topology_unlock();
2429		sx_xlock(&sc->sc_lock);
2430		sbuf_printf(sb, "%s<State>%s", indent,
2431		    g_raid_disk_state2str(disk->d_state));
2432		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2433			sbuf_printf(sb, " (");
2434			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2435				sbuf_printf(sb, "%s",
2436				    g_raid_subdisk_state2str(sd->sd_state));
2437				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2438				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2439					sbuf_printf(sb, " %d%%",
2440					    (int)(sd->sd_rebuild_pos * 100 /
2441					     sd->sd_size));
2442				}
2443				if (TAILQ_NEXT(sd, sd_next))
2444					sbuf_printf(sb, ", ");
2445			}
2446			sbuf_printf(sb, ")");
2447		}
2448		sbuf_printf(sb, "</State>\n");
2449		sbuf_printf(sb, "%s<Subdisks>", indent);
2450		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2451			sbuf_printf(sb, "r%d(%s):%d@%ju",
2452			    sd->sd_volume->v_global_id,
2453			    sd->sd_volume->v_name,
2454			    sd->sd_pos, sd->sd_offset);
2455			if (TAILQ_NEXT(sd, sd_next))
2456				sbuf_printf(sb, ", ");
2457		}
2458		sbuf_printf(sb, "</Subdisks>\n");
2459		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2460		    disk->d_read_errs);
2461		sx_xunlock(&sc->sc_lock);
2462		g_topology_lock();
2463	} else {
2464		g_topology_unlock();
2465		sx_xlock(&sc->sc_lock);
2466		if (sc->sc_md) {
2467			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2468			    sc->sc_md->mdo_class->name);
2469		}
2470		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2471			s = 0xff;
2472			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2473				if (vol->v_state < s)
2474					s = vol->v_state;
2475			}
2476			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2477			    g_raid_volume_state2str(s));
2478		}
2479		sx_xunlock(&sc->sc_lock);
2480		g_topology_lock();
2481	}
2482}
2483
2484static void
2485g_raid_shutdown_post_sync(void *arg, int howto)
2486{
2487	struct g_class *mp;
2488	struct g_geom *gp, *gp2;
2489	struct g_raid_softc *sc;
2490	struct g_raid_volume *vol;
2491
2492	mp = arg;
2493	DROP_GIANT();
2494	g_topology_lock();
2495	g_raid_shutdown = 1;
2496	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2497		if ((sc = gp->softc) == NULL)
2498			continue;
2499		g_topology_unlock();
2500		sx_xlock(&sc->sc_lock);
2501		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2502			g_raid_clean(vol, -1);
2503		g_cancel_event(sc);
2504		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2505		g_topology_lock();
2506	}
2507	g_topology_unlock();
2508	PICKUP_GIANT();
2509}
2510
2511static void
2512g_raid_init(struct g_class *mp)
2513{
2514
2515	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2516	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2517	if (g_raid_post_sync == NULL)
2518		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2519	g_raid_started = 1;
2520}
2521
2522static void
2523g_raid_fini(struct g_class *mp)
2524{
2525
2526	if (g_raid_post_sync != NULL)
2527		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2528	g_raid_started = 0;
2529}
2530
2531int
2532g_raid_md_modevent(module_t mod, int type, void *arg)
2533{
2534	struct g_raid_md_class *class, *c, *nc;
2535	int error;
2536
2537	error = 0;
2538	class = arg;
2539	switch (type) {
2540	case MOD_LOAD:
2541		c = LIST_FIRST(&g_raid_md_classes);
2542		if (c == NULL || c->mdc_priority > class->mdc_priority)
2543			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2544		else {
2545			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2546			    nc->mdc_priority < class->mdc_priority)
2547				c = nc;
2548			LIST_INSERT_AFTER(c, class, mdc_list);
2549		}
2550		if (g_raid_started)
2551			g_retaste(&g_raid_class);
2552		break;
2553	case MOD_UNLOAD:
2554		LIST_REMOVE(class, mdc_list);
2555		break;
2556	default:
2557		error = EOPNOTSUPP;
2558		break;
2559	}
2560
2561	return (error);
2562}
2563
2564int
2565g_raid_tr_modevent(module_t mod, int type, void *arg)
2566{
2567	struct g_raid_tr_class *class, *c, *nc;
2568	int error;
2569
2570	error = 0;
2571	class = arg;
2572	switch (type) {
2573	case MOD_LOAD:
2574		c = LIST_FIRST(&g_raid_tr_classes);
2575		if (c == NULL || c->trc_priority > class->trc_priority)
2576			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2577		else {
2578			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2579			    nc->trc_priority < class->trc_priority)
2580				c = nc;
2581			LIST_INSERT_AFTER(c, class, trc_list);
2582		}
2583		break;
2584	case MOD_UNLOAD:
2585		LIST_REMOVE(class, trc_list);
2586		break;
2587	default:
2588		error = EOPNOTSUPP;
2589		break;
2590	}
2591
2592	return (error);
2593}
2594
2595/*
2596 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2597 * to reduce module priority, allowing submodules to register them first.
2598 */
2599static moduledata_t g_raid_mod = {
2600	"g_raid",
2601	g_modevent,
2602	&g_raid_class
2603};
2604DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2605MODULE_VERSION(geom_raid, 0);
2606