g_raid.c revision 280757
1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/raid/g_raid.c 280757 2015-03-27 12:44:28Z mav $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sbuf.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/eventhandler.h>
42#include <vm/uma.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/raid/g_raid.h>
48#include "g_raid_md_if.h"
49#include "g_raid_tr_if.h"
50
51static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52
53SYSCTL_DECL(_kern_geom);
54SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55int g_raid_enable = 1;
56SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
57    &g_raid_enable, 0, "Enable on-disk metadata taste");
58u_int g_raid_aggressive_spare = 0;
59SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
60    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
61u_int g_raid_debug = 0;
62SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
63    "Debug level");
64int g_raid_read_err_thresh = 10;
65SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
66    &g_raid_read_err_thresh, 0,
67    "Number of read errors equated to disk failure");
68u_int g_raid_start_timeout = 30;
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
70    &g_raid_start_timeout, 0,
71    "Time to wait for all array components");
72static u_int g_raid_clean_time = 5;
73SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
74    &g_raid_clean_time, 0, "Mark volume as clean when idling");
75static u_int g_raid_disconnect_on_failure = 1;
76SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
77    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
78static u_int g_raid_name_format = 0;
79SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
80    &g_raid_name_format, 0, "Providers name format.");
81static u_int g_raid_idle_threshold = 1000000;
82SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
83    &g_raid_idle_threshold, 1000000,
84    "Time in microseconds to consider a volume idle.");
85static u_int ar_legacy_aliases = 1;
86SYSCTL_INT(_kern_geom_raid, OID_AUTO, legacy_aliases, CTLFLAG_RWTUN,
87           &ar_legacy_aliases, 0, "Create aliases named as the legacy ataraid style.");
88
89
90#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
91	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
92	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
93	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
94} while (0)
95
96LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
97    LIST_HEAD_INITIALIZER(g_raid_md_classes);
98
99LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
100    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
101
102LIST_HEAD(, g_raid_volume) g_raid_volumes =
103    LIST_HEAD_INITIALIZER(g_raid_volumes);
104
105static eventhandler_tag g_raid_post_sync = NULL;
106static int g_raid_started = 0;
107static int g_raid_shutdown = 0;
108
109static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
110    struct g_geom *gp);
111static g_taste_t g_raid_taste;
112static void g_raid_init(struct g_class *mp);
113static void g_raid_fini(struct g_class *mp);
114
115struct g_class g_raid_class = {
116	.name = G_RAID_CLASS_NAME,
117	.version = G_VERSION,
118	.ctlreq = g_raid_ctl,
119	.taste = g_raid_taste,
120	.destroy_geom = g_raid_destroy_geom,
121	.init = g_raid_init,
122	.fini = g_raid_fini
123};
124
125static void g_raid_destroy_provider(struct g_raid_volume *vol);
126static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
127static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
128static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
129static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
130static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
131    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
132static void g_raid_start(struct bio *bp);
133static void g_raid_start_request(struct bio *bp);
134static void g_raid_disk_done(struct bio *bp);
135static void g_raid_poll(struct g_raid_softc *sc);
136
137static const char *
138g_raid_node_event2str(int event)
139{
140
141	switch (event) {
142	case G_RAID_NODE_E_WAKE:
143		return ("WAKE");
144	case G_RAID_NODE_E_START:
145		return ("START");
146	default:
147		return ("INVALID");
148	}
149}
150
151const char *
152g_raid_disk_state2str(int state)
153{
154
155	switch (state) {
156	case G_RAID_DISK_S_NONE:
157		return ("NONE");
158	case G_RAID_DISK_S_OFFLINE:
159		return ("OFFLINE");
160	case G_RAID_DISK_S_DISABLED:
161		return ("DISABLED");
162	case G_RAID_DISK_S_FAILED:
163		return ("FAILED");
164	case G_RAID_DISK_S_STALE_FAILED:
165		return ("STALE_FAILED");
166	case G_RAID_DISK_S_SPARE:
167		return ("SPARE");
168	case G_RAID_DISK_S_STALE:
169		return ("STALE");
170	case G_RAID_DISK_S_ACTIVE:
171		return ("ACTIVE");
172	default:
173		return ("INVALID");
174	}
175}
176
177static const char *
178g_raid_disk_event2str(int event)
179{
180
181	switch (event) {
182	case G_RAID_DISK_E_DISCONNECTED:
183		return ("DISCONNECTED");
184	default:
185		return ("INVALID");
186	}
187}
188
189const char *
190g_raid_subdisk_state2str(int state)
191{
192
193	switch (state) {
194	case G_RAID_SUBDISK_S_NONE:
195		return ("NONE");
196	case G_RAID_SUBDISK_S_FAILED:
197		return ("FAILED");
198	case G_RAID_SUBDISK_S_NEW:
199		return ("NEW");
200	case G_RAID_SUBDISK_S_REBUILD:
201		return ("REBUILD");
202	case G_RAID_SUBDISK_S_UNINITIALIZED:
203		return ("UNINITIALIZED");
204	case G_RAID_SUBDISK_S_STALE:
205		return ("STALE");
206	case G_RAID_SUBDISK_S_RESYNC:
207		return ("RESYNC");
208	case G_RAID_SUBDISK_S_ACTIVE:
209		return ("ACTIVE");
210	default:
211		return ("INVALID");
212	}
213}
214
215static const char *
216g_raid_subdisk_event2str(int event)
217{
218
219	switch (event) {
220	case G_RAID_SUBDISK_E_NEW:
221		return ("NEW");
222	case G_RAID_SUBDISK_E_FAILED:
223		return ("FAILED");
224	case G_RAID_SUBDISK_E_DISCONNECTED:
225		return ("DISCONNECTED");
226	default:
227		return ("INVALID");
228	}
229}
230
231const char *
232g_raid_volume_state2str(int state)
233{
234
235	switch (state) {
236	case G_RAID_VOLUME_S_STARTING:
237		return ("STARTING");
238	case G_RAID_VOLUME_S_BROKEN:
239		return ("BROKEN");
240	case G_RAID_VOLUME_S_DEGRADED:
241		return ("DEGRADED");
242	case G_RAID_VOLUME_S_SUBOPTIMAL:
243		return ("SUBOPTIMAL");
244	case G_RAID_VOLUME_S_OPTIMAL:
245		return ("OPTIMAL");
246	case G_RAID_VOLUME_S_UNSUPPORTED:
247		return ("UNSUPPORTED");
248	case G_RAID_VOLUME_S_STOPPED:
249		return ("STOPPED");
250	default:
251		return ("INVALID");
252	}
253}
254
255static const char *
256g_raid_volume_event2str(int event)
257{
258
259	switch (event) {
260	case G_RAID_VOLUME_E_UP:
261		return ("UP");
262	case G_RAID_VOLUME_E_DOWN:
263		return ("DOWN");
264	case G_RAID_VOLUME_E_START:
265		return ("START");
266	case G_RAID_VOLUME_E_STARTMD:
267		return ("STARTMD");
268	default:
269		return ("INVALID");
270	}
271}
272
273const char *
274g_raid_volume_level2str(int level, int qual)
275{
276
277	switch (level) {
278	case G_RAID_VOLUME_RL_RAID0:
279		return ("RAID0");
280	case G_RAID_VOLUME_RL_RAID1:
281		return ("RAID1");
282	case G_RAID_VOLUME_RL_RAID3:
283		if (qual == G_RAID_VOLUME_RLQ_R3P0)
284			return ("RAID3-P0");
285		if (qual == G_RAID_VOLUME_RLQ_R3PN)
286			return ("RAID3-PN");
287		return ("RAID3");
288	case G_RAID_VOLUME_RL_RAID4:
289		if (qual == G_RAID_VOLUME_RLQ_R4P0)
290			return ("RAID4-P0");
291		if (qual == G_RAID_VOLUME_RLQ_R4PN)
292			return ("RAID4-PN");
293		return ("RAID4");
294	case G_RAID_VOLUME_RL_RAID5:
295		if (qual == G_RAID_VOLUME_RLQ_R5RA)
296			return ("RAID5-RA");
297		if (qual == G_RAID_VOLUME_RLQ_R5RS)
298			return ("RAID5-RS");
299		if (qual == G_RAID_VOLUME_RLQ_R5LA)
300			return ("RAID5-LA");
301		if (qual == G_RAID_VOLUME_RLQ_R5LS)
302			return ("RAID5-LS");
303		return ("RAID5");
304	case G_RAID_VOLUME_RL_RAID6:
305		if (qual == G_RAID_VOLUME_RLQ_R6RA)
306			return ("RAID6-RA");
307		if (qual == G_RAID_VOLUME_RLQ_R6RS)
308			return ("RAID6-RS");
309		if (qual == G_RAID_VOLUME_RLQ_R6LA)
310			return ("RAID6-LA");
311		if (qual == G_RAID_VOLUME_RLQ_R6LS)
312			return ("RAID6-LS");
313		return ("RAID6");
314	case G_RAID_VOLUME_RL_RAIDMDF:
315		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
316			return ("RAIDMDF-RA");
317		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
318			return ("RAIDMDF-RS");
319		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
320			return ("RAIDMDF-LA");
321		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
322			return ("RAIDMDF-LS");
323		return ("RAIDMDF");
324	case G_RAID_VOLUME_RL_RAID1E:
325		if (qual == G_RAID_VOLUME_RLQ_R1EA)
326			return ("RAID1E-A");
327		if (qual == G_RAID_VOLUME_RLQ_R1EO)
328			return ("RAID1E-O");
329		return ("RAID1E");
330	case G_RAID_VOLUME_RL_SINGLE:
331		return ("SINGLE");
332	case G_RAID_VOLUME_RL_CONCAT:
333		return ("CONCAT");
334	case G_RAID_VOLUME_RL_RAID5E:
335		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
336			return ("RAID5E-RA");
337		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
338			return ("RAID5E-RS");
339		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
340			return ("RAID5E-LA");
341		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
342			return ("RAID5E-LS");
343		return ("RAID5E");
344	case G_RAID_VOLUME_RL_RAID5EE:
345		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
346			return ("RAID5EE-RA");
347		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
348			return ("RAID5EE-RS");
349		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
350			return ("RAID5EE-LA");
351		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
352			return ("RAID5EE-LS");
353		return ("RAID5EE");
354	case G_RAID_VOLUME_RL_RAID5R:
355		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
356			return ("RAID5R-RA");
357		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
358			return ("RAID5R-RS");
359		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
360			return ("RAID5R-LA");
361		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
362			return ("RAID5R-LS");
363		return ("RAID5E");
364	default:
365		return ("UNKNOWN");
366	}
367}
368
369int
370g_raid_volume_str2level(const char *str, int *level, int *qual)
371{
372
373	*level = G_RAID_VOLUME_RL_UNKNOWN;
374	*qual = G_RAID_VOLUME_RLQ_NONE;
375	if (strcasecmp(str, "RAID0") == 0)
376		*level = G_RAID_VOLUME_RL_RAID0;
377	else if (strcasecmp(str, "RAID1") == 0)
378		*level = G_RAID_VOLUME_RL_RAID1;
379	else if (strcasecmp(str, "RAID3-P0") == 0) {
380		*level = G_RAID_VOLUME_RL_RAID3;
381		*qual = G_RAID_VOLUME_RLQ_R3P0;
382	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
383		   strcasecmp(str, "RAID3") == 0) {
384		*level = G_RAID_VOLUME_RL_RAID3;
385		*qual = G_RAID_VOLUME_RLQ_R3PN;
386	} else if (strcasecmp(str, "RAID4-P0") == 0) {
387		*level = G_RAID_VOLUME_RL_RAID4;
388		*qual = G_RAID_VOLUME_RLQ_R4P0;
389	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
390		   strcasecmp(str, "RAID4") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID4;
392		*qual = G_RAID_VOLUME_RLQ_R4PN;
393	} else if (strcasecmp(str, "RAID5-RA") == 0) {
394		*level = G_RAID_VOLUME_RL_RAID5;
395		*qual = G_RAID_VOLUME_RLQ_R5RA;
396	} else if (strcasecmp(str, "RAID5-RS") == 0) {
397		*level = G_RAID_VOLUME_RL_RAID5;
398		*qual = G_RAID_VOLUME_RLQ_R5RS;
399	} else if (strcasecmp(str, "RAID5") == 0 ||
400		   strcasecmp(str, "RAID5-LA") == 0) {
401		*level = G_RAID_VOLUME_RL_RAID5;
402		*qual = G_RAID_VOLUME_RLQ_R5LA;
403	} else if (strcasecmp(str, "RAID5-LS") == 0) {
404		*level = G_RAID_VOLUME_RL_RAID5;
405		*qual = G_RAID_VOLUME_RLQ_R5LS;
406	} else if (strcasecmp(str, "RAID6-RA") == 0) {
407		*level = G_RAID_VOLUME_RL_RAID6;
408		*qual = G_RAID_VOLUME_RLQ_R6RA;
409	} else if (strcasecmp(str, "RAID6-RS") == 0) {
410		*level = G_RAID_VOLUME_RL_RAID6;
411		*qual = G_RAID_VOLUME_RLQ_R6RS;
412	} else if (strcasecmp(str, "RAID6") == 0 ||
413		   strcasecmp(str, "RAID6-LA") == 0) {
414		*level = G_RAID_VOLUME_RL_RAID6;
415		*qual = G_RAID_VOLUME_RLQ_R6LA;
416	} else if (strcasecmp(str, "RAID6-LS") == 0) {
417		*level = G_RAID_VOLUME_RL_RAID6;
418		*qual = G_RAID_VOLUME_RLQ_R6LS;
419	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
420		*level = G_RAID_VOLUME_RL_RAIDMDF;
421		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
422	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
423		*level = G_RAID_VOLUME_RL_RAIDMDF;
424		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
425	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
426		   strcasecmp(str, "RAIDMDF-LA") == 0) {
427		*level = G_RAID_VOLUME_RL_RAIDMDF;
428		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
429	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
430		*level = G_RAID_VOLUME_RL_RAIDMDF;
431		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
432	} else if (strcasecmp(str, "RAID10") == 0 ||
433		   strcasecmp(str, "RAID1E") == 0 ||
434		   strcasecmp(str, "RAID1E-A") == 0) {
435		*level = G_RAID_VOLUME_RL_RAID1E;
436		*qual = G_RAID_VOLUME_RLQ_R1EA;
437	} else if (strcasecmp(str, "RAID1E-O") == 0) {
438		*level = G_RAID_VOLUME_RL_RAID1E;
439		*qual = G_RAID_VOLUME_RLQ_R1EO;
440	} else if (strcasecmp(str, "SINGLE") == 0)
441		*level = G_RAID_VOLUME_RL_SINGLE;
442	else if (strcasecmp(str, "CONCAT") == 0)
443		*level = G_RAID_VOLUME_RL_CONCAT;
444	else if (strcasecmp(str, "RAID5E-RA") == 0) {
445		*level = G_RAID_VOLUME_RL_RAID5E;
446		*qual = G_RAID_VOLUME_RLQ_R5ERA;
447	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
448		*level = G_RAID_VOLUME_RL_RAID5E;
449		*qual = G_RAID_VOLUME_RLQ_R5ERS;
450	} else if (strcasecmp(str, "RAID5E") == 0 ||
451		   strcasecmp(str, "RAID5E-LA") == 0) {
452		*level = G_RAID_VOLUME_RL_RAID5E;
453		*qual = G_RAID_VOLUME_RLQ_R5ELA;
454	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
455		*level = G_RAID_VOLUME_RL_RAID5E;
456		*qual = G_RAID_VOLUME_RLQ_R5ELS;
457	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
458		*level = G_RAID_VOLUME_RL_RAID5EE;
459		*qual = G_RAID_VOLUME_RLQ_R5EERA;
460	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
461		*level = G_RAID_VOLUME_RL_RAID5EE;
462		*qual = G_RAID_VOLUME_RLQ_R5EERS;
463	} else if (strcasecmp(str, "RAID5EE") == 0 ||
464		   strcasecmp(str, "RAID5EE-LA") == 0) {
465		*level = G_RAID_VOLUME_RL_RAID5EE;
466		*qual = G_RAID_VOLUME_RLQ_R5EELA;
467	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
468		*level = G_RAID_VOLUME_RL_RAID5EE;
469		*qual = G_RAID_VOLUME_RLQ_R5EELS;
470	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
471		*level = G_RAID_VOLUME_RL_RAID5R;
472		*qual = G_RAID_VOLUME_RLQ_R5RRA;
473	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
474		*level = G_RAID_VOLUME_RL_RAID5R;
475		*qual = G_RAID_VOLUME_RLQ_R5RRS;
476	} else if (strcasecmp(str, "RAID5R") == 0 ||
477		   strcasecmp(str, "RAID5R-LA") == 0) {
478		*level = G_RAID_VOLUME_RL_RAID5R;
479		*qual = G_RAID_VOLUME_RLQ_R5RLA;
480	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
481		*level = G_RAID_VOLUME_RL_RAID5R;
482		*qual = G_RAID_VOLUME_RLQ_R5RLS;
483	} else
484		return (-1);
485	return (0);
486}
487
488const char *
489g_raid_get_diskname(struct g_raid_disk *disk)
490{
491
492	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
493		return ("[unknown]");
494	return (disk->d_consumer->provider->name);
495}
496
497void
498g_raid_get_disk_info(struct g_raid_disk *disk)
499{
500	struct g_consumer *cp = disk->d_consumer;
501	int error, len;
502
503	/* Read kernel dumping information. */
504	disk->d_kd.offset = 0;
505	disk->d_kd.length = OFF_MAX;
506	len = sizeof(disk->d_kd);
507	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
508	if (error)
509		disk->d_kd.di.dumper = NULL;
510	if (disk->d_kd.di.dumper == NULL)
511		G_RAID_DEBUG1(2, disk->d_softc,
512		    "Dumping not supported by %s: %d.",
513		    cp->provider->name, error);
514
515	/* Read BIO_DELETE support. */
516	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
517	if (error)
518		disk->d_candelete = 0;
519	if (!disk->d_candelete)
520		G_RAID_DEBUG1(2, disk->d_softc,
521		    "BIO_DELETE not supported by %s: %d.",
522		    cp->provider->name, error);
523}
524
525void
526g_raid_report_disk_state(struct g_raid_disk *disk)
527{
528	struct g_raid_subdisk *sd;
529	int len, state;
530	uint32_t s;
531
532	if (disk->d_consumer == NULL)
533		return;
534	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
535		s = G_STATE_ACTIVE; /* XXX */
536	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
537	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
538		s = G_STATE_FAILED;
539	} else {
540		state = G_RAID_SUBDISK_S_ACTIVE;
541		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
542			if (sd->sd_state < state)
543				state = sd->sd_state;
544		}
545		if (state == G_RAID_SUBDISK_S_FAILED)
546			s = G_STATE_FAILED;
547		else if (state == G_RAID_SUBDISK_S_NEW ||
548		    state == G_RAID_SUBDISK_S_REBUILD)
549			s = G_STATE_REBUILD;
550		else if (state == G_RAID_SUBDISK_S_STALE ||
551		    state == G_RAID_SUBDISK_S_RESYNC)
552			s = G_STATE_RESYNC;
553		else
554			s = G_STATE_ACTIVE;
555	}
556	len = sizeof(s);
557	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
558	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
559	    g_raid_get_diskname(disk), s);
560}
561
562void
563g_raid_change_disk_state(struct g_raid_disk *disk, int state)
564{
565
566	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
567	    g_raid_get_diskname(disk),
568	    g_raid_disk_state2str(disk->d_state),
569	    g_raid_disk_state2str(state));
570	disk->d_state = state;
571	g_raid_report_disk_state(disk);
572}
573
574void
575g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
576{
577
578	G_RAID_DEBUG1(0, sd->sd_softc,
579	    "Subdisk %s:%d-%s state changed from %s to %s.",
580	    sd->sd_volume->v_name, sd->sd_pos,
581	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
582	    g_raid_subdisk_state2str(sd->sd_state),
583	    g_raid_subdisk_state2str(state));
584	sd->sd_state = state;
585	if (sd->sd_disk)
586		g_raid_report_disk_state(sd->sd_disk);
587}
588
589void
590g_raid_change_volume_state(struct g_raid_volume *vol, int state)
591{
592
593	G_RAID_DEBUG1(0, vol->v_softc,
594	    "Volume %s state changed from %s to %s.",
595	    vol->v_name,
596	    g_raid_volume_state2str(vol->v_state),
597	    g_raid_volume_state2str(state));
598	vol->v_state = state;
599}
600
601/*
602 * --- Events handling functions ---
603 * Events in geom_raid are used to maintain subdisks and volumes status
604 * from one thread to simplify locking.
605 */
606static void
607g_raid_event_free(struct g_raid_event *ep)
608{
609
610	free(ep, M_RAID);
611}
612
613int
614g_raid_event_send(void *arg, int event, int flags)
615{
616	struct g_raid_softc *sc;
617	struct g_raid_event *ep;
618	int error;
619
620	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
621		sc = ((struct g_raid_volume *)arg)->v_softc;
622	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
623		sc = ((struct g_raid_disk *)arg)->d_softc;
624	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
625		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
626	} else {
627		sc = arg;
628	}
629	ep = malloc(sizeof(*ep), M_RAID,
630	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
631	if (ep == NULL)
632		return (ENOMEM);
633	ep->e_tgt = arg;
634	ep->e_event = event;
635	ep->e_flags = flags;
636	ep->e_error = 0;
637	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
638	mtx_lock(&sc->sc_queue_mtx);
639	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
640	mtx_unlock(&sc->sc_queue_mtx);
641	wakeup(sc);
642
643	if ((flags & G_RAID_EVENT_WAIT) == 0)
644		return (0);
645
646	sx_assert(&sc->sc_lock, SX_XLOCKED);
647	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
648	sx_xunlock(&sc->sc_lock);
649	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
650		mtx_lock(&sc->sc_queue_mtx);
651		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
652		    hz * 5);
653	}
654	error = ep->e_error;
655	g_raid_event_free(ep);
656	sx_xlock(&sc->sc_lock);
657	return (error);
658}
659
660static void
661g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
662{
663	struct g_raid_event *ep, *tmpep;
664
665	sx_assert(&sc->sc_lock, SX_XLOCKED);
666
667	mtx_lock(&sc->sc_queue_mtx);
668	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
669		if (ep->e_tgt != tgt)
670			continue;
671		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
672		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
673			g_raid_event_free(ep);
674		else {
675			ep->e_error = ECANCELED;
676			wakeup(ep);
677		}
678	}
679	mtx_unlock(&sc->sc_queue_mtx);
680}
681
682static int
683g_raid_event_check(struct g_raid_softc *sc, void *tgt)
684{
685	struct g_raid_event *ep;
686	int	res = 0;
687
688	sx_assert(&sc->sc_lock, SX_XLOCKED);
689
690	mtx_lock(&sc->sc_queue_mtx);
691	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
692		if (ep->e_tgt != tgt)
693			continue;
694		res = 1;
695		break;
696	}
697	mtx_unlock(&sc->sc_queue_mtx);
698	return (res);
699}
700
701/*
702 * Return the number of disks in given state.
703 * If state is equal to -1, count all connected disks.
704 */
705u_int
706g_raid_ndisks(struct g_raid_softc *sc, int state)
707{
708	struct g_raid_disk *disk;
709	u_int n;
710
711	sx_assert(&sc->sc_lock, SX_LOCKED);
712
713	n = 0;
714	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
715		if (disk->d_state == state || state == -1)
716			n++;
717	}
718	return (n);
719}
720
721/*
722 * Return the number of subdisks in given state.
723 * If state is equal to -1, count all connected disks.
724 */
725u_int
726g_raid_nsubdisks(struct g_raid_volume *vol, int state)
727{
728	struct g_raid_subdisk *subdisk;
729	struct g_raid_softc *sc;
730	u_int i, n ;
731
732	sc = vol->v_softc;
733	sx_assert(&sc->sc_lock, SX_LOCKED);
734
735	n = 0;
736	for (i = 0; i < vol->v_disks_count; i++) {
737		subdisk = &vol->v_subdisks[i];
738		if ((state == -1 &&
739		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
740		    subdisk->sd_state == state)
741			n++;
742	}
743	return (n);
744}
745
746/*
747 * Return the first subdisk in given state.
748 * If state is equal to -1, then the first connected disks.
749 */
750struct g_raid_subdisk *
751g_raid_get_subdisk(struct g_raid_volume *vol, int state)
752{
753	struct g_raid_subdisk *sd;
754	struct g_raid_softc *sc;
755	u_int i;
756
757	sc = vol->v_softc;
758	sx_assert(&sc->sc_lock, SX_LOCKED);
759
760	for (i = 0; i < vol->v_disks_count; i++) {
761		sd = &vol->v_subdisks[i];
762		if ((state == -1 &&
763		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
764		    sd->sd_state == state)
765			return (sd);
766	}
767	return (NULL);
768}
769
770struct g_consumer *
771g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
772{
773	struct g_consumer *cp;
774	struct g_provider *pp;
775
776	g_topology_assert();
777
778	if (strncmp(name, "/dev/", 5) == 0)
779		name += 5;
780	pp = g_provider_by_name(name);
781	if (pp == NULL)
782		return (NULL);
783	cp = g_new_consumer(sc->sc_geom);
784	cp->flags |= G_CF_DIRECT_RECEIVE;
785	if (g_attach(cp, pp) != 0) {
786		g_destroy_consumer(cp);
787		return (NULL);
788	}
789	if (g_access(cp, 1, 1, 1) != 0) {
790		g_detach(cp);
791		g_destroy_consumer(cp);
792		return (NULL);
793	}
794	return (cp);
795}
796
797static u_int
798g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
799{
800	struct bio *bp;
801	u_int nreqs = 0;
802
803	mtx_lock(&sc->sc_queue_mtx);
804	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
805		if (bp->bio_from == cp)
806			nreqs++;
807	}
808	mtx_unlock(&sc->sc_queue_mtx);
809	return (nreqs);
810}
811
812u_int
813g_raid_nopens(struct g_raid_softc *sc)
814{
815	struct g_raid_volume *vol;
816	u_int opens;
817
818	opens = 0;
819	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
820		if (vol->v_provider_open != 0)
821			opens++;
822	}
823	return (opens);
824}
825
826static int
827g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
828{
829
830	if (cp->index > 0) {
831		G_RAID_DEBUG1(2, sc,
832		    "I/O requests for %s exist, can't destroy it now.",
833		    cp->provider->name);
834		return (1);
835	}
836	if (g_raid_nrequests(sc, cp) > 0) {
837		G_RAID_DEBUG1(2, sc,
838		    "I/O requests for %s in queue, can't destroy it now.",
839		    cp->provider->name);
840		return (1);
841	}
842	return (0);
843}
844
845static void
846g_raid_destroy_consumer(void *arg, int flags __unused)
847{
848	struct g_consumer *cp;
849
850	g_topology_assert();
851
852	cp = arg;
853	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
854	g_detach(cp);
855	g_destroy_consumer(cp);
856}
857
858void
859g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
860{
861	struct g_provider *pp;
862	int retaste_wait;
863
864	g_topology_assert_not();
865
866	g_topology_lock();
867	cp->private = NULL;
868	if (g_raid_consumer_is_busy(sc, cp))
869		goto out;
870	pp = cp->provider;
871	retaste_wait = 0;
872	if (cp->acw == 1) {
873		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
874			retaste_wait = 1;
875	}
876	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
877		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
878	if (retaste_wait) {
879		/*
880		 * After retaste event was send (inside g_access()), we can send
881		 * event to detach and destroy consumer.
882		 * A class, which has consumer to the given provider connected
883		 * will not receive retaste event for the provider.
884		 * This is the way how I ignore retaste events when I close
885		 * consumers opened for write: I detach and destroy consumer
886		 * after retaste event is sent.
887		 */
888		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
889		goto out;
890	}
891	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
892	g_detach(cp);
893	g_destroy_consumer(cp);
894out:
895	g_topology_unlock();
896}
897
898static void
899g_raid_orphan(struct g_consumer *cp)
900{
901	struct g_raid_disk *disk;
902
903	g_topology_assert();
904
905	disk = cp->private;
906	if (disk == NULL)
907		return;
908	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
909	    G_RAID_EVENT_DISK);
910}
911
912static void
913g_raid_clean(struct g_raid_volume *vol, int acw)
914{
915	struct g_raid_softc *sc;
916	int timeout;
917
918	sc = vol->v_softc;
919	g_topology_assert_not();
920	sx_assert(&sc->sc_lock, SX_XLOCKED);
921
922//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
923//		return;
924	if (!vol->v_dirty)
925		return;
926	if (vol->v_writes > 0)
927		return;
928	if (acw > 0 || (acw == -1 &&
929	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
930		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
931		if (!g_raid_shutdown && timeout > 0)
932			return;
933	}
934	vol->v_dirty = 0;
935	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
936	    vol->v_name);
937	g_raid_write_metadata(sc, vol, NULL, NULL);
938}
939
940static void
941g_raid_dirty(struct g_raid_volume *vol)
942{
943	struct g_raid_softc *sc;
944
945	sc = vol->v_softc;
946	g_topology_assert_not();
947	sx_assert(&sc->sc_lock, SX_XLOCKED);
948
949//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
950//		return;
951	vol->v_dirty = 1;
952	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
953	    vol->v_name);
954	g_raid_write_metadata(sc, vol, NULL, NULL);
955}
956
957void
958g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
959{
960	struct g_raid_softc *sc;
961	struct g_raid_volume *vol;
962	struct g_raid_subdisk *sd;
963	struct bio_queue_head queue;
964	struct bio *cbp;
965	int i;
966
967	vol = tr->tro_volume;
968	sc = vol->v_softc;
969
970	/*
971	 * Allocate all bios before sending any request, so we can return
972	 * ENOMEM in nice and clean way.
973	 */
974	bioq_init(&queue);
975	for (i = 0; i < vol->v_disks_count; i++) {
976		sd = &vol->v_subdisks[i];
977		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
978		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
979			continue;
980		cbp = g_clone_bio(bp);
981		if (cbp == NULL)
982			goto failure;
983		cbp->bio_caller1 = sd;
984		bioq_insert_tail(&queue, cbp);
985	}
986	while ((cbp = bioq_takefirst(&queue)) != NULL) {
987		sd = cbp->bio_caller1;
988		cbp->bio_caller1 = NULL;
989		g_raid_subdisk_iostart(sd, cbp);
990	}
991	return;
992failure:
993	while ((cbp = bioq_takefirst(&queue)) != NULL)
994		g_destroy_bio(cbp);
995	if (bp->bio_error == 0)
996		bp->bio_error = ENOMEM;
997	g_raid_iodone(bp, bp->bio_error);
998}
999
1000static void
1001g_raid_tr_kerneldump_common_done(struct bio *bp)
1002{
1003
1004	bp->bio_flags |= BIO_DONE;
1005}
1006
1007int
1008g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1009    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1010{
1011	struct g_raid_softc *sc;
1012	struct g_raid_volume *vol;
1013	struct bio bp;
1014
1015	vol = tr->tro_volume;
1016	sc = vol->v_softc;
1017
1018	bzero(&bp, sizeof(bp));
1019	bp.bio_cmd = BIO_WRITE;
1020	bp.bio_done = g_raid_tr_kerneldump_common_done;
1021	bp.bio_attribute = NULL;
1022	bp.bio_offset = offset;
1023	bp.bio_length = length;
1024	bp.bio_data = virtual;
1025	bp.bio_to = vol->v_provider;
1026
1027	g_raid_start(&bp);
1028	while (!(bp.bio_flags & BIO_DONE)) {
1029		G_RAID_DEBUG1(4, sc, "Poll...");
1030		g_raid_poll(sc);
1031		DELAY(10);
1032	}
1033
1034	return (bp.bio_error != 0 ? EIO : 0);
1035}
1036
1037static int
1038g_raid_dump(void *arg,
1039    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1040{
1041	struct g_raid_volume *vol;
1042	int error;
1043
1044	vol = (struct g_raid_volume *)arg;
1045	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1046	    (long long unsigned)offset, (long long unsigned)length);
1047
1048	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1049	    virtual, physical, offset, length);
1050	return (error);
1051}
1052
1053static void
1054g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1055{
1056	struct g_kerneldump *gkd;
1057	struct g_provider *pp;
1058	struct g_raid_volume *vol;
1059
1060	gkd = (struct g_kerneldump*)bp->bio_data;
1061	pp = bp->bio_to;
1062	vol = pp->private;
1063	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1064		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1065	gkd->di.dumper = g_raid_dump;
1066	gkd->di.priv = vol;
1067	gkd->di.blocksize = vol->v_sectorsize;
1068	gkd->di.maxiosize = DFLTPHYS;
1069	gkd->di.mediaoffset = gkd->offset;
1070	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1071		gkd->length = vol->v_mediasize - gkd->offset;
1072	gkd->di.mediasize = gkd->length;
1073	g_io_deliver(bp, 0);
1074}
1075
1076static void
1077g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1078{
1079	struct g_provider *pp;
1080	struct g_raid_volume *vol;
1081	struct g_raid_subdisk *sd;
1082	int *val;
1083	int i;
1084
1085	val = (int *)bp->bio_data;
1086	pp = bp->bio_to;
1087	vol = pp->private;
1088	*val = 0;
1089	for (i = 0; i < vol->v_disks_count; i++) {
1090		sd = &vol->v_subdisks[i];
1091		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1092			continue;
1093		if (sd->sd_disk->d_candelete) {
1094			*val = 1;
1095			break;
1096		}
1097	}
1098	g_io_deliver(bp, 0);
1099}
1100
1101static void
1102g_raid_start(struct bio *bp)
1103{
1104	struct g_raid_softc *sc;
1105
1106	sc = bp->bio_to->geom->softc;
1107	/*
1108	 * If sc == NULL or there are no valid disks, provider's error
1109	 * should be set and g_raid_start() should not be called at all.
1110	 */
1111//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1112//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1113//	    bp->bio_to->error, bp->bio_to->name));
1114	G_RAID_LOGREQ(3, bp, "Request received.");
1115
1116	switch (bp->bio_cmd) {
1117	case BIO_READ:
1118	case BIO_WRITE:
1119	case BIO_DELETE:
1120	case BIO_FLUSH:
1121		break;
1122	case BIO_GETATTR:
1123		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1124			g_raid_candelete(sc, bp);
1125		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1126			g_raid_kerneldump(sc, bp);
1127		else
1128			g_io_deliver(bp, EOPNOTSUPP);
1129		return;
1130	default:
1131		g_io_deliver(bp, EOPNOTSUPP);
1132		return;
1133	}
1134	mtx_lock(&sc->sc_queue_mtx);
1135	bioq_insert_tail(&sc->sc_queue, bp);
1136	mtx_unlock(&sc->sc_queue_mtx);
1137	if (!dumping) {
1138		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1139		wakeup(sc);
1140	}
1141}
1142
1143static int
1144g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1145{
1146	/*
1147	 * 5 cases:
1148	 * (1) bp entirely below NO
1149	 * (2) bp entirely above NO
1150	 * (3) bp start below, but end in range YES
1151	 * (4) bp entirely within YES
1152	 * (5) bp starts within, ends above YES
1153	 *
1154	 * lock range 10-19 (offset 10 length 10)
1155	 * (1) 1-5: first if kicks it out
1156	 * (2) 30-35: second if kicks it out
1157	 * (3) 5-15: passes both ifs
1158	 * (4) 12-14: passes both ifs
1159	 * (5) 19-20: passes both
1160	 */
1161	off_t lend = lstart + len - 1;
1162	off_t bstart = bp->bio_offset;
1163	off_t bend = bp->bio_offset + bp->bio_length - 1;
1164
1165	if (bend < lstart)
1166		return (0);
1167	if (lend < bstart)
1168		return (0);
1169	return (1);
1170}
1171
1172static int
1173g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1174{
1175	struct g_raid_lock *lp;
1176
1177	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1178
1179	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1180		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1181			return (1);
1182	}
1183	return (0);
1184}
1185
1186static void
1187g_raid_start_request(struct bio *bp)
1188{
1189	struct g_raid_softc *sc;
1190	struct g_raid_volume *vol;
1191
1192	sc = bp->bio_to->geom->softc;
1193	sx_assert(&sc->sc_lock, SX_LOCKED);
1194	vol = bp->bio_to->private;
1195
1196	/*
1197	 * Check to see if this item is in a locked range.  If so,
1198	 * queue it to our locked queue and return.  We'll requeue
1199	 * it when the range is unlocked.  Internal I/O for the
1200	 * rebuild/rescan/recovery process is excluded from this
1201	 * check so we can actually do the recovery.
1202	 */
1203	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1204	    g_raid_is_in_locked_range(vol, bp)) {
1205		G_RAID_LOGREQ(3, bp, "Defer request.");
1206		bioq_insert_tail(&vol->v_locked, bp);
1207		return;
1208	}
1209
1210	/*
1211	 * If we're actually going to do the write/delete, then
1212	 * update the idle stats for the volume.
1213	 */
1214	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1215		if (!vol->v_dirty)
1216			g_raid_dirty(vol);
1217		vol->v_writes++;
1218	}
1219
1220	/*
1221	 * Put request onto inflight queue, so we can check if new
1222	 * synchronization requests don't collide with it.  Then tell
1223	 * the transformation layer to start the I/O.
1224	 */
1225	bioq_insert_tail(&vol->v_inflight, bp);
1226	G_RAID_LOGREQ(4, bp, "Request started");
1227	G_RAID_TR_IOSTART(vol->v_tr, bp);
1228}
1229
1230static void
1231g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1232{
1233	off_t off, len;
1234	struct bio *nbp;
1235	struct g_raid_lock *lp;
1236
1237	vol->v_pending_lock = 0;
1238	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1239		if (lp->l_pending) {
1240			off = lp->l_offset;
1241			len = lp->l_length;
1242			lp->l_pending = 0;
1243			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1244				if (g_raid_bio_overlaps(nbp, off, len))
1245					lp->l_pending++;
1246			}
1247			if (lp->l_pending) {
1248				vol->v_pending_lock = 1;
1249				G_RAID_DEBUG1(4, vol->v_softc,
1250				    "Deferred lock(%jd, %jd) has %d pending",
1251				    (intmax_t)off, (intmax_t)(off + len),
1252				    lp->l_pending);
1253				continue;
1254			}
1255			G_RAID_DEBUG1(4, vol->v_softc,
1256			    "Deferred lock of %jd to %jd completed",
1257			    (intmax_t)off, (intmax_t)(off + len));
1258			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1259		}
1260	}
1261}
1262
1263void
1264g_raid_iodone(struct bio *bp, int error)
1265{
1266	struct g_raid_softc *sc;
1267	struct g_raid_volume *vol;
1268
1269	sc = bp->bio_to->geom->softc;
1270	sx_assert(&sc->sc_lock, SX_LOCKED);
1271	vol = bp->bio_to->private;
1272	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1273
1274	/* Update stats if we done write/delete. */
1275	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1276		vol->v_writes--;
1277		vol->v_last_write = time_uptime;
1278	}
1279
1280	bioq_remove(&vol->v_inflight, bp);
1281	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1282		g_raid_finish_with_locked_ranges(vol, bp);
1283	getmicrouptime(&vol->v_last_done);
1284	g_io_deliver(bp, error);
1285}
1286
1287int
1288g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1289    struct bio *ignore, void *argp)
1290{
1291	struct g_raid_softc *sc;
1292	struct g_raid_lock *lp;
1293	struct bio *bp;
1294
1295	sc = vol->v_softc;
1296	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1297	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1298	lp->l_offset = off;
1299	lp->l_length = len;
1300	lp->l_callback_arg = argp;
1301
1302	lp->l_pending = 0;
1303	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1304		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1305			lp->l_pending++;
1306	}
1307
1308	/*
1309	 * If there are any writes that are pending, we return EBUSY.  All
1310	 * callers will have to wait until all pending writes clear.
1311	 */
1312	if (lp->l_pending > 0) {
1313		vol->v_pending_lock = 1;
1314		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1315		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1316		return (EBUSY);
1317	}
1318	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1319	    (intmax_t)off, (intmax_t)(off+len));
1320	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1321	return (0);
1322}
1323
1324int
1325g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1326{
1327	struct g_raid_lock *lp;
1328	struct g_raid_softc *sc;
1329	struct bio *bp;
1330
1331	sc = vol->v_softc;
1332	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1333		if (lp->l_offset == off && lp->l_length == len) {
1334			LIST_REMOVE(lp, l_next);
1335			/* XXX
1336			 * Right now we just put them all back on the queue
1337			 * and hope for the best.  We hope this because any
1338			 * locked ranges will go right back on this list
1339			 * when the worker thread runs.
1340			 * XXX
1341			 */
1342			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1343			    (intmax_t)lp->l_offset,
1344			    (intmax_t)(lp->l_offset+lp->l_length));
1345			mtx_lock(&sc->sc_queue_mtx);
1346			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1347				bioq_insert_tail(&sc->sc_queue, bp);
1348			mtx_unlock(&sc->sc_queue_mtx);
1349			free(lp, M_RAID);
1350			return (0);
1351		}
1352	}
1353	return (EINVAL);
1354}
1355
1356void
1357g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1358{
1359	struct g_consumer *cp;
1360	struct g_raid_disk *disk, *tdisk;
1361
1362	bp->bio_caller1 = sd;
1363
1364	/*
1365	 * Make sure that the disk is present. Generally it is a task of
1366	 * transformation layers to not send requests to absent disks, but
1367	 * it is better to be safe and report situation then sorry.
1368	 */
1369	if (sd->sd_disk == NULL) {
1370		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1371nodisk:
1372		bp->bio_from = NULL;
1373		bp->bio_to = NULL;
1374		bp->bio_error = ENXIO;
1375		g_raid_disk_done(bp);
1376		return;
1377	}
1378	disk = sd->sd_disk;
1379	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1380	    disk->d_state != G_RAID_DISK_S_FAILED) {
1381		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1382		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1383		goto nodisk;
1384	}
1385
1386	cp = disk->d_consumer;
1387	bp->bio_from = cp;
1388	bp->bio_to = cp->provider;
1389	cp->index++;
1390
1391	/* Update average disks load. */
1392	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1393		if (tdisk->d_consumer == NULL)
1394			tdisk->d_load = 0;
1395		else
1396			tdisk->d_load = (tdisk->d_consumer->index *
1397			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1398	}
1399
1400	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1401	if (dumping) {
1402		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1403		if (bp->bio_cmd == BIO_WRITE) {
1404			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1405			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1406		} else
1407			bp->bio_error = EOPNOTSUPP;
1408		g_raid_disk_done(bp);
1409	} else {
1410		bp->bio_done = g_raid_disk_done;
1411		bp->bio_offset += sd->sd_offset;
1412		G_RAID_LOGREQ(3, bp, "Sending request.");
1413		g_io_request(bp, cp);
1414	}
1415}
1416
1417int
1418g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1419    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1420{
1421
1422	if (sd->sd_disk == NULL)
1423		return (ENXIO);
1424	if (sd->sd_disk->d_kd.di.dumper == NULL)
1425		return (EOPNOTSUPP);
1426	return (dump_write(&sd->sd_disk->d_kd.di,
1427	    virtual, physical,
1428	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1429	    length));
1430}
1431
1432static void
1433g_raid_disk_done(struct bio *bp)
1434{
1435	struct g_raid_softc *sc;
1436	struct g_raid_subdisk *sd;
1437
1438	sd = bp->bio_caller1;
1439	sc = sd->sd_softc;
1440	mtx_lock(&sc->sc_queue_mtx);
1441	bioq_insert_tail(&sc->sc_queue, bp);
1442	mtx_unlock(&sc->sc_queue_mtx);
1443	if (!dumping)
1444		wakeup(sc);
1445}
1446
1447static void
1448g_raid_disk_done_request(struct bio *bp)
1449{
1450	struct g_raid_softc *sc;
1451	struct g_raid_disk *disk;
1452	struct g_raid_subdisk *sd;
1453	struct g_raid_volume *vol;
1454
1455	g_topology_assert_not();
1456
1457	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1458	sd = bp->bio_caller1;
1459	sc = sd->sd_softc;
1460	vol = sd->sd_volume;
1461	if (bp->bio_from != NULL) {
1462		bp->bio_from->index--;
1463		disk = bp->bio_from->private;
1464		if (disk == NULL)
1465			g_raid_kill_consumer(sc, bp->bio_from);
1466	}
1467	bp->bio_offset -= sd->sd_offset;
1468
1469	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1470}
1471
1472static void
1473g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1474{
1475
1476	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1477		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1478	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1479		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1480	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1481		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1482	else
1483		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1484	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1485		KASSERT(ep->e_error == 0,
1486		    ("Error cannot be handled."));
1487		g_raid_event_free(ep);
1488	} else {
1489		ep->e_flags |= G_RAID_EVENT_DONE;
1490		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1491		mtx_lock(&sc->sc_queue_mtx);
1492		wakeup(ep);
1493		mtx_unlock(&sc->sc_queue_mtx);
1494	}
1495}
1496
1497/*
1498 * Worker thread.
1499 */
1500static void
1501g_raid_worker(void *arg)
1502{
1503	struct g_raid_softc *sc;
1504	struct g_raid_event *ep;
1505	struct g_raid_volume *vol;
1506	struct bio *bp;
1507	struct timeval now, t;
1508	int timeout, rv;
1509
1510	sc = arg;
1511	thread_lock(curthread);
1512	sched_prio(curthread, PRIBIO);
1513	thread_unlock(curthread);
1514
1515	sx_xlock(&sc->sc_lock);
1516	for (;;) {
1517		mtx_lock(&sc->sc_queue_mtx);
1518		/*
1519		 * First take a look at events.
1520		 * This is important to handle events before any I/O requests.
1521		 */
1522		bp = NULL;
1523		vol = NULL;
1524		rv = 0;
1525		ep = TAILQ_FIRST(&sc->sc_events);
1526		if (ep != NULL)
1527			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1528		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1529			;
1530		else {
1531			getmicrouptime(&now);
1532			t = now;
1533			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1534				if (bioq_first(&vol->v_inflight) == NULL &&
1535				    vol->v_tr &&
1536				    timevalcmp(&vol->v_last_done, &t, < ))
1537					t = vol->v_last_done;
1538			}
1539			timevalsub(&t, &now);
1540			timeout = g_raid_idle_threshold +
1541			    t.tv_sec * 1000000 + t.tv_usec;
1542			if (timeout > 0) {
1543				/*
1544				 * Two steps to avoid overflows at HZ=1000
1545				 * and idle timeouts > 2.1s.  Some rounding
1546				 * errors can occur, but they are < 1tick,
1547				 * which is deemed to be close enough for
1548				 * this purpose.
1549				 */
1550				int micpertic = 1000000 / hz;
1551				timeout = (timeout + micpertic - 1) / micpertic;
1552				sx_xunlock(&sc->sc_lock);
1553				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1554				    PRIBIO | PDROP, "-", timeout);
1555				sx_xlock(&sc->sc_lock);
1556				goto process;
1557			} else
1558				rv = EWOULDBLOCK;
1559		}
1560		mtx_unlock(&sc->sc_queue_mtx);
1561process:
1562		if (ep != NULL) {
1563			g_raid_handle_event(sc, ep);
1564		} else if (bp != NULL) {
1565			if (bp->bio_to != NULL &&
1566			    bp->bio_to->geom == sc->sc_geom)
1567				g_raid_start_request(bp);
1568			else
1569				g_raid_disk_done_request(bp);
1570		} else if (rv == EWOULDBLOCK) {
1571			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1572				g_raid_clean(vol, -1);
1573				if (bioq_first(&vol->v_inflight) == NULL &&
1574				    vol->v_tr) {
1575					t.tv_sec = g_raid_idle_threshold / 1000000;
1576					t.tv_usec = g_raid_idle_threshold % 1000000;
1577					timevaladd(&t, &vol->v_last_done);
1578					getmicrouptime(&now);
1579					if (timevalcmp(&t, &now, <= )) {
1580						G_RAID_TR_IDLE(vol->v_tr);
1581						vol->v_last_done = now;
1582					}
1583				}
1584			}
1585		}
1586		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1587			g_raid_destroy_node(sc, 1);	/* May not return. */
1588	}
1589}
1590
1591static void
1592g_raid_poll(struct g_raid_softc *sc)
1593{
1594	struct g_raid_event *ep;
1595	struct bio *bp;
1596
1597	sx_xlock(&sc->sc_lock);
1598	mtx_lock(&sc->sc_queue_mtx);
1599	/*
1600	 * First take a look at events.
1601	 * This is important to handle events before any I/O requests.
1602	 */
1603	ep = TAILQ_FIRST(&sc->sc_events);
1604	if (ep != NULL) {
1605		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1606		mtx_unlock(&sc->sc_queue_mtx);
1607		g_raid_handle_event(sc, ep);
1608		goto out;
1609	}
1610	bp = bioq_takefirst(&sc->sc_queue);
1611	if (bp != NULL) {
1612		mtx_unlock(&sc->sc_queue_mtx);
1613		if (bp->bio_from == NULL ||
1614		    bp->bio_from->geom != sc->sc_geom)
1615			g_raid_start_request(bp);
1616		else
1617			g_raid_disk_done_request(bp);
1618	}
1619out:
1620	sx_xunlock(&sc->sc_lock);
1621}
1622
1623static void
1624g_raid_launch_provider(struct g_raid_volume *vol)
1625{
1626	struct g_raid_disk *disk;
1627	struct g_raid_subdisk *sd;
1628	struct g_raid_softc *sc;
1629	struct g_provider *pp;
1630	char name[G_RAID_MAX_VOLUMENAME];
1631	char   announce_buf[80], buf1[32];
1632	off_t off;
1633	int i;
1634
1635	sc = vol->v_softc;
1636	sx_assert(&sc->sc_lock, SX_LOCKED);
1637
1638	g_topology_lock();
1639	/* Try to name provider with volume name. */
1640	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1641	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1642	    g_provider_by_name(name) != NULL) {
1643		/* Otherwise use sequential volume number. */
1644		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1645	}
1646
1647	/*
1648	 * Create a /dev/ar%d that the old ataraid(4) stack once
1649	 * created as an alias for /dev/raid/r%d if requested.
1650	 * This helps going from stable/7 ataraid devices to newer
1651	 * FreeBSD releases. sbruno 07 MAY 2013
1652	 */
1653
1654        if (ar_legacy_aliases) {
1655		snprintf(announce_buf, sizeof(announce_buf),
1656                        "kern.devalias.%s", name);
1657                snprintf(buf1, sizeof(buf1),
1658                        "ar%d", vol->v_global_id);
1659                kern_setenv(announce_buf, buf1);
1660        }
1661
1662	pp = g_new_providerf(sc->sc_geom, "%s", name);
1663	pp->flags |= G_PF_DIRECT_RECEIVE;
1664	if (vol->v_tr->tro_class->trc_accept_unmapped) {
1665		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1666		for (i = 0; i < vol->v_disks_count; i++) {
1667			sd = &vol->v_subdisks[i];
1668			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1669				continue;
1670			if ((sd->sd_disk->d_consumer->provider->flags &
1671			    G_PF_ACCEPT_UNMAPPED) == 0)
1672				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
1673		}
1674	}
1675	pp->private = vol;
1676	pp->mediasize = vol->v_mediasize;
1677	pp->sectorsize = vol->v_sectorsize;
1678	pp->stripesize = 0;
1679	pp->stripeoffset = 0;
1680	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1681	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1682	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1683	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1684		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1685		    disk->d_consumer != NULL &&
1686		    disk->d_consumer->provider != NULL) {
1687			pp->stripesize = disk->d_consumer->provider->stripesize;
1688			off = disk->d_consumer->provider->stripeoffset;
1689			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1690			if (off > 0)
1691				pp->stripeoffset %= off;
1692		}
1693		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1694			pp->stripesize *= (vol->v_disks_count - 1);
1695			pp->stripeoffset *= (vol->v_disks_count - 1);
1696		}
1697	} else
1698		pp->stripesize = vol->v_strip_size;
1699	vol->v_provider = pp;
1700	g_error_provider(pp, 0);
1701	g_topology_unlock();
1702	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1703	    pp->name, vol->v_name);
1704}
1705
1706static void
1707g_raid_destroy_provider(struct g_raid_volume *vol)
1708{
1709	struct g_raid_softc *sc;
1710	struct g_provider *pp;
1711	struct bio *bp, *tmp;
1712
1713	g_topology_assert_not();
1714	sc = vol->v_softc;
1715	pp = vol->v_provider;
1716	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1717
1718	g_topology_lock();
1719	g_error_provider(pp, ENXIO);
1720	mtx_lock(&sc->sc_queue_mtx);
1721	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1722		if (bp->bio_to != pp)
1723			continue;
1724		bioq_remove(&sc->sc_queue, bp);
1725		g_io_deliver(bp, ENXIO);
1726	}
1727	mtx_unlock(&sc->sc_queue_mtx);
1728	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1729	    pp->name, vol->v_name);
1730	g_wither_provider(pp, ENXIO);
1731	g_topology_unlock();
1732	vol->v_provider = NULL;
1733}
1734
1735/*
1736 * Update device state.
1737 */
1738static int
1739g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1740{
1741	struct g_raid_softc *sc;
1742
1743	sc = vol->v_softc;
1744	sx_assert(&sc->sc_lock, SX_XLOCKED);
1745
1746	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1747	    g_raid_volume_event2str(event),
1748	    vol->v_name);
1749	switch (event) {
1750	case G_RAID_VOLUME_E_DOWN:
1751		if (vol->v_provider != NULL)
1752			g_raid_destroy_provider(vol);
1753		break;
1754	case G_RAID_VOLUME_E_UP:
1755		if (vol->v_provider == NULL)
1756			g_raid_launch_provider(vol);
1757		break;
1758	case G_RAID_VOLUME_E_START:
1759		if (vol->v_tr)
1760			G_RAID_TR_START(vol->v_tr);
1761		return (0);
1762	default:
1763		if (sc->sc_md)
1764			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1765		return (0);
1766	}
1767
1768	/* Manage root mount release. */
1769	if (vol->v_starting) {
1770		vol->v_starting = 0;
1771		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1772		root_mount_rel(vol->v_rootmount);
1773		vol->v_rootmount = NULL;
1774	}
1775	if (vol->v_stopping && vol->v_provider_open == 0)
1776		g_raid_destroy_volume(vol);
1777	return (0);
1778}
1779
1780/*
1781 * Update subdisk state.
1782 */
1783static int
1784g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1785{
1786	struct g_raid_softc *sc;
1787	struct g_raid_volume *vol;
1788
1789	sc = sd->sd_softc;
1790	vol = sd->sd_volume;
1791	sx_assert(&sc->sc_lock, SX_XLOCKED);
1792
1793	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1794	    g_raid_subdisk_event2str(event),
1795	    vol->v_name, sd->sd_pos,
1796	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1797	if (vol->v_tr)
1798		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1799
1800	return (0);
1801}
1802
1803/*
1804 * Update disk state.
1805 */
1806static int
1807g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1808{
1809	struct g_raid_softc *sc;
1810
1811	sc = disk->d_softc;
1812	sx_assert(&sc->sc_lock, SX_XLOCKED);
1813
1814	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1815	    g_raid_disk_event2str(event),
1816	    g_raid_get_diskname(disk));
1817
1818	if (sc->sc_md)
1819		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1820	return (0);
1821}
1822
1823/*
1824 * Node event.
1825 */
1826static int
1827g_raid_update_node(struct g_raid_softc *sc, u_int event)
1828{
1829	sx_assert(&sc->sc_lock, SX_XLOCKED);
1830
1831	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1832	    g_raid_node_event2str(event));
1833
1834	if (event == G_RAID_NODE_E_WAKE)
1835		return (0);
1836	if (sc->sc_md)
1837		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1838	return (0);
1839}
1840
1841static int
1842g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1843{
1844	struct g_raid_volume *vol;
1845	struct g_raid_softc *sc;
1846	int dcw, opens, error = 0;
1847
1848	g_topology_assert();
1849	sc = pp->geom->softc;
1850	vol = pp->private;
1851	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1852	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1853
1854	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1855	    acr, acw, ace);
1856	dcw = pp->acw + acw;
1857
1858	g_topology_unlock();
1859	sx_xlock(&sc->sc_lock);
1860	/* Deny new opens while dying. */
1861	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1862		error = ENXIO;
1863		goto out;
1864	}
1865	/* Deny write opens for read-only volumes. */
1866	if (vol->v_read_only && acw > 0) {
1867		error = EROFS;
1868		goto out;
1869	}
1870	if (dcw == 0)
1871		g_raid_clean(vol, dcw);
1872	vol->v_provider_open += acr + acw + ace;
1873	/* Handle delayed node destruction. */
1874	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1875	    vol->v_provider_open == 0) {
1876		/* Count open volumes. */
1877		opens = g_raid_nopens(sc);
1878		if (opens == 0) {
1879			sc->sc_stopping = G_RAID_DESTROY_HARD;
1880			/* Wake up worker to make it selfdestruct. */
1881			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1882		}
1883	}
1884	/* Handle open volume destruction. */
1885	if (vol->v_stopping && vol->v_provider_open == 0)
1886		g_raid_destroy_volume(vol);
1887out:
1888	sx_xunlock(&sc->sc_lock);
1889	g_topology_lock();
1890	return (error);
1891}
1892
1893struct g_raid_softc *
1894g_raid_create_node(struct g_class *mp,
1895    const char *name, struct g_raid_md_object *md)
1896{
1897	struct g_raid_softc *sc;
1898	struct g_geom *gp;
1899	int error;
1900
1901	g_topology_assert();
1902	G_RAID_DEBUG(1, "Creating array %s.", name);
1903
1904	gp = g_new_geomf(mp, "%s", name);
1905	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1906	gp->start = g_raid_start;
1907	gp->orphan = g_raid_orphan;
1908	gp->access = g_raid_access;
1909	gp->dumpconf = g_raid_dumpconf;
1910
1911	sc->sc_md = md;
1912	sc->sc_geom = gp;
1913	sc->sc_flags = 0;
1914	TAILQ_INIT(&sc->sc_volumes);
1915	TAILQ_INIT(&sc->sc_disks);
1916	sx_init(&sc->sc_lock, "graid:lock");
1917	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1918	TAILQ_INIT(&sc->sc_events);
1919	bioq_init(&sc->sc_queue);
1920	gp->softc = sc;
1921	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1922	    "g_raid %s", name);
1923	if (error != 0) {
1924		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1925		mtx_destroy(&sc->sc_queue_mtx);
1926		sx_destroy(&sc->sc_lock);
1927		g_destroy_geom(sc->sc_geom);
1928		free(sc, M_RAID);
1929		return (NULL);
1930	}
1931
1932	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1933	return (sc);
1934}
1935
1936struct g_raid_volume *
1937g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1938{
1939	struct g_raid_volume	*vol, *vol1;
1940	int i;
1941
1942	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1943	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1944	vol->v_softc = sc;
1945	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1946	vol->v_state = G_RAID_VOLUME_S_STARTING;
1947	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1948	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1949	vol->v_rotate_parity = 1;
1950	bioq_init(&vol->v_inflight);
1951	bioq_init(&vol->v_locked);
1952	LIST_INIT(&vol->v_locks);
1953	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1954		vol->v_subdisks[i].sd_softc = sc;
1955		vol->v_subdisks[i].sd_volume = vol;
1956		vol->v_subdisks[i].sd_pos = i;
1957		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1958	}
1959
1960	/* Find free ID for this volume. */
1961	g_topology_lock();
1962	vol1 = vol;
1963	if (id >= 0) {
1964		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1965			if (vol1->v_global_id == id)
1966				break;
1967		}
1968	}
1969	if (vol1 != NULL) {
1970		for (id = 0; ; id++) {
1971			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1972				if (vol1->v_global_id == id)
1973					break;
1974			}
1975			if (vol1 == NULL)
1976				break;
1977		}
1978	}
1979	vol->v_global_id = id;
1980	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1981	g_topology_unlock();
1982
1983	/* Delay root mounting. */
1984	vol->v_rootmount = root_mount_hold("GRAID");
1985	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1986	vol->v_starting = 1;
1987	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1988	return (vol);
1989}
1990
1991struct g_raid_disk *
1992g_raid_create_disk(struct g_raid_softc *sc)
1993{
1994	struct g_raid_disk	*disk;
1995
1996	G_RAID_DEBUG1(1, sc, "Creating disk.");
1997	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1998	disk->d_softc = sc;
1999	disk->d_state = G_RAID_DISK_S_NONE;
2000	TAILQ_INIT(&disk->d_subdisks);
2001	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
2002	return (disk);
2003}
2004
2005int g_raid_start_volume(struct g_raid_volume *vol)
2006{
2007	struct g_raid_tr_class *class;
2008	struct g_raid_tr_object *obj;
2009	int status;
2010
2011	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
2012	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
2013		if (!class->trc_enable)
2014			continue;
2015		G_RAID_DEBUG1(2, vol->v_softc,
2016		    "Tasting volume %s for %s transformation.",
2017		    vol->v_name, class->name);
2018		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2019		    M_WAITOK);
2020		obj->tro_class = class;
2021		obj->tro_volume = vol;
2022		status = G_RAID_TR_TASTE(obj, vol);
2023		if (status != G_RAID_TR_TASTE_FAIL)
2024			break;
2025		kobj_delete((kobj_t)obj, M_RAID);
2026	}
2027	if (class == NULL) {
2028		G_RAID_DEBUG1(0, vol->v_softc,
2029		    "No transformation module found for %s.",
2030		    vol->v_name);
2031		vol->v_tr = NULL;
2032		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2033		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2034		    G_RAID_EVENT_VOLUME);
2035		return (-1);
2036	}
2037	G_RAID_DEBUG1(2, vol->v_softc,
2038	    "Transformation module %s chosen for %s.",
2039	    class->name, vol->v_name);
2040	vol->v_tr = obj;
2041	return (0);
2042}
2043
2044int
2045g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2046{
2047	struct g_raid_volume *vol, *tmpv;
2048	struct g_raid_disk *disk, *tmpd;
2049	int error = 0;
2050
2051	sc->sc_stopping = G_RAID_DESTROY_HARD;
2052	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2053		if (g_raid_destroy_volume(vol))
2054			error = EBUSY;
2055	}
2056	if (error)
2057		return (error);
2058	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2059		if (g_raid_destroy_disk(disk))
2060			error = EBUSY;
2061	}
2062	if (error)
2063		return (error);
2064	if (sc->sc_md) {
2065		G_RAID_MD_FREE(sc->sc_md);
2066		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2067		sc->sc_md = NULL;
2068	}
2069	if (sc->sc_geom != NULL) {
2070		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2071		g_topology_lock();
2072		sc->sc_geom->softc = NULL;
2073		g_wither_geom(sc->sc_geom, ENXIO);
2074		g_topology_unlock();
2075		sc->sc_geom = NULL;
2076	} else
2077		G_RAID_DEBUG(1, "Array destroyed.");
2078	if (worker) {
2079		g_raid_event_cancel(sc, sc);
2080		mtx_destroy(&sc->sc_queue_mtx);
2081		sx_xunlock(&sc->sc_lock);
2082		sx_destroy(&sc->sc_lock);
2083		wakeup(&sc->sc_stopping);
2084		free(sc, M_RAID);
2085		curthread->td_pflags &= ~TDP_GEOM;
2086		G_RAID_DEBUG(1, "Thread exiting.");
2087		kproc_exit(0);
2088	} else {
2089		/* Wake up worker to make it selfdestruct. */
2090		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2091	}
2092	return (0);
2093}
2094
2095int
2096g_raid_destroy_volume(struct g_raid_volume *vol)
2097{
2098	struct g_raid_softc *sc;
2099	struct g_raid_disk *disk;
2100	int i;
2101
2102	sc = vol->v_softc;
2103	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2104	vol->v_stopping = 1;
2105	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2106		if (vol->v_tr) {
2107			G_RAID_TR_STOP(vol->v_tr);
2108			return (EBUSY);
2109		} else
2110			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2111	}
2112	if (g_raid_event_check(sc, vol) != 0)
2113		return (EBUSY);
2114	if (vol->v_provider != NULL)
2115		return (EBUSY);
2116	if (vol->v_provider_open != 0)
2117		return (EBUSY);
2118	if (vol->v_tr) {
2119		G_RAID_TR_FREE(vol->v_tr);
2120		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2121		vol->v_tr = NULL;
2122	}
2123	if (vol->v_rootmount)
2124		root_mount_rel(vol->v_rootmount);
2125	g_topology_lock();
2126	LIST_REMOVE(vol, v_global_next);
2127	g_topology_unlock();
2128	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2129	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2130		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2131		disk = vol->v_subdisks[i].sd_disk;
2132		if (disk == NULL)
2133			continue;
2134		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2135	}
2136	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2137	if (sc->sc_md)
2138		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2139	g_raid_event_cancel(sc, vol);
2140	free(vol, M_RAID);
2141	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2142		/* Wake up worker to let it selfdestruct. */
2143		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2144	}
2145	return (0);
2146}
2147
2148int
2149g_raid_destroy_disk(struct g_raid_disk *disk)
2150{
2151	struct g_raid_softc *sc;
2152	struct g_raid_subdisk *sd, *tmp;
2153
2154	sc = disk->d_softc;
2155	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2156	if (disk->d_consumer) {
2157		g_raid_kill_consumer(sc, disk->d_consumer);
2158		disk->d_consumer = NULL;
2159	}
2160	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2161		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2162		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2163		    G_RAID_EVENT_SUBDISK);
2164		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2165		sd->sd_disk = NULL;
2166	}
2167	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2168	if (sc->sc_md)
2169		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2170	g_raid_event_cancel(sc, disk);
2171	free(disk, M_RAID);
2172	return (0);
2173}
2174
2175int
2176g_raid_destroy(struct g_raid_softc *sc, int how)
2177{
2178	int error, opens;
2179
2180	g_topology_assert_not();
2181	if (sc == NULL)
2182		return (ENXIO);
2183	sx_assert(&sc->sc_lock, SX_XLOCKED);
2184
2185	/* Count open volumes. */
2186	opens = g_raid_nopens(sc);
2187
2188	/* React on some opened volumes. */
2189	if (opens > 0) {
2190		switch (how) {
2191		case G_RAID_DESTROY_SOFT:
2192			G_RAID_DEBUG1(1, sc,
2193			    "%d volumes are still open.",
2194			    opens);
2195			sx_xunlock(&sc->sc_lock);
2196			return (EBUSY);
2197		case G_RAID_DESTROY_DELAYED:
2198			G_RAID_DEBUG1(1, sc,
2199			    "Array will be destroyed on last close.");
2200			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2201			sx_xunlock(&sc->sc_lock);
2202			return (EBUSY);
2203		case G_RAID_DESTROY_HARD:
2204			G_RAID_DEBUG1(1, sc,
2205			    "%d volumes are still open.",
2206			    opens);
2207		}
2208	}
2209
2210	/* Mark node for destruction. */
2211	sc->sc_stopping = G_RAID_DESTROY_HARD;
2212	/* Wake up worker to let it selfdestruct. */
2213	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2214	/* Sleep until node destroyed. */
2215	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2216	    PRIBIO | PDROP, "r:destroy", hz * 3);
2217	return (error == EWOULDBLOCK ? EBUSY : 0);
2218}
2219
2220static void
2221g_raid_taste_orphan(struct g_consumer *cp)
2222{
2223
2224	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2225	    cp->provider->name));
2226}
2227
2228static struct g_geom *
2229g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2230{
2231	struct g_consumer *cp;
2232	struct g_geom *gp, *geom;
2233	struct g_raid_md_class *class;
2234	struct g_raid_md_object *obj;
2235	int status;
2236
2237	g_topology_assert();
2238	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2239	if (!g_raid_enable)
2240		return (NULL);
2241	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2242
2243	geom = NULL;
2244	status = G_RAID_MD_TASTE_FAIL;
2245	gp = g_new_geomf(mp, "raid:taste");
2246	/*
2247	 * This orphan function should be never called.
2248	 */
2249	gp->orphan = g_raid_taste_orphan;
2250	cp = g_new_consumer(gp);
2251	cp->flags |= G_CF_DIRECT_RECEIVE;
2252	g_attach(cp, pp);
2253	if (g_access(cp, 1, 0, 0) != 0)
2254		goto ofail;
2255
2256	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2257		if (!class->mdc_enable)
2258			continue;
2259		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2260		    pp->name, class->name);
2261		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2262		    M_WAITOK);
2263		obj->mdo_class = class;
2264		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2265		if (status != G_RAID_MD_TASTE_NEW)
2266			kobj_delete((kobj_t)obj, M_RAID);
2267		if (status != G_RAID_MD_TASTE_FAIL)
2268			break;
2269	}
2270
2271	if (status == G_RAID_MD_TASTE_FAIL)
2272		(void)g_access(cp, -1, 0, 0);
2273ofail:
2274	g_detach(cp);
2275	g_destroy_consumer(cp);
2276	g_destroy_geom(gp);
2277	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2278	return (geom);
2279}
2280
2281int
2282g_raid_create_node_format(const char *format, struct gctl_req *req,
2283    struct g_geom **gp)
2284{
2285	struct g_raid_md_class *class;
2286	struct g_raid_md_object *obj;
2287	int status;
2288
2289	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2290	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2291		if (strcasecmp(class->name, format) == 0)
2292			break;
2293	}
2294	if (class == NULL) {
2295		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2296		return (G_RAID_MD_TASTE_FAIL);
2297	}
2298	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2299	    M_WAITOK);
2300	obj->mdo_class = class;
2301	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2302	if (status != G_RAID_MD_TASTE_NEW)
2303		kobj_delete((kobj_t)obj, M_RAID);
2304	return (status);
2305}
2306
2307static int
2308g_raid_destroy_geom(struct gctl_req *req __unused,
2309    struct g_class *mp __unused, struct g_geom *gp)
2310{
2311	struct g_raid_softc *sc;
2312	int error;
2313
2314	g_topology_unlock();
2315	sc = gp->softc;
2316	sx_xlock(&sc->sc_lock);
2317	g_cancel_event(sc);
2318	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2319	g_topology_lock();
2320	return (error);
2321}
2322
2323void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2324    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2325{
2326
2327	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2328		return;
2329	if (sc->sc_md)
2330		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2331}
2332
2333void g_raid_fail_disk(struct g_raid_softc *sc,
2334    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2335{
2336
2337	if (disk == NULL)
2338		disk = sd->sd_disk;
2339	if (disk == NULL) {
2340		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2341		return;
2342	}
2343	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2344		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2345		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2346		return;
2347	}
2348	if (sc->sc_md)
2349		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2350}
2351
2352static void
2353g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2354    struct g_consumer *cp, struct g_provider *pp)
2355{
2356	struct g_raid_softc *sc;
2357	struct g_raid_volume *vol;
2358	struct g_raid_subdisk *sd;
2359	struct g_raid_disk *disk;
2360	int i, s;
2361
2362	g_topology_assert();
2363
2364	sc = gp->softc;
2365	if (sc == NULL)
2366		return;
2367	if (pp != NULL) {
2368		vol = pp->private;
2369		g_topology_unlock();
2370		sx_xlock(&sc->sc_lock);
2371		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
2372		    sc->sc_md->mdo_class->name,
2373		    g_raid_volume_level2str(vol->v_raid_level,
2374		    vol->v_raid_level_qualifier));
2375		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2376		    vol->v_name);
2377		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2378		    g_raid_volume_level2str(vol->v_raid_level,
2379		    vol->v_raid_level_qualifier));
2380		sbuf_printf(sb,
2381		    "%s<Transformation>%s</Transformation>\n", indent,
2382		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2383		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2384		    vol->v_disks_count);
2385		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2386		    vol->v_strip_size);
2387		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2388		    g_raid_volume_state2str(vol->v_state));
2389		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2390		    vol->v_dirty ? "Yes" : "No");
2391		sbuf_printf(sb, "%s<Subdisks>", indent);
2392		for (i = 0; i < vol->v_disks_count; i++) {
2393			sd = &vol->v_subdisks[i];
2394			if (sd->sd_disk != NULL &&
2395			    sd->sd_disk->d_consumer != NULL) {
2396				sbuf_printf(sb, "%s ",
2397				    g_raid_get_diskname(sd->sd_disk));
2398			} else {
2399				sbuf_printf(sb, "NONE ");
2400			}
2401			sbuf_printf(sb, "(%s",
2402			    g_raid_subdisk_state2str(sd->sd_state));
2403			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2404			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2405				sbuf_printf(sb, " %d%%",
2406				    (int)(sd->sd_rebuild_pos * 100 /
2407				     sd->sd_size));
2408			}
2409			sbuf_printf(sb, ")");
2410			if (i + 1 < vol->v_disks_count)
2411				sbuf_printf(sb, ", ");
2412		}
2413		sbuf_printf(sb, "</Subdisks>\n");
2414		sx_xunlock(&sc->sc_lock);
2415		g_topology_lock();
2416	} else if (cp != NULL) {
2417		disk = cp->private;
2418		if (disk == NULL)
2419			return;
2420		g_topology_unlock();
2421		sx_xlock(&sc->sc_lock);
2422		sbuf_printf(sb, "%s<State>%s", indent,
2423		    g_raid_disk_state2str(disk->d_state));
2424		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2425			sbuf_printf(sb, " (");
2426			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2427				sbuf_printf(sb, "%s",
2428				    g_raid_subdisk_state2str(sd->sd_state));
2429				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2430				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2431					sbuf_printf(sb, " %d%%",
2432					    (int)(sd->sd_rebuild_pos * 100 /
2433					     sd->sd_size));
2434				}
2435				if (TAILQ_NEXT(sd, sd_next))
2436					sbuf_printf(sb, ", ");
2437			}
2438			sbuf_printf(sb, ")");
2439		}
2440		sbuf_printf(sb, "</State>\n");
2441		sbuf_printf(sb, "%s<Subdisks>", indent);
2442		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2443			sbuf_printf(sb, "r%d(%s):%d@%ju",
2444			    sd->sd_volume->v_global_id,
2445			    sd->sd_volume->v_name,
2446			    sd->sd_pos, sd->sd_offset);
2447			if (TAILQ_NEXT(sd, sd_next))
2448				sbuf_printf(sb, ", ");
2449		}
2450		sbuf_printf(sb, "</Subdisks>\n");
2451		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2452		    disk->d_read_errs);
2453		sx_xunlock(&sc->sc_lock);
2454		g_topology_lock();
2455	} else {
2456		g_topology_unlock();
2457		sx_xlock(&sc->sc_lock);
2458		if (sc->sc_md) {
2459			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2460			    sc->sc_md->mdo_class->name);
2461		}
2462		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2463			s = 0xff;
2464			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2465				if (vol->v_state < s)
2466					s = vol->v_state;
2467			}
2468			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2469			    g_raid_volume_state2str(s));
2470		}
2471		sx_xunlock(&sc->sc_lock);
2472		g_topology_lock();
2473	}
2474}
2475
2476static void
2477g_raid_shutdown_post_sync(void *arg, int howto)
2478{
2479	struct g_class *mp;
2480	struct g_geom *gp, *gp2;
2481	struct g_raid_softc *sc;
2482	struct g_raid_volume *vol;
2483
2484	mp = arg;
2485	DROP_GIANT();
2486	g_topology_lock();
2487	g_raid_shutdown = 1;
2488	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2489		if ((sc = gp->softc) == NULL)
2490			continue;
2491		g_topology_unlock();
2492		sx_xlock(&sc->sc_lock);
2493		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2494			g_raid_clean(vol, -1);
2495		g_cancel_event(sc);
2496		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2497		g_topology_lock();
2498	}
2499	g_topology_unlock();
2500	PICKUP_GIANT();
2501}
2502
2503static void
2504g_raid_init(struct g_class *mp)
2505{
2506
2507	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2508	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2509	if (g_raid_post_sync == NULL)
2510		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2511	g_raid_started = 1;
2512}
2513
2514static void
2515g_raid_fini(struct g_class *mp)
2516{
2517
2518	if (g_raid_post_sync != NULL)
2519		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2520	g_raid_started = 0;
2521}
2522
2523int
2524g_raid_md_modevent(module_t mod, int type, void *arg)
2525{
2526	struct g_raid_md_class *class, *c, *nc;
2527	int error;
2528
2529	error = 0;
2530	class = arg;
2531	switch (type) {
2532	case MOD_LOAD:
2533		c = LIST_FIRST(&g_raid_md_classes);
2534		if (c == NULL || c->mdc_priority > class->mdc_priority)
2535			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2536		else {
2537			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2538			    nc->mdc_priority < class->mdc_priority)
2539				c = nc;
2540			LIST_INSERT_AFTER(c, class, mdc_list);
2541		}
2542		if (g_raid_started)
2543			g_retaste(&g_raid_class);
2544		break;
2545	case MOD_UNLOAD:
2546		LIST_REMOVE(class, mdc_list);
2547		break;
2548	default:
2549		error = EOPNOTSUPP;
2550		break;
2551	}
2552
2553	return (error);
2554}
2555
2556int
2557g_raid_tr_modevent(module_t mod, int type, void *arg)
2558{
2559	struct g_raid_tr_class *class, *c, *nc;
2560	int error;
2561
2562	error = 0;
2563	class = arg;
2564	switch (type) {
2565	case MOD_LOAD:
2566		c = LIST_FIRST(&g_raid_tr_classes);
2567		if (c == NULL || c->trc_priority > class->trc_priority)
2568			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2569		else {
2570			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2571			    nc->trc_priority < class->trc_priority)
2572				c = nc;
2573			LIST_INSERT_AFTER(c, class, trc_list);
2574		}
2575		break;
2576	case MOD_UNLOAD:
2577		LIST_REMOVE(class, trc_list);
2578		break;
2579	default:
2580		error = EOPNOTSUPP;
2581		break;
2582	}
2583
2584	return (error);
2585}
2586
2587/*
2588 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2589 * to reduce module priority, allowing submodules to register them first.
2590 */
2591static moduledata_t g_raid_mod = {
2592	"g_raid",
2593	g_modevent,
2594	&g_raid_class
2595};
2596DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2597MODULE_VERSION(geom_raid, 0);
2598